xref: /freebsd/sys/geom/vinum/geom_vinum_raid5.c (revision 86b3c6f5)
173679edcSLukas Ertl /*-
273679edcSLukas Ertl  * Copyright (c) 2004 Lukas Ertl
373679edcSLukas Ertl  * All rights reserved.
473679edcSLukas Ertl  *
573679edcSLukas Ertl  * Redistribution and use in source and binary forms, with or without
673679edcSLukas Ertl  * modification, are permitted provided that the following conditions
773679edcSLukas Ertl  * are met:
873679edcSLukas Ertl  * 1. Redistributions of source code must retain the above copyright
973679edcSLukas Ertl  *    notice, this list of conditions and the following disclaimer.
1073679edcSLukas Ertl  * 2. Redistributions in binary form must reproduce the above copyright
1173679edcSLukas Ertl  *    notice, this list of conditions and the following disclaimer in the
1273679edcSLukas Ertl  *    documentation and/or other materials provided with the distribution.
1373679edcSLukas Ertl  *
1473679edcSLukas Ertl  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
1573679edcSLukas Ertl  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
1673679edcSLukas Ertl  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
1773679edcSLukas Ertl  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
1873679edcSLukas Ertl  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
1973679edcSLukas Ertl  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
2073679edcSLukas Ertl  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
2173679edcSLukas Ertl  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
2273679edcSLukas Ertl  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
2373679edcSLukas Ertl  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
2473679edcSLukas Ertl  * SUCH DAMAGE.
2573679edcSLukas Ertl  */
2673679edcSLukas Ertl 
2773679edcSLukas Ertl #include <sys/cdefs.h>
2873679edcSLukas Ertl __FBSDID("$FreeBSD$");
2973679edcSLukas Ertl 
3073679edcSLukas Ertl #include <sys/param.h>
3173679edcSLukas Ertl #include <sys/bio.h>
3273679edcSLukas Ertl #include <sys/conf.h>
3373679edcSLukas Ertl #include <sys/errno.h>
3473679edcSLukas Ertl #include <sys/kernel.h>
3573679edcSLukas Ertl #include <sys/kthread.h>
3673679edcSLukas Ertl #include <sys/libkern.h>
3773679edcSLukas Ertl #include <sys/lock.h>
3873679edcSLukas Ertl #include <sys/malloc.h>
3973679edcSLukas Ertl #include <sys/mutex.h>
4073679edcSLukas Ertl #include <sys/systm.h>
4173679edcSLukas Ertl 
4273679edcSLukas Ertl #include <geom/geom.h>
4373679edcSLukas Ertl #include <geom/vinum/geom_vinum_var.h>
4473679edcSLukas Ertl #include <geom/vinum/geom_vinum_raid5.h>
4573679edcSLukas Ertl #include <geom/vinum/geom_vinum.h>
4673679edcSLukas Ertl 
47fb4e65d0SLukas Ertl int	gv_raid5_offset(struct gv_plex *, off_t, off_t, off_t *, off_t *,
48fb4e65d0SLukas Ertl 	    int *, int *);
49fb4e65d0SLukas Ertl 
5073679edcSLukas Ertl /*
5173679edcSLukas Ertl  * Check if the stripe that the work packet wants is already being used by
5273679edcSLukas Ertl  * some other work packet.
5373679edcSLukas Ertl  */
5473679edcSLukas Ertl int
5567e3ab6eSLukas Ertl gv_stripe_active(struct gv_plex *p, struct bio *bp)
5673679edcSLukas Ertl {
5767e3ab6eSLukas Ertl 	struct gv_raid5_packet *wp, *owp;
5867e3ab6eSLukas Ertl 	int overlap;
5973679edcSLukas Ertl 
6067e3ab6eSLukas Ertl 	wp = bp->bio_driver1;
6167e3ab6eSLukas Ertl 	if (wp->lockbase == -1)
6273679edcSLukas Ertl 		return (0);
6373679edcSLukas Ertl 
6467e3ab6eSLukas Ertl 	overlap = 0;
6567e3ab6eSLukas Ertl 	TAILQ_FOREACH(owp, &p->packets, list) {
6667e3ab6eSLukas Ertl 		if (owp == wp)
6773679edcSLukas Ertl 			break;
6867e3ab6eSLukas Ertl 		if ((wp->lockbase >= owp->lockbase) &&
6967e3ab6eSLukas Ertl 		    (wp->lockbase <= owp->lockbase + owp->length)) {
7067e3ab6eSLukas Ertl 			overlap++;
7173679edcSLukas Ertl 			break;
7273679edcSLukas Ertl 		}
7367e3ab6eSLukas Ertl 		if ((wp->lockbase <= owp->lockbase) &&
7467e3ab6eSLukas Ertl 		    (wp->lockbase + wp->length >= owp->lockbase)) {
7567e3ab6eSLukas Ertl 			overlap++;
7667e3ab6eSLukas Ertl 			break;
7767e3ab6eSLukas Ertl 		}
7867e3ab6eSLukas Ertl 	}
7973679edcSLukas Ertl 
8067e3ab6eSLukas Ertl 	return (overlap);
8173679edcSLukas Ertl }
8273679edcSLukas Ertl 
83c3aadfb9SLukas Ertl int
84fb5885afSLukas Ertl gv_check_raid5(struct gv_plex *p, struct gv_raid5_packet *wp, struct bio *bp,
85fb5885afSLukas Ertl     caddr_t addr, off_t boff, off_t bcount)
86fb5885afSLukas Ertl {
87fb5885afSLukas Ertl 	struct gv_sd *parity, *s;
88fb5885afSLukas Ertl 	struct gv_bioq *bq;
89fb5885afSLukas Ertl 	struct bio *cbp, *pbp;
90fb5885afSLukas Ertl 	int i, psdno;
91fb5885afSLukas Ertl 	off_t real_len, real_off;
92fb5885afSLukas Ertl 
93fb5885afSLukas Ertl 	if (p == NULL || LIST_EMPTY(&p->subdisks))
94fb5885afSLukas Ertl 		return (ENXIO);
95fb5885afSLukas Ertl 
96fb5885afSLukas Ertl 	gv_raid5_offset(p, boff, bcount, &real_off, &real_len, NULL, &psdno);
97fb5885afSLukas Ertl 
98fb5885afSLukas Ertl 	/* Find the right subdisk. */
99fb5885afSLukas Ertl 	parity = NULL;
100fb5885afSLukas Ertl 	i = 0;
101fb5885afSLukas Ertl 	LIST_FOREACH(s, &p->subdisks, in_plex) {
102fb5885afSLukas Ertl 		if (i == psdno) {
103fb5885afSLukas Ertl 			parity = s;
104fb5885afSLukas Ertl 			break;
105fb5885afSLukas Ertl 		}
106fb5885afSLukas Ertl 		i++;
107fb5885afSLukas Ertl 	}
108fb5885afSLukas Ertl 
109fb5885afSLukas Ertl 	/* Parity stripe not found. */
110fb5885afSLukas Ertl 	if (parity == NULL)
111fb5885afSLukas Ertl 		return (ENXIO);
112fb5885afSLukas Ertl 
113fb5885afSLukas Ertl 	if (parity->state != GV_SD_UP)
114fb5885afSLukas Ertl 		return (ENXIO);
115fb5885afSLukas Ertl 
116fb5885afSLukas Ertl 	wp->length = real_len;
117fb5885afSLukas Ertl 	wp->data = addr;
118fb5885afSLukas Ertl 	wp->lockbase = real_off;
119fb5885afSLukas Ertl 
120fb5885afSLukas Ertl 	/* Read all subdisks. */
121fb5885afSLukas Ertl 	LIST_FOREACH(s, &p->subdisks, in_plex) {
122fb5885afSLukas Ertl 		/* Skip the parity subdisk. */
123fb5885afSLukas Ertl 		if (s == parity)
124fb5885afSLukas Ertl 			continue;
125fb5885afSLukas Ertl 
126fb5885afSLukas Ertl 		cbp = g_clone_bio(bp);
127fb5885afSLukas Ertl 		if (cbp == NULL)
128fb5885afSLukas Ertl 			return (ENOMEM);
129fb5885afSLukas Ertl 		cbp->bio_cmd = BIO_READ;
130fb5885afSLukas Ertl 		cbp->bio_data = g_malloc(real_len, M_WAITOK);
131fb5885afSLukas Ertl 		cbp->bio_cflags |= GV_BIO_MALLOC;
132fb5885afSLukas Ertl 		cbp->bio_offset = real_off;
133fb5885afSLukas Ertl 		cbp->bio_length = real_len;
134fb5885afSLukas Ertl 		cbp->bio_done = gv_plex_done;
135fb5885afSLukas Ertl 		cbp->bio_caller2 = s->consumer;
136fb5885afSLukas Ertl 		cbp->bio_driver1 = wp;
137fb5885afSLukas Ertl 
138fb5885afSLukas Ertl 		GV_ENQUEUE(bp, cbp, pbp);
139fb5885afSLukas Ertl 
140fb5885afSLukas Ertl 		bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO);
141fb5885afSLukas Ertl 		bq->bp = cbp;
142fb5885afSLukas Ertl 		TAILQ_INSERT_TAIL(&wp->bits, bq, queue);
143fb5885afSLukas Ertl 	}
144fb5885afSLukas Ertl 
145fb5885afSLukas Ertl 	/* Read the parity data. */
146fb5885afSLukas Ertl 	cbp = g_clone_bio(bp);
147fb5885afSLukas Ertl 	if (cbp == NULL)
148fb5885afSLukas Ertl 		return (ENOMEM);
149fb5885afSLukas Ertl 	cbp->bio_cmd = BIO_READ;
150fb5885afSLukas Ertl 	cbp->bio_data = g_malloc(real_len, M_WAITOK | M_ZERO);
151fb5885afSLukas Ertl 	cbp->bio_cflags |= GV_BIO_MALLOC;
152fb5885afSLukas Ertl 	cbp->bio_offset = real_off;
153fb5885afSLukas Ertl 	cbp->bio_length = real_len;
154fb5885afSLukas Ertl 	cbp->bio_done = gv_plex_done;
155fb5885afSLukas Ertl 	cbp->bio_caller2 = parity->consumer;
156fb5885afSLukas Ertl 	cbp->bio_driver1 = wp;
157fb5885afSLukas Ertl 	wp->waiting = cbp;
158fb5885afSLukas Ertl 
159fb5885afSLukas Ertl 	/*
160fb5885afSLukas Ertl 	 * In case we want to rebuild the parity, create an extra BIO to write
161fb5885afSLukas Ertl 	 * it out.  It also acts as buffer for the XOR operations.
162fb5885afSLukas Ertl 	 */
163fb5885afSLukas Ertl 	cbp = g_clone_bio(bp);
164fb5885afSLukas Ertl 	if (cbp == NULL)
165fb5885afSLukas Ertl 		return (ENOMEM);
166fb5885afSLukas Ertl 	cbp->bio_data = addr;
167fb5885afSLukas Ertl 	cbp->bio_offset = real_off;
168fb5885afSLukas Ertl 	cbp->bio_length = real_len;
169fb5885afSLukas Ertl 	cbp->bio_done = gv_plex_done;
170fb5885afSLukas Ertl 	cbp->bio_caller2 = parity->consumer;
171fb5885afSLukas Ertl 	cbp->bio_driver1 = wp;
172fb5885afSLukas Ertl 	wp->parity = cbp;
173fb5885afSLukas Ertl 
174fb5885afSLukas Ertl 	return (0);
175fb5885afSLukas Ertl }
176fb5885afSLukas Ertl 
177fb5885afSLukas Ertl /* Rebuild a degraded RAID5 plex. */
178fb5885afSLukas Ertl int
179c3aadfb9SLukas Ertl gv_rebuild_raid5(struct gv_plex *p, struct gv_raid5_packet *wp, struct bio *bp,
180c3aadfb9SLukas Ertl     caddr_t addr, off_t boff, off_t bcount)
181c3aadfb9SLukas Ertl {
182c3aadfb9SLukas Ertl 	struct gv_sd *broken, *s;
183c3aadfb9SLukas Ertl 	struct gv_bioq *bq;
184c3aadfb9SLukas Ertl 	struct bio *cbp, *pbp;
185fb4e65d0SLukas Ertl 	off_t real_len, real_off;
186c3aadfb9SLukas Ertl 
187c3aadfb9SLukas Ertl 	if (p == NULL || LIST_EMPTY(&p->subdisks))
188c3aadfb9SLukas Ertl 		return (ENXIO);
189c3aadfb9SLukas Ertl 
190fb4e65d0SLukas Ertl 	gv_raid5_offset(p, boff, bcount, &real_off, &real_len, NULL, NULL);
191c3aadfb9SLukas Ertl 
192c3aadfb9SLukas Ertl 	/* Find the right subdisk. */
193c3aadfb9SLukas Ertl 	broken = NULL;
194c3aadfb9SLukas Ertl 	LIST_FOREACH(s, &p->subdisks, in_plex) {
195c3aadfb9SLukas Ertl 		if (s->state != GV_SD_UP)
196c3aadfb9SLukas Ertl 			broken = s;
197c3aadfb9SLukas Ertl 	}
198c3aadfb9SLukas Ertl 
199fb5885afSLukas Ertl 	/* Broken stripe not found. */
200c3aadfb9SLukas Ertl 	if (broken == NULL)
201c3aadfb9SLukas Ertl 		return (ENXIO);
202c3aadfb9SLukas Ertl 
203c3aadfb9SLukas Ertl 	switch (broken->state) {
204c3aadfb9SLukas Ertl 	case GV_SD_UP:
205c3aadfb9SLukas Ertl 		return (EINVAL);
206c3aadfb9SLukas Ertl 
207c3aadfb9SLukas Ertl 	case GV_SD_STALE:
208c3aadfb9SLukas Ertl 		if (!(bp->bio_cflags & GV_BIO_REBUILD))
209c3aadfb9SLukas Ertl 			return (ENXIO);
210c3aadfb9SLukas Ertl 
21186b3c6f5SUlf Lilleengen 		G_VINUM_DEBUG(1, "sd %s is reviving", broken->name);
212c3aadfb9SLukas Ertl 		gv_set_sd_state(broken, GV_SD_REVIVING, GV_SETSTATE_FORCE);
213c3aadfb9SLukas Ertl 		break;
214c3aadfb9SLukas Ertl 
215c3aadfb9SLukas Ertl 	case GV_SD_REVIVING:
216c3aadfb9SLukas Ertl 		break;
217c3aadfb9SLukas Ertl 
218c3aadfb9SLukas Ertl 	default:
219c3aadfb9SLukas Ertl 		/* All other subdisk states mean it's not accessible. */
220c3aadfb9SLukas Ertl 		return (ENXIO);
221c3aadfb9SLukas Ertl 	}
222c3aadfb9SLukas Ertl 
223c3aadfb9SLukas Ertl 	wp->length = real_len;
224c3aadfb9SLukas Ertl 	wp->data = addr;
225c3aadfb9SLukas Ertl 	wp->lockbase = real_off;
226c3aadfb9SLukas Ertl 
227fb4e65d0SLukas Ertl 	KASSERT(wp->length >= 0, ("gv_rebuild_raid5: wp->length < 0"));
228c3aadfb9SLukas Ertl 
229c3aadfb9SLukas Ertl 	/* Read all subdisks. */
230c3aadfb9SLukas Ertl 	LIST_FOREACH(s, &p->subdisks, in_plex) {
231c3aadfb9SLukas Ertl 		/* Skip the broken subdisk. */
232c3aadfb9SLukas Ertl 		if (s == broken)
233c3aadfb9SLukas Ertl 			continue;
234c3aadfb9SLukas Ertl 
235c3aadfb9SLukas Ertl 		cbp = g_clone_bio(bp);
236c3aadfb9SLukas Ertl 		if (cbp == NULL)
237c3aadfb9SLukas Ertl 			return (ENOMEM);
238c3aadfb9SLukas Ertl 		cbp->bio_cmd = BIO_READ;
239c3aadfb9SLukas Ertl 		cbp->bio_data = g_malloc(real_len, M_WAITOK);
240c3aadfb9SLukas Ertl 		cbp->bio_cflags |= GV_BIO_MALLOC;
241c3aadfb9SLukas Ertl 		cbp->bio_offset = real_off;
242c3aadfb9SLukas Ertl 		cbp->bio_length = real_len;
243c3aadfb9SLukas Ertl 		cbp->bio_done = gv_plex_done;
244c3aadfb9SLukas Ertl 		cbp->bio_caller2 = s->consumer;
245c3aadfb9SLukas Ertl 		cbp->bio_driver1 = wp;
246c3aadfb9SLukas Ertl 
247c3aadfb9SLukas Ertl 		GV_ENQUEUE(bp, cbp, pbp);
248c3aadfb9SLukas Ertl 
249c3aadfb9SLukas Ertl 		bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO);
250c3aadfb9SLukas Ertl 		bq->bp = cbp;
251c3aadfb9SLukas Ertl 		TAILQ_INSERT_TAIL(&wp->bits, bq, queue);
252c3aadfb9SLukas Ertl 	}
253c3aadfb9SLukas Ertl 
254c3aadfb9SLukas Ertl 	/* Write the parity data. */
255c3aadfb9SLukas Ertl 	cbp = g_clone_bio(bp);
256c3aadfb9SLukas Ertl 	if (cbp == NULL)
257c3aadfb9SLukas Ertl 		return (ENOMEM);
258c3aadfb9SLukas Ertl 	cbp->bio_data = g_malloc(real_len, M_WAITOK | M_ZERO);
259c3aadfb9SLukas Ertl 	cbp->bio_cflags |= GV_BIO_MALLOC;
260c3aadfb9SLukas Ertl 	cbp->bio_offset = real_off;
261c3aadfb9SLukas Ertl 	cbp->bio_length = real_len;
262c3aadfb9SLukas Ertl 	cbp->bio_done = gv_plex_done;
263c3aadfb9SLukas Ertl 	cbp->bio_caller2 = broken->consumer;
264c3aadfb9SLukas Ertl 	cbp->bio_driver1 = wp;
265c3aadfb9SLukas Ertl 	cbp->bio_cflags |= GV_BIO_REBUILD;
266c3aadfb9SLukas Ertl 	wp->parity = cbp;
267c3aadfb9SLukas Ertl 
268c3aadfb9SLukas Ertl 	p->synced = boff;
269c3aadfb9SLukas Ertl 
270c3aadfb9SLukas Ertl 	return (0);
271c3aadfb9SLukas Ertl }
272c3aadfb9SLukas Ertl 
27373679edcSLukas Ertl /* Build a request group to perform (part of) a RAID5 request. */
27473679edcSLukas Ertl int
27567e3ab6eSLukas Ertl gv_build_raid5_req(struct gv_plex *p, struct gv_raid5_packet *wp,
27667e3ab6eSLukas Ertl     struct bio *bp, caddr_t addr, off_t boff, off_t bcount)
27773679edcSLukas Ertl {
27873679edcSLukas Ertl 	struct g_geom *gp;
27973679edcSLukas Ertl 	struct gv_sd *broken, *original, *parity, *s;
28067e3ab6eSLukas Ertl 	struct gv_bioq *bq;
28167e3ab6eSLukas Ertl 	struct bio *cbp, *pbp;
28267e3ab6eSLukas Ertl 	int i, psdno, sdno, type;
283fb4e65d0SLukas Ertl 	off_t real_len, real_off;
28473679edcSLukas Ertl 
28573679edcSLukas Ertl 	gp = bp->bio_to->geom;
28673679edcSLukas Ertl 
28773679edcSLukas Ertl 	if (p == NULL || LIST_EMPTY(&p->subdisks))
28873679edcSLukas Ertl 		return (ENXIO);
28973679edcSLukas Ertl 
29073679edcSLukas Ertl 	/* We are optimistic and assume that this request will be OK. */
29167e3ab6eSLukas Ertl #define	REQ_TYPE_NORMAL		0
29267e3ab6eSLukas Ertl #define	REQ_TYPE_DEGRADED	1
29367e3ab6eSLukas Ertl #define	REQ_TYPE_NOPARITY	2
29467e3ab6eSLukas Ertl 
29567e3ab6eSLukas Ertl 	type = REQ_TYPE_NORMAL;
29673679edcSLukas Ertl 	original = parity = broken = NULL;
29773679edcSLukas Ertl 
298fb4e65d0SLukas Ertl 	gv_raid5_offset(p, boff, bcount, &real_off, &real_len, &sdno, &psdno);
29973679edcSLukas Ertl 
30073679edcSLukas Ertl 	/* Find the right subdisks. */
30173679edcSLukas Ertl 	i = 0;
30273679edcSLukas Ertl 	LIST_FOREACH(s, &p->subdisks, in_plex) {
30373679edcSLukas Ertl 		if (i == sdno)
30473679edcSLukas Ertl 			original = s;
30573679edcSLukas Ertl 		if (i == psdno)
30673679edcSLukas Ertl 			parity = s;
30773679edcSLukas Ertl 		if (s->state != GV_SD_UP)
30873679edcSLukas Ertl 			broken = s;
30973679edcSLukas Ertl 		i++;
31073679edcSLukas Ertl 	}
31173679edcSLukas Ertl 
31273679edcSLukas Ertl 	if ((original == NULL) || (parity == NULL))
31373679edcSLukas Ertl 		return (ENXIO);
31473679edcSLukas Ertl 
31573679edcSLukas Ertl 	/* Our data stripe is missing. */
31673679edcSLukas Ertl 	if (original->state != GV_SD_UP)
31767e3ab6eSLukas Ertl 		type = REQ_TYPE_DEGRADED;
31873679edcSLukas Ertl 	/* Our parity stripe is missing. */
31973679edcSLukas Ertl 	if (parity->state != GV_SD_UP) {
32073679edcSLukas Ertl 		/* We cannot take another failure if we're already degraded. */
32167e3ab6eSLukas Ertl 		if (type != REQ_TYPE_NORMAL)
32273679edcSLukas Ertl 			return (ENXIO);
32373679edcSLukas Ertl 		else
32467e3ab6eSLukas Ertl 			type = REQ_TYPE_NOPARITY;
32573679edcSLukas Ertl 	}
32673679edcSLukas Ertl 
32767e3ab6eSLukas Ertl 	wp->length = real_len;
32873679edcSLukas Ertl 	wp->data = addr;
32967e3ab6eSLukas Ertl 	wp->lockbase = real_off;
33073679edcSLukas Ertl 
33173679edcSLukas Ertl 	KASSERT(wp->length >= 0, ("gv_build_raid5_request: wp->length < 0"));
33273679edcSLukas Ertl 
333c3aadfb9SLukas Ertl 	if ((p->flags & GV_PLEX_SYNCING) && (boff + real_len < p->synced))
334c3aadfb9SLukas Ertl 		type = REQ_TYPE_NORMAL;
335c3aadfb9SLukas Ertl 
33673679edcSLukas Ertl 	switch (bp->bio_cmd) {
33773679edcSLukas Ertl 	case BIO_READ:
33873679edcSLukas Ertl 		/*
33973679edcSLukas Ertl 		 * For a degraded read we need to read in all stripes except
34073679edcSLukas Ertl 		 * the broken one plus the parity stripe and then recalculate
34173679edcSLukas Ertl 		 * the desired data.
34273679edcSLukas Ertl 		 */
34367e3ab6eSLukas Ertl 		if (type == REQ_TYPE_DEGRADED) {
34467e3ab6eSLukas Ertl 			bzero(wp->data, wp->length);
34573679edcSLukas Ertl 			LIST_FOREACH(s, &p->subdisks, in_plex) {
34673679edcSLukas Ertl 				/* Skip the broken subdisk. */
34773679edcSLukas Ertl 				if (s == broken)
34873679edcSLukas Ertl 					continue;
34967e3ab6eSLukas Ertl 				cbp = g_clone_bio(bp);
35067e3ab6eSLukas Ertl 				if (cbp == NULL)
35173679edcSLukas Ertl 					return (ENOMEM);
35267e3ab6eSLukas Ertl 				cbp->bio_data = g_malloc(real_len, M_WAITOK);
35367e3ab6eSLukas Ertl 				cbp->bio_cflags |= GV_BIO_MALLOC;
35467e3ab6eSLukas Ertl 				cbp->bio_offset = real_off;
35567e3ab6eSLukas Ertl 				cbp->bio_length = real_len;
35667e3ab6eSLukas Ertl 				cbp->bio_done = gv_plex_done;
35767e3ab6eSLukas Ertl 				cbp->bio_caller2 = s->consumer;
35867e3ab6eSLukas Ertl 				cbp->bio_driver1 = wp;
35967e3ab6eSLukas Ertl 
36067e3ab6eSLukas Ertl 				GV_ENQUEUE(bp, cbp, pbp);
36167e3ab6eSLukas Ertl 
36267e3ab6eSLukas Ertl 				bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO);
36367e3ab6eSLukas Ertl 				bq->bp = cbp;
36467e3ab6eSLukas Ertl 				TAILQ_INSERT_TAIL(&wp->bits, bq, queue);
36573679edcSLukas Ertl 			}
36673679edcSLukas Ertl 
36773679edcSLukas Ertl 		/* A normal read can be fulfilled with the original subdisk. */
36873679edcSLukas Ertl 		} else {
36967e3ab6eSLukas Ertl 			cbp = g_clone_bio(bp);
37067e3ab6eSLukas Ertl 			if (cbp == NULL)
37173679edcSLukas Ertl 				return (ENOMEM);
37267e3ab6eSLukas Ertl 			cbp->bio_offset = real_off;
37367e3ab6eSLukas Ertl 			cbp->bio_length = real_len;
37467e3ab6eSLukas Ertl 			cbp->bio_data = addr;
37567e3ab6eSLukas Ertl 			cbp->bio_done = g_std_done;
37667e3ab6eSLukas Ertl 			cbp->bio_caller2 = original->consumer;
37767e3ab6eSLukas Ertl 
37867e3ab6eSLukas Ertl 			GV_ENQUEUE(bp, cbp, pbp);
37973679edcSLukas Ertl 		}
38073679edcSLukas Ertl 		wp->lockbase = -1;
38167e3ab6eSLukas Ertl 
38273679edcSLukas Ertl 		break;
38373679edcSLukas Ertl 
38473679edcSLukas Ertl 	case BIO_WRITE:
38573679edcSLukas Ertl 		/*
38673679edcSLukas Ertl 		 * A degraded write means we cannot write to the original data
38773679edcSLukas Ertl 		 * subdisk.  Thus we need to read in all valid stripes,
38873679edcSLukas Ertl 		 * recalculate the parity from the original data, and then
38973679edcSLukas Ertl 		 * write the parity stripe back out.
39073679edcSLukas Ertl 		 */
39167e3ab6eSLukas Ertl 		if (type == REQ_TYPE_DEGRADED) {
39267e3ab6eSLukas Ertl 			/* Read all subdisks. */
39373679edcSLukas Ertl 			LIST_FOREACH(s, &p->subdisks, in_plex) {
39473679edcSLukas Ertl 				/* Skip the broken and the parity subdisk. */
39567e3ab6eSLukas Ertl 				if ((s == broken) || (s == parity))
39673679edcSLukas Ertl 					continue;
39773679edcSLukas Ertl 
39867e3ab6eSLukas Ertl 				cbp = g_clone_bio(bp);
39967e3ab6eSLukas Ertl 				if (cbp == NULL)
40073679edcSLukas Ertl 					return (ENOMEM);
40167e3ab6eSLukas Ertl 				cbp->bio_cmd = BIO_READ;
40267e3ab6eSLukas Ertl 				cbp->bio_data = g_malloc(real_len, M_WAITOK);
40367e3ab6eSLukas Ertl 				cbp->bio_cflags |= GV_BIO_MALLOC;
40467e3ab6eSLukas Ertl 				cbp->bio_offset = real_off;
40567e3ab6eSLukas Ertl 				cbp->bio_length = real_len;
40667e3ab6eSLukas Ertl 				cbp->bio_done = gv_plex_done;
40767e3ab6eSLukas Ertl 				cbp->bio_caller2 = s->consumer;
40867e3ab6eSLukas Ertl 				cbp->bio_driver1 = wp;
40967e3ab6eSLukas Ertl 
41067e3ab6eSLukas Ertl 				GV_ENQUEUE(bp, cbp, pbp);
41167e3ab6eSLukas Ertl 
41267e3ab6eSLukas Ertl 				bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO);
41367e3ab6eSLukas Ertl 				bq->bp = cbp;
41467e3ab6eSLukas Ertl 				TAILQ_INSERT_TAIL(&wp->bits, bq, queue);
41573679edcSLukas Ertl 			}
41673679edcSLukas Ertl 
41767e3ab6eSLukas Ertl 			/* Write the parity data. */
41867e3ab6eSLukas Ertl 			cbp = g_clone_bio(bp);
41967e3ab6eSLukas Ertl 			if (cbp == NULL)
42073679edcSLukas Ertl 				return (ENOMEM);
42167e3ab6eSLukas Ertl 			cbp->bio_data = g_malloc(real_len, M_WAITOK);
42267e3ab6eSLukas Ertl 			cbp->bio_cflags |= GV_BIO_MALLOC;
42367e3ab6eSLukas Ertl 			bcopy(addr, cbp->bio_data, real_len);
42467e3ab6eSLukas Ertl 			cbp->bio_offset = real_off;
42567e3ab6eSLukas Ertl 			cbp->bio_length = real_len;
42667e3ab6eSLukas Ertl 			cbp->bio_done = gv_plex_done;
42767e3ab6eSLukas Ertl 			cbp->bio_caller2 = parity->consumer;
42867e3ab6eSLukas Ertl 			cbp->bio_driver1 = wp;
42967e3ab6eSLukas Ertl 			wp->parity = cbp;
43073679edcSLukas Ertl 
43173679edcSLukas Ertl 		/*
43267e3ab6eSLukas Ertl 		 * When the parity stripe is missing we just write out the data.
43373679edcSLukas Ertl 		 */
43467e3ab6eSLukas Ertl 		} else if (type == REQ_TYPE_NOPARITY) {
43567e3ab6eSLukas Ertl 			cbp = g_clone_bio(bp);
43667e3ab6eSLukas Ertl 			if (cbp == NULL)
437291cb0acSLukas Ertl 				return (ENOMEM);
43867e3ab6eSLukas Ertl 			cbp->bio_offset = real_off;
43967e3ab6eSLukas Ertl 			cbp->bio_length = real_len;
44067e3ab6eSLukas Ertl 			cbp->bio_data = addr;
44167e3ab6eSLukas Ertl 			cbp->bio_done = gv_plex_done;
44267e3ab6eSLukas Ertl 			cbp->bio_caller2 = original->consumer;
44367e3ab6eSLukas Ertl 			cbp->bio_driver1 = wp;
44473679edcSLukas Ertl 
44567e3ab6eSLukas Ertl 			GV_ENQUEUE(bp, cbp, pbp);
44673679edcSLukas Ertl 
44767e3ab6eSLukas Ertl 			bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO);
44867e3ab6eSLukas Ertl 			bq->bp = cbp;
44967e3ab6eSLukas Ertl 			TAILQ_INSERT_TAIL(&wp->bits, bq, queue);
45073679edcSLukas Ertl 
45173679edcSLukas Ertl 		/*
45273679edcSLukas Ertl 		 * A normal write request goes to the original subdisk, then we
45373679edcSLukas Ertl 		 * read in all other stripes, recalculate the parity and write
45473679edcSLukas Ertl 		 * out the parity again.
45573679edcSLukas Ertl 		 */
45673679edcSLukas Ertl 		} else {
45767e3ab6eSLukas Ertl 			/* Read old parity. */
45867e3ab6eSLukas Ertl 			cbp = g_clone_bio(bp);
45967e3ab6eSLukas Ertl 			if (cbp == NULL)
460291cb0acSLukas Ertl 				return (ENOMEM);
46167e3ab6eSLukas Ertl 			cbp->bio_cmd = BIO_READ;
46267e3ab6eSLukas Ertl 			cbp->bio_data = g_malloc(real_len, M_WAITOK);
46367e3ab6eSLukas Ertl 			cbp->bio_cflags |= GV_BIO_MALLOC;
46467e3ab6eSLukas Ertl 			cbp->bio_offset = real_off;
46567e3ab6eSLukas Ertl 			cbp->bio_length = real_len;
46667e3ab6eSLukas Ertl 			cbp->bio_done = gv_plex_done;
46767e3ab6eSLukas Ertl 			cbp->bio_caller2 = parity->consumer;
46867e3ab6eSLukas Ertl 			cbp->bio_driver1 = wp;
46973679edcSLukas Ertl 
47067e3ab6eSLukas Ertl 			GV_ENQUEUE(bp, cbp, pbp);
47167e3ab6eSLukas Ertl 
47267e3ab6eSLukas Ertl 			bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO);
47367e3ab6eSLukas Ertl 			bq->bp = cbp;
47467e3ab6eSLukas Ertl 			TAILQ_INSERT_TAIL(&wp->bits, bq, queue);
47567e3ab6eSLukas Ertl 
47667e3ab6eSLukas Ertl 			/* Read old data. */
47767e3ab6eSLukas Ertl 			cbp = g_clone_bio(bp);
47867e3ab6eSLukas Ertl 			if (cbp == NULL)
47973679edcSLukas Ertl 				return (ENOMEM);
48067e3ab6eSLukas Ertl 			cbp->bio_cmd = BIO_READ;
48167e3ab6eSLukas Ertl 			cbp->bio_data = g_malloc(real_len, M_WAITOK);
48267e3ab6eSLukas Ertl 			cbp->bio_cflags |= GV_BIO_MALLOC;
48367e3ab6eSLukas Ertl 			cbp->bio_offset = real_off;
48467e3ab6eSLukas Ertl 			cbp->bio_length = real_len;
48567e3ab6eSLukas Ertl 			cbp->bio_done = gv_plex_done;
48667e3ab6eSLukas Ertl 			cbp->bio_caller2 = original->consumer;
48767e3ab6eSLukas Ertl 			cbp->bio_driver1 = wp;
48867e3ab6eSLukas Ertl 
48967e3ab6eSLukas Ertl 			GV_ENQUEUE(bp, cbp, pbp);
49067e3ab6eSLukas Ertl 
49167e3ab6eSLukas Ertl 			bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO);
49267e3ab6eSLukas Ertl 			bq->bp = cbp;
49367e3ab6eSLukas Ertl 			TAILQ_INSERT_TAIL(&wp->bits, bq, queue);
49467e3ab6eSLukas Ertl 
49567e3ab6eSLukas Ertl 			/* Write new data. */
49667e3ab6eSLukas Ertl 			cbp = g_clone_bio(bp);
49767e3ab6eSLukas Ertl 			if (cbp == NULL)
49867e3ab6eSLukas Ertl 				return (ENOMEM);
49967e3ab6eSLukas Ertl 			cbp->bio_data = addr;
50067e3ab6eSLukas Ertl 			cbp->bio_offset = real_off;
50167e3ab6eSLukas Ertl 			cbp->bio_length = real_len;
50267e3ab6eSLukas Ertl 			cbp->bio_done = gv_plex_done;
50367e3ab6eSLukas Ertl 			cbp->bio_caller2 = original->consumer;
50467e3ab6eSLukas Ertl 
50567e3ab6eSLukas Ertl 			cbp->bio_driver1 = wp;
50667e3ab6eSLukas Ertl 
50773679edcSLukas Ertl 			/*
50867e3ab6eSLukas Ertl 			 * We must not write the new data until the old data
50967e3ab6eSLukas Ertl 			 * was read, so hold this BIO back until we're ready
51067e3ab6eSLukas Ertl 			 * for it.
51173679edcSLukas Ertl 			 */
51267e3ab6eSLukas Ertl 			wp->waiting = cbp;
51367e3ab6eSLukas Ertl 
51467e3ab6eSLukas Ertl 			/* The final bio for the parity. */
51567e3ab6eSLukas Ertl 			cbp = g_clone_bio(bp);
51667e3ab6eSLukas Ertl 			if (cbp == NULL)
517291cb0acSLukas Ertl 				return (ENOMEM);
51867e3ab6eSLukas Ertl 			cbp->bio_data = g_malloc(real_len, M_WAITOK | M_ZERO);
51967e3ab6eSLukas Ertl 			cbp->bio_cflags |= GV_BIO_MALLOC;
52067e3ab6eSLukas Ertl 			cbp->bio_offset = real_off;
52167e3ab6eSLukas Ertl 			cbp->bio_length = real_len;
52267e3ab6eSLukas Ertl 			cbp->bio_done = gv_plex_done;
52367e3ab6eSLukas Ertl 			cbp->bio_caller2 = parity->consumer;
52467e3ab6eSLukas Ertl 			cbp->bio_driver1 = wp;
52567e3ab6eSLukas Ertl 
52667e3ab6eSLukas Ertl 			/* Remember that this is the BIO for the parity data. */
52767e3ab6eSLukas Ertl 			wp->parity = cbp;
52873679edcSLukas Ertl 		}
52973679edcSLukas Ertl 		break;
53067e3ab6eSLukas Ertl 
53173679edcSLukas Ertl 	default:
53273679edcSLukas Ertl 		return (EINVAL);
53373679edcSLukas Ertl 	}
53473679edcSLukas Ertl 
53573679edcSLukas Ertl 	return (0);
53673679edcSLukas Ertl }
537fb4e65d0SLukas Ertl 
538fb4e65d0SLukas Ertl /* Calculate the offsets in the various subdisks for a RAID5 request. */
539fb4e65d0SLukas Ertl int
540fb4e65d0SLukas Ertl gv_raid5_offset(struct gv_plex *p, off_t boff, off_t bcount, off_t *real_off,
541fb4e65d0SLukas Ertl     off_t *real_len, int *sdno, int *psdno)
542fb4e65d0SLukas Ertl {
543fb4e65d0SLukas Ertl 	int sd, psd;
544fb4e65d0SLukas Ertl 	off_t len_left, stripeend, stripeoff, stripestart;
545fb4e65d0SLukas Ertl 
546fb4e65d0SLukas Ertl 	/* The number of the subdisk containing the parity stripe. */
547fb4e65d0SLukas Ertl 	psd = p->sdcount - 1 - ( boff / (p->stripesize * (p->sdcount - 1))) %
548fb4e65d0SLukas Ertl 	    p->sdcount;
549fb4e65d0SLukas Ertl 	KASSERT(psdno >= 0, ("gv_raid5_offset: psdno < 0"));
550fb4e65d0SLukas Ertl 
551fb4e65d0SLukas Ertl 	/* Offset of the start address from the start of the stripe. */
552fb4e65d0SLukas Ertl 	stripeoff = boff % (p->stripesize * (p->sdcount - 1));
553fb4e65d0SLukas Ertl 	KASSERT(stripeoff >= 0, ("gv_raid5_offset: stripeoff < 0"));
554fb4e65d0SLukas Ertl 
555fb4e65d0SLukas Ertl 	/* The number of the subdisk where the stripe resides. */
556fb4e65d0SLukas Ertl 	sd = stripeoff / p->stripesize;
557fb4e65d0SLukas Ertl 	KASSERT(sdno >= 0, ("gv_raid5_offset: sdno < 0"));
558fb4e65d0SLukas Ertl 
559fb4e65d0SLukas Ertl 	/* At or past parity subdisk. */
560fb4e65d0SLukas Ertl 	if (sd >= psd)
561fb4e65d0SLukas Ertl 		sd++;
562fb4e65d0SLukas Ertl 
563fb4e65d0SLukas Ertl 	/* The offset of the stripe on this subdisk. */
564fb4e65d0SLukas Ertl 	stripestart = (boff - stripeoff) / (p->sdcount - 1);
565fb4e65d0SLukas Ertl 	KASSERT(stripestart >= 0, ("gv_raid5_offset: stripestart < 0"));
566fb4e65d0SLukas Ertl 
567fb4e65d0SLukas Ertl 	stripeoff %= p->stripesize;
568fb4e65d0SLukas Ertl 
569fb4e65d0SLukas Ertl 	/* The offset of the request on this subdisk. */
570fb4e65d0SLukas Ertl 	*real_off = stripestart + stripeoff;
571fb4e65d0SLukas Ertl 
572fb4e65d0SLukas Ertl 	stripeend = stripestart + p->stripesize;
573fb4e65d0SLukas Ertl 	len_left = stripeend - *real_off;
574fb4e65d0SLukas Ertl 	KASSERT(len_left >= 0, ("gv_raid5_offset: len_left < 0"));
575fb4e65d0SLukas Ertl 
576fb4e65d0SLukas Ertl 	*real_len = (bcount <= len_left) ? bcount : len_left;
577fb4e65d0SLukas Ertl 
578fb4e65d0SLukas Ertl 	if (sdno != NULL)
579fb4e65d0SLukas Ertl 		*sdno = sd;
580fb4e65d0SLukas Ertl 	if (psdno != NULL)
581fb4e65d0SLukas Ertl 		*psdno = psd;
582fb4e65d0SLukas Ertl 
583fb4e65d0SLukas Ertl 	return (0);
584fb4e65d0SLukas Ertl }
585