xref: /freebsd/sys/geom/vinum/geom_vinum_raid5.c (revision 73679edc)
173679edcSLukas Ertl /*-
273679edcSLukas Ertl  * Copyright (c) 2004 Lukas Ertl
373679edcSLukas Ertl  * All rights reserved.
473679edcSLukas Ertl  *
573679edcSLukas Ertl  * Redistribution and use in source and binary forms, with or without
673679edcSLukas Ertl  * modification, are permitted provided that the following conditions
773679edcSLukas Ertl  * are met:
873679edcSLukas Ertl  * 1. Redistributions of source code must retain the above copyright
973679edcSLukas Ertl  *    notice, this list of conditions and the following disclaimer.
1073679edcSLukas Ertl  * 2. Redistributions in binary form must reproduce the above copyright
1173679edcSLukas Ertl  *    notice, this list of conditions and the following disclaimer in the
1273679edcSLukas Ertl  *    documentation and/or other materials provided with the distribution.
1373679edcSLukas Ertl  *
1473679edcSLukas Ertl  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
1573679edcSLukas Ertl  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
1673679edcSLukas Ertl  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
1773679edcSLukas Ertl  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
1873679edcSLukas Ertl  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
1973679edcSLukas Ertl  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
2073679edcSLukas Ertl  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
2173679edcSLukas Ertl  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
2273679edcSLukas Ertl  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
2373679edcSLukas Ertl  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
2473679edcSLukas Ertl  * SUCH DAMAGE.
2573679edcSLukas Ertl  */
2673679edcSLukas Ertl 
2773679edcSLukas Ertl #include <sys/cdefs.h>
2873679edcSLukas Ertl __FBSDID("$FreeBSD$");
2973679edcSLukas Ertl 
3073679edcSLukas Ertl #include <sys/param.h>
3173679edcSLukas Ertl #include <sys/bio.h>
3273679edcSLukas Ertl #include <sys/conf.h>
3373679edcSLukas Ertl #include <sys/errno.h>
3473679edcSLukas Ertl #include <sys/kernel.h>
3573679edcSLukas Ertl #include <sys/kthread.h>
3673679edcSLukas Ertl #include <sys/libkern.h>
3773679edcSLukas Ertl #include <sys/lock.h>
3873679edcSLukas Ertl #include <sys/malloc.h>
3973679edcSLukas Ertl #include <sys/mutex.h>
4073679edcSLukas Ertl #include <sys/systm.h>
4173679edcSLukas Ertl 
4273679edcSLukas Ertl #include <geom/geom.h>
4373679edcSLukas Ertl #include <geom/vinum/geom_vinum_var.h>
4473679edcSLukas Ertl #include <geom/vinum/geom_vinum_raid5.h>
4573679edcSLukas Ertl #include <geom/vinum/geom_vinum.h>
4673679edcSLukas Ertl 
4773679edcSLukas Ertl int	gv_raid5_parity(struct gv_raid5_packet *);
4873679edcSLukas Ertl int	gv_stripe_active(struct gv_raid5_packet *, struct gv_plex *);
4973679edcSLukas Ertl 
5073679edcSLukas Ertl struct gv_raid5_bit *
5173679edcSLukas Ertl gv_new_raid5_bit(void)
5273679edcSLukas Ertl {
5373679edcSLukas Ertl 	struct gv_raid5_bit *r;
5473679edcSLukas Ertl 	r = g_malloc(sizeof(*r), M_NOWAIT | M_ZERO);
5573679edcSLukas Ertl 	KASSERT(r != NULL, ("gv_new_raid5_bit: NULL r"));
5673679edcSLukas Ertl 	return (r);
5773679edcSLukas Ertl }
5873679edcSLukas Ertl 
5973679edcSLukas Ertl struct gv_raid5_packet *
6073679edcSLukas Ertl gv_new_raid5_packet(void)
6173679edcSLukas Ertl {
6273679edcSLukas Ertl 	struct gv_raid5_packet *wp;
6373679edcSLukas Ertl 
6473679edcSLukas Ertl 	wp = g_malloc(sizeof(*wp), M_NOWAIT | M_ZERO);
6573679edcSLukas Ertl 	KASSERT(wp != NULL, ("gv_new_raid5_packet: NULL wp"));
6673679edcSLukas Ertl 	wp->state = SETUP;
6773679edcSLukas Ertl 	wp->type = JUNK;
6873679edcSLukas Ertl 	TAILQ_INIT(&wp->bits);
6973679edcSLukas Ertl 
7073679edcSLukas Ertl 	return (wp);
7173679edcSLukas Ertl }
7273679edcSLukas Ertl 
7373679edcSLukas Ertl /*
7473679edcSLukas Ertl  * Check if the stripe that the work packet wants is already being used by
7573679edcSLukas Ertl  * some other work packet.
7673679edcSLukas Ertl  */
7773679edcSLukas Ertl int
7873679edcSLukas Ertl gv_stripe_active(struct gv_raid5_packet *wp, struct gv_plex *sc)
7973679edcSLukas Ertl {
8073679edcSLukas Ertl 	struct gv_raid5_packet *wpa;
8173679edcSLukas Ertl 
8273679edcSLukas Ertl 	TAILQ_FOREACH(wpa, &sc->worklist, list) {
8373679edcSLukas Ertl 		if (wpa->lockbase == wp->lockbase) {
8473679edcSLukas Ertl 			if (wpa->bio == wp->bio)
8573679edcSLukas Ertl 				return (0);
8673679edcSLukas Ertl 			return (1);
8773679edcSLukas Ertl 		}
8873679edcSLukas Ertl 	}
8973679edcSLukas Ertl 	return (0);
9073679edcSLukas Ertl }
9173679edcSLukas Ertl 
9273679edcSLukas Ertl /*
9373679edcSLukas Ertl  * The "worker" thread that runs through the worklist and fires off the
9473679edcSLukas Ertl  * "subrequests" needed to fulfill a RAID5 read or write request.
9573679edcSLukas Ertl  */
9673679edcSLukas Ertl void
9773679edcSLukas Ertl gv_raid5_worker(void *arg)
9873679edcSLukas Ertl {
9973679edcSLukas Ertl 	struct bio *bp;
10073679edcSLukas Ertl 	struct g_geom *gp;
10173679edcSLukas Ertl 	struct gv_plex *p;
10273679edcSLukas Ertl 	struct gv_raid5_packet *wp, *wpt;
10373679edcSLukas Ertl 	struct gv_raid5_bit *rbp, *rbpt;
10473679edcSLukas Ertl 	int error, restart;
10573679edcSLukas Ertl 
10673679edcSLukas Ertl 	gp = arg;
10773679edcSLukas Ertl 	p = gp->softc;
10873679edcSLukas Ertl 
10973679edcSLukas Ertl 	mtx_lock(&p->worklist_mtx);
11073679edcSLukas Ertl 	for (;;) {
11173679edcSLukas Ertl 		restart = 0;
11273679edcSLukas Ertl 		g_trace(G_T_TOPOLOGY, "gv_raid5_worker scan");
11373679edcSLukas Ertl 		TAILQ_FOREACH_SAFE(wp, &p->worklist, list, wpt) {
11473679edcSLukas Ertl 			/* This request packet is already being processed. */
11573679edcSLukas Ertl 			if (wp->state == IO)
11673679edcSLukas Ertl 				continue;
11773679edcSLukas Ertl 			/* This request packet is ready for processing. */
11873679edcSLukas Ertl 			if (wp->state == VALID) {
11973679edcSLukas Ertl 				/* Couldn't get the lock, try again. */
12073679edcSLukas Ertl 				if ((wp->lockbase != -1) &&
12173679edcSLukas Ertl 				    gv_stripe_active(wp, p))
12273679edcSLukas Ertl 					continue;
12373679edcSLukas Ertl 
12473679edcSLukas Ertl 				wp->state = IO;
12573679edcSLukas Ertl 				mtx_unlock(&p->worklist_mtx);
12673679edcSLukas Ertl 				TAILQ_FOREACH_SAFE(rbp, &wp->bits, list, rbpt)
12773679edcSLukas Ertl 					g_io_request(rbp->bio, rbp->consumer);
12873679edcSLukas Ertl 				mtx_lock(&p->worklist_mtx);
12973679edcSLukas Ertl 				continue;
13073679edcSLukas Ertl 			}
13173679edcSLukas Ertl 			if (wp->state == FINISH) {
13273679edcSLukas Ertl 				bp = wp->bio;
13373679edcSLukas Ertl 				bp->bio_completed += wp->length;
13473679edcSLukas Ertl 				/*
13573679edcSLukas Ertl 				 * Deliver the original request if we have
13673679edcSLukas Ertl 				 * finished.
13773679edcSLukas Ertl 				 */
13873679edcSLukas Ertl 				if (bp->bio_completed == bp->bio_length) {
13973679edcSLukas Ertl 					mtx_unlock(&p->worklist_mtx);
14073679edcSLukas Ertl 					g_io_deliver(bp, 0);
14173679edcSLukas Ertl 					mtx_lock(&p->worklist_mtx);
14273679edcSLukas Ertl 				}
14373679edcSLukas Ertl 				TAILQ_REMOVE(&p->worklist, wp, list);
14473679edcSLukas Ertl 				if (wp->bufmalloc == 1)
14573679edcSLukas Ertl 					g_free(wp->buf);
14673679edcSLukas Ertl 				g_free(wp);
14773679edcSLukas Ertl 				restart++;
14873679edcSLukas Ertl 				/*break;*/
14973679edcSLukas Ertl 			}
15073679edcSLukas Ertl 		}
15173679edcSLukas Ertl 		if (!restart) {
15273679edcSLukas Ertl 			/* Self-destruct. */
15373679edcSLukas Ertl 			if (p->flags & GV_PLEX_THREAD_DIE)
15473679edcSLukas Ertl 				break;
15573679edcSLukas Ertl 			g_trace(G_T_TOPOLOGY, "gv_raid5_worker sleep");
15673679edcSLukas Ertl 			error = msleep(p, &p->worklist_mtx, PRIBIO, "-",
15773679edcSLukas Ertl 			    hz/100);
15873679edcSLukas Ertl 		}
15973679edcSLukas Ertl 	}
16073679edcSLukas Ertl 	mtx_unlock(&p->worklist_mtx);
16173679edcSLukas Ertl 
16273679edcSLukas Ertl 	g_trace(G_T_TOPOLOGY, "gv_raid5_worker die");
16373679edcSLukas Ertl 
16473679edcSLukas Ertl 	/* Signal our plex that we are dead. */
16573679edcSLukas Ertl 	p->flags |= GV_PLEX_THREAD_DEAD;
16673679edcSLukas Ertl 	wakeup(p);
16773679edcSLukas Ertl 	kthread_exit(0);
16873679edcSLukas Ertl }
16973679edcSLukas Ertl 
17073679edcSLukas Ertl /* Final bio transaction to write out the parity data. */
17173679edcSLukas Ertl int
17273679edcSLukas Ertl gv_raid5_parity(struct gv_raid5_packet *wp)
17373679edcSLukas Ertl {
17473679edcSLukas Ertl 	struct bio *bp;
17573679edcSLukas Ertl 
17673679edcSLukas Ertl 	bp = g_new_bio();
17773679edcSLukas Ertl 	if (bp == NULL)
17873679edcSLukas Ertl 		return (ENOMEM);
17973679edcSLukas Ertl 
18073679edcSLukas Ertl 	wp->type = ISPARITY;
18173679edcSLukas Ertl 	bp->bio_cmd = BIO_WRITE;
18273679edcSLukas Ertl 	bp->bio_data = wp->buf;
18373679edcSLukas Ertl 	bp->bio_offset = wp->offset;
18473679edcSLukas Ertl 	bp->bio_length = wp->length;
18573679edcSLukas Ertl 	bp->bio_done = gv_raid5_done;
18673679edcSLukas Ertl 	bp->bio_caller1 = wp;
18773679edcSLukas Ertl 	bp->bio_caller2 = NULL;
18873679edcSLukas Ertl 	g_io_request(bp, wp->parity);
18973679edcSLukas Ertl 
19073679edcSLukas Ertl 	return (0);
19173679edcSLukas Ertl }
19273679edcSLukas Ertl 
19373679edcSLukas Ertl /* We end up here after each subrequest. */
19473679edcSLukas Ertl void
19573679edcSLukas Ertl gv_raid5_done(struct bio *bp)
19673679edcSLukas Ertl {
19773679edcSLukas Ertl 	struct bio *obp;
19873679edcSLukas Ertl 	struct g_geom *gp;
19973679edcSLukas Ertl 	struct gv_plex *p;
20073679edcSLukas Ertl 	struct gv_raid5_packet *wp;
20173679edcSLukas Ertl 	struct gv_raid5_bit *rbp;
20273679edcSLukas Ertl 	off_t i;
20373679edcSLukas Ertl 	int error;
20473679edcSLukas Ertl 
20573679edcSLukas Ertl 	wp = bp->bio_caller1;
20673679edcSLukas Ertl 	rbp = bp->bio_caller2;
20773679edcSLukas Ertl 	obp = wp->bio;
20873679edcSLukas Ertl 	gp = bp->bio_from->geom;
20973679edcSLukas Ertl 	p = gp->softc;
21073679edcSLukas Ertl 
21173679edcSLukas Ertl 	/* One less active subrequest. */
21273679edcSLukas Ertl 	wp->active--;
21373679edcSLukas Ertl 
21473679edcSLukas Ertl 	switch (obp->bio_cmd) {
21573679edcSLukas Ertl 	case BIO_READ:
21673679edcSLukas Ertl 		/* Degraded reads need to handle parity data. */
21773679edcSLukas Ertl 		if (wp->type == DEGRADED) {
21873679edcSLukas Ertl 			for (i = 0; i < wp->length; i++)
21973679edcSLukas Ertl 				wp->buf[i] ^= bp->bio_data[i];
22073679edcSLukas Ertl 
22173679edcSLukas Ertl 			/* When we're finished copy back the data we want. */
22273679edcSLukas Ertl 			if (wp->active == 0)
22373679edcSLukas Ertl 				bcopy(wp->buf, wp->data, wp->length);
22473679edcSLukas Ertl 		}
22573679edcSLukas Ertl 
22673679edcSLukas Ertl 		break;
22773679edcSLukas Ertl 
22873679edcSLukas Ertl 	case BIO_WRITE:
22973679edcSLukas Ertl 		/* Handle the parity data, if needed. */
23073679edcSLukas Ertl 		if ((wp->type != NOPARITY) && (wp->type != ISPARITY)) {
23173679edcSLukas Ertl 			for (i = 0; i < wp->length; i++)
23273679edcSLukas Ertl 				wp->buf[i] ^= bp->bio_data[i];
23373679edcSLukas Ertl 
23473679edcSLukas Ertl 			/* Write out the parity data we calculated. */
23573679edcSLukas Ertl 			if (wp->active == 0) {
23673679edcSLukas Ertl 				wp->active++;
23773679edcSLukas Ertl 				error = gv_raid5_parity(wp);
23873679edcSLukas Ertl 			}
23973679edcSLukas Ertl 		}
24073679edcSLukas Ertl 		break;
24173679edcSLukas Ertl 	}
24273679edcSLukas Ertl 
24373679edcSLukas Ertl 	g_destroy_bio(bp);
24473679edcSLukas Ertl 
24573679edcSLukas Ertl 	if (rbp != NULL) {
24673679edcSLukas Ertl 		if (rbp->malloc == 1)
24773679edcSLukas Ertl 			g_free(rbp->buf);
24873679edcSLukas Ertl 		TAILQ_REMOVE(&wp->bits, rbp, list);
24973679edcSLukas Ertl 		g_free(rbp);
25073679edcSLukas Ertl 	}
25173679edcSLukas Ertl 
25273679edcSLukas Ertl 	/* This request group is done. */
25373679edcSLukas Ertl 	if (wp->active == 0)
25473679edcSLukas Ertl 		wp->state = FINISH;
25573679edcSLukas Ertl }
25673679edcSLukas Ertl 
25773679edcSLukas Ertl /* Build a request group to perform (part of) a RAID5 request. */
25873679edcSLukas Ertl int
25973679edcSLukas Ertl gv_build_raid5_req(struct gv_raid5_packet *wp, struct bio *bp, caddr_t addr,
26073679edcSLukas Ertl     long bcount, off_t boff)
26173679edcSLukas Ertl {
26273679edcSLukas Ertl 	struct g_geom *gp;
26373679edcSLukas Ertl 	struct gv_plex *p;
26473679edcSLukas Ertl 	struct gv_raid5_bit *rbp;
26573679edcSLukas Ertl 	struct gv_sd *broken, *original, *parity, *s;
26673679edcSLukas Ertl 	int i, psdno, sdno;
26773679edcSLukas Ertl 	off_t len_left, real_off, stripeend, stripeoff, stripestart;
26873679edcSLukas Ertl 
26973679edcSLukas Ertl 	gp = bp->bio_to->geom;
27073679edcSLukas Ertl 	p = gp->softc;
27173679edcSLukas Ertl 
27273679edcSLukas Ertl 	if (p == NULL || LIST_EMPTY(&p->subdisks))
27373679edcSLukas Ertl 		return (ENXIO);
27473679edcSLukas Ertl 
27573679edcSLukas Ertl 	/* We are optimistic and assume that this request will be OK. */
27673679edcSLukas Ertl 	wp->type = NORMAL;
27773679edcSLukas Ertl 	original = parity = broken = NULL;
27873679edcSLukas Ertl 
27973679edcSLukas Ertl 	/* The number of the subdisk containing the parity stripe. */
28073679edcSLukas Ertl 	psdno = p->sdcount - 1 - ( boff / (p->stripesize * (p->sdcount - 1))) %
28173679edcSLukas Ertl 	    p->sdcount;
28273679edcSLukas Ertl 	KASSERT(psdno >= 0, ("gv_build_raid5_request: psdno < 0"));
28373679edcSLukas Ertl 
28473679edcSLukas Ertl 	/* Offset of the start address from the start of the stripe. */
28573679edcSLukas Ertl 	stripeoff = boff % (p->stripesize * (p->sdcount - 1));
28673679edcSLukas Ertl 	KASSERT(stripeoff >= 0, ("gv_build_raid5_request: stripeoff < 0"));
28773679edcSLukas Ertl 
28873679edcSLukas Ertl 	/* The number of the subdisk where the stripe resides. */
28973679edcSLukas Ertl 	sdno = stripeoff / p->stripesize;
29073679edcSLukas Ertl 	KASSERT(sdno >= 0, ("gv_build_raid5_request: sdno < 0"));
29173679edcSLukas Ertl 
29273679edcSLukas Ertl 	/* At or past parity subdisk. */
29373679edcSLukas Ertl 	if (sdno >= psdno)
29473679edcSLukas Ertl 		sdno++;
29573679edcSLukas Ertl 
29673679edcSLukas Ertl 	/* The offset of the stripe on this subdisk. */
29773679edcSLukas Ertl 	stripestart = (boff - stripeoff) / (p->sdcount - 1);
29873679edcSLukas Ertl 	KASSERT(stripestart >= 0, ("gv_build_raid5_request: stripestart < 0"));
29973679edcSLukas Ertl 
30073679edcSLukas Ertl 	if (stripeoff >= p->stripesize)
30173679edcSLukas Ertl 		stripeoff -= p->stripesize;
30273679edcSLukas Ertl 
30373679edcSLukas Ertl 	/* The offset of the request on this subdisk. */
30473679edcSLukas Ertl 	real_off = stripestart + stripeoff;
30573679edcSLukas Ertl 
30673679edcSLukas Ertl 	stripeend = stripestart + p->stripesize;
30773679edcSLukas Ertl 	len_left = stripeend - real_off;
30873679edcSLukas Ertl 	KASSERT(len_left >= 0, ("gv_build_raid5_request: len_left < 0"));
30973679edcSLukas Ertl 
31073679edcSLukas Ertl 	/* Find the right subdisks. */
31173679edcSLukas Ertl 	i = 0;
31273679edcSLukas Ertl 	LIST_FOREACH(s, &p->subdisks, in_plex) {
31373679edcSLukas Ertl 		if (i == sdno)
31473679edcSLukas Ertl 			original = s;
31573679edcSLukas Ertl 		if (i == psdno)
31673679edcSLukas Ertl 			parity = s;
31773679edcSLukas Ertl 		if (s->state != GV_SD_UP)
31873679edcSLukas Ertl 			broken = s;
31973679edcSLukas Ertl 		i++;
32073679edcSLukas Ertl 	}
32173679edcSLukas Ertl 
32273679edcSLukas Ertl 	if ((original == NULL) || (parity == NULL))
32373679edcSLukas Ertl 		return (ENXIO);
32473679edcSLukas Ertl 
32573679edcSLukas Ertl 	/* Our data stripe is missing. */
32673679edcSLukas Ertl 	if (original->state != GV_SD_UP)
32773679edcSLukas Ertl 		wp->type = DEGRADED;
32873679edcSLukas Ertl 	/* Our parity stripe is missing. */
32973679edcSLukas Ertl 	if (parity->state != GV_SD_UP) {
33073679edcSLukas Ertl 		/* We cannot take another failure if we're already degraded. */
33173679edcSLukas Ertl 		if (wp->type != NORMAL)
33273679edcSLukas Ertl 			return (ENXIO);
33373679edcSLukas Ertl 		else
33473679edcSLukas Ertl 			wp->type = NOPARITY;
33573679edcSLukas Ertl 	}
33673679edcSLukas Ertl 
33773679edcSLukas Ertl 	/*
33873679edcSLukas Ertl 	 * A combined write is necessary when the original data subdisk and the
33973679edcSLukas Ertl 	 * parity subdisk are both up, but one of the other subdisks isn't.
34073679edcSLukas Ertl 	 */
34173679edcSLukas Ertl 	if ((broken != NULL) && (broken != parity) && (broken != original))
34273679edcSLukas Ertl 		wp->type = COMBINED;
34373679edcSLukas Ertl 
34473679edcSLukas Ertl 	wp->offset = real_off;
34573679edcSLukas Ertl 	wp->length = (bcount <= len_left) ? bcount : len_left;
34673679edcSLukas Ertl 	wp->data = addr;
34773679edcSLukas Ertl 	wp->original = original->consumer;
34873679edcSLukas Ertl 	wp->parity = parity->consumer;
34973679edcSLukas Ertl 	wp->lockbase = stripestart;
35073679edcSLukas Ertl 
35173679edcSLukas Ertl 	KASSERT(wp->length >= 0, ("gv_build_raid5_request: wp->length < 0"));
35273679edcSLukas Ertl 
35373679edcSLukas Ertl 	switch (bp->bio_cmd) {
35473679edcSLukas Ertl 	case BIO_READ:
35573679edcSLukas Ertl 		/*
35673679edcSLukas Ertl 		 * For a degraded read we need to read in all stripes except
35773679edcSLukas Ertl 		 * the broken one plus the parity stripe and then recalculate
35873679edcSLukas Ertl 		 * the desired data.
35973679edcSLukas Ertl 		 */
36073679edcSLukas Ertl 		if (wp->type == DEGRADED) {
36173679edcSLukas Ertl 			wp->buf = g_malloc(wp->length, M_WAITOK | M_ZERO);
36273679edcSLukas Ertl 			wp->bufmalloc = 1;
36373679edcSLukas Ertl 			LIST_FOREACH(s, &p->subdisks, in_plex) {
36473679edcSLukas Ertl 				/* Skip the broken subdisk. */
36573679edcSLukas Ertl 				if (s == broken)
36673679edcSLukas Ertl 					continue;
36773679edcSLukas Ertl 				rbp = gv_new_raid5_bit();
36873679edcSLukas Ertl 				rbp->consumer = s->consumer;
36973679edcSLukas Ertl 				rbp->bio = g_new_bio();
37073679edcSLukas Ertl 				if (rbp->bio == NULL)
37173679edcSLukas Ertl 					return (ENOMEM);
37273679edcSLukas Ertl 				rbp->buf = g_malloc(wp->length,
37373679edcSLukas Ertl 					M_WAITOK | M_ZERO);
37473679edcSLukas Ertl 				rbp->malloc = 1;
37573679edcSLukas Ertl 				rbp->bio->bio_cmd = BIO_READ;
37673679edcSLukas Ertl 				rbp->bio->bio_offset = wp->offset;
37773679edcSLukas Ertl 				rbp->bio->bio_length = wp->length;
37873679edcSLukas Ertl 				rbp->bio->bio_data = rbp->buf;
37973679edcSLukas Ertl 				rbp->bio->bio_done = gv_raid5_done;
38073679edcSLukas Ertl 				rbp->bio->bio_caller1 = wp;
38173679edcSLukas Ertl 				rbp->bio->bio_caller2 = rbp;
38273679edcSLukas Ertl 				TAILQ_INSERT_HEAD(&wp->bits, rbp, list);
38373679edcSLukas Ertl 				wp->active++;
38473679edcSLukas Ertl 				wp->rqcount++;
38573679edcSLukas Ertl 			}
38673679edcSLukas Ertl 
38773679edcSLukas Ertl 		/* A normal read can be fulfilled with the original subdisk. */
38873679edcSLukas Ertl 		} else {
38973679edcSLukas Ertl 			rbp = gv_new_raid5_bit();
39073679edcSLukas Ertl 			rbp->consumer = wp->original;
39173679edcSLukas Ertl 			rbp->bio = g_new_bio();
39273679edcSLukas Ertl 			if (rbp->bio == NULL)
39373679edcSLukas Ertl 				return (ENOMEM);
39473679edcSLukas Ertl 			rbp->bio->bio_cmd = BIO_READ;
39573679edcSLukas Ertl 			rbp->bio->bio_offset = wp->offset;
39673679edcSLukas Ertl 			rbp->bio->bio_length = wp->length;
39773679edcSLukas Ertl 			rbp->buf = addr;
39873679edcSLukas Ertl 			rbp->bio->bio_data = rbp->buf;
39973679edcSLukas Ertl 			rbp->bio->bio_done = gv_raid5_done;
40073679edcSLukas Ertl 			rbp->bio->bio_caller1 = wp;
40173679edcSLukas Ertl 			rbp->bio->bio_caller2 = rbp;
40273679edcSLukas Ertl 			TAILQ_INSERT_HEAD(&wp->bits, rbp, list);
40373679edcSLukas Ertl 			wp->active++;
40473679edcSLukas Ertl 			wp->rqcount++;
40573679edcSLukas Ertl 		}
40673679edcSLukas Ertl 		if (wp->type != COMBINED)
40773679edcSLukas Ertl 			wp->lockbase = -1;
40873679edcSLukas Ertl 		break;
40973679edcSLukas Ertl 
41073679edcSLukas Ertl 	case BIO_WRITE:
41173679edcSLukas Ertl 		/*
41273679edcSLukas Ertl 		 * A degraded write means we cannot write to the original data
41373679edcSLukas Ertl 		 * subdisk.  Thus we need to read in all valid stripes,
41473679edcSLukas Ertl 		 * recalculate the parity from the original data, and then
41573679edcSLukas Ertl 		 * write the parity stripe back out.
41673679edcSLukas Ertl 		 */
41773679edcSLukas Ertl 		if (wp->type == DEGRADED) {
41873679edcSLukas Ertl 			wp->buf = g_malloc(wp->length, M_WAITOK | M_ZERO);
41973679edcSLukas Ertl 			wp->bufmalloc = 1;
42073679edcSLukas Ertl 
42173679edcSLukas Ertl 			/* Copy the original data. */
42273679edcSLukas Ertl 			bcopy(wp->data, wp->buf, wp->length);
42373679edcSLukas Ertl 
42473679edcSLukas Ertl 			LIST_FOREACH(s, &p->subdisks, in_plex) {
42573679edcSLukas Ertl 				/* Skip the broken and the parity subdisk. */
42673679edcSLukas Ertl 				if ((s == broken) ||
42773679edcSLukas Ertl 				    (s->consumer == wp->parity))
42873679edcSLukas Ertl 					continue;
42973679edcSLukas Ertl 
43073679edcSLukas Ertl 				rbp = gv_new_raid5_bit();
43173679edcSLukas Ertl 				rbp->consumer = s->consumer;
43273679edcSLukas Ertl 				rbp->bio = g_new_bio();
43373679edcSLukas Ertl 				if (rbp->bio == NULL)
43473679edcSLukas Ertl 					return (ENOMEM);
43573679edcSLukas Ertl 				rbp->buf = g_malloc(wp->length,
43673679edcSLukas Ertl 				    M_WAITOK | M_ZERO);
43773679edcSLukas Ertl 				rbp->malloc = 1;
43873679edcSLukas Ertl 				rbp->bio->bio_cmd = BIO_READ;
43973679edcSLukas Ertl 				rbp->bio->bio_data = rbp->buf;
44073679edcSLukas Ertl 				rbp->bio->bio_offset = wp->offset;
44173679edcSLukas Ertl 				rbp->bio->bio_length = wp->length;
44273679edcSLukas Ertl 				rbp->bio->bio_done = gv_raid5_done;
44373679edcSLukas Ertl 				rbp->bio->bio_caller1 = wp;
44473679edcSLukas Ertl 				rbp->bio->bio_caller2 = rbp;
44573679edcSLukas Ertl 				TAILQ_INSERT_HEAD(&wp->bits, rbp, list);
44673679edcSLukas Ertl 				wp->active++;
44773679edcSLukas Ertl 				wp->rqcount++;
44873679edcSLukas Ertl 			}
44973679edcSLukas Ertl 
45073679edcSLukas Ertl 		/*
45173679edcSLukas Ertl 		 * When we don't have the parity stripe we just write out the
45273679edcSLukas Ertl 		 * data.
45373679edcSLukas Ertl 		 */
45473679edcSLukas Ertl 		} else if (wp->type == NOPARITY) {
45573679edcSLukas Ertl 			rbp = gv_new_raid5_bit();
45673679edcSLukas Ertl 			rbp->consumer = wp->original;
45773679edcSLukas Ertl 			rbp->bio = g_new_bio();
45873679edcSLukas Ertl 			if (rbp->bio == NULL)
45973679edcSLukas Ertl 				return (ENOMEM);
46073679edcSLukas Ertl 			rbp->bio->bio_cmd = BIO_WRITE;
46173679edcSLukas Ertl 			rbp->bio->bio_offset = wp->offset;
46273679edcSLukas Ertl 			rbp->bio->bio_length = wp->length;
46373679edcSLukas Ertl 			rbp->bio->bio_data = addr;
46473679edcSLukas Ertl 			rbp->bio->bio_done = gv_raid5_done;
46573679edcSLukas Ertl 			rbp->bio->bio_caller1 = wp;
46673679edcSLukas Ertl 			rbp->bio->bio_caller2 = rbp;
46773679edcSLukas Ertl 			TAILQ_INSERT_HEAD(&wp->bits, rbp, list);
46873679edcSLukas Ertl 			wp->active++;
46973679edcSLukas Ertl 			wp->rqcount++;
47073679edcSLukas Ertl 
47173679edcSLukas Ertl 		/*
47273679edcSLukas Ertl 		 * A combined write means that our data subdisk and the parity
47373679edcSLukas Ertl 		 * subdisks are both up, but another subdisk isn't.  We need to
47473679edcSLukas Ertl 		 * read all valid stripes including the parity to recalculate
47573679edcSLukas Ertl 		 * the data of the stripe that is missing.  Then we write our
47673679edcSLukas Ertl 		 * original data, and together with the other data stripes
47773679edcSLukas Ertl 		 * recalculate the parity again.
47873679edcSLukas Ertl 		 */
47973679edcSLukas Ertl 		} else if (wp->type == COMBINED) {
48073679edcSLukas Ertl 			wp->buf = g_malloc(wp->length, M_WAITOK | M_ZERO);
48173679edcSLukas Ertl 			wp->bufmalloc = 1;
48273679edcSLukas Ertl 
48373679edcSLukas Ertl 			/* Get the data from all subdisks. */
48473679edcSLukas Ertl 			LIST_FOREACH(s, &p->subdisks, in_plex) {
48573679edcSLukas Ertl 				/* Skip the broken subdisk. */
48673679edcSLukas Ertl 				if (s == broken)
48773679edcSLukas Ertl 					continue;
48873679edcSLukas Ertl 
48973679edcSLukas Ertl 				rbp = gv_new_raid5_bit();
49073679edcSLukas Ertl 				rbp->consumer = s->consumer;
49173679edcSLukas Ertl 				rbp->bio = g_new_bio();
49273679edcSLukas Ertl 				if (rbp->bio == NULL)
49373679edcSLukas Ertl 					return (ENOMEM);
49473679edcSLukas Ertl 				rbp->bio->bio_cmd = BIO_READ;
49573679edcSLukas Ertl 				rbp->buf = g_malloc(wp->length,
49673679edcSLukas Ertl 				    M_WAITOK | M_ZERO);
49773679edcSLukas Ertl 				rbp->malloc = 1;
49873679edcSLukas Ertl 				rbp->bio->bio_data = rbp->buf;
49973679edcSLukas Ertl 				rbp->bio->bio_offset = wp->offset;
50073679edcSLukas Ertl 				rbp->bio->bio_length = wp->length;
50173679edcSLukas Ertl 				rbp->bio->bio_done = gv_raid5_done;
50273679edcSLukas Ertl 				rbp->bio->bio_caller1 = wp;
50373679edcSLukas Ertl 				rbp->bio->bio_caller2 = rbp;
50473679edcSLukas Ertl 				TAILQ_INSERT_HEAD(&wp->bits, rbp, list);
50573679edcSLukas Ertl 				wp->active++;
50673679edcSLukas Ertl 				wp->rqcount++;
50773679edcSLukas Ertl 			}
50873679edcSLukas Ertl 
50973679edcSLukas Ertl 			/* Write the original data. */
51073679edcSLukas Ertl 			rbp = gv_new_raid5_bit();
51173679edcSLukas Ertl 			rbp->consumer = wp->original;
51273679edcSLukas Ertl 			rbp->buf = addr;
51373679edcSLukas Ertl 			rbp->bio = g_new_bio();
51473679edcSLukas Ertl 			if (rbp->bio == NULL)
51573679edcSLukas Ertl 				return (ENOMEM);
51673679edcSLukas Ertl 			rbp->bio->bio_cmd = BIO_WRITE;
51773679edcSLukas Ertl 			rbp->bio->bio_data = rbp->buf;
51873679edcSLukas Ertl 			rbp->bio->bio_offset = wp->offset;
51973679edcSLukas Ertl 			rbp->bio->bio_length = wp->length;
52073679edcSLukas Ertl 			rbp->bio->bio_done = gv_raid5_done;
52173679edcSLukas Ertl 			rbp->bio->bio_caller1 = wp;
52273679edcSLukas Ertl 			rbp->bio->bio_caller2 = rbp;
52373679edcSLukas Ertl 			/*
52473679edcSLukas Ertl 			 * Insert at the tail, because we want to read the old
52573679edcSLukas Ertl 			 * data first.
52673679edcSLukas Ertl 			 */
52773679edcSLukas Ertl 			TAILQ_INSERT_TAIL(&wp->bits, rbp, list);
52873679edcSLukas Ertl 			wp->active++;
52973679edcSLukas Ertl 			wp->rqcount++;
53073679edcSLukas Ertl 
53173679edcSLukas Ertl 			/* Get the rest of the data again. */
53273679edcSLukas Ertl 			LIST_FOREACH(s, &p->subdisks, in_plex) {
53373679edcSLukas Ertl 				/*
53473679edcSLukas Ertl 				 * Skip the broken subdisk, the parity, and the
53573679edcSLukas Ertl 				 * one we just wrote.
53673679edcSLukas Ertl 				 */
53773679edcSLukas Ertl 				if ((s == broken) ||
53873679edcSLukas Ertl 				    (s->consumer == wp->parity) ||
53973679edcSLukas Ertl 				    (s->consumer == wp->original))
54073679edcSLukas Ertl 					continue;
54173679edcSLukas Ertl 				rbp = gv_new_raid5_bit();
54273679edcSLukas Ertl 				rbp->consumer = s->consumer;
54373679edcSLukas Ertl 				rbp->bio = g_new_bio();
54473679edcSLukas Ertl 				if (rbp->bio == NULL)
54573679edcSLukas Ertl 					return (ENOMEM);
54673679edcSLukas Ertl 				rbp->bio->bio_cmd = BIO_READ;
54773679edcSLukas Ertl 				rbp->buf = g_malloc(wp->length,
54873679edcSLukas Ertl 				    M_WAITOK | M_ZERO);
54973679edcSLukas Ertl 				rbp->malloc = 1;
55073679edcSLukas Ertl 				rbp->bio->bio_data = rbp->buf;
55173679edcSLukas Ertl 				rbp->bio->bio_offset = wp->offset;
55273679edcSLukas Ertl 				rbp->bio->bio_length = wp->length;
55373679edcSLukas Ertl 				rbp->bio->bio_done = gv_raid5_done;
55473679edcSLukas Ertl 				rbp->bio->bio_caller1 = wp;
55573679edcSLukas Ertl 				rbp->bio->bio_caller2 = rbp;
55673679edcSLukas Ertl 				/*
55773679edcSLukas Ertl 				 * Again, insert at the tail to keep correct
55873679edcSLukas Ertl 				 * order.
55973679edcSLukas Ertl 				 */
56073679edcSLukas Ertl 				TAILQ_INSERT_TAIL(&wp->bits, rbp, list);
56173679edcSLukas Ertl 				wp->active++;
56273679edcSLukas Ertl 				wp->rqcount++;
56373679edcSLukas Ertl 			}
56473679edcSLukas Ertl 
56573679edcSLukas Ertl 
56673679edcSLukas Ertl 		/*
56773679edcSLukas Ertl 		 * A normal write request goes to the original subdisk, then we
56873679edcSLukas Ertl 		 * read in all other stripes, recalculate the parity and write
56973679edcSLukas Ertl 		 * out the parity again.
57073679edcSLukas Ertl 		 */
57173679edcSLukas Ertl 		} else {
57273679edcSLukas Ertl 			wp->buf = g_malloc(wp->length, M_WAITOK | M_ZERO);
57373679edcSLukas Ertl 			wp->bufmalloc = 1;
57473679edcSLukas Ertl 			LIST_FOREACH(s, &p->subdisks, in_plex) {
57573679edcSLukas Ertl 				/* Skip the parity stripe. */
57673679edcSLukas Ertl 				if (s->consumer == wp->parity)
57773679edcSLukas Ertl 					continue;
57873679edcSLukas Ertl 
57973679edcSLukas Ertl 				rbp = gv_new_raid5_bit();
58073679edcSLukas Ertl 				rbp->consumer = s->consumer;
58173679edcSLukas Ertl 				rbp->bio = g_new_bio();
58273679edcSLukas Ertl 				if (rbp->bio == NULL)
58373679edcSLukas Ertl 					return (ENOMEM);
58473679edcSLukas Ertl 				/*
58573679edcSLukas Ertl 				 * The data for the original stripe is written,
58673679edcSLukas Ertl 				 * the others need to be read in for the parity
58773679edcSLukas Ertl 				 * calculation.
58873679edcSLukas Ertl 				 */
58973679edcSLukas Ertl 				if (s->consumer == wp->original) {
59073679edcSLukas Ertl 					rbp->bio->bio_cmd = BIO_WRITE;
59173679edcSLukas Ertl 					rbp->buf = addr;
59273679edcSLukas Ertl 				} else {
59373679edcSLukas Ertl 					rbp->bio->bio_cmd = BIO_READ;
59473679edcSLukas Ertl 					rbp->buf = g_malloc(wp->length,
59573679edcSLukas Ertl 					    M_WAITOK | M_ZERO);
59673679edcSLukas Ertl 					rbp->malloc = 1;
59773679edcSLukas Ertl 				}
59873679edcSLukas Ertl 				rbp->bio->bio_data = rbp->buf;
59973679edcSLukas Ertl 				rbp->bio->bio_offset = wp->offset;
60073679edcSLukas Ertl 				rbp->bio->bio_length = wp->length;
60173679edcSLukas Ertl 				rbp->bio->bio_done = gv_raid5_done;
60273679edcSLukas Ertl 				rbp->bio->bio_caller1 = wp;
60373679edcSLukas Ertl 				rbp->bio->bio_caller2 = rbp;
60473679edcSLukas Ertl 				TAILQ_INSERT_HEAD(&wp->bits, rbp, list);
60573679edcSLukas Ertl 				wp->active++;
60673679edcSLukas Ertl 				wp->rqcount++;
60773679edcSLukas Ertl 			}
60873679edcSLukas Ertl 		}
60973679edcSLukas Ertl 		break;
61073679edcSLukas Ertl 	default:
61173679edcSLukas Ertl 		return (EINVAL);
61273679edcSLukas Ertl 	}
61373679edcSLukas Ertl 
61473679edcSLukas Ertl 	wp->state = VALID;
61573679edcSLukas Ertl 	return (0);
61673679edcSLukas Ertl }
617