xref: /freebsd/sys/geom/vinum/geom_vinum_raid5.c (revision 87bb53cb)
173679edcSLukas Ertl /*-
23728855aSPedro F. Giffuni  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
33728855aSPedro F. Giffuni  *
4c0b9797aSUlf Lilleengen  * Copyright (c) 2004, 2007 Lukas Ertl
573679edcSLukas Ertl  * All rights reserved.
673679edcSLukas Ertl  *
773679edcSLukas Ertl  * Redistribution and use in source and binary forms, with or without
873679edcSLukas Ertl  * modification, are permitted provided that the following conditions
973679edcSLukas Ertl  * are met:
1073679edcSLukas Ertl  * 1. Redistributions of source code must retain the above copyright
1173679edcSLukas Ertl  *    notice, this list of conditions and the following disclaimer.
1273679edcSLukas Ertl  * 2. Redistributions in binary form must reproduce the above copyright
1373679edcSLukas Ertl  *    notice, this list of conditions and the following disclaimer in the
1473679edcSLukas Ertl  *    documentation and/or other materials provided with the distribution.
1573679edcSLukas Ertl  *
1673679edcSLukas Ertl  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
1773679edcSLukas Ertl  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
1873679edcSLukas Ertl  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
1973679edcSLukas Ertl  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
2073679edcSLukas Ertl  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
2173679edcSLukas Ertl  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
2273679edcSLukas Ertl  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
2373679edcSLukas Ertl  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
2473679edcSLukas Ertl  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
2573679edcSLukas Ertl  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
2673679edcSLukas Ertl  * SUCH DAMAGE.
2773679edcSLukas Ertl  */
2873679edcSLukas Ertl 
2973679edcSLukas Ertl #include <sys/cdefs.h>
3073679edcSLukas Ertl __FBSDID("$FreeBSD$");
3173679edcSLukas Ertl 
3273679edcSLukas Ertl #include <sys/param.h>
3373679edcSLukas Ertl #include <sys/bio.h>
3473679edcSLukas Ertl #include <sys/lock.h>
3573679edcSLukas Ertl #include <sys/malloc.h>
3673679edcSLukas Ertl #include <sys/systm.h>
3773679edcSLukas Ertl 
3873679edcSLukas Ertl #include <geom/geom.h>
39ac03832eSConrad Meyer #include <geom/geom_dbg.h>
4073679edcSLukas Ertl #include <geom/vinum/geom_vinum_var.h>
4173679edcSLukas Ertl #include <geom/vinum/geom_vinum_raid5.h>
4273679edcSLukas Ertl #include <geom/vinum/geom_vinum.h>
4373679edcSLukas Ertl 
44c0b9797aSUlf Lilleengen static int		gv_raid5_offset(struct gv_plex *, off_t, off_t,
45c0b9797aSUlf Lilleengen 			    off_t *, off_t *, int *, int *, int);
46c0b9797aSUlf Lilleengen static struct bio *	gv_raid5_clone_bio(struct bio *, struct gv_sd *,
47c0b9797aSUlf Lilleengen 			    struct gv_raid5_packet *, caddr_t, int);
48c0b9797aSUlf Lilleengen static int	gv_raid5_request(struct gv_plex *, struct gv_raid5_packet *,
49c0b9797aSUlf Lilleengen 		    struct bio *, caddr_t, off_t, off_t, int *);
50c0b9797aSUlf Lilleengen static int	gv_raid5_check(struct gv_plex *, struct gv_raid5_packet *,
51c0b9797aSUlf Lilleengen 		    struct bio *, caddr_t, off_t, off_t);
52c0b9797aSUlf Lilleengen static int	gv_raid5_rebuild(struct gv_plex *, struct gv_raid5_packet *,
53c0b9797aSUlf Lilleengen 		    struct bio *, caddr_t, off_t, off_t);
54c0b9797aSUlf Lilleengen 
55c0b9797aSUlf Lilleengen struct gv_raid5_packet *
56c0b9797aSUlf Lilleengen gv_raid5_start(struct gv_plex *p, struct bio *bp, caddr_t addr, off_t boff,
57c0b9797aSUlf Lilleengen     off_t bcount)
58c0b9797aSUlf Lilleengen {
59c0b9797aSUlf Lilleengen 	struct bio *cbp;
60c0b9797aSUlf Lilleengen 	struct gv_raid5_packet *wp, *wp2;
61c0b9797aSUlf Lilleengen 	struct gv_bioq *bq, *bq2;
62c0b9797aSUlf Lilleengen 	int err, delay;
63c0b9797aSUlf Lilleengen 
64c0b9797aSUlf Lilleengen 	delay = 0;
65c0b9797aSUlf Lilleengen 	wp = g_malloc(sizeof(*wp), M_WAITOK | M_ZERO);
66c0b9797aSUlf Lilleengen 	wp->bio = bp;
67c0b9797aSUlf Lilleengen 	wp->waiting = NULL;
68c0b9797aSUlf Lilleengen 	wp->parity = NULL;
69c0b9797aSUlf Lilleengen 	TAILQ_INIT(&wp->bits);
70c0b9797aSUlf Lilleengen 
71d8d015cdSUlf Lilleengen 	if (bp->bio_pflags & GV_BIO_REBUILD)
72c0b9797aSUlf Lilleengen 		err = gv_raid5_rebuild(p, wp, bp, addr, boff, bcount);
73d8d015cdSUlf Lilleengen 	else if (bp->bio_pflags & GV_BIO_CHECK)
74c0b9797aSUlf Lilleengen 		err = gv_raid5_check(p, wp, bp, addr, boff, bcount);
75c0b9797aSUlf Lilleengen 	else
76c0b9797aSUlf Lilleengen 		err = gv_raid5_request(p, wp, bp, addr, boff, bcount, &delay);
77c0b9797aSUlf Lilleengen 
78c0b9797aSUlf Lilleengen 	/* Means we have a delayed request. */
79c0b9797aSUlf Lilleengen 	if (delay) {
80c0b9797aSUlf Lilleengen 		g_free(wp);
81c0b9797aSUlf Lilleengen 		return (NULL);
82c0b9797aSUlf Lilleengen 	}
83c0b9797aSUlf Lilleengen 
84c0b9797aSUlf Lilleengen 	/*
85c0b9797aSUlf Lilleengen 	 * Building the sub-request failed, we probably need to clean up a lot.
86c0b9797aSUlf Lilleengen 	 */
87c0b9797aSUlf Lilleengen 	if (err) {
88c0b9797aSUlf Lilleengen 		G_VINUM_LOGREQ(0, bp, "raid5 plex request failed.");
89c0b9797aSUlf Lilleengen 		TAILQ_FOREACH_SAFE(bq, &wp->bits, queue, bq2) {
90c0b9797aSUlf Lilleengen 			TAILQ_REMOVE(&wp->bits, bq, queue);
91c0b9797aSUlf Lilleengen 			g_free(bq);
92c0b9797aSUlf Lilleengen 		}
93c0b9797aSUlf Lilleengen 		if (wp->waiting != NULL) {
94c0b9797aSUlf Lilleengen 			if (wp->waiting->bio_cflags & GV_BIO_MALLOC)
95c0b9797aSUlf Lilleengen 				g_free(wp->waiting->bio_data);
96a29df733SAlexander Motin 			gv_drive_done(wp->waiting->bio_caller1);
97c0b9797aSUlf Lilleengen 			g_destroy_bio(wp->waiting);
98c0b9797aSUlf Lilleengen 		}
99c0b9797aSUlf Lilleengen 		if (wp->parity != NULL) {
100c0b9797aSUlf Lilleengen 			if (wp->parity->bio_cflags & GV_BIO_MALLOC)
101c0b9797aSUlf Lilleengen 				g_free(wp->parity->bio_data);
102a29df733SAlexander Motin 			gv_drive_done(wp->parity->bio_caller1);
103c0b9797aSUlf Lilleengen 			g_destroy_bio(wp->parity);
104c0b9797aSUlf Lilleengen 		}
105c0b9797aSUlf Lilleengen 		g_free(wp);
106c0b9797aSUlf Lilleengen 
107c0b9797aSUlf Lilleengen 		TAILQ_FOREACH_SAFE(wp, &p->packets, list, wp2) {
108c0b9797aSUlf Lilleengen 			if (wp->bio != bp)
109c0b9797aSUlf Lilleengen 				continue;
110c0b9797aSUlf Lilleengen 
111c0b9797aSUlf Lilleengen 			TAILQ_REMOVE(&p->packets, wp, list);
112c0b9797aSUlf Lilleengen 			TAILQ_FOREACH_SAFE(bq, &wp->bits, queue, bq2) {
113c0b9797aSUlf Lilleengen 				TAILQ_REMOVE(&wp->bits, bq, queue);
114c0b9797aSUlf Lilleengen 				g_free(bq);
115c0b9797aSUlf Lilleengen 			}
116c0b9797aSUlf Lilleengen 			g_free(wp);
117c0b9797aSUlf Lilleengen 		}
118c0b9797aSUlf Lilleengen 
119c0b9797aSUlf Lilleengen 		cbp = bioq_takefirst(p->bqueue);
120c0b9797aSUlf Lilleengen 		while (cbp != NULL) {
121c0b9797aSUlf Lilleengen 			if (cbp->bio_cflags & GV_BIO_MALLOC)
122c0b9797aSUlf Lilleengen 				g_free(cbp->bio_data);
123a29df733SAlexander Motin 			gv_drive_done(cbp->bio_caller1);
124c0b9797aSUlf Lilleengen 			g_destroy_bio(cbp);
125c0b9797aSUlf Lilleengen 			cbp = bioq_takefirst(p->bqueue);
126c0b9797aSUlf Lilleengen 		}
127c0b9797aSUlf Lilleengen 
128c0b9797aSUlf Lilleengen 		/* If internal, stop and reset state. */
129d8d015cdSUlf Lilleengen 		if (bp->bio_pflags & GV_BIO_INTERNAL) {
130d8d015cdSUlf Lilleengen 			if (bp->bio_pflags & GV_BIO_MALLOC)
1311d8dfc60SUlf Lilleengen 				g_free(bp->bio_data);
132c0b9797aSUlf Lilleengen 			g_destroy_bio(bp);
133c0b9797aSUlf Lilleengen 			/* Reset flags. */
134c0b9797aSUlf Lilleengen 			p->flags &= ~(GV_PLEX_SYNCING | GV_PLEX_REBUILDING |
135c0b9797aSUlf Lilleengen 			    GV_PLEX_GROWING);
136c0b9797aSUlf Lilleengen 			return (NULL);
137c0b9797aSUlf Lilleengen 		}
138c0b9797aSUlf Lilleengen 		g_io_deliver(bp, err);
139c0b9797aSUlf Lilleengen 		return (NULL);
140c0b9797aSUlf Lilleengen 	}
141c0b9797aSUlf Lilleengen 
142c0b9797aSUlf Lilleengen 	return (wp);
143c0b9797aSUlf Lilleengen }
144fb4e65d0SLukas Ertl 
14573679edcSLukas Ertl /*
14673679edcSLukas Ertl  * Check if the stripe that the work packet wants is already being used by
14773679edcSLukas Ertl  * some other work packet.
14873679edcSLukas Ertl  */
14973679edcSLukas Ertl int
15067e3ab6eSLukas Ertl gv_stripe_active(struct gv_plex *p, struct bio *bp)
15173679edcSLukas Ertl {
15267e3ab6eSLukas Ertl 	struct gv_raid5_packet *wp, *owp;
15367e3ab6eSLukas Ertl 	int overlap;
15473679edcSLukas Ertl 
155c0b9797aSUlf Lilleengen 	wp = bp->bio_caller2;
15667e3ab6eSLukas Ertl 	if (wp->lockbase == -1)
15773679edcSLukas Ertl 		return (0);
15873679edcSLukas Ertl 
15967e3ab6eSLukas Ertl 	overlap = 0;
16067e3ab6eSLukas Ertl 	TAILQ_FOREACH(owp, &p->packets, list) {
16167e3ab6eSLukas Ertl 		if (owp == wp)
16273679edcSLukas Ertl 			break;
16367e3ab6eSLukas Ertl 		if ((wp->lockbase >= owp->lockbase) &&
16467e3ab6eSLukas Ertl 		    (wp->lockbase <= owp->lockbase + owp->length)) {
16567e3ab6eSLukas Ertl 			overlap++;
16673679edcSLukas Ertl 			break;
16773679edcSLukas Ertl 		}
16867e3ab6eSLukas Ertl 		if ((wp->lockbase <= owp->lockbase) &&
16967e3ab6eSLukas Ertl 		    (wp->lockbase + wp->length >= owp->lockbase)) {
17067e3ab6eSLukas Ertl 			overlap++;
17167e3ab6eSLukas Ertl 			break;
17267e3ab6eSLukas Ertl 		}
17367e3ab6eSLukas Ertl 	}
17473679edcSLukas Ertl 
17567e3ab6eSLukas Ertl 	return (overlap);
17673679edcSLukas Ertl }
17773679edcSLukas Ertl 
178c0b9797aSUlf Lilleengen static int
179c0b9797aSUlf Lilleengen gv_raid5_check(struct gv_plex *p, struct gv_raid5_packet *wp, struct bio *bp,
180fb5885afSLukas Ertl     caddr_t addr, off_t boff, off_t bcount)
181fb5885afSLukas Ertl {
182fb5885afSLukas Ertl 	struct gv_sd *parity, *s;
183fb5885afSLukas Ertl 	struct gv_bioq *bq;
184c0b9797aSUlf Lilleengen 	struct bio *cbp;
185fb5885afSLukas Ertl 	int i, psdno;
186fb5885afSLukas Ertl 	off_t real_len, real_off;
187fb5885afSLukas Ertl 
188fb5885afSLukas Ertl 	if (p == NULL || LIST_EMPTY(&p->subdisks))
189fb5885afSLukas Ertl 		return (ENXIO);
190fb5885afSLukas Ertl 
191c0b9797aSUlf Lilleengen 	gv_raid5_offset(p, boff, bcount, &real_off, &real_len, NULL, &psdno, 1);
192fb5885afSLukas Ertl 
193fb5885afSLukas Ertl 	/* Find the right subdisk. */
194fb5885afSLukas Ertl 	parity = NULL;
195fb5885afSLukas Ertl 	i = 0;
196fb5885afSLukas Ertl 	LIST_FOREACH(s, &p->subdisks, in_plex) {
197fb5885afSLukas Ertl 		if (i == psdno) {
198fb5885afSLukas Ertl 			parity = s;
199fb5885afSLukas Ertl 			break;
200fb5885afSLukas Ertl 		}
201fb5885afSLukas Ertl 		i++;
202fb5885afSLukas Ertl 	}
203fb5885afSLukas Ertl 
204fb5885afSLukas Ertl 	/* Parity stripe not found. */
205fb5885afSLukas Ertl 	if (parity == NULL)
206fb5885afSLukas Ertl 		return (ENXIO);
207fb5885afSLukas Ertl 
208fb5885afSLukas Ertl 	if (parity->state != GV_SD_UP)
209fb5885afSLukas Ertl 		return (ENXIO);
210fb5885afSLukas Ertl 
211fb5885afSLukas Ertl 	wp->length = real_len;
212fb5885afSLukas Ertl 	wp->data = addr;
213fb5885afSLukas Ertl 	wp->lockbase = real_off;
214fb5885afSLukas Ertl 
215fb5885afSLukas Ertl 	/* Read all subdisks. */
216fb5885afSLukas Ertl 	LIST_FOREACH(s, &p->subdisks, in_plex) {
217fb5885afSLukas Ertl 		/* Skip the parity subdisk. */
218fb5885afSLukas Ertl 		if (s == parity)
219fb5885afSLukas Ertl 			continue;
220c0b9797aSUlf Lilleengen 		/* Skip growing subdisks. */
221c0b9797aSUlf Lilleengen 		if (s->flags & GV_SD_GROW)
222c0b9797aSUlf Lilleengen 			continue;
223fb5885afSLukas Ertl 
224c0b9797aSUlf Lilleengen 		cbp = gv_raid5_clone_bio(bp, s, wp, NULL, 1);
225fb5885afSLukas Ertl 		if (cbp == NULL)
226fb5885afSLukas Ertl 			return (ENOMEM);
227fb5885afSLukas Ertl 		cbp->bio_cmd = BIO_READ;
228fb5885afSLukas Ertl 
229c0b9797aSUlf Lilleengen 		bioq_insert_tail(p->bqueue, cbp);
230fb5885afSLukas Ertl 
231fb5885afSLukas Ertl 		bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO);
232fb5885afSLukas Ertl 		bq->bp = cbp;
233fb5885afSLukas Ertl 		TAILQ_INSERT_TAIL(&wp->bits, bq, queue);
234fb5885afSLukas Ertl 	}
235fb5885afSLukas Ertl 
236fb5885afSLukas Ertl 	/* Read the parity data. */
237c0b9797aSUlf Lilleengen 	cbp = gv_raid5_clone_bio(bp, parity, wp, NULL, 1);
238fb5885afSLukas Ertl 	if (cbp == NULL)
239fb5885afSLukas Ertl 		return (ENOMEM);
240fb5885afSLukas Ertl 	cbp->bio_cmd = BIO_READ;
241fb5885afSLukas Ertl 	wp->waiting = cbp;
242fb5885afSLukas Ertl 
243fb5885afSLukas Ertl 	/*
244fb5885afSLukas Ertl 	 * In case we want to rebuild the parity, create an extra BIO to write
245fb5885afSLukas Ertl 	 * it out.  It also acts as buffer for the XOR operations.
246fb5885afSLukas Ertl 	 */
247c0b9797aSUlf Lilleengen 	cbp = gv_raid5_clone_bio(bp, parity, wp, addr, 1);
248fb5885afSLukas Ertl 	if (cbp == NULL)
249fb5885afSLukas Ertl 		return (ENOMEM);
250fb5885afSLukas Ertl 	wp->parity = cbp;
251fb5885afSLukas Ertl 
252fb5885afSLukas Ertl 	return (0);
253fb5885afSLukas Ertl }
254fb5885afSLukas Ertl 
255fb5885afSLukas Ertl /* Rebuild a degraded RAID5 plex. */
256c0b9797aSUlf Lilleengen static int
257c0b9797aSUlf Lilleengen gv_raid5_rebuild(struct gv_plex *p, struct gv_raid5_packet *wp, struct bio *bp,
258c3aadfb9SLukas Ertl     caddr_t addr, off_t boff, off_t bcount)
259c3aadfb9SLukas Ertl {
260c3aadfb9SLukas Ertl 	struct gv_sd *broken, *s;
261c3aadfb9SLukas Ertl 	struct gv_bioq *bq;
262c0b9797aSUlf Lilleengen 	struct bio *cbp;
263fb4e65d0SLukas Ertl 	off_t real_len, real_off;
264c3aadfb9SLukas Ertl 
265c3aadfb9SLukas Ertl 	if (p == NULL || LIST_EMPTY(&p->subdisks))
266c3aadfb9SLukas Ertl 		return (ENXIO);
267c3aadfb9SLukas Ertl 
268c0b9797aSUlf Lilleengen 	gv_raid5_offset(p, boff, bcount, &real_off, &real_len, NULL, NULL, 1);
269c3aadfb9SLukas Ertl 
270c3aadfb9SLukas Ertl 	/* Find the right subdisk. */
271c3aadfb9SLukas Ertl 	broken = NULL;
272c3aadfb9SLukas Ertl 	LIST_FOREACH(s, &p->subdisks, in_plex) {
273c3aadfb9SLukas Ertl 		if (s->state != GV_SD_UP)
274c3aadfb9SLukas Ertl 			broken = s;
275c3aadfb9SLukas Ertl 	}
276c3aadfb9SLukas Ertl 
277fb5885afSLukas Ertl 	/* Broken stripe not found. */
278c3aadfb9SLukas Ertl 	if (broken == NULL)
279c3aadfb9SLukas Ertl 		return (ENXIO);
280c3aadfb9SLukas Ertl 
281c3aadfb9SLukas Ertl 	switch (broken->state) {
282c3aadfb9SLukas Ertl 	case GV_SD_UP:
283c3aadfb9SLukas Ertl 		return (EINVAL);
284c3aadfb9SLukas Ertl 
285c3aadfb9SLukas Ertl 	case GV_SD_STALE:
286d8d015cdSUlf Lilleengen 		if (!(bp->bio_pflags & GV_BIO_REBUILD))
287c3aadfb9SLukas Ertl 			return (ENXIO);
288c3aadfb9SLukas Ertl 
28986b3c6f5SUlf Lilleengen 		G_VINUM_DEBUG(1, "sd %s is reviving", broken->name);
290c3aadfb9SLukas Ertl 		gv_set_sd_state(broken, GV_SD_REVIVING, GV_SETSTATE_FORCE);
291c0b9797aSUlf Lilleengen 		/* Set this bit now, but should be set at end. */
292c0b9797aSUlf Lilleengen 		broken->flags |= GV_SD_CANGOUP;
293c3aadfb9SLukas Ertl 		break;
294c3aadfb9SLukas Ertl 
295c3aadfb9SLukas Ertl 	case GV_SD_REVIVING:
296c3aadfb9SLukas Ertl 		break;
297c3aadfb9SLukas Ertl 
298c3aadfb9SLukas Ertl 	default:
299c3aadfb9SLukas Ertl 		/* All other subdisk states mean it's not accessible. */
300c3aadfb9SLukas Ertl 		return (ENXIO);
301c3aadfb9SLukas Ertl 	}
302c3aadfb9SLukas Ertl 
303c3aadfb9SLukas Ertl 	wp->length = real_len;
304c3aadfb9SLukas Ertl 	wp->data = addr;
305c3aadfb9SLukas Ertl 	wp->lockbase = real_off;
306c3aadfb9SLukas Ertl 
307fb4e65d0SLukas Ertl 	KASSERT(wp->length >= 0, ("gv_rebuild_raid5: wp->length < 0"));
308c3aadfb9SLukas Ertl 
309c3aadfb9SLukas Ertl 	/* Read all subdisks. */
310c3aadfb9SLukas Ertl 	LIST_FOREACH(s, &p->subdisks, in_plex) {
311c3aadfb9SLukas Ertl 		/* Skip the broken subdisk. */
312c3aadfb9SLukas Ertl 		if (s == broken)
313c3aadfb9SLukas Ertl 			continue;
314c3aadfb9SLukas Ertl 
315c0b9797aSUlf Lilleengen 		/* Skip growing subdisks. */
316c0b9797aSUlf Lilleengen 		if (s->flags & GV_SD_GROW)
317c0b9797aSUlf Lilleengen 			continue;
318c0b9797aSUlf Lilleengen 
319c0b9797aSUlf Lilleengen 		cbp = gv_raid5_clone_bio(bp, s, wp, NULL, 1);
320c3aadfb9SLukas Ertl 		if (cbp == NULL)
321c3aadfb9SLukas Ertl 			return (ENOMEM);
322c3aadfb9SLukas Ertl 		cbp->bio_cmd = BIO_READ;
323c3aadfb9SLukas Ertl 
324c0b9797aSUlf Lilleengen 		bioq_insert_tail(p->bqueue, cbp);
325c3aadfb9SLukas Ertl 
326c3aadfb9SLukas Ertl 		bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO);
327c3aadfb9SLukas Ertl 		bq->bp = cbp;
328c3aadfb9SLukas Ertl 		TAILQ_INSERT_TAIL(&wp->bits, bq, queue);
329c3aadfb9SLukas Ertl 	}
330c3aadfb9SLukas Ertl 
331c3aadfb9SLukas Ertl 	/* Write the parity data. */
332c0b9797aSUlf Lilleengen 	cbp = gv_raid5_clone_bio(bp, broken, wp, NULL, 1);
333c3aadfb9SLukas Ertl 	if (cbp == NULL)
334c3aadfb9SLukas Ertl 		return (ENOMEM);
335c3aadfb9SLukas Ertl 	wp->parity = cbp;
336c3aadfb9SLukas Ertl 
337c3aadfb9SLukas Ertl 	p->synced = boff;
338c3aadfb9SLukas Ertl 
339c0b9797aSUlf Lilleengen 	/* Post notification that we're finished. */
340c3aadfb9SLukas Ertl 	return (0);
341c3aadfb9SLukas Ertl }
342c3aadfb9SLukas Ertl 
34373679edcSLukas Ertl /* Build a request group to perform (part of) a RAID5 request. */
344c0b9797aSUlf Lilleengen static int
345c0b9797aSUlf Lilleengen gv_raid5_request(struct gv_plex *p, struct gv_raid5_packet *wp,
346c0b9797aSUlf Lilleengen     struct bio *bp, caddr_t addr, off_t boff, off_t bcount, int *delay)
34773679edcSLukas Ertl {
34873679edcSLukas Ertl 	struct gv_sd *broken, *original, *parity, *s;
34967e3ab6eSLukas Ertl 	struct gv_bioq *bq;
350c0b9797aSUlf Lilleengen 	struct bio *cbp;
351c0b9797aSUlf Lilleengen 	int i, psdno, sdno, type, grow;
352fb4e65d0SLukas Ertl 	off_t real_len, real_off;
35373679edcSLukas Ertl 
35473679edcSLukas Ertl 	if (p == NULL || LIST_EMPTY(&p->subdisks))
35573679edcSLukas Ertl 		return (ENXIO);
35673679edcSLukas Ertl 
35773679edcSLukas Ertl 	/* We are optimistic and assume that this request will be OK. */
35867e3ab6eSLukas Ertl #define	REQ_TYPE_NORMAL		0
35967e3ab6eSLukas Ertl #define	REQ_TYPE_DEGRADED	1
36067e3ab6eSLukas Ertl #define	REQ_TYPE_NOPARITY	2
36167e3ab6eSLukas Ertl 
36267e3ab6eSLukas Ertl 	type = REQ_TYPE_NORMAL;
36373679edcSLukas Ertl 	original = parity = broken = NULL;
36473679edcSLukas Ertl 
365c0b9797aSUlf Lilleengen 	/* XXX: The resize won't crash with rebuild or sync, but we should still
366c0b9797aSUlf Lilleengen 	 * be aware of it. Also this should perhaps be done on rebuild/check as
367c0b9797aSUlf Lilleengen 	 * well?
368c0b9797aSUlf Lilleengen 	 */
369c0b9797aSUlf Lilleengen 	/* If we're over, we must use the old. */
370c0b9797aSUlf Lilleengen 	if (boff >= p->synced) {
371c0b9797aSUlf Lilleengen 		grow = 1;
372c0b9797aSUlf Lilleengen 	/* Or if over the resized offset, we use all drives. */
373c0b9797aSUlf Lilleengen 	} else if (boff + bcount <= p->synced) {
374c0b9797aSUlf Lilleengen 		grow = 0;
375c0b9797aSUlf Lilleengen 	/* Else, we're in the middle, and must wait a bit. */
376c0b9797aSUlf Lilleengen 	} else {
377c0b9797aSUlf Lilleengen 		bioq_disksort(p->rqueue, bp);
378c0b9797aSUlf Lilleengen 		*delay = 1;
379c0b9797aSUlf Lilleengen 		return (0);
380c0b9797aSUlf Lilleengen 	}
381c0b9797aSUlf Lilleengen 	gv_raid5_offset(p, boff, bcount, &real_off, &real_len,
382c0b9797aSUlf Lilleengen 	    &sdno, &psdno, grow);
38373679edcSLukas Ertl 
38473679edcSLukas Ertl 	/* Find the right subdisks. */
38573679edcSLukas Ertl 	i = 0;
38673679edcSLukas Ertl 	LIST_FOREACH(s, &p->subdisks, in_plex) {
38773679edcSLukas Ertl 		if (i == sdno)
38873679edcSLukas Ertl 			original = s;
38973679edcSLukas Ertl 		if (i == psdno)
39073679edcSLukas Ertl 			parity = s;
39173679edcSLukas Ertl 		if (s->state != GV_SD_UP)
39273679edcSLukas Ertl 			broken = s;
39373679edcSLukas Ertl 		i++;
39473679edcSLukas Ertl 	}
39573679edcSLukas Ertl 
39673679edcSLukas Ertl 	if ((original == NULL) || (parity == NULL))
39773679edcSLukas Ertl 		return (ENXIO);
39873679edcSLukas Ertl 
39973679edcSLukas Ertl 	/* Our data stripe is missing. */
40073679edcSLukas Ertl 	if (original->state != GV_SD_UP)
40167e3ab6eSLukas Ertl 		type = REQ_TYPE_DEGRADED;
402c0b9797aSUlf Lilleengen 
403c0b9797aSUlf Lilleengen 	/* If synchronizing request, just write it if disks are stale. */
404c0b9797aSUlf Lilleengen 	if (original->state == GV_SD_STALE && parity->state == GV_SD_STALE &&
405d8d015cdSUlf Lilleengen 	    bp->bio_pflags & GV_BIO_SYNCREQ && bp->bio_cmd == BIO_WRITE) {
406c0b9797aSUlf Lilleengen 		type = REQ_TYPE_NORMAL;
40773679edcSLukas Ertl 	/* Our parity stripe is missing. */
408c0b9797aSUlf Lilleengen 	} else if (parity->state != GV_SD_UP) {
40973679edcSLukas Ertl 		/* We cannot take another failure if we're already degraded. */
41067e3ab6eSLukas Ertl 		if (type != REQ_TYPE_NORMAL)
41173679edcSLukas Ertl 			return (ENXIO);
41273679edcSLukas Ertl 		else
41367e3ab6eSLukas Ertl 			type = REQ_TYPE_NOPARITY;
41473679edcSLukas Ertl 	}
41573679edcSLukas Ertl 
41667e3ab6eSLukas Ertl 	wp->length = real_len;
41773679edcSLukas Ertl 	wp->data = addr;
41867e3ab6eSLukas Ertl 	wp->lockbase = real_off;
41973679edcSLukas Ertl 
42073679edcSLukas Ertl 	KASSERT(wp->length >= 0, ("gv_build_raid5_request: wp->length < 0"));
42173679edcSLukas Ertl 
422c0b9797aSUlf Lilleengen 	if ((p->flags & GV_PLEX_REBUILDING) && (boff + real_len < p->synced))
423c3aadfb9SLukas Ertl 		type = REQ_TYPE_NORMAL;
424c3aadfb9SLukas Ertl 
425c0b9797aSUlf Lilleengen 	if ((p->flags & GV_PLEX_REBUILDING) && (boff + real_len >= p->synced)) {
426c0b9797aSUlf Lilleengen 		bioq_disksort(p->rqueue, bp);
427c0b9797aSUlf Lilleengen 		*delay = 1;
428c0b9797aSUlf Lilleengen 		return (0);
429c0b9797aSUlf Lilleengen 	}
430c0b9797aSUlf Lilleengen 
43173679edcSLukas Ertl 	switch (bp->bio_cmd) {
43273679edcSLukas Ertl 	case BIO_READ:
43373679edcSLukas Ertl 		/*
43473679edcSLukas Ertl 		 * For a degraded read we need to read in all stripes except
43573679edcSLukas Ertl 		 * the broken one plus the parity stripe and then recalculate
43673679edcSLukas Ertl 		 * the desired data.
43773679edcSLukas Ertl 		 */
43867e3ab6eSLukas Ertl 		if (type == REQ_TYPE_DEGRADED) {
43967e3ab6eSLukas Ertl 			bzero(wp->data, wp->length);
44073679edcSLukas Ertl 			LIST_FOREACH(s, &p->subdisks, in_plex) {
44173679edcSLukas Ertl 				/* Skip the broken subdisk. */
44273679edcSLukas Ertl 				if (s == broken)
44373679edcSLukas Ertl 					continue;
444c0b9797aSUlf Lilleengen 				/* Skip growing if within offset. */
445c0b9797aSUlf Lilleengen 				if (grow && s->flags & GV_SD_GROW)
446c0b9797aSUlf Lilleengen 					continue;
447c0b9797aSUlf Lilleengen 				cbp = gv_raid5_clone_bio(bp, s, wp, NULL, 1);
44867e3ab6eSLukas Ertl 				if (cbp == NULL)
44973679edcSLukas Ertl 					return (ENOMEM);
45067e3ab6eSLukas Ertl 
451c0b9797aSUlf Lilleengen 				bioq_insert_tail(p->bqueue, cbp);
45267e3ab6eSLukas Ertl 
45367e3ab6eSLukas Ertl 				bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO);
45467e3ab6eSLukas Ertl 				bq->bp = cbp;
45567e3ab6eSLukas Ertl 				TAILQ_INSERT_TAIL(&wp->bits, bq, queue);
45673679edcSLukas Ertl 			}
45773679edcSLukas Ertl 
45873679edcSLukas Ertl 		/* A normal read can be fulfilled with the original subdisk. */
45973679edcSLukas Ertl 		} else {
460c0b9797aSUlf Lilleengen 			cbp = gv_raid5_clone_bio(bp, original, wp, addr, 0);
46167e3ab6eSLukas Ertl 			if (cbp == NULL)
46273679edcSLukas Ertl 				return (ENOMEM);
46367e3ab6eSLukas Ertl 
464c0b9797aSUlf Lilleengen 			bioq_insert_tail(p->bqueue, cbp);
46573679edcSLukas Ertl 		}
46673679edcSLukas Ertl 		wp->lockbase = -1;
46767e3ab6eSLukas Ertl 
46873679edcSLukas Ertl 		break;
46973679edcSLukas Ertl 
47073679edcSLukas Ertl 	case BIO_WRITE:
47173679edcSLukas Ertl 		/*
47273679edcSLukas Ertl 		 * A degraded write means we cannot write to the original data
47373679edcSLukas Ertl 		 * subdisk.  Thus we need to read in all valid stripes,
47473679edcSLukas Ertl 		 * recalculate the parity from the original data, and then
47573679edcSLukas Ertl 		 * write the parity stripe back out.
47673679edcSLukas Ertl 		 */
47767e3ab6eSLukas Ertl 		if (type == REQ_TYPE_DEGRADED) {
47867e3ab6eSLukas Ertl 			/* Read all subdisks. */
47973679edcSLukas Ertl 			LIST_FOREACH(s, &p->subdisks, in_plex) {
48073679edcSLukas Ertl 				/* Skip the broken and the parity subdisk. */
48167e3ab6eSLukas Ertl 				if ((s == broken) || (s == parity))
48273679edcSLukas Ertl 					continue;
483c0b9797aSUlf Lilleengen 				/* Skip growing if within offset. */
484c0b9797aSUlf Lilleengen 				if (grow && s->flags & GV_SD_GROW)
485c0b9797aSUlf Lilleengen 					continue;
48673679edcSLukas Ertl 
487c0b9797aSUlf Lilleengen 				cbp = gv_raid5_clone_bio(bp, s, wp, NULL, 1);
48867e3ab6eSLukas Ertl 				if (cbp == NULL)
48973679edcSLukas Ertl 					return (ENOMEM);
49067e3ab6eSLukas Ertl 				cbp->bio_cmd = BIO_READ;
49167e3ab6eSLukas Ertl 
492c0b9797aSUlf Lilleengen 				bioq_insert_tail(p->bqueue, cbp);
49367e3ab6eSLukas Ertl 
49467e3ab6eSLukas Ertl 				bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO);
49567e3ab6eSLukas Ertl 				bq->bp = cbp;
49667e3ab6eSLukas Ertl 				TAILQ_INSERT_TAIL(&wp->bits, bq, queue);
49773679edcSLukas Ertl 			}
49873679edcSLukas Ertl 
49967e3ab6eSLukas Ertl 			/* Write the parity data. */
500c0b9797aSUlf Lilleengen 			cbp = gv_raid5_clone_bio(bp, parity, wp, NULL, 1);
50167e3ab6eSLukas Ertl 			if (cbp == NULL)
50273679edcSLukas Ertl 				return (ENOMEM);
503c0b9797aSUlf Lilleengen 			bcopy(addr, cbp->bio_data, wp->length);
50467e3ab6eSLukas Ertl 			wp->parity = cbp;
50573679edcSLukas Ertl 
50673679edcSLukas Ertl 		/*
50767e3ab6eSLukas Ertl 		 * When the parity stripe is missing we just write out the data.
50873679edcSLukas Ertl 		 */
50967e3ab6eSLukas Ertl 		} else if (type == REQ_TYPE_NOPARITY) {
510c0b9797aSUlf Lilleengen 			cbp = gv_raid5_clone_bio(bp, original, wp, addr, 1);
51167e3ab6eSLukas Ertl 			if (cbp == NULL)
512291cb0acSLukas Ertl 				return (ENOMEM);
51373679edcSLukas Ertl 
514c0b9797aSUlf Lilleengen 			bioq_insert_tail(p->bqueue, cbp);
51573679edcSLukas Ertl 
51667e3ab6eSLukas Ertl 			bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO);
51767e3ab6eSLukas Ertl 			bq->bp = cbp;
51867e3ab6eSLukas Ertl 			TAILQ_INSERT_TAIL(&wp->bits, bq, queue);
51973679edcSLukas Ertl 
52073679edcSLukas Ertl 		/*
52173679edcSLukas Ertl 		 * A normal write request goes to the original subdisk, then we
52273679edcSLukas Ertl 		 * read in all other stripes, recalculate the parity and write
52373679edcSLukas Ertl 		 * out the parity again.
52473679edcSLukas Ertl 		 */
52573679edcSLukas Ertl 		} else {
52667e3ab6eSLukas Ertl 			/* Read old parity. */
527c0b9797aSUlf Lilleengen 			cbp = gv_raid5_clone_bio(bp, parity, wp, NULL, 1);
52867e3ab6eSLukas Ertl 			if (cbp == NULL)
529291cb0acSLukas Ertl 				return (ENOMEM);
53067e3ab6eSLukas Ertl 			cbp->bio_cmd = BIO_READ;
53173679edcSLukas Ertl 
532c0b9797aSUlf Lilleengen 			bioq_insert_tail(p->bqueue, cbp);
53367e3ab6eSLukas Ertl 
53467e3ab6eSLukas Ertl 			bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO);
53567e3ab6eSLukas Ertl 			bq->bp = cbp;
53667e3ab6eSLukas Ertl 			TAILQ_INSERT_TAIL(&wp->bits, bq, queue);
53767e3ab6eSLukas Ertl 
53867e3ab6eSLukas Ertl 			/* Read old data. */
539c0b9797aSUlf Lilleengen 			cbp = gv_raid5_clone_bio(bp, original, wp, NULL, 1);
54067e3ab6eSLukas Ertl 			if (cbp == NULL)
54173679edcSLukas Ertl 				return (ENOMEM);
54267e3ab6eSLukas Ertl 			cbp->bio_cmd = BIO_READ;
54367e3ab6eSLukas Ertl 
544c0b9797aSUlf Lilleengen 			bioq_insert_tail(p->bqueue, cbp);
54567e3ab6eSLukas Ertl 
54667e3ab6eSLukas Ertl 			bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO);
54767e3ab6eSLukas Ertl 			bq->bp = cbp;
54867e3ab6eSLukas Ertl 			TAILQ_INSERT_TAIL(&wp->bits, bq, queue);
54967e3ab6eSLukas Ertl 
55067e3ab6eSLukas Ertl 			/* Write new data. */
551c0b9797aSUlf Lilleengen 			cbp = gv_raid5_clone_bio(bp, original, wp, addr, 1);
55267e3ab6eSLukas Ertl 			if (cbp == NULL)
55367e3ab6eSLukas Ertl 				return (ENOMEM);
55467e3ab6eSLukas Ertl 
55573679edcSLukas Ertl 			/*
55667e3ab6eSLukas Ertl 			 * We must not write the new data until the old data
55767e3ab6eSLukas Ertl 			 * was read, so hold this BIO back until we're ready
55867e3ab6eSLukas Ertl 			 * for it.
55973679edcSLukas Ertl 			 */
56067e3ab6eSLukas Ertl 			wp->waiting = cbp;
56167e3ab6eSLukas Ertl 
56267e3ab6eSLukas Ertl 			/* The final bio for the parity. */
563c0b9797aSUlf Lilleengen 			cbp = gv_raid5_clone_bio(bp, parity, wp, NULL, 1);
56467e3ab6eSLukas Ertl 			if (cbp == NULL)
565291cb0acSLukas Ertl 				return (ENOMEM);
56667e3ab6eSLukas Ertl 
56767e3ab6eSLukas Ertl 			/* Remember that this is the BIO for the parity data. */
56867e3ab6eSLukas Ertl 			wp->parity = cbp;
56973679edcSLukas Ertl 		}
57073679edcSLukas Ertl 		break;
57167e3ab6eSLukas Ertl 
57273679edcSLukas Ertl 	default:
57373679edcSLukas Ertl 		return (EINVAL);
57473679edcSLukas Ertl 	}
57573679edcSLukas Ertl 
57673679edcSLukas Ertl 	return (0);
57773679edcSLukas Ertl }
578fb4e65d0SLukas Ertl 
579c0b9797aSUlf Lilleengen /*
580c0b9797aSUlf Lilleengen  * Calculate the offsets in the various subdisks for a RAID5 request. Also take
581c0b9797aSUlf Lilleengen  * care of new subdisks in an expanded RAID5 array.
582c0b9797aSUlf Lilleengen  * XXX: This assumes that the new subdisks are inserted after the others (which
583c0b9797aSUlf Lilleengen  * is okay as long as plex_offset is larger). If subdisks are inserted into the
584c0b9797aSUlf Lilleengen  * plexlist before, we get problems.
585c0b9797aSUlf Lilleengen  */
586c0b9797aSUlf Lilleengen static int
587fb4e65d0SLukas Ertl gv_raid5_offset(struct gv_plex *p, off_t boff, off_t bcount, off_t *real_off,
588c0b9797aSUlf Lilleengen     off_t *real_len, int *sdno, int *psdno, int growing)
589fb4e65d0SLukas Ertl {
590c0b9797aSUlf Lilleengen 	struct gv_sd *s;
591c0b9797aSUlf Lilleengen 	int sd, psd, sdcount;
592fb4e65d0SLukas Ertl 	off_t len_left, stripeend, stripeoff, stripestart;
593fb4e65d0SLukas Ertl 
594c0b9797aSUlf Lilleengen 	sdcount = p->sdcount;
595c0b9797aSUlf Lilleengen 	if (growing) {
596c0b9797aSUlf Lilleengen 		LIST_FOREACH(s, &p->subdisks, in_plex) {
597c0b9797aSUlf Lilleengen 			if (s->flags & GV_SD_GROW)
598c0b9797aSUlf Lilleengen 				sdcount--;
599c0b9797aSUlf Lilleengen 		}
600c0b9797aSUlf Lilleengen 	}
601c0b9797aSUlf Lilleengen 
602fb4e65d0SLukas Ertl 	/* The number of the subdisk containing the parity stripe. */
603c0b9797aSUlf Lilleengen 	psd = sdcount - 1 - ( boff / (p->stripesize * (sdcount - 1))) %
604c0b9797aSUlf Lilleengen 	    sdcount;
60587bb53cbSEd Maste 	KASSERT(psd >= 0, ("gv_raid5_offset: psdno < 0"));
606fb4e65d0SLukas Ertl 
607fb4e65d0SLukas Ertl 	/* Offset of the start address from the start of the stripe. */
608c0b9797aSUlf Lilleengen 	stripeoff = boff % (p->stripesize * (sdcount - 1));
609fb4e65d0SLukas Ertl 	KASSERT(stripeoff >= 0, ("gv_raid5_offset: stripeoff < 0"));
610fb4e65d0SLukas Ertl 
611fb4e65d0SLukas Ertl 	/* The number of the subdisk where the stripe resides. */
612fb4e65d0SLukas Ertl 	sd = stripeoff / p->stripesize;
61387bb53cbSEd Maste 	KASSERT(sd >= 0, ("gv_raid5_offset: sdno < 0"));
614fb4e65d0SLukas Ertl 
615fb4e65d0SLukas Ertl 	/* At or past parity subdisk. */
616fb4e65d0SLukas Ertl 	if (sd >= psd)
617fb4e65d0SLukas Ertl 		sd++;
618fb4e65d0SLukas Ertl 
619fb4e65d0SLukas Ertl 	/* The offset of the stripe on this subdisk. */
620c0b9797aSUlf Lilleengen 	stripestart = (boff - stripeoff) / (sdcount - 1);
621fb4e65d0SLukas Ertl 	KASSERT(stripestart >= 0, ("gv_raid5_offset: stripestart < 0"));
622fb4e65d0SLukas Ertl 
623fb4e65d0SLukas Ertl 	stripeoff %= p->stripesize;
624fb4e65d0SLukas Ertl 
625fb4e65d0SLukas Ertl 	/* The offset of the request on this subdisk. */
626fb4e65d0SLukas Ertl 	*real_off = stripestart + stripeoff;
627fb4e65d0SLukas Ertl 
628fb4e65d0SLukas Ertl 	stripeend = stripestart + p->stripesize;
629fb4e65d0SLukas Ertl 	len_left = stripeend - *real_off;
630fb4e65d0SLukas Ertl 	KASSERT(len_left >= 0, ("gv_raid5_offset: len_left < 0"));
631fb4e65d0SLukas Ertl 
632fb4e65d0SLukas Ertl 	*real_len = (bcount <= len_left) ? bcount : len_left;
633fb4e65d0SLukas Ertl 
634fb4e65d0SLukas Ertl 	if (sdno != NULL)
635fb4e65d0SLukas Ertl 		*sdno = sd;
636fb4e65d0SLukas Ertl 	if (psdno != NULL)
637fb4e65d0SLukas Ertl 		*psdno = psd;
638fb4e65d0SLukas Ertl 
639fb4e65d0SLukas Ertl 	return (0);
640fb4e65d0SLukas Ertl }
641c0b9797aSUlf Lilleengen 
642c0b9797aSUlf Lilleengen static struct bio *
643c0b9797aSUlf Lilleengen gv_raid5_clone_bio(struct bio *bp, struct gv_sd *s, struct gv_raid5_packet *wp,
644c0b9797aSUlf Lilleengen     caddr_t addr, int use_wp)
645c0b9797aSUlf Lilleengen {
646c0b9797aSUlf Lilleengen 	struct bio *cbp;
647c0b9797aSUlf Lilleengen 
648c0b9797aSUlf Lilleengen 	cbp = g_clone_bio(bp);
649c0b9797aSUlf Lilleengen 	if (cbp == NULL)
650c0b9797aSUlf Lilleengen 		return (NULL);
651c0b9797aSUlf Lilleengen 	if (addr == NULL) {
652c0b9797aSUlf Lilleengen 		cbp->bio_data = g_malloc(wp->length, M_WAITOK | M_ZERO);
653c0b9797aSUlf Lilleengen 		cbp->bio_cflags |= GV_BIO_MALLOC;
654c0b9797aSUlf Lilleengen 	} else
655c0b9797aSUlf Lilleengen 		cbp->bio_data = addr;
656c0b9797aSUlf Lilleengen 	cbp->bio_offset = wp->lockbase + s->drive_offset;
657c0b9797aSUlf Lilleengen 	cbp->bio_length = wp->length;
658c0b9797aSUlf Lilleengen 	cbp->bio_done = gv_done;
659c0b9797aSUlf Lilleengen 	cbp->bio_caller1 = s;
660a29df733SAlexander Motin 	s->drive_sc->active++;
661c0b9797aSUlf Lilleengen 	if (use_wp)
662c0b9797aSUlf Lilleengen 		cbp->bio_caller2 = wp;
663c0b9797aSUlf Lilleengen 
664c0b9797aSUlf Lilleengen 	return (cbp);
665c0b9797aSUlf Lilleengen }
666