173679edcSLukas Ertl /*-
24d846d26SWarner Losh * SPDX-License-Identifier: BSD-2-Clause
33728855aSPedro F. Giffuni *
4c0b9797aSUlf Lilleengen * Copyright (c) 2004, 2007 Lukas Ertl
573679edcSLukas Ertl * All rights reserved.
673679edcSLukas Ertl *
773679edcSLukas Ertl * Redistribution and use in source and binary forms, with or without
873679edcSLukas Ertl * modification, are permitted provided that the following conditions
973679edcSLukas Ertl * are met:
1073679edcSLukas Ertl * 1. Redistributions of source code must retain the above copyright
1173679edcSLukas Ertl * notice, this list of conditions and the following disclaimer.
1273679edcSLukas Ertl * 2. Redistributions in binary form must reproduce the above copyright
1373679edcSLukas Ertl * notice, this list of conditions and the following disclaimer in the
1473679edcSLukas Ertl * documentation and/or other materials provided with the distribution.
1573679edcSLukas Ertl *
1673679edcSLukas Ertl * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
1773679edcSLukas Ertl * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
1873679edcSLukas Ertl * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
1973679edcSLukas Ertl * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
2073679edcSLukas Ertl * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
2173679edcSLukas Ertl * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
2273679edcSLukas Ertl * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
2373679edcSLukas Ertl * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
2473679edcSLukas Ertl * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
2573679edcSLukas Ertl * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
2673679edcSLukas Ertl * SUCH DAMAGE.
2773679edcSLukas Ertl */
2873679edcSLukas Ertl
2973679edcSLukas Ertl #include <sys/param.h>
3073679edcSLukas Ertl #include <sys/bio.h>
3173679edcSLukas Ertl #include <sys/lock.h>
3273679edcSLukas Ertl #include <sys/malloc.h>
3373679edcSLukas Ertl #include <sys/systm.h>
3473679edcSLukas Ertl
3573679edcSLukas Ertl #include <geom/geom.h>
36ac03832eSConrad Meyer #include <geom/geom_dbg.h>
3773679edcSLukas Ertl #include <geom/vinum/geom_vinum_var.h>
3873679edcSLukas Ertl #include <geom/vinum/geom_vinum_raid5.h>
3973679edcSLukas Ertl #include <geom/vinum/geom_vinum.h>
4073679edcSLukas Ertl
41c0b9797aSUlf Lilleengen static int gv_raid5_offset(struct gv_plex *, off_t, off_t,
42c0b9797aSUlf Lilleengen off_t *, off_t *, int *, int *, int);
43c0b9797aSUlf Lilleengen static struct bio * gv_raid5_clone_bio(struct bio *, struct gv_sd *,
44c0b9797aSUlf Lilleengen struct gv_raid5_packet *, caddr_t, int);
45c0b9797aSUlf Lilleengen static int gv_raid5_request(struct gv_plex *, struct gv_raid5_packet *,
46c0b9797aSUlf Lilleengen struct bio *, caddr_t, off_t, off_t, int *);
47c0b9797aSUlf Lilleengen static int gv_raid5_check(struct gv_plex *, struct gv_raid5_packet *,
48c0b9797aSUlf Lilleengen struct bio *, caddr_t, off_t, off_t);
49c0b9797aSUlf Lilleengen static int gv_raid5_rebuild(struct gv_plex *, struct gv_raid5_packet *,
50c0b9797aSUlf Lilleengen struct bio *, caddr_t, off_t, off_t);
51c0b9797aSUlf Lilleengen
52c0b9797aSUlf Lilleengen struct gv_raid5_packet *
gv_raid5_start(struct gv_plex * p,struct bio * bp,caddr_t addr,off_t boff,off_t bcount)53c0b9797aSUlf Lilleengen gv_raid5_start(struct gv_plex *p, struct bio *bp, caddr_t addr, off_t boff,
54c0b9797aSUlf Lilleengen off_t bcount)
55c0b9797aSUlf Lilleengen {
56c0b9797aSUlf Lilleengen struct bio *cbp;
57c0b9797aSUlf Lilleengen struct gv_raid5_packet *wp, *wp2;
58c0b9797aSUlf Lilleengen struct gv_bioq *bq, *bq2;
59c0b9797aSUlf Lilleengen int err, delay;
60c0b9797aSUlf Lilleengen
61c0b9797aSUlf Lilleengen delay = 0;
62c0b9797aSUlf Lilleengen wp = g_malloc(sizeof(*wp), M_WAITOK | M_ZERO);
63c0b9797aSUlf Lilleengen wp->bio = bp;
64c0b9797aSUlf Lilleengen wp->waiting = NULL;
65c0b9797aSUlf Lilleengen wp->parity = NULL;
66c0b9797aSUlf Lilleengen TAILQ_INIT(&wp->bits);
67c0b9797aSUlf Lilleengen
68d8d015cdSUlf Lilleengen if (bp->bio_pflags & GV_BIO_REBUILD)
69c0b9797aSUlf Lilleengen err = gv_raid5_rebuild(p, wp, bp, addr, boff, bcount);
70d8d015cdSUlf Lilleengen else if (bp->bio_pflags & GV_BIO_CHECK)
71c0b9797aSUlf Lilleengen err = gv_raid5_check(p, wp, bp, addr, boff, bcount);
72c0b9797aSUlf Lilleengen else
73c0b9797aSUlf Lilleengen err = gv_raid5_request(p, wp, bp, addr, boff, bcount, &delay);
74c0b9797aSUlf Lilleengen
75c0b9797aSUlf Lilleengen /* Means we have a delayed request. */
76c0b9797aSUlf Lilleengen if (delay) {
77c0b9797aSUlf Lilleengen g_free(wp);
78c0b9797aSUlf Lilleengen return (NULL);
79c0b9797aSUlf Lilleengen }
80c0b9797aSUlf Lilleengen
81c0b9797aSUlf Lilleengen /*
82c0b9797aSUlf Lilleengen * Building the sub-request failed, we probably need to clean up a lot.
83c0b9797aSUlf Lilleengen */
84c0b9797aSUlf Lilleengen if (err) {
85c0b9797aSUlf Lilleengen G_VINUM_LOGREQ(0, bp, "raid5 plex request failed.");
86c0b9797aSUlf Lilleengen TAILQ_FOREACH_SAFE(bq, &wp->bits, queue, bq2) {
87c0b9797aSUlf Lilleengen TAILQ_REMOVE(&wp->bits, bq, queue);
88c0b9797aSUlf Lilleengen g_free(bq);
89c0b9797aSUlf Lilleengen }
90c0b9797aSUlf Lilleengen if (wp->waiting != NULL) {
91c0b9797aSUlf Lilleengen if (wp->waiting->bio_cflags & GV_BIO_MALLOC)
92c0b9797aSUlf Lilleengen g_free(wp->waiting->bio_data);
93a29df733SAlexander Motin gv_drive_done(wp->waiting->bio_caller1);
94c0b9797aSUlf Lilleengen g_destroy_bio(wp->waiting);
95c0b9797aSUlf Lilleengen }
96c0b9797aSUlf Lilleengen if (wp->parity != NULL) {
97c0b9797aSUlf Lilleengen if (wp->parity->bio_cflags & GV_BIO_MALLOC)
98c0b9797aSUlf Lilleengen g_free(wp->parity->bio_data);
99a29df733SAlexander Motin gv_drive_done(wp->parity->bio_caller1);
100c0b9797aSUlf Lilleengen g_destroy_bio(wp->parity);
101c0b9797aSUlf Lilleengen }
102c0b9797aSUlf Lilleengen g_free(wp);
103c0b9797aSUlf Lilleengen
104c0b9797aSUlf Lilleengen TAILQ_FOREACH_SAFE(wp, &p->packets, list, wp2) {
105c0b9797aSUlf Lilleengen if (wp->bio != bp)
106c0b9797aSUlf Lilleengen continue;
107c0b9797aSUlf Lilleengen
108c0b9797aSUlf Lilleengen TAILQ_REMOVE(&p->packets, wp, list);
109c0b9797aSUlf Lilleengen TAILQ_FOREACH_SAFE(bq, &wp->bits, queue, bq2) {
110c0b9797aSUlf Lilleengen TAILQ_REMOVE(&wp->bits, bq, queue);
111c0b9797aSUlf Lilleengen g_free(bq);
112c0b9797aSUlf Lilleengen }
113c0b9797aSUlf Lilleengen g_free(wp);
114c0b9797aSUlf Lilleengen }
115c0b9797aSUlf Lilleengen
116c0b9797aSUlf Lilleengen cbp = bioq_takefirst(p->bqueue);
117c0b9797aSUlf Lilleengen while (cbp != NULL) {
118c0b9797aSUlf Lilleengen if (cbp->bio_cflags & GV_BIO_MALLOC)
119c0b9797aSUlf Lilleengen g_free(cbp->bio_data);
120a29df733SAlexander Motin gv_drive_done(cbp->bio_caller1);
121c0b9797aSUlf Lilleengen g_destroy_bio(cbp);
122c0b9797aSUlf Lilleengen cbp = bioq_takefirst(p->bqueue);
123c0b9797aSUlf Lilleengen }
124c0b9797aSUlf Lilleengen
125c0b9797aSUlf Lilleengen /* If internal, stop and reset state. */
126d8d015cdSUlf Lilleengen if (bp->bio_pflags & GV_BIO_INTERNAL) {
127d8d015cdSUlf Lilleengen if (bp->bio_pflags & GV_BIO_MALLOC)
1281d8dfc60SUlf Lilleengen g_free(bp->bio_data);
129c0b9797aSUlf Lilleengen g_destroy_bio(bp);
130c0b9797aSUlf Lilleengen /* Reset flags. */
131c0b9797aSUlf Lilleengen p->flags &= ~(GV_PLEX_SYNCING | GV_PLEX_REBUILDING |
132c0b9797aSUlf Lilleengen GV_PLEX_GROWING);
133c0b9797aSUlf Lilleengen return (NULL);
134c0b9797aSUlf Lilleengen }
135c0b9797aSUlf Lilleengen g_io_deliver(bp, err);
136c0b9797aSUlf Lilleengen return (NULL);
137c0b9797aSUlf Lilleengen }
138c0b9797aSUlf Lilleengen
139c0b9797aSUlf Lilleengen return (wp);
140c0b9797aSUlf Lilleengen }
141fb4e65d0SLukas Ertl
14273679edcSLukas Ertl /*
14373679edcSLukas Ertl * Check if the stripe that the work packet wants is already being used by
14473679edcSLukas Ertl * some other work packet.
14573679edcSLukas Ertl */
14673679edcSLukas Ertl int
gv_stripe_active(struct gv_plex * p,struct bio * bp)14767e3ab6eSLukas Ertl gv_stripe_active(struct gv_plex *p, struct bio *bp)
14873679edcSLukas Ertl {
14967e3ab6eSLukas Ertl struct gv_raid5_packet *wp, *owp;
15067e3ab6eSLukas Ertl int overlap;
15173679edcSLukas Ertl
152c0b9797aSUlf Lilleengen wp = bp->bio_caller2;
15367e3ab6eSLukas Ertl if (wp->lockbase == -1)
15473679edcSLukas Ertl return (0);
15573679edcSLukas Ertl
15667e3ab6eSLukas Ertl overlap = 0;
15767e3ab6eSLukas Ertl TAILQ_FOREACH(owp, &p->packets, list) {
15867e3ab6eSLukas Ertl if (owp == wp)
15973679edcSLukas Ertl break;
16067e3ab6eSLukas Ertl if ((wp->lockbase >= owp->lockbase) &&
16167e3ab6eSLukas Ertl (wp->lockbase <= owp->lockbase + owp->length)) {
16267e3ab6eSLukas Ertl overlap++;
16373679edcSLukas Ertl break;
16473679edcSLukas Ertl }
16567e3ab6eSLukas Ertl if ((wp->lockbase <= owp->lockbase) &&
16667e3ab6eSLukas Ertl (wp->lockbase + wp->length >= owp->lockbase)) {
16767e3ab6eSLukas Ertl overlap++;
16867e3ab6eSLukas Ertl break;
16967e3ab6eSLukas Ertl }
17067e3ab6eSLukas Ertl }
17173679edcSLukas Ertl
17267e3ab6eSLukas Ertl return (overlap);
17373679edcSLukas Ertl }
17473679edcSLukas Ertl
175c0b9797aSUlf Lilleengen static int
gv_raid5_check(struct gv_plex * p,struct gv_raid5_packet * wp,struct bio * bp,caddr_t addr,off_t boff,off_t bcount)176c0b9797aSUlf Lilleengen gv_raid5_check(struct gv_plex *p, struct gv_raid5_packet *wp, struct bio *bp,
177fb5885afSLukas Ertl caddr_t addr, off_t boff, off_t bcount)
178fb5885afSLukas Ertl {
179fb5885afSLukas Ertl struct gv_sd *parity, *s;
180fb5885afSLukas Ertl struct gv_bioq *bq;
181c0b9797aSUlf Lilleengen struct bio *cbp;
182fb5885afSLukas Ertl int i, psdno;
183fb5885afSLukas Ertl off_t real_len, real_off;
184fb5885afSLukas Ertl
185fb5885afSLukas Ertl if (p == NULL || LIST_EMPTY(&p->subdisks))
186fb5885afSLukas Ertl return (ENXIO);
187fb5885afSLukas Ertl
188c0b9797aSUlf Lilleengen gv_raid5_offset(p, boff, bcount, &real_off, &real_len, NULL, &psdno, 1);
189fb5885afSLukas Ertl
190fb5885afSLukas Ertl /* Find the right subdisk. */
191fb5885afSLukas Ertl parity = NULL;
192fb5885afSLukas Ertl i = 0;
193fb5885afSLukas Ertl LIST_FOREACH(s, &p->subdisks, in_plex) {
194fb5885afSLukas Ertl if (i == psdno) {
195fb5885afSLukas Ertl parity = s;
196fb5885afSLukas Ertl break;
197fb5885afSLukas Ertl }
198fb5885afSLukas Ertl i++;
199fb5885afSLukas Ertl }
200fb5885afSLukas Ertl
201fb5885afSLukas Ertl /* Parity stripe not found. */
202fb5885afSLukas Ertl if (parity == NULL)
203fb5885afSLukas Ertl return (ENXIO);
204fb5885afSLukas Ertl
205fb5885afSLukas Ertl if (parity->state != GV_SD_UP)
206fb5885afSLukas Ertl return (ENXIO);
207fb5885afSLukas Ertl
208fb5885afSLukas Ertl wp->length = real_len;
209fb5885afSLukas Ertl wp->data = addr;
210fb5885afSLukas Ertl wp->lockbase = real_off;
211fb5885afSLukas Ertl
212fb5885afSLukas Ertl /* Read all subdisks. */
213fb5885afSLukas Ertl LIST_FOREACH(s, &p->subdisks, in_plex) {
214fb5885afSLukas Ertl /* Skip the parity subdisk. */
215fb5885afSLukas Ertl if (s == parity)
216fb5885afSLukas Ertl continue;
217c0b9797aSUlf Lilleengen /* Skip growing subdisks. */
218c0b9797aSUlf Lilleengen if (s->flags & GV_SD_GROW)
219c0b9797aSUlf Lilleengen continue;
220fb5885afSLukas Ertl
221c0b9797aSUlf Lilleengen cbp = gv_raid5_clone_bio(bp, s, wp, NULL, 1);
222fb5885afSLukas Ertl if (cbp == NULL)
223fb5885afSLukas Ertl return (ENOMEM);
224fb5885afSLukas Ertl cbp->bio_cmd = BIO_READ;
225fb5885afSLukas Ertl
226c0b9797aSUlf Lilleengen bioq_insert_tail(p->bqueue, cbp);
227fb5885afSLukas Ertl
228fb5885afSLukas Ertl bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO);
229fb5885afSLukas Ertl bq->bp = cbp;
230fb5885afSLukas Ertl TAILQ_INSERT_TAIL(&wp->bits, bq, queue);
231fb5885afSLukas Ertl }
232fb5885afSLukas Ertl
233fb5885afSLukas Ertl /* Read the parity data. */
234c0b9797aSUlf Lilleengen cbp = gv_raid5_clone_bio(bp, parity, wp, NULL, 1);
235fb5885afSLukas Ertl if (cbp == NULL)
236fb5885afSLukas Ertl return (ENOMEM);
237fb5885afSLukas Ertl cbp->bio_cmd = BIO_READ;
238fb5885afSLukas Ertl wp->waiting = cbp;
239fb5885afSLukas Ertl
240fb5885afSLukas Ertl /*
241fb5885afSLukas Ertl * In case we want to rebuild the parity, create an extra BIO to write
242fb5885afSLukas Ertl * it out. It also acts as buffer for the XOR operations.
243fb5885afSLukas Ertl */
244c0b9797aSUlf Lilleengen cbp = gv_raid5_clone_bio(bp, parity, wp, addr, 1);
245fb5885afSLukas Ertl if (cbp == NULL)
246fb5885afSLukas Ertl return (ENOMEM);
247fb5885afSLukas Ertl wp->parity = cbp;
248fb5885afSLukas Ertl
249fb5885afSLukas Ertl return (0);
250fb5885afSLukas Ertl }
251fb5885afSLukas Ertl
252fb5885afSLukas Ertl /* Rebuild a degraded RAID5 plex. */
253c0b9797aSUlf Lilleengen static int
gv_raid5_rebuild(struct gv_plex * p,struct gv_raid5_packet * wp,struct bio * bp,caddr_t addr,off_t boff,off_t bcount)254c0b9797aSUlf Lilleengen gv_raid5_rebuild(struct gv_plex *p, struct gv_raid5_packet *wp, struct bio *bp,
255c3aadfb9SLukas Ertl caddr_t addr, off_t boff, off_t bcount)
256c3aadfb9SLukas Ertl {
257c3aadfb9SLukas Ertl struct gv_sd *broken, *s;
258c3aadfb9SLukas Ertl struct gv_bioq *bq;
259c0b9797aSUlf Lilleengen struct bio *cbp;
260fb4e65d0SLukas Ertl off_t real_len, real_off;
261c3aadfb9SLukas Ertl
262c3aadfb9SLukas Ertl if (p == NULL || LIST_EMPTY(&p->subdisks))
263c3aadfb9SLukas Ertl return (ENXIO);
264c3aadfb9SLukas Ertl
265c0b9797aSUlf Lilleengen gv_raid5_offset(p, boff, bcount, &real_off, &real_len, NULL, NULL, 1);
266c3aadfb9SLukas Ertl
267c3aadfb9SLukas Ertl /* Find the right subdisk. */
268c3aadfb9SLukas Ertl broken = NULL;
269c3aadfb9SLukas Ertl LIST_FOREACH(s, &p->subdisks, in_plex) {
270c3aadfb9SLukas Ertl if (s->state != GV_SD_UP)
271c3aadfb9SLukas Ertl broken = s;
272c3aadfb9SLukas Ertl }
273c3aadfb9SLukas Ertl
274fb5885afSLukas Ertl /* Broken stripe not found. */
275c3aadfb9SLukas Ertl if (broken == NULL)
276c3aadfb9SLukas Ertl return (ENXIO);
277c3aadfb9SLukas Ertl
278c3aadfb9SLukas Ertl switch (broken->state) {
279c3aadfb9SLukas Ertl case GV_SD_UP:
280c3aadfb9SLukas Ertl return (EINVAL);
281c3aadfb9SLukas Ertl
282c3aadfb9SLukas Ertl case GV_SD_STALE:
283d8d015cdSUlf Lilleengen if (!(bp->bio_pflags & GV_BIO_REBUILD))
284c3aadfb9SLukas Ertl return (ENXIO);
285c3aadfb9SLukas Ertl
28686b3c6f5SUlf Lilleengen G_VINUM_DEBUG(1, "sd %s is reviving", broken->name);
287c3aadfb9SLukas Ertl gv_set_sd_state(broken, GV_SD_REVIVING, GV_SETSTATE_FORCE);
288c0b9797aSUlf Lilleengen /* Set this bit now, but should be set at end. */
289c0b9797aSUlf Lilleengen broken->flags |= GV_SD_CANGOUP;
290c3aadfb9SLukas Ertl break;
291c3aadfb9SLukas Ertl
292c3aadfb9SLukas Ertl case GV_SD_REVIVING:
293c3aadfb9SLukas Ertl break;
294c3aadfb9SLukas Ertl
295c3aadfb9SLukas Ertl default:
296c3aadfb9SLukas Ertl /* All other subdisk states mean it's not accessible. */
297c3aadfb9SLukas Ertl return (ENXIO);
298c3aadfb9SLukas Ertl }
299c3aadfb9SLukas Ertl
300c3aadfb9SLukas Ertl wp->length = real_len;
301c3aadfb9SLukas Ertl wp->data = addr;
302c3aadfb9SLukas Ertl wp->lockbase = real_off;
303c3aadfb9SLukas Ertl
304fb4e65d0SLukas Ertl KASSERT(wp->length >= 0, ("gv_rebuild_raid5: wp->length < 0"));
305c3aadfb9SLukas Ertl
306c3aadfb9SLukas Ertl /* Read all subdisks. */
307c3aadfb9SLukas Ertl LIST_FOREACH(s, &p->subdisks, in_plex) {
308c3aadfb9SLukas Ertl /* Skip the broken subdisk. */
309c3aadfb9SLukas Ertl if (s == broken)
310c3aadfb9SLukas Ertl continue;
311c3aadfb9SLukas Ertl
312c0b9797aSUlf Lilleengen /* Skip growing subdisks. */
313c0b9797aSUlf Lilleengen if (s->flags & GV_SD_GROW)
314c0b9797aSUlf Lilleengen continue;
315c0b9797aSUlf Lilleengen
316c0b9797aSUlf Lilleengen cbp = gv_raid5_clone_bio(bp, s, wp, NULL, 1);
317c3aadfb9SLukas Ertl if (cbp == NULL)
318c3aadfb9SLukas Ertl return (ENOMEM);
319c3aadfb9SLukas Ertl cbp->bio_cmd = BIO_READ;
320c3aadfb9SLukas Ertl
321c0b9797aSUlf Lilleengen bioq_insert_tail(p->bqueue, cbp);
322c3aadfb9SLukas Ertl
323c3aadfb9SLukas Ertl bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO);
324c3aadfb9SLukas Ertl bq->bp = cbp;
325c3aadfb9SLukas Ertl TAILQ_INSERT_TAIL(&wp->bits, bq, queue);
326c3aadfb9SLukas Ertl }
327c3aadfb9SLukas Ertl
328c3aadfb9SLukas Ertl /* Write the parity data. */
329c0b9797aSUlf Lilleengen cbp = gv_raid5_clone_bio(bp, broken, wp, NULL, 1);
330c3aadfb9SLukas Ertl if (cbp == NULL)
331c3aadfb9SLukas Ertl return (ENOMEM);
332c3aadfb9SLukas Ertl wp->parity = cbp;
333c3aadfb9SLukas Ertl
334c3aadfb9SLukas Ertl p->synced = boff;
335c3aadfb9SLukas Ertl
336c0b9797aSUlf Lilleengen /* Post notification that we're finished. */
337c3aadfb9SLukas Ertl return (0);
338c3aadfb9SLukas Ertl }
339c3aadfb9SLukas Ertl
34073679edcSLukas Ertl /* Build a request group to perform (part of) a RAID5 request. */
341c0b9797aSUlf Lilleengen static int
gv_raid5_request(struct gv_plex * p,struct gv_raid5_packet * wp,struct bio * bp,caddr_t addr,off_t boff,off_t bcount,int * delay)342c0b9797aSUlf Lilleengen gv_raid5_request(struct gv_plex *p, struct gv_raid5_packet *wp,
343c0b9797aSUlf Lilleengen struct bio *bp, caddr_t addr, off_t boff, off_t bcount, int *delay)
34473679edcSLukas Ertl {
34573679edcSLukas Ertl struct gv_sd *broken, *original, *parity, *s;
34667e3ab6eSLukas Ertl struct gv_bioq *bq;
347c0b9797aSUlf Lilleengen struct bio *cbp;
348c0b9797aSUlf Lilleengen int i, psdno, sdno, type, grow;
349fb4e65d0SLukas Ertl off_t real_len, real_off;
35073679edcSLukas Ertl
35173679edcSLukas Ertl if (p == NULL || LIST_EMPTY(&p->subdisks))
35273679edcSLukas Ertl return (ENXIO);
35373679edcSLukas Ertl
35473679edcSLukas Ertl /* We are optimistic and assume that this request will be OK. */
35567e3ab6eSLukas Ertl #define REQ_TYPE_NORMAL 0
35667e3ab6eSLukas Ertl #define REQ_TYPE_DEGRADED 1
35767e3ab6eSLukas Ertl #define REQ_TYPE_NOPARITY 2
35867e3ab6eSLukas Ertl
35967e3ab6eSLukas Ertl type = REQ_TYPE_NORMAL;
36073679edcSLukas Ertl original = parity = broken = NULL;
36173679edcSLukas Ertl
362c0b9797aSUlf Lilleengen /* XXX: The resize won't crash with rebuild or sync, but we should still
363c0b9797aSUlf Lilleengen * be aware of it. Also this should perhaps be done on rebuild/check as
364c0b9797aSUlf Lilleengen * well?
365c0b9797aSUlf Lilleengen */
366c0b9797aSUlf Lilleengen /* If we're over, we must use the old. */
367c0b9797aSUlf Lilleengen if (boff >= p->synced) {
368c0b9797aSUlf Lilleengen grow = 1;
369c0b9797aSUlf Lilleengen /* Or if over the resized offset, we use all drives. */
370c0b9797aSUlf Lilleengen } else if (boff + bcount <= p->synced) {
371c0b9797aSUlf Lilleengen grow = 0;
372c0b9797aSUlf Lilleengen /* Else, we're in the middle, and must wait a bit. */
373c0b9797aSUlf Lilleengen } else {
374c0b9797aSUlf Lilleengen bioq_disksort(p->rqueue, bp);
375c0b9797aSUlf Lilleengen *delay = 1;
376c0b9797aSUlf Lilleengen return (0);
377c0b9797aSUlf Lilleengen }
378c0b9797aSUlf Lilleengen gv_raid5_offset(p, boff, bcount, &real_off, &real_len,
379c0b9797aSUlf Lilleengen &sdno, &psdno, grow);
38073679edcSLukas Ertl
38173679edcSLukas Ertl /* Find the right subdisks. */
38273679edcSLukas Ertl i = 0;
38373679edcSLukas Ertl LIST_FOREACH(s, &p->subdisks, in_plex) {
38473679edcSLukas Ertl if (i == sdno)
38573679edcSLukas Ertl original = s;
38673679edcSLukas Ertl if (i == psdno)
38773679edcSLukas Ertl parity = s;
38873679edcSLukas Ertl if (s->state != GV_SD_UP)
38973679edcSLukas Ertl broken = s;
39073679edcSLukas Ertl i++;
39173679edcSLukas Ertl }
39273679edcSLukas Ertl
39373679edcSLukas Ertl if ((original == NULL) || (parity == NULL))
39473679edcSLukas Ertl return (ENXIO);
39573679edcSLukas Ertl
39673679edcSLukas Ertl /* Our data stripe is missing. */
39773679edcSLukas Ertl if (original->state != GV_SD_UP)
39867e3ab6eSLukas Ertl type = REQ_TYPE_DEGRADED;
399c0b9797aSUlf Lilleengen
400c0b9797aSUlf Lilleengen /* If synchronizing request, just write it if disks are stale. */
401c0b9797aSUlf Lilleengen if (original->state == GV_SD_STALE && parity->state == GV_SD_STALE &&
402d8d015cdSUlf Lilleengen bp->bio_pflags & GV_BIO_SYNCREQ && bp->bio_cmd == BIO_WRITE) {
403c0b9797aSUlf Lilleengen type = REQ_TYPE_NORMAL;
40473679edcSLukas Ertl /* Our parity stripe is missing. */
405c0b9797aSUlf Lilleengen } else if (parity->state != GV_SD_UP) {
40673679edcSLukas Ertl /* We cannot take another failure if we're already degraded. */
40767e3ab6eSLukas Ertl if (type != REQ_TYPE_NORMAL)
40873679edcSLukas Ertl return (ENXIO);
40973679edcSLukas Ertl else
41067e3ab6eSLukas Ertl type = REQ_TYPE_NOPARITY;
41173679edcSLukas Ertl }
41273679edcSLukas Ertl
41367e3ab6eSLukas Ertl wp->length = real_len;
41473679edcSLukas Ertl wp->data = addr;
41567e3ab6eSLukas Ertl wp->lockbase = real_off;
41673679edcSLukas Ertl
41773679edcSLukas Ertl KASSERT(wp->length >= 0, ("gv_build_raid5_request: wp->length < 0"));
41873679edcSLukas Ertl
419c0b9797aSUlf Lilleengen if ((p->flags & GV_PLEX_REBUILDING) && (boff + real_len < p->synced))
420c3aadfb9SLukas Ertl type = REQ_TYPE_NORMAL;
421c3aadfb9SLukas Ertl
422c0b9797aSUlf Lilleengen if ((p->flags & GV_PLEX_REBUILDING) && (boff + real_len >= p->synced)) {
423c0b9797aSUlf Lilleengen bioq_disksort(p->rqueue, bp);
424c0b9797aSUlf Lilleengen *delay = 1;
425c0b9797aSUlf Lilleengen return (0);
426c0b9797aSUlf Lilleengen }
427c0b9797aSUlf Lilleengen
42873679edcSLukas Ertl switch (bp->bio_cmd) {
42973679edcSLukas Ertl case BIO_READ:
43073679edcSLukas Ertl /*
43173679edcSLukas Ertl * For a degraded read we need to read in all stripes except
43273679edcSLukas Ertl * the broken one plus the parity stripe and then recalculate
43373679edcSLukas Ertl * the desired data.
43473679edcSLukas Ertl */
43567e3ab6eSLukas Ertl if (type == REQ_TYPE_DEGRADED) {
43667e3ab6eSLukas Ertl bzero(wp->data, wp->length);
43773679edcSLukas Ertl LIST_FOREACH(s, &p->subdisks, in_plex) {
43873679edcSLukas Ertl /* Skip the broken subdisk. */
43973679edcSLukas Ertl if (s == broken)
44073679edcSLukas Ertl continue;
441c0b9797aSUlf Lilleengen /* Skip growing if within offset. */
442c0b9797aSUlf Lilleengen if (grow && s->flags & GV_SD_GROW)
443c0b9797aSUlf Lilleengen continue;
444c0b9797aSUlf Lilleengen cbp = gv_raid5_clone_bio(bp, s, wp, NULL, 1);
44567e3ab6eSLukas Ertl if (cbp == NULL)
44673679edcSLukas Ertl return (ENOMEM);
44767e3ab6eSLukas Ertl
448c0b9797aSUlf Lilleengen bioq_insert_tail(p->bqueue, cbp);
44967e3ab6eSLukas Ertl
45067e3ab6eSLukas Ertl bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO);
45167e3ab6eSLukas Ertl bq->bp = cbp;
45267e3ab6eSLukas Ertl TAILQ_INSERT_TAIL(&wp->bits, bq, queue);
45373679edcSLukas Ertl }
45473679edcSLukas Ertl
45573679edcSLukas Ertl /* A normal read can be fulfilled with the original subdisk. */
45673679edcSLukas Ertl } else {
457c0b9797aSUlf Lilleengen cbp = gv_raid5_clone_bio(bp, original, wp, addr, 0);
45867e3ab6eSLukas Ertl if (cbp == NULL)
45973679edcSLukas Ertl return (ENOMEM);
46067e3ab6eSLukas Ertl
461c0b9797aSUlf Lilleengen bioq_insert_tail(p->bqueue, cbp);
46273679edcSLukas Ertl }
46373679edcSLukas Ertl wp->lockbase = -1;
46467e3ab6eSLukas Ertl
46573679edcSLukas Ertl break;
46673679edcSLukas Ertl
46773679edcSLukas Ertl case BIO_WRITE:
46873679edcSLukas Ertl /*
46973679edcSLukas Ertl * A degraded write means we cannot write to the original data
47073679edcSLukas Ertl * subdisk. Thus we need to read in all valid stripes,
47173679edcSLukas Ertl * recalculate the parity from the original data, and then
47273679edcSLukas Ertl * write the parity stripe back out.
47373679edcSLukas Ertl */
47467e3ab6eSLukas Ertl if (type == REQ_TYPE_DEGRADED) {
47567e3ab6eSLukas Ertl /* Read all subdisks. */
47673679edcSLukas Ertl LIST_FOREACH(s, &p->subdisks, in_plex) {
47773679edcSLukas Ertl /* Skip the broken and the parity subdisk. */
47867e3ab6eSLukas Ertl if ((s == broken) || (s == parity))
47973679edcSLukas Ertl continue;
480c0b9797aSUlf Lilleengen /* Skip growing if within offset. */
481c0b9797aSUlf Lilleengen if (grow && s->flags & GV_SD_GROW)
482c0b9797aSUlf Lilleengen continue;
48373679edcSLukas Ertl
484c0b9797aSUlf Lilleengen cbp = gv_raid5_clone_bio(bp, s, wp, NULL, 1);
48567e3ab6eSLukas Ertl if (cbp == NULL)
48673679edcSLukas Ertl return (ENOMEM);
48767e3ab6eSLukas Ertl cbp->bio_cmd = BIO_READ;
48867e3ab6eSLukas Ertl
489c0b9797aSUlf Lilleengen bioq_insert_tail(p->bqueue, cbp);
49067e3ab6eSLukas Ertl
49167e3ab6eSLukas Ertl bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO);
49267e3ab6eSLukas Ertl bq->bp = cbp;
49367e3ab6eSLukas Ertl TAILQ_INSERT_TAIL(&wp->bits, bq, queue);
49473679edcSLukas Ertl }
49573679edcSLukas Ertl
49667e3ab6eSLukas Ertl /* Write the parity data. */
497c0b9797aSUlf Lilleengen cbp = gv_raid5_clone_bio(bp, parity, wp, NULL, 1);
49867e3ab6eSLukas Ertl if (cbp == NULL)
49973679edcSLukas Ertl return (ENOMEM);
500c0b9797aSUlf Lilleengen bcopy(addr, cbp->bio_data, wp->length);
50167e3ab6eSLukas Ertl wp->parity = cbp;
50273679edcSLukas Ertl
50373679edcSLukas Ertl /*
50467e3ab6eSLukas Ertl * When the parity stripe is missing we just write out the data.
50573679edcSLukas Ertl */
50667e3ab6eSLukas Ertl } else if (type == REQ_TYPE_NOPARITY) {
507c0b9797aSUlf Lilleengen cbp = gv_raid5_clone_bio(bp, original, wp, addr, 1);
50867e3ab6eSLukas Ertl if (cbp == NULL)
509291cb0acSLukas Ertl return (ENOMEM);
51073679edcSLukas Ertl
511c0b9797aSUlf Lilleengen bioq_insert_tail(p->bqueue, cbp);
51273679edcSLukas Ertl
51367e3ab6eSLukas Ertl bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO);
51467e3ab6eSLukas Ertl bq->bp = cbp;
51567e3ab6eSLukas Ertl TAILQ_INSERT_TAIL(&wp->bits, bq, queue);
51673679edcSLukas Ertl
51773679edcSLukas Ertl /*
51873679edcSLukas Ertl * A normal write request goes to the original subdisk, then we
51973679edcSLukas Ertl * read in all other stripes, recalculate the parity and write
52073679edcSLukas Ertl * out the parity again.
52173679edcSLukas Ertl */
52273679edcSLukas Ertl } else {
52367e3ab6eSLukas Ertl /* Read old parity. */
524c0b9797aSUlf Lilleengen cbp = gv_raid5_clone_bio(bp, parity, wp, NULL, 1);
52567e3ab6eSLukas Ertl if (cbp == NULL)
526291cb0acSLukas Ertl return (ENOMEM);
52767e3ab6eSLukas Ertl cbp->bio_cmd = BIO_READ;
52873679edcSLukas Ertl
529c0b9797aSUlf Lilleengen bioq_insert_tail(p->bqueue, cbp);
53067e3ab6eSLukas Ertl
53167e3ab6eSLukas Ertl bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO);
53267e3ab6eSLukas Ertl bq->bp = cbp;
53367e3ab6eSLukas Ertl TAILQ_INSERT_TAIL(&wp->bits, bq, queue);
53467e3ab6eSLukas Ertl
53567e3ab6eSLukas Ertl /* Read old data. */
536c0b9797aSUlf Lilleengen cbp = gv_raid5_clone_bio(bp, original, wp, NULL, 1);
53767e3ab6eSLukas Ertl if (cbp == NULL)
53873679edcSLukas Ertl return (ENOMEM);
53967e3ab6eSLukas Ertl cbp->bio_cmd = BIO_READ;
54067e3ab6eSLukas Ertl
541c0b9797aSUlf Lilleengen bioq_insert_tail(p->bqueue, cbp);
54267e3ab6eSLukas Ertl
54367e3ab6eSLukas Ertl bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO);
54467e3ab6eSLukas Ertl bq->bp = cbp;
54567e3ab6eSLukas Ertl TAILQ_INSERT_TAIL(&wp->bits, bq, queue);
54667e3ab6eSLukas Ertl
54767e3ab6eSLukas Ertl /* Write new data. */
548c0b9797aSUlf Lilleengen cbp = gv_raid5_clone_bio(bp, original, wp, addr, 1);
54967e3ab6eSLukas Ertl if (cbp == NULL)
55067e3ab6eSLukas Ertl return (ENOMEM);
55167e3ab6eSLukas Ertl
55273679edcSLukas Ertl /*
55367e3ab6eSLukas Ertl * We must not write the new data until the old data
55467e3ab6eSLukas Ertl * was read, so hold this BIO back until we're ready
55567e3ab6eSLukas Ertl * for it.
55673679edcSLukas Ertl */
55767e3ab6eSLukas Ertl wp->waiting = cbp;
55867e3ab6eSLukas Ertl
55967e3ab6eSLukas Ertl /* The final bio for the parity. */
560c0b9797aSUlf Lilleengen cbp = gv_raid5_clone_bio(bp, parity, wp, NULL, 1);
56167e3ab6eSLukas Ertl if (cbp == NULL)
562291cb0acSLukas Ertl return (ENOMEM);
56367e3ab6eSLukas Ertl
56467e3ab6eSLukas Ertl /* Remember that this is the BIO for the parity data. */
56567e3ab6eSLukas Ertl wp->parity = cbp;
56673679edcSLukas Ertl }
56773679edcSLukas Ertl break;
56867e3ab6eSLukas Ertl
56973679edcSLukas Ertl default:
57073679edcSLukas Ertl return (EINVAL);
57173679edcSLukas Ertl }
57273679edcSLukas Ertl
57373679edcSLukas Ertl return (0);
57473679edcSLukas Ertl }
575fb4e65d0SLukas Ertl
576c0b9797aSUlf Lilleengen /*
577c0b9797aSUlf Lilleengen * Calculate the offsets in the various subdisks for a RAID5 request. Also take
578c0b9797aSUlf Lilleengen * care of new subdisks in an expanded RAID5 array.
579c0b9797aSUlf Lilleengen * XXX: This assumes that the new subdisks are inserted after the others (which
580c0b9797aSUlf Lilleengen * is okay as long as plex_offset is larger). If subdisks are inserted into the
581c0b9797aSUlf Lilleengen * plexlist before, we get problems.
582c0b9797aSUlf Lilleengen */
583c0b9797aSUlf Lilleengen static int
gv_raid5_offset(struct gv_plex * p,off_t boff,off_t bcount,off_t * real_off,off_t * real_len,int * sdno,int * psdno,int growing)584fb4e65d0SLukas Ertl gv_raid5_offset(struct gv_plex *p, off_t boff, off_t bcount, off_t *real_off,
585c0b9797aSUlf Lilleengen off_t *real_len, int *sdno, int *psdno, int growing)
586fb4e65d0SLukas Ertl {
587c0b9797aSUlf Lilleengen struct gv_sd *s;
588c0b9797aSUlf Lilleengen int sd, psd, sdcount;
589fb4e65d0SLukas Ertl off_t len_left, stripeend, stripeoff, stripestart;
590fb4e65d0SLukas Ertl
591c0b9797aSUlf Lilleengen sdcount = p->sdcount;
592c0b9797aSUlf Lilleengen if (growing) {
593c0b9797aSUlf Lilleengen LIST_FOREACH(s, &p->subdisks, in_plex) {
594c0b9797aSUlf Lilleengen if (s->flags & GV_SD_GROW)
595c0b9797aSUlf Lilleengen sdcount--;
596c0b9797aSUlf Lilleengen }
597c0b9797aSUlf Lilleengen }
598c0b9797aSUlf Lilleengen
599fb4e65d0SLukas Ertl /* The number of the subdisk containing the parity stripe. */
600c0b9797aSUlf Lilleengen psd = sdcount - 1 - ( boff / (p->stripesize * (sdcount - 1))) %
601c0b9797aSUlf Lilleengen sdcount;
60287bb53cbSEd Maste KASSERT(psd >= 0, ("gv_raid5_offset: psdno < 0"));
603fb4e65d0SLukas Ertl
604fb4e65d0SLukas Ertl /* Offset of the start address from the start of the stripe. */
605c0b9797aSUlf Lilleengen stripeoff = boff % (p->stripesize * (sdcount - 1));
606fb4e65d0SLukas Ertl KASSERT(stripeoff >= 0, ("gv_raid5_offset: stripeoff < 0"));
607fb4e65d0SLukas Ertl
608fb4e65d0SLukas Ertl /* The number of the subdisk where the stripe resides. */
609fb4e65d0SLukas Ertl sd = stripeoff / p->stripesize;
61087bb53cbSEd Maste KASSERT(sd >= 0, ("gv_raid5_offset: sdno < 0"));
611fb4e65d0SLukas Ertl
612fb4e65d0SLukas Ertl /* At or past parity subdisk. */
613fb4e65d0SLukas Ertl if (sd >= psd)
614fb4e65d0SLukas Ertl sd++;
615fb4e65d0SLukas Ertl
616fb4e65d0SLukas Ertl /* The offset of the stripe on this subdisk. */
617c0b9797aSUlf Lilleengen stripestart = (boff - stripeoff) / (sdcount - 1);
618fb4e65d0SLukas Ertl KASSERT(stripestart >= 0, ("gv_raid5_offset: stripestart < 0"));
619fb4e65d0SLukas Ertl
620fb4e65d0SLukas Ertl stripeoff %= p->stripesize;
621fb4e65d0SLukas Ertl
622fb4e65d0SLukas Ertl /* The offset of the request on this subdisk. */
623fb4e65d0SLukas Ertl *real_off = stripestart + stripeoff;
624fb4e65d0SLukas Ertl
625fb4e65d0SLukas Ertl stripeend = stripestart + p->stripesize;
626fb4e65d0SLukas Ertl len_left = stripeend - *real_off;
627fb4e65d0SLukas Ertl KASSERT(len_left >= 0, ("gv_raid5_offset: len_left < 0"));
628fb4e65d0SLukas Ertl
629fb4e65d0SLukas Ertl *real_len = (bcount <= len_left) ? bcount : len_left;
630fb4e65d0SLukas Ertl
631fb4e65d0SLukas Ertl if (sdno != NULL)
632fb4e65d0SLukas Ertl *sdno = sd;
633fb4e65d0SLukas Ertl if (psdno != NULL)
634fb4e65d0SLukas Ertl *psdno = psd;
635fb4e65d0SLukas Ertl
636fb4e65d0SLukas Ertl return (0);
637fb4e65d0SLukas Ertl }
638c0b9797aSUlf Lilleengen
639c0b9797aSUlf Lilleengen static struct bio *
gv_raid5_clone_bio(struct bio * bp,struct gv_sd * s,struct gv_raid5_packet * wp,caddr_t addr,int use_wp)640c0b9797aSUlf Lilleengen gv_raid5_clone_bio(struct bio *bp, struct gv_sd *s, struct gv_raid5_packet *wp,
641c0b9797aSUlf Lilleengen caddr_t addr, int use_wp)
642c0b9797aSUlf Lilleengen {
643c0b9797aSUlf Lilleengen struct bio *cbp;
644c0b9797aSUlf Lilleengen
645c0b9797aSUlf Lilleengen cbp = g_clone_bio(bp);
646c0b9797aSUlf Lilleengen if (cbp == NULL)
647c0b9797aSUlf Lilleengen return (NULL);
648c0b9797aSUlf Lilleengen if (addr == NULL) {
649c0b9797aSUlf Lilleengen cbp->bio_data = g_malloc(wp->length, M_WAITOK | M_ZERO);
650c0b9797aSUlf Lilleengen cbp->bio_cflags |= GV_BIO_MALLOC;
651c0b9797aSUlf Lilleengen } else
652c0b9797aSUlf Lilleengen cbp->bio_data = addr;
653c0b9797aSUlf Lilleengen cbp->bio_offset = wp->lockbase + s->drive_offset;
654c0b9797aSUlf Lilleengen cbp->bio_length = wp->length;
655c0b9797aSUlf Lilleengen cbp->bio_done = gv_done;
656c0b9797aSUlf Lilleengen cbp->bio_caller1 = s;
657a29df733SAlexander Motin s->drive_sc->active++;
658c0b9797aSUlf Lilleengen if (use_wp)
659c0b9797aSUlf Lilleengen cbp->bio_caller2 = wp;
660c0b9797aSUlf Lilleengen
661c0b9797aSUlf Lilleengen return (cbp);
662c0b9797aSUlf Lilleengen }
663