173679edcSLukas Ertl /*- 23728855aSPedro F. Giffuni * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 33728855aSPedro F. Giffuni * 4c0b9797aSUlf Lilleengen * Copyright (c) 2004, 2007 Lukas Ertl 573679edcSLukas Ertl * All rights reserved. 673679edcSLukas Ertl * 773679edcSLukas Ertl * Redistribution and use in source and binary forms, with or without 873679edcSLukas Ertl * modification, are permitted provided that the following conditions 973679edcSLukas Ertl * are met: 1073679edcSLukas Ertl * 1. Redistributions of source code must retain the above copyright 1173679edcSLukas Ertl * notice, this list of conditions and the following disclaimer. 1273679edcSLukas Ertl * 2. Redistributions in binary form must reproduce the above copyright 1373679edcSLukas Ertl * notice, this list of conditions and the following disclaimer in the 1473679edcSLukas Ertl * documentation and/or other materials provided with the distribution. 1573679edcSLukas Ertl * 1673679edcSLukas Ertl * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 1773679edcSLukas Ertl * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 1873679edcSLukas Ertl * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 1973679edcSLukas Ertl * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 2073679edcSLukas Ertl * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 2173679edcSLukas Ertl * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 2273679edcSLukas Ertl * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 2373679edcSLukas Ertl * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 2473679edcSLukas Ertl * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 2573679edcSLukas Ertl * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 2673679edcSLukas Ertl * SUCH DAMAGE. 2773679edcSLukas Ertl */ 2873679edcSLukas Ertl 2973679edcSLukas Ertl #include <sys/cdefs.h> 3073679edcSLukas Ertl __FBSDID("$FreeBSD$"); 3173679edcSLukas Ertl 3273679edcSLukas Ertl #include <sys/param.h> 3373679edcSLukas Ertl #include <sys/bio.h> 3473679edcSLukas Ertl #include <sys/lock.h> 3573679edcSLukas Ertl #include <sys/malloc.h> 3673679edcSLukas Ertl #include <sys/systm.h> 3773679edcSLukas Ertl 3873679edcSLukas Ertl #include <geom/geom.h> 39ac03832eSConrad Meyer #include <geom/geom_dbg.h> 4073679edcSLukas Ertl #include <geom/vinum/geom_vinum_var.h> 4173679edcSLukas Ertl #include <geom/vinum/geom_vinum_raid5.h> 4273679edcSLukas Ertl #include <geom/vinum/geom_vinum.h> 4373679edcSLukas Ertl 44c0b9797aSUlf Lilleengen static int gv_raid5_offset(struct gv_plex *, off_t, off_t, 45c0b9797aSUlf Lilleengen off_t *, off_t *, int *, int *, int); 46c0b9797aSUlf Lilleengen static struct bio * gv_raid5_clone_bio(struct bio *, struct gv_sd *, 47c0b9797aSUlf Lilleengen struct gv_raid5_packet *, caddr_t, int); 48c0b9797aSUlf Lilleengen static int gv_raid5_request(struct gv_plex *, struct gv_raid5_packet *, 49c0b9797aSUlf Lilleengen struct bio *, caddr_t, off_t, off_t, int *); 50c0b9797aSUlf Lilleengen static int gv_raid5_check(struct gv_plex *, struct gv_raid5_packet *, 51c0b9797aSUlf Lilleengen struct bio *, caddr_t, off_t, off_t); 52c0b9797aSUlf Lilleengen static int gv_raid5_rebuild(struct gv_plex *, struct gv_raid5_packet *, 53c0b9797aSUlf Lilleengen struct bio *, caddr_t, off_t, off_t); 54c0b9797aSUlf Lilleengen 55c0b9797aSUlf Lilleengen struct gv_raid5_packet * 56c0b9797aSUlf Lilleengen gv_raid5_start(struct gv_plex *p, struct bio *bp, caddr_t addr, off_t boff, 57c0b9797aSUlf Lilleengen off_t bcount) 58c0b9797aSUlf Lilleengen { 59c0b9797aSUlf Lilleengen struct bio *cbp; 60c0b9797aSUlf Lilleengen struct gv_raid5_packet *wp, *wp2; 61c0b9797aSUlf Lilleengen struct gv_bioq *bq, *bq2; 62c0b9797aSUlf Lilleengen int err, delay; 63c0b9797aSUlf Lilleengen 64c0b9797aSUlf Lilleengen delay = 0; 65c0b9797aSUlf Lilleengen wp = g_malloc(sizeof(*wp), M_WAITOK | M_ZERO); 66c0b9797aSUlf Lilleengen wp->bio = bp; 67c0b9797aSUlf Lilleengen wp->waiting = NULL; 68c0b9797aSUlf Lilleengen wp->parity = NULL; 69c0b9797aSUlf Lilleengen TAILQ_INIT(&wp->bits); 70c0b9797aSUlf Lilleengen 71d8d015cdSUlf Lilleengen if (bp->bio_pflags & GV_BIO_REBUILD) 72c0b9797aSUlf Lilleengen err = gv_raid5_rebuild(p, wp, bp, addr, boff, bcount); 73d8d015cdSUlf Lilleengen else if (bp->bio_pflags & GV_BIO_CHECK) 74c0b9797aSUlf Lilleengen err = gv_raid5_check(p, wp, bp, addr, boff, bcount); 75c0b9797aSUlf Lilleengen else 76c0b9797aSUlf Lilleengen err = gv_raid5_request(p, wp, bp, addr, boff, bcount, &delay); 77c0b9797aSUlf Lilleengen 78c0b9797aSUlf Lilleengen /* Means we have a delayed request. */ 79c0b9797aSUlf Lilleengen if (delay) { 80c0b9797aSUlf Lilleengen g_free(wp); 81c0b9797aSUlf Lilleengen return (NULL); 82c0b9797aSUlf Lilleengen } 83c0b9797aSUlf Lilleengen 84c0b9797aSUlf Lilleengen /* 85c0b9797aSUlf Lilleengen * Building the sub-request failed, we probably need to clean up a lot. 86c0b9797aSUlf Lilleengen */ 87c0b9797aSUlf Lilleengen if (err) { 88c0b9797aSUlf Lilleengen G_VINUM_LOGREQ(0, bp, "raid5 plex request failed."); 89c0b9797aSUlf Lilleengen TAILQ_FOREACH_SAFE(bq, &wp->bits, queue, bq2) { 90c0b9797aSUlf Lilleengen TAILQ_REMOVE(&wp->bits, bq, queue); 91c0b9797aSUlf Lilleengen g_free(bq); 92c0b9797aSUlf Lilleengen } 93c0b9797aSUlf Lilleengen if (wp->waiting != NULL) { 94c0b9797aSUlf Lilleengen if (wp->waiting->bio_cflags & GV_BIO_MALLOC) 95c0b9797aSUlf Lilleengen g_free(wp->waiting->bio_data); 96c0b9797aSUlf Lilleengen g_destroy_bio(wp->waiting); 97c0b9797aSUlf Lilleengen } 98c0b9797aSUlf Lilleengen if (wp->parity != NULL) { 99c0b9797aSUlf Lilleengen if (wp->parity->bio_cflags & GV_BIO_MALLOC) 100c0b9797aSUlf Lilleengen g_free(wp->parity->bio_data); 101c0b9797aSUlf Lilleengen g_destroy_bio(wp->parity); 102c0b9797aSUlf Lilleengen } 103c0b9797aSUlf Lilleengen g_free(wp); 104c0b9797aSUlf Lilleengen 105c0b9797aSUlf Lilleengen TAILQ_FOREACH_SAFE(wp, &p->packets, list, wp2) { 106c0b9797aSUlf Lilleengen if (wp->bio != bp) 107c0b9797aSUlf Lilleengen continue; 108c0b9797aSUlf Lilleengen 109c0b9797aSUlf Lilleengen TAILQ_REMOVE(&p->packets, wp, list); 110c0b9797aSUlf Lilleengen TAILQ_FOREACH_SAFE(bq, &wp->bits, queue, bq2) { 111c0b9797aSUlf Lilleengen TAILQ_REMOVE(&wp->bits, bq, queue); 112c0b9797aSUlf Lilleengen g_free(bq); 113c0b9797aSUlf Lilleengen } 114c0b9797aSUlf Lilleengen g_free(wp); 115c0b9797aSUlf Lilleengen } 116c0b9797aSUlf Lilleengen 117c0b9797aSUlf Lilleengen cbp = bioq_takefirst(p->bqueue); 118c0b9797aSUlf Lilleengen while (cbp != NULL) { 119c0b9797aSUlf Lilleengen if (cbp->bio_cflags & GV_BIO_MALLOC) 120c0b9797aSUlf Lilleengen g_free(cbp->bio_data); 121c0b9797aSUlf Lilleengen g_destroy_bio(cbp); 122c0b9797aSUlf Lilleengen cbp = bioq_takefirst(p->bqueue); 123c0b9797aSUlf Lilleengen } 124c0b9797aSUlf Lilleengen 125c0b9797aSUlf Lilleengen /* If internal, stop and reset state. */ 126d8d015cdSUlf Lilleengen if (bp->bio_pflags & GV_BIO_INTERNAL) { 127d8d015cdSUlf Lilleengen if (bp->bio_pflags & GV_BIO_MALLOC) 1281d8dfc60SUlf Lilleengen g_free(bp->bio_data); 129c0b9797aSUlf Lilleengen g_destroy_bio(bp); 130c0b9797aSUlf Lilleengen /* Reset flags. */ 131c0b9797aSUlf Lilleengen p->flags &= ~(GV_PLEX_SYNCING | GV_PLEX_REBUILDING | 132c0b9797aSUlf Lilleengen GV_PLEX_GROWING); 133c0b9797aSUlf Lilleengen return (NULL); 134c0b9797aSUlf Lilleengen } 135c0b9797aSUlf Lilleengen g_io_deliver(bp, err); 136c0b9797aSUlf Lilleengen return (NULL); 137c0b9797aSUlf Lilleengen } 138c0b9797aSUlf Lilleengen 139c0b9797aSUlf Lilleengen return (wp); 140c0b9797aSUlf Lilleengen } 141fb4e65d0SLukas Ertl 14273679edcSLukas Ertl /* 14373679edcSLukas Ertl * Check if the stripe that the work packet wants is already being used by 14473679edcSLukas Ertl * some other work packet. 14573679edcSLukas Ertl */ 14673679edcSLukas Ertl int 14767e3ab6eSLukas Ertl gv_stripe_active(struct gv_plex *p, struct bio *bp) 14873679edcSLukas Ertl { 14967e3ab6eSLukas Ertl struct gv_raid5_packet *wp, *owp; 15067e3ab6eSLukas Ertl int overlap; 15173679edcSLukas Ertl 152c0b9797aSUlf Lilleengen wp = bp->bio_caller2; 15367e3ab6eSLukas Ertl if (wp->lockbase == -1) 15473679edcSLukas Ertl return (0); 15573679edcSLukas Ertl 15667e3ab6eSLukas Ertl overlap = 0; 15767e3ab6eSLukas Ertl TAILQ_FOREACH(owp, &p->packets, list) { 15867e3ab6eSLukas Ertl if (owp == wp) 15973679edcSLukas Ertl break; 16067e3ab6eSLukas Ertl if ((wp->lockbase >= owp->lockbase) && 16167e3ab6eSLukas Ertl (wp->lockbase <= owp->lockbase + owp->length)) { 16267e3ab6eSLukas Ertl overlap++; 16373679edcSLukas Ertl break; 16473679edcSLukas Ertl } 16567e3ab6eSLukas Ertl if ((wp->lockbase <= owp->lockbase) && 16667e3ab6eSLukas Ertl (wp->lockbase + wp->length >= owp->lockbase)) { 16767e3ab6eSLukas Ertl overlap++; 16867e3ab6eSLukas Ertl break; 16967e3ab6eSLukas Ertl } 17067e3ab6eSLukas Ertl } 17173679edcSLukas Ertl 17267e3ab6eSLukas Ertl return (overlap); 17373679edcSLukas Ertl } 17473679edcSLukas Ertl 175c0b9797aSUlf Lilleengen static int 176c0b9797aSUlf Lilleengen gv_raid5_check(struct gv_plex *p, struct gv_raid5_packet *wp, struct bio *bp, 177fb5885afSLukas Ertl caddr_t addr, off_t boff, off_t bcount) 178fb5885afSLukas Ertl { 179fb5885afSLukas Ertl struct gv_sd *parity, *s; 180fb5885afSLukas Ertl struct gv_bioq *bq; 181c0b9797aSUlf Lilleengen struct bio *cbp; 182fb5885afSLukas Ertl int i, psdno; 183fb5885afSLukas Ertl off_t real_len, real_off; 184fb5885afSLukas Ertl 185fb5885afSLukas Ertl if (p == NULL || LIST_EMPTY(&p->subdisks)) 186fb5885afSLukas Ertl return (ENXIO); 187fb5885afSLukas Ertl 188c0b9797aSUlf Lilleengen gv_raid5_offset(p, boff, bcount, &real_off, &real_len, NULL, &psdno, 1); 189fb5885afSLukas Ertl 190fb5885afSLukas Ertl /* Find the right subdisk. */ 191fb5885afSLukas Ertl parity = NULL; 192fb5885afSLukas Ertl i = 0; 193fb5885afSLukas Ertl LIST_FOREACH(s, &p->subdisks, in_plex) { 194fb5885afSLukas Ertl if (i == psdno) { 195fb5885afSLukas Ertl parity = s; 196fb5885afSLukas Ertl break; 197fb5885afSLukas Ertl } 198fb5885afSLukas Ertl i++; 199fb5885afSLukas Ertl } 200fb5885afSLukas Ertl 201fb5885afSLukas Ertl /* Parity stripe not found. */ 202fb5885afSLukas Ertl if (parity == NULL) 203fb5885afSLukas Ertl return (ENXIO); 204fb5885afSLukas Ertl 205fb5885afSLukas Ertl if (parity->state != GV_SD_UP) 206fb5885afSLukas Ertl return (ENXIO); 207fb5885afSLukas Ertl 208fb5885afSLukas Ertl wp->length = real_len; 209fb5885afSLukas Ertl wp->data = addr; 210fb5885afSLukas Ertl wp->lockbase = real_off; 211fb5885afSLukas Ertl 212fb5885afSLukas Ertl /* Read all subdisks. */ 213fb5885afSLukas Ertl LIST_FOREACH(s, &p->subdisks, in_plex) { 214fb5885afSLukas Ertl /* Skip the parity subdisk. */ 215fb5885afSLukas Ertl if (s == parity) 216fb5885afSLukas Ertl continue; 217c0b9797aSUlf Lilleengen /* Skip growing subdisks. */ 218c0b9797aSUlf Lilleengen if (s->flags & GV_SD_GROW) 219c0b9797aSUlf Lilleengen continue; 220fb5885afSLukas Ertl 221c0b9797aSUlf Lilleengen cbp = gv_raid5_clone_bio(bp, s, wp, NULL, 1); 222fb5885afSLukas Ertl if (cbp == NULL) 223fb5885afSLukas Ertl return (ENOMEM); 224fb5885afSLukas Ertl cbp->bio_cmd = BIO_READ; 225fb5885afSLukas Ertl 226c0b9797aSUlf Lilleengen bioq_insert_tail(p->bqueue, cbp); 227fb5885afSLukas Ertl 228fb5885afSLukas Ertl bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO); 229fb5885afSLukas Ertl bq->bp = cbp; 230fb5885afSLukas Ertl TAILQ_INSERT_TAIL(&wp->bits, bq, queue); 231fb5885afSLukas Ertl } 232fb5885afSLukas Ertl 233fb5885afSLukas Ertl /* Read the parity data. */ 234c0b9797aSUlf Lilleengen cbp = gv_raid5_clone_bio(bp, parity, wp, NULL, 1); 235fb5885afSLukas Ertl if (cbp == NULL) 236fb5885afSLukas Ertl return (ENOMEM); 237fb5885afSLukas Ertl cbp->bio_cmd = BIO_READ; 238fb5885afSLukas Ertl wp->waiting = cbp; 239fb5885afSLukas Ertl 240fb5885afSLukas Ertl /* 241fb5885afSLukas Ertl * In case we want to rebuild the parity, create an extra BIO to write 242fb5885afSLukas Ertl * it out. It also acts as buffer for the XOR operations. 243fb5885afSLukas Ertl */ 244c0b9797aSUlf Lilleengen cbp = gv_raid5_clone_bio(bp, parity, wp, addr, 1); 245fb5885afSLukas Ertl if (cbp == NULL) 246fb5885afSLukas Ertl return (ENOMEM); 247fb5885afSLukas Ertl wp->parity = cbp; 248fb5885afSLukas Ertl 249fb5885afSLukas Ertl return (0); 250fb5885afSLukas Ertl } 251fb5885afSLukas Ertl 252fb5885afSLukas Ertl /* Rebuild a degraded RAID5 plex. */ 253c0b9797aSUlf Lilleengen static int 254c0b9797aSUlf Lilleengen gv_raid5_rebuild(struct gv_plex *p, struct gv_raid5_packet *wp, struct bio *bp, 255c3aadfb9SLukas Ertl caddr_t addr, off_t boff, off_t bcount) 256c3aadfb9SLukas Ertl { 257c3aadfb9SLukas Ertl struct gv_sd *broken, *s; 258c3aadfb9SLukas Ertl struct gv_bioq *bq; 259c0b9797aSUlf Lilleengen struct bio *cbp; 260fb4e65d0SLukas Ertl off_t real_len, real_off; 261c3aadfb9SLukas Ertl 262c3aadfb9SLukas Ertl if (p == NULL || LIST_EMPTY(&p->subdisks)) 263c3aadfb9SLukas Ertl return (ENXIO); 264c3aadfb9SLukas Ertl 265c0b9797aSUlf Lilleengen gv_raid5_offset(p, boff, bcount, &real_off, &real_len, NULL, NULL, 1); 266c3aadfb9SLukas Ertl 267c3aadfb9SLukas Ertl /* Find the right subdisk. */ 268c3aadfb9SLukas Ertl broken = NULL; 269c3aadfb9SLukas Ertl LIST_FOREACH(s, &p->subdisks, in_plex) { 270c3aadfb9SLukas Ertl if (s->state != GV_SD_UP) 271c3aadfb9SLukas Ertl broken = s; 272c3aadfb9SLukas Ertl } 273c3aadfb9SLukas Ertl 274fb5885afSLukas Ertl /* Broken stripe not found. */ 275c3aadfb9SLukas Ertl if (broken == NULL) 276c3aadfb9SLukas Ertl return (ENXIO); 277c3aadfb9SLukas Ertl 278c3aadfb9SLukas Ertl switch (broken->state) { 279c3aadfb9SLukas Ertl case GV_SD_UP: 280c3aadfb9SLukas Ertl return (EINVAL); 281c3aadfb9SLukas Ertl 282c3aadfb9SLukas Ertl case GV_SD_STALE: 283d8d015cdSUlf Lilleengen if (!(bp->bio_pflags & GV_BIO_REBUILD)) 284c3aadfb9SLukas Ertl return (ENXIO); 285c3aadfb9SLukas Ertl 28686b3c6f5SUlf Lilleengen G_VINUM_DEBUG(1, "sd %s is reviving", broken->name); 287c3aadfb9SLukas Ertl gv_set_sd_state(broken, GV_SD_REVIVING, GV_SETSTATE_FORCE); 288c0b9797aSUlf Lilleengen /* Set this bit now, but should be set at end. */ 289c0b9797aSUlf Lilleengen broken->flags |= GV_SD_CANGOUP; 290c3aadfb9SLukas Ertl break; 291c3aadfb9SLukas Ertl 292c3aadfb9SLukas Ertl case GV_SD_REVIVING: 293c3aadfb9SLukas Ertl break; 294c3aadfb9SLukas Ertl 295c3aadfb9SLukas Ertl default: 296c3aadfb9SLukas Ertl /* All other subdisk states mean it's not accessible. */ 297c3aadfb9SLukas Ertl return (ENXIO); 298c3aadfb9SLukas Ertl } 299c3aadfb9SLukas Ertl 300c3aadfb9SLukas Ertl wp->length = real_len; 301c3aadfb9SLukas Ertl wp->data = addr; 302c3aadfb9SLukas Ertl wp->lockbase = real_off; 303c3aadfb9SLukas Ertl 304fb4e65d0SLukas Ertl KASSERT(wp->length >= 0, ("gv_rebuild_raid5: wp->length < 0")); 305c3aadfb9SLukas Ertl 306c3aadfb9SLukas Ertl /* Read all subdisks. */ 307c3aadfb9SLukas Ertl LIST_FOREACH(s, &p->subdisks, in_plex) { 308c3aadfb9SLukas Ertl /* Skip the broken subdisk. */ 309c3aadfb9SLukas Ertl if (s == broken) 310c3aadfb9SLukas Ertl continue; 311c3aadfb9SLukas Ertl 312c0b9797aSUlf Lilleengen /* Skip growing subdisks. */ 313c0b9797aSUlf Lilleengen if (s->flags & GV_SD_GROW) 314c0b9797aSUlf Lilleengen continue; 315c0b9797aSUlf Lilleengen 316c0b9797aSUlf Lilleengen cbp = gv_raid5_clone_bio(bp, s, wp, NULL, 1); 317c3aadfb9SLukas Ertl if (cbp == NULL) 318c3aadfb9SLukas Ertl return (ENOMEM); 319c3aadfb9SLukas Ertl cbp->bio_cmd = BIO_READ; 320c3aadfb9SLukas Ertl 321c0b9797aSUlf Lilleengen bioq_insert_tail(p->bqueue, cbp); 322c3aadfb9SLukas Ertl 323c3aadfb9SLukas Ertl bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO); 324c3aadfb9SLukas Ertl bq->bp = cbp; 325c3aadfb9SLukas Ertl TAILQ_INSERT_TAIL(&wp->bits, bq, queue); 326c3aadfb9SLukas Ertl } 327c3aadfb9SLukas Ertl 328c3aadfb9SLukas Ertl /* Write the parity data. */ 329c0b9797aSUlf Lilleengen cbp = gv_raid5_clone_bio(bp, broken, wp, NULL, 1); 330c3aadfb9SLukas Ertl if (cbp == NULL) 331c3aadfb9SLukas Ertl return (ENOMEM); 332c3aadfb9SLukas Ertl wp->parity = cbp; 333c3aadfb9SLukas Ertl 334c3aadfb9SLukas Ertl p->synced = boff; 335c3aadfb9SLukas Ertl 336c0b9797aSUlf Lilleengen /* Post notification that we're finished. */ 337c3aadfb9SLukas Ertl return (0); 338c3aadfb9SLukas Ertl } 339c3aadfb9SLukas Ertl 34073679edcSLukas Ertl /* Build a request group to perform (part of) a RAID5 request. */ 341c0b9797aSUlf Lilleengen static int 342c0b9797aSUlf Lilleengen gv_raid5_request(struct gv_plex *p, struct gv_raid5_packet *wp, 343c0b9797aSUlf Lilleengen struct bio *bp, caddr_t addr, off_t boff, off_t bcount, int *delay) 34473679edcSLukas Ertl { 34573679edcSLukas Ertl struct g_geom *gp; 34673679edcSLukas Ertl struct gv_sd *broken, *original, *parity, *s; 34767e3ab6eSLukas Ertl struct gv_bioq *bq; 348c0b9797aSUlf Lilleengen struct bio *cbp; 349c0b9797aSUlf Lilleengen int i, psdno, sdno, type, grow; 350fb4e65d0SLukas Ertl off_t real_len, real_off; 35173679edcSLukas Ertl 35273679edcSLukas Ertl gp = bp->bio_to->geom; 35373679edcSLukas Ertl 35473679edcSLukas Ertl if (p == NULL || LIST_EMPTY(&p->subdisks)) 35573679edcSLukas Ertl return (ENXIO); 35673679edcSLukas Ertl 35773679edcSLukas Ertl /* We are optimistic and assume that this request will be OK. */ 35867e3ab6eSLukas Ertl #define REQ_TYPE_NORMAL 0 35967e3ab6eSLukas Ertl #define REQ_TYPE_DEGRADED 1 36067e3ab6eSLukas Ertl #define REQ_TYPE_NOPARITY 2 36167e3ab6eSLukas Ertl 36267e3ab6eSLukas Ertl type = REQ_TYPE_NORMAL; 36373679edcSLukas Ertl original = parity = broken = NULL; 36473679edcSLukas Ertl 365c0b9797aSUlf Lilleengen /* XXX: The resize won't crash with rebuild or sync, but we should still 366c0b9797aSUlf Lilleengen * be aware of it. Also this should perhaps be done on rebuild/check as 367c0b9797aSUlf Lilleengen * well? 368c0b9797aSUlf Lilleengen */ 369c0b9797aSUlf Lilleengen /* If we're over, we must use the old. */ 370c0b9797aSUlf Lilleengen if (boff >= p->synced) { 371c0b9797aSUlf Lilleengen grow = 1; 372c0b9797aSUlf Lilleengen /* Or if over the resized offset, we use all drives. */ 373c0b9797aSUlf Lilleengen } else if (boff + bcount <= p->synced) { 374c0b9797aSUlf Lilleengen grow = 0; 375c0b9797aSUlf Lilleengen /* Else, we're in the middle, and must wait a bit. */ 376c0b9797aSUlf Lilleengen } else { 377c0b9797aSUlf Lilleengen bioq_disksort(p->rqueue, bp); 378c0b9797aSUlf Lilleengen *delay = 1; 379c0b9797aSUlf Lilleengen return (0); 380c0b9797aSUlf Lilleengen } 381c0b9797aSUlf Lilleengen gv_raid5_offset(p, boff, bcount, &real_off, &real_len, 382c0b9797aSUlf Lilleengen &sdno, &psdno, grow); 38373679edcSLukas Ertl 38473679edcSLukas Ertl /* Find the right subdisks. */ 38573679edcSLukas Ertl i = 0; 38673679edcSLukas Ertl LIST_FOREACH(s, &p->subdisks, in_plex) { 38773679edcSLukas Ertl if (i == sdno) 38873679edcSLukas Ertl original = s; 38973679edcSLukas Ertl if (i == psdno) 39073679edcSLukas Ertl parity = s; 39173679edcSLukas Ertl if (s->state != GV_SD_UP) 39273679edcSLukas Ertl broken = s; 39373679edcSLukas Ertl i++; 39473679edcSLukas Ertl } 39573679edcSLukas Ertl 39673679edcSLukas Ertl if ((original == NULL) || (parity == NULL)) 39773679edcSLukas Ertl return (ENXIO); 39873679edcSLukas Ertl 39973679edcSLukas Ertl /* Our data stripe is missing. */ 40073679edcSLukas Ertl if (original->state != GV_SD_UP) 40167e3ab6eSLukas Ertl type = REQ_TYPE_DEGRADED; 402c0b9797aSUlf Lilleengen 403c0b9797aSUlf Lilleengen /* If synchronizing request, just write it if disks are stale. */ 404c0b9797aSUlf Lilleengen if (original->state == GV_SD_STALE && parity->state == GV_SD_STALE && 405d8d015cdSUlf Lilleengen bp->bio_pflags & GV_BIO_SYNCREQ && bp->bio_cmd == BIO_WRITE) { 406c0b9797aSUlf Lilleengen type = REQ_TYPE_NORMAL; 40773679edcSLukas Ertl /* Our parity stripe is missing. */ 408c0b9797aSUlf Lilleengen } else if (parity->state != GV_SD_UP) { 40973679edcSLukas Ertl /* We cannot take another failure if we're already degraded. */ 41067e3ab6eSLukas Ertl if (type != REQ_TYPE_NORMAL) 41173679edcSLukas Ertl return (ENXIO); 41273679edcSLukas Ertl else 41367e3ab6eSLukas Ertl type = REQ_TYPE_NOPARITY; 41473679edcSLukas Ertl } 41573679edcSLukas Ertl 41667e3ab6eSLukas Ertl wp->length = real_len; 41773679edcSLukas Ertl wp->data = addr; 41867e3ab6eSLukas Ertl wp->lockbase = real_off; 41973679edcSLukas Ertl 42073679edcSLukas Ertl KASSERT(wp->length >= 0, ("gv_build_raid5_request: wp->length < 0")); 42173679edcSLukas Ertl 422c0b9797aSUlf Lilleengen if ((p->flags & GV_PLEX_REBUILDING) && (boff + real_len < p->synced)) 423c3aadfb9SLukas Ertl type = REQ_TYPE_NORMAL; 424c3aadfb9SLukas Ertl 425c0b9797aSUlf Lilleengen if ((p->flags & GV_PLEX_REBUILDING) && (boff + real_len >= p->synced)) { 426c0b9797aSUlf Lilleengen bioq_disksort(p->rqueue, bp); 427c0b9797aSUlf Lilleengen *delay = 1; 428c0b9797aSUlf Lilleengen return (0); 429c0b9797aSUlf Lilleengen } 430c0b9797aSUlf Lilleengen 43173679edcSLukas Ertl switch (bp->bio_cmd) { 43273679edcSLukas Ertl case BIO_READ: 43373679edcSLukas Ertl /* 43473679edcSLukas Ertl * For a degraded read we need to read in all stripes except 43573679edcSLukas Ertl * the broken one plus the parity stripe and then recalculate 43673679edcSLukas Ertl * the desired data. 43773679edcSLukas Ertl */ 43867e3ab6eSLukas Ertl if (type == REQ_TYPE_DEGRADED) { 43967e3ab6eSLukas Ertl bzero(wp->data, wp->length); 44073679edcSLukas Ertl LIST_FOREACH(s, &p->subdisks, in_plex) { 44173679edcSLukas Ertl /* Skip the broken subdisk. */ 44273679edcSLukas Ertl if (s == broken) 44373679edcSLukas Ertl continue; 444c0b9797aSUlf Lilleengen /* Skip growing if within offset. */ 445c0b9797aSUlf Lilleengen if (grow && s->flags & GV_SD_GROW) 446c0b9797aSUlf Lilleengen continue; 447c0b9797aSUlf Lilleengen cbp = gv_raid5_clone_bio(bp, s, wp, NULL, 1); 44867e3ab6eSLukas Ertl if (cbp == NULL) 44973679edcSLukas Ertl return (ENOMEM); 45067e3ab6eSLukas Ertl 451c0b9797aSUlf Lilleengen bioq_insert_tail(p->bqueue, cbp); 45267e3ab6eSLukas Ertl 45367e3ab6eSLukas Ertl bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO); 45467e3ab6eSLukas Ertl bq->bp = cbp; 45567e3ab6eSLukas Ertl TAILQ_INSERT_TAIL(&wp->bits, bq, queue); 45673679edcSLukas Ertl } 45773679edcSLukas Ertl 45873679edcSLukas Ertl /* A normal read can be fulfilled with the original subdisk. */ 45973679edcSLukas Ertl } else { 460c0b9797aSUlf Lilleengen cbp = gv_raid5_clone_bio(bp, original, wp, addr, 0); 46167e3ab6eSLukas Ertl if (cbp == NULL) 46273679edcSLukas Ertl return (ENOMEM); 46367e3ab6eSLukas Ertl 464c0b9797aSUlf Lilleengen bioq_insert_tail(p->bqueue, cbp); 46573679edcSLukas Ertl } 46673679edcSLukas Ertl wp->lockbase = -1; 46767e3ab6eSLukas Ertl 46873679edcSLukas Ertl break; 46973679edcSLukas Ertl 47073679edcSLukas Ertl case BIO_WRITE: 47173679edcSLukas Ertl /* 47273679edcSLukas Ertl * A degraded write means we cannot write to the original data 47373679edcSLukas Ertl * subdisk. Thus we need to read in all valid stripes, 47473679edcSLukas Ertl * recalculate the parity from the original data, and then 47573679edcSLukas Ertl * write the parity stripe back out. 47673679edcSLukas Ertl */ 47767e3ab6eSLukas Ertl if (type == REQ_TYPE_DEGRADED) { 47867e3ab6eSLukas Ertl /* Read all subdisks. */ 47973679edcSLukas Ertl LIST_FOREACH(s, &p->subdisks, in_plex) { 48073679edcSLukas Ertl /* Skip the broken and the parity subdisk. */ 48167e3ab6eSLukas Ertl if ((s == broken) || (s == parity)) 48273679edcSLukas Ertl continue; 483c0b9797aSUlf Lilleengen /* Skip growing if within offset. */ 484c0b9797aSUlf Lilleengen if (grow && s->flags & GV_SD_GROW) 485c0b9797aSUlf Lilleengen continue; 48673679edcSLukas Ertl 487c0b9797aSUlf Lilleengen cbp = gv_raid5_clone_bio(bp, s, wp, NULL, 1); 48867e3ab6eSLukas Ertl if (cbp == NULL) 48973679edcSLukas Ertl return (ENOMEM); 49067e3ab6eSLukas Ertl cbp->bio_cmd = BIO_READ; 49167e3ab6eSLukas Ertl 492c0b9797aSUlf Lilleengen bioq_insert_tail(p->bqueue, cbp); 49367e3ab6eSLukas Ertl 49467e3ab6eSLukas Ertl bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO); 49567e3ab6eSLukas Ertl bq->bp = cbp; 49667e3ab6eSLukas Ertl TAILQ_INSERT_TAIL(&wp->bits, bq, queue); 49773679edcSLukas Ertl } 49873679edcSLukas Ertl 49967e3ab6eSLukas Ertl /* Write the parity data. */ 500c0b9797aSUlf Lilleengen cbp = gv_raid5_clone_bio(bp, parity, wp, NULL, 1); 50167e3ab6eSLukas Ertl if (cbp == NULL) 50273679edcSLukas Ertl return (ENOMEM); 503c0b9797aSUlf Lilleengen bcopy(addr, cbp->bio_data, wp->length); 50467e3ab6eSLukas Ertl wp->parity = cbp; 50573679edcSLukas Ertl 50673679edcSLukas Ertl /* 50767e3ab6eSLukas Ertl * When the parity stripe is missing we just write out the data. 50873679edcSLukas Ertl */ 50967e3ab6eSLukas Ertl } else if (type == REQ_TYPE_NOPARITY) { 510c0b9797aSUlf Lilleengen cbp = gv_raid5_clone_bio(bp, original, wp, addr, 1); 51167e3ab6eSLukas Ertl if (cbp == NULL) 512291cb0acSLukas Ertl return (ENOMEM); 51373679edcSLukas Ertl 514c0b9797aSUlf Lilleengen bioq_insert_tail(p->bqueue, cbp); 51573679edcSLukas Ertl 51667e3ab6eSLukas Ertl bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO); 51767e3ab6eSLukas Ertl bq->bp = cbp; 51867e3ab6eSLukas Ertl TAILQ_INSERT_TAIL(&wp->bits, bq, queue); 51973679edcSLukas Ertl 52073679edcSLukas Ertl /* 52173679edcSLukas Ertl * A normal write request goes to the original subdisk, then we 52273679edcSLukas Ertl * read in all other stripes, recalculate the parity and write 52373679edcSLukas Ertl * out the parity again. 52473679edcSLukas Ertl */ 52573679edcSLukas Ertl } else { 52667e3ab6eSLukas Ertl /* Read old parity. */ 527c0b9797aSUlf Lilleengen cbp = gv_raid5_clone_bio(bp, parity, wp, NULL, 1); 52867e3ab6eSLukas Ertl if (cbp == NULL) 529291cb0acSLukas Ertl return (ENOMEM); 53067e3ab6eSLukas Ertl cbp->bio_cmd = BIO_READ; 53173679edcSLukas Ertl 532c0b9797aSUlf Lilleengen bioq_insert_tail(p->bqueue, cbp); 53367e3ab6eSLukas Ertl 53467e3ab6eSLukas Ertl bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO); 53567e3ab6eSLukas Ertl bq->bp = cbp; 53667e3ab6eSLukas Ertl TAILQ_INSERT_TAIL(&wp->bits, bq, queue); 53767e3ab6eSLukas Ertl 53867e3ab6eSLukas Ertl /* Read old data. */ 539c0b9797aSUlf Lilleengen cbp = gv_raid5_clone_bio(bp, original, wp, NULL, 1); 54067e3ab6eSLukas Ertl if (cbp == NULL) 54173679edcSLukas Ertl return (ENOMEM); 54267e3ab6eSLukas Ertl cbp->bio_cmd = BIO_READ; 54367e3ab6eSLukas Ertl 544c0b9797aSUlf Lilleengen bioq_insert_tail(p->bqueue, cbp); 54567e3ab6eSLukas Ertl 54667e3ab6eSLukas Ertl bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO); 54767e3ab6eSLukas Ertl bq->bp = cbp; 54867e3ab6eSLukas Ertl TAILQ_INSERT_TAIL(&wp->bits, bq, queue); 54967e3ab6eSLukas Ertl 55067e3ab6eSLukas Ertl /* Write new data. */ 551c0b9797aSUlf Lilleengen cbp = gv_raid5_clone_bio(bp, original, wp, addr, 1); 55267e3ab6eSLukas Ertl if (cbp == NULL) 55367e3ab6eSLukas Ertl return (ENOMEM); 55467e3ab6eSLukas Ertl 55573679edcSLukas Ertl /* 55667e3ab6eSLukas Ertl * We must not write the new data until the old data 55767e3ab6eSLukas Ertl * was read, so hold this BIO back until we're ready 55867e3ab6eSLukas Ertl * for it. 55973679edcSLukas Ertl */ 56067e3ab6eSLukas Ertl wp->waiting = cbp; 56167e3ab6eSLukas Ertl 56267e3ab6eSLukas Ertl /* The final bio for the parity. */ 563c0b9797aSUlf Lilleengen cbp = gv_raid5_clone_bio(bp, parity, wp, NULL, 1); 56467e3ab6eSLukas Ertl if (cbp == NULL) 565291cb0acSLukas Ertl return (ENOMEM); 56667e3ab6eSLukas Ertl 56767e3ab6eSLukas Ertl /* Remember that this is the BIO for the parity data. */ 56867e3ab6eSLukas Ertl wp->parity = cbp; 56973679edcSLukas Ertl } 57073679edcSLukas Ertl break; 57167e3ab6eSLukas Ertl 57273679edcSLukas Ertl default: 57373679edcSLukas Ertl return (EINVAL); 57473679edcSLukas Ertl } 57573679edcSLukas Ertl 57673679edcSLukas Ertl return (0); 57773679edcSLukas Ertl } 578fb4e65d0SLukas Ertl 579c0b9797aSUlf Lilleengen /* 580c0b9797aSUlf Lilleengen * Calculate the offsets in the various subdisks for a RAID5 request. Also take 581c0b9797aSUlf Lilleengen * care of new subdisks in an expanded RAID5 array. 582c0b9797aSUlf Lilleengen * XXX: This assumes that the new subdisks are inserted after the others (which 583c0b9797aSUlf Lilleengen * is okay as long as plex_offset is larger). If subdisks are inserted into the 584c0b9797aSUlf Lilleengen * plexlist before, we get problems. 585c0b9797aSUlf Lilleengen */ 586c0b9797aSUlf Lilleengen static int 587fb4e65d0SLukas Ertl gv_raid5_offset(struct gv_plex *p, off_t boff, off_t bcount, off_t *real_off, 588c0b9797aSUlf Lilleengen off_t *real_len, int *sdno, int *psdno, int growing) 589fb4e65d0SLukas Ertl { 590c0b9797aSUlf Lilleengen struct gv_sd *s; 591c0b9797aSUlf Lilleengen int sd, psd, sdcount; 592fb4e65d0SLukas Ertl off_t len_left, stripeend, stripeoff, stripestart; 593fb4e65d0SLukas Ertl 594c0b9797aSUlf Lilleengen sdcount = p->sdcount; 595c0b9797aSUlf Lilleengen if (growing) { 596c0b9797aSUlf Lilleengen LIST_FOREACH(s, &p->subdisks, in_plex) { 597c0b9797aSUlf Lilleengen if (s->flags & GV_SD_GROW) 598c0b9797aSUlf Lilleengen sdcount--; 599c0b9797aSUlf Lilleengen } 600c0b9797aSUlf Lilleengen } 601c0b9797aSUlf Lilleengen 602fb4e65d0SLukas Ertl /* The number of the subdisk containing the parity stripe. */ 603c0b9797aSUlf Lilleengen psd = sdcount - 1 - ( boff / (p->stripesize * (sdcount - 1))) % 604c0b9797aSUlf Lilleengen sdcount; 605fb4e65d0SLukas Ertl KASSERT(psdno >= 0, ("gv_raid5_offset: psdno < 0")); 606fb4e65d0SLukas Ertl 607fb4e65d0SLukas Ertl /* Offset of the start address from the start of the stripe. */ 608c0b9797aSUlf Lilleengen stripeoff = boff % (p->stripesize * (sdcount - 1)); 609fb4e65d0SLukas Ertl KASSERT(stripeoff >= 0, ("gv_raid5_offset: stripeoff < 0")); 610fb4e65d0SLukas Ertl 611fb4e65d0SLukas Ertl /* The number of the subdisk where the stripe resides. */ 612fb4e65d0SLukas Ertl sd = stripeoff / p->stripesize; 613fb4e65d0SLukas Ertl KASSERT(sdno >= 0, ("gv_raid5_offset: sdno < 0")); 614fb4e65d0SLukas Ertl 615fb4e65d0SLukas Ertl /* At or past parity subdisk. */ 616fb4e65d0SLukas Ertl if (sd >= psd) 617fb4e65d0SLukas Ertl sd++; 618fb4e65d0SLukas Ertl 619fb4e65d0SLukas Ertl /* The offset of the stripe on this subdisk. */ 620c0b9797aSUlf Lilleengen stripestart = (boff - stripeoff) / (sdcount - 1); 621fb4e65d0SLukas Ertl KASSERT(stripestart >= 0, ("gv_raid5_offset: stripestart < 0")); 622fb4e65d0SLukas Ertl 623fb4e65d0SLukas Ertl stripeoff %= p->stripesize; 624fb4e65d0SLukas Ertl 625fb4e65d0SLukas Ertl /* The offset of the request on this subdisk. */ 626fb4e65d0SLukas Ertl *real_off = stripestart + stripeoff; 627fb4e65d0SLukas Ertl 628fb4e65d0SLukas Ertl stripeend = stripestart + p->stripesize; 629fb4e65d0SLukas Ertl len_left = stripeend - *real_off; 630fb4e65d0SLukas Ertl KASSERT(len_left >= 0, ("gv_raid5_offset: len_left < 0")); 631fb4e65d0SLukas Ertl 632fb4e65d0SLukas Ertl *real_len = (bcount <= len_left) ? bcount : len_left; 633fb4e65d0SLukas Ertl 634fb4e65d0SLukas Ertl if (sdno != NULL) 635fb4e65d0SLukas Ertl *sdno = sd; 636fb4e65d0SLukas Ertl if (psdno != NULL) 637fb4e65d0SLukas Ertl *psdno = psd; 638fb4e65d0SLukas Ertl 639fb4e65d0SLukas Ertl return (0); 640fb4e65d0SLukas Ertl } 641c0b9797aSUlf Lilleengen 642c0b9797aSUlf Lilleengen static struct bio * 643c0b9797aSUlf Lilleengen gv_raid5_clone_bio(struct bio *bp, struct gv_sd *s, struct gv_raid5_packet *wp, 644c0b9797aSUlf Lilleengen caddr_t addr, int use_wp) 645c0b9797aSUlf Lilleengen { 646c0b9797aSUlf Lilleengen struct bio *cbp; 647c0b9797aSUlf Lilleengen 648c0b9797aSUlf Lilleengen cbp = g_clone_bio(bp); 649c0b9797aSUlf Lilleengen if (cbp == NULL) 650c0b9797aSUlf Lilleengen return (NULL); 651c0b9797aSUlf Lilleengen if (addr == NULL) { 652c0b9797aSUlf Lilleengen cbp->bio_data = g_malloc(wp->length, M_WAITOK | M_ZERO); 653c0b9797aSUlf Lilleengen cbp->bio_cflags |= GV_BIO_MALLOC; 654c0b9797aSUlf Lilleengen } else 655c0b9797aSUlf Lilleengen cbp->bio_data = addr; 656c0b9797aSUlf Lilleengen cbp->bio_offset = wp->lockbase + s->drive_offset; 657c0b9797aSUlf Lilleengen cbp->bio_length = wp->length; 658c0b9797aSUlf Lilleengen cbp->bio_done = gv_done; 659c0b9797aSUlf Lilleengen cbp->bio_caller1 = s; 660c0b9797aSUlf Lilleengen if (use_wp) 661c0b9797aSUlf Lilleengen cbp->bio_caller2 = wp; 662c0b9797aSUlf Lilleengen 663c0b9797aSUlf Lilleengen return (cbp); 664c0b9797aSUlf Lilleengen } 665