173679edcSLukas Ertl /*- 273679edcSLukas Ertl * Copyright (c) 2004 Lukas Ertl 373679edcSLukas Ertl * All rights reserved. 473679edcSLukas Ertl * 573679edcSLukas Ertl * Redistribution and use in source and binary forms, with or without 673679edcSLukas Ertl * modification, are permitted provided that the following conditions 773679edcSLukas Ertl * are met: 873679edcSLukas Ertl * 1. Redistributions of source code must retain the above copyright 973679edcSLukas Ertl * notice, this list of conditions and the following disclaimer. 1073679edcSLukas Ertl * 2. Redistributions in binary form must reproduce the above copyright 1173679edcSLukas Ertl * notice, this list of conditions and the following disclaimer in the 1273679edcSLukas Ertl * documentation and/or other materials provided with the distribution. 1373679edcSLukas Ertl * 1473679edcSLukas Ertl * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 1573679edcSLukas Ertl * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 1673679edcSLukas Ertl * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 1773679edcSLukas Ertl * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 1873679edcSLukas Ertl * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 1973679edcSLukas Ertl * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 2073679edcSLukas Ertl * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 2173679edcSLukas Ertl * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 2273679edcSLukas Ertl * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 2373679edcSLukas Ertl * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 2473679edcSLukas Ertl * SUCH DAMAGE. 2573679edcSLukas Ertl */ 2673679edcSLukas Ertl 2773679edcSLukas Ertl #include <sys/cdefs.h> 2873679edcSLukas Ertl __FBSDID("$FreeBSD$"); 2973679edcSLukas Ertl 3073679edcSLukas Ertl #include <sys/param.h> 3173679edcSLukas Ertl #include <sys/bio.h> 3273679edcSLukas Ertl #include <sys/conf.h> 3373679edcSLukas Ertl #include <sys/errno.h> 3473679edcSLukas Ertl #include <sys/kernel.h> 3573679edcSLukas Ertl #include <sys/kthread.h> 3673679edcSLukas Ertl #include <sys/libkern.h> 3773679edcSLukas Ertl #include <sys/lock.h> 3873679edcSLukas Ertl #include <sys/malloc.h> 3973679edcSLukas Ertl #include <sys/mutex.h> 4073679edcSLukas Ertl #include <sys/systm.h> 4173679edcSLukas Ertl 4273679edcSLukas Ertl #include <geom/geom.h> 4373679edcSLukas Ertl #include <geom/vinum/geom_vinum_var.h> 4473679edcSLukas Ertl #include <geom/vinum/geom_vinum_raid5.h> 4573679edcSLukas Ertl #include <geom/vinum/geom_vinum.h> 4673679edcSLukas Ertl 47fb4e65d0SLukas Ertl int gv_raid5_offset(struct gv_plex *, off_t, off_t, off_t *, off_t *, 48fb4e65d0SLukas Ertl int *, int *); 49fb4e65d0SLukas Ertl 5073679edcSLukas Ertl /* 5173679edcSLukas Ertl * Check if the stripe that the work packet wants is already being used by 5273679edcSLukas Ertl * some other work packet. 5373679edcSLukas Ertl */ 5473679edcSLukas Ertl int 5567e3ab6eSLukas Ertl gv_stripe_active(struct gv_plex *p, struct bio *bp) 5673679edcSLukas Ertl { 5767e3ab6eSLukas Ertl struct gv_raid5_packet *wp, *owp; 5867e3ab6eSLukas Ertl int overlap; 5973679edcSLukas Ertl 6067e3ab6eSLukas Ertl wp = bp->bio_driver1; 6167e3ab6eSLukas Ertl if (wp->lockbase == -1) 6273679edcSLukas Ertl return (0); 6373679edcSLukas Ertl 6467e3ab6eSLukas Ertl overlap = 0; 6567e3ab6eSLukas Ertl TAILQ_FOREACH(owp, &p->packets, list) { 6667e3ab6eSLukas Ertl if (owp == wp) 6773679edcSLukas Ertl break; 6867e3ab6eSLukas Ertl if ((wp->lockbase >= owp->lockbase) && 6967e3ab6eSLukas Ertl (wp->lockbase <= owp->lockbase + owp->length)) { 7067e3ab6eSLukas Ertl overlap++; 7173679edcSLukas Ertl break; 7273679edcSLukas Ertl } 7367e3ab6eSLukas Ertl if ((wp->lockbase <= owp->lockbase) && 7467e3ab6eSLukas Ertl (wp->lockbase + wp->length >= owp->lockbase)) { 7567e3ab6eSLukas Ertl overlap++; 7667e3ab6eSLukas Ertl break; 7767e3ab6eSLukas Ertl } 7867e3ab6eSLukas Ertl } 7973679edcSLukas Ertl 8067e3ab6eSLukas Ertl return (overlap); 8173679edcSLukas Ertl } 8273679edcSLukas Ertl 83c3aadfb9SLukas Ertl int 84fb5885afSLukas Ertl gv_check_raid5(struct gv_plex *p, struct gv_raid5_packet *wp, struct bio *bp, 85fb5885afSLukas Ertl caddr_t addr, off_t boff, off_t bcount) 86fb5885afSLukas Ertl { 87fb5885afSLukas Ertl struct gv_sd *parity, *s; 88fb5885afSLukas Ertl struct gv_bioq *bq; 89fb5885afSLukas Ertl struct bio *cbp, *pbp; 90fb5885afSLukas Ertl int i, psdno; 91fb5885afSLukas Ertl off_t real_len, real_off; 92fb5885afSLukas Ertl 93fb5885afSLukas Ertl if (p == NULL || LIST_EMPTY(&p->subdisks)) 94fb5885afSLukas Ertl return (ENXIO); 95fb5885afSLukas Ertl 96fb5885afSLukas Ertl gv_raid5_offset(p, boff, bcount, &real_off, &real_len, NULL, &psdno); 97fb5885afSLukas Ertl 98fb5885afSLukas Ertl /* Find the right subdisk. */ 99fb5885afSLukas Ertl parity = NULL; 100fb5885afSLukas Ertl i = 0; 101fb5885afSLukas Ertl LIST_FOREACH(s, &p->subdisks, in_plex) { 102fb5885afSLukas Ertl if (i == psdno) { 103fb5885afSLukas Ertl parity = s; 104fb5885afSLukas Ertl break; 105fb5885afSLukas Ertl } 106fb5885afSLukas Ertl i++; 107fb5885afSLukas Ertl } 108fb5885afSLukas Ertl 109fb5885afSLukas Ertl /* Parity stripe not found. */ 110fb5885afSLukas Ertl if (parity == NULL) 111fb5885afSLukas Ertl return (ENXIO); 112fb5885afSLukas Ertl 113fb5885afSLukas Ertl if (parity->state != GV_SD_UP) 114fb5885afSLukas Ertl return (ENXIO); 115fb5885afSLukas Ertl 116fb5885afSLukas Ertl wp->length = real_len; 117fb5885afSLukas Ertl wp->data = addr; 118fb5885afSLukas Ertl wp->lockbase = real_off; 119fb5885afSLukas Ertl 120fb5885afSLukas Ertl /* Read all subdisks. */ 121fb5885afSLukas Ertl LIST_FOREACH(s, &p->subdisks, in_plex) { 122fb5885afSLukas Ertl /* Skip the parity subdisk. */ 123fb5885afSLukas Ertl if (s == parity) 124fb5885afSLukas Ertl continue; 125fb5885afSLukas Ertl 126fb5885afSLukas Ertl cbp = g_clone_bio(bp); 127fb5885afSLukas Ertl if (cbp == NULL) 128fb5885afSLukas Ertl return (ENOMEM); 129fb5885afSLukas Ertl cbp->bio_cmd = BIO_READ; 130fb5885afSLukas Ertl cbp->bio_data = g_malloc(real_len, M_WAITOK); 131fb5885afSLukas Ertl cbp->bio_cflags |= GV_BIO_MALLOC; 132fb5885afSLukas Ertl cbp->bio_offset = real_off; 133fb5885afSLukas Ertl cbp->bio_length = real_len; 134fb5885afSLukas Ertl cbp->bio_done = gv_plex_done; 135fb5885afSLukas Ertl cbp->bio_caller2 = s->consumer; 136fb5885afSLukas Ertl cbp->bio_driver1 = wp; 137fb5885afSLukas Ertl 138fb5885afSLukas Ertl GV_ENQUEUE(bp, cbp, pbp); 139fb5885afSLukas Ertl 140fb5885afSLukas Ertl bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO); 141fb5885afSLukas Ertl bq->bp = cbp; 142fb5885afSLukas Ertl TAILQ_INSERT_TAIL(&wp->bits, bq, queue); 143fb5885afSLukas Ertl } 144fb5885afSLukas Ertl 145fb5885afSLukas Ertl /* Read the parity data. */ 146fb5885afSLukas Ertl cbp = g_clone_bio(bp); 147fb5885afSLukas Ertl if (cbp == NULL) 148fb5885afSLukas Ertl return (ENOMEM); 149fb5885afSLukas Ertl cbp->bio_cmd = BIO_READ; 150fb5885afSLukas Ertl cbp->bio_data = g_malloc(real_len, M_WAITOK | M_ZERO); 151fb5885afSLukas Ertl cbp->bio_cflags |= GV_BIO_MALLOC; 152fb5885afSLukas Ertl cbp->bio_offset = real_off; 153fb5885afSLukas Ertl cbp->bio_length = real_len; 154fb5885afSLukas Ertl cbp->bio_done = gv_plex_done; 155fb5885afSLukas Ertl cbp->bio_caller2 = parity->consumer; 156fb5885afSLukas Ertl cbp->bio_driver1 = wp; 157fb5885afSLukas Ertl wp->waiting = cbp; 158fb5885afSLukas Ertl 159fb5885afSLukas Ertl /* 160fb5885afSLukas Ertl * In case we want to rebuild the parity, create an extra BIO to write 161fb5885afSLukas Ertl * it out. It also acts as buffer for the XOR operations. 162fb5885afSLukas Ertl */ 163fb5885afSLukas Ertl cbp = g_clone_bio(bp); 164fb5885afSLukas Ertl if (cbp == NULL) 165fb5885afSLukas Ertl return (ENOMEM); 166fb5885afSLukas Ertl cbp->bio_data = addr; 167fb5885afSLukas Ertl cbp->bio_offset = real_off; 168fb5885afSLukas Ertl cbp->bio_length = real_len; 169fb5885afSLukas Ertl cbp->bio_done = gv_plex_done; 170fb5885afSLukas Ertl cbp->bio_caller2 = parity->consumer; 171fb5885afSLukas Ertl cbp->bio_driver1 = wp; 172fb5885afSLukas Ertl wp->parity = cbp; 173fb5885afSLukas Ertl 174fb5885afSLukas Ertl return (0); 175fb5885afSLukas Ertl } 176fb5885afSLukas Ertl 177fb5885afSLukas Ertl /* Rebuild a degraded RAID5 plex. */ 178fb5885afSLukas Ertl int 179c3aadfb9SLukas Ertl gv_rebuild_raid5(struct gv_plex *p, struct gv_raid5_packet *wp, struct bio *bp, 180c3aadfb9SLukas Ertl caddr_t addr, off_t boff, off_t bcount) 181c3aadfb9SLukas Ertl { 182c3aadfb9SLukas Ertl struct gv_sd *broken, *s; 183c3aadfb9SLukas Ertl struct gv_bioq *bq; 184c3aadfb9SLukas Ertl struct bio *cbp, *pbp; 185fb4e65d0SLukas Ertl off_t real_len, real_off; 186c3aadfb9SLukas Ertl 187c3aadfb9SLukas Ertl if (p == NULL || LIST_EMPTY(&p->subdisks)) 188c3aadfb9SLukas Ertl return (ENXIO); 189c3aadfb9SLukas Ertl 190fb4e65d0SLukas Ertl gv_raid5_offset(p, boff, bcount, &real_off, &real_len, NULL, NULL); 191c3aadfb9SLukas Ertl 192c3aadfb9SLukas Ertl /* Find the right subdisk. */ 193c3aadfb9SLukas Ertl broken = NULL; 194c3aadfb9SLukas Ertl LIST_FOREACH(s, &p->subdisks, in_plex) { 195c3aadfb9SLukas Ertl if (s->state != GV_SD_UP) 196c3aadfb9SLukas Ertl broken = s; 197c3aadfb9SLukas Ertl } 198c3aadfb9SLukas Ertl 199fb5885afSLukas Ertl /* Broken stripe not found. */ 200c3aadfb9SLukas Ertl if (broken == NULL) 201c3aadfb9SLukas Ertl return (ENXIO); 202c3aadfb9SLukas Ertl 203c3aadfb9SLukas Ertl switch (broken->state) { 204c3aadfb9SLukas Ertl case GV_SD_UP: 205c3aadfb9SLukas Ertl return (EINVAL); 206c3aadfb9SLukas Ertl 207c3aadfb9SLukas Ertl case GV_SD_STALE: 208c3aadfb9SLukas Ertl if (!(bp->bio_cflags & GV_BIO_REBUILD)) 209c3aadfb9SLukas Ertl return (ENXIO); 210c3aadfb9SLukas Ertl 21186b3c6f5SUlf Lilleengen G_VINUM_DEBUG(1, "sd %s is reviving", broken->name); 212c3aadfb9SLukas Ertl gv_set_sd_state(broken, GV_SD_REVIVING, GV_SETSTATE_FORCE); 213c3aadfb9SLukas Ertl break; 214c3aadfb9SLukas Ertl 215c3aadfb9SLukas Ertl case GV_SD_REVIVING: 216c3aadfb9SLukas Ertl break; 217c3aadfb9SLukas Ertl 218c3aadfb9SLukas Ertl default: 219c3aadfb9SLukas Ertl /* All other subdisk states mean it's not accessible. */ 220c3aadfb9SLukas Ertl return (ENXIO); 221c3aadfb9SLukas Ertl } 222c3aadfb9SLukas Ertl 223c3aadfb9SLukas Ertl wp->length = real_len; 224c3aadfb9SLukas Ertl wp->data = addr; 225c3aadfb9SLukas Ertl wp->lockbase = real_off; 226c3aadfb9SLukas Ertl 227fb4e65d0SLukas Ertl KASSERT(wp->length >= 0, ("gv_rebuild_raid5: wp->length < 0")); 228c3aadfb9SLukas Ertl 229c3aadfb9SLukas Ertl /* Read all subdisks. */ 230c3aadfb9SLukas Ertl LIST_FOREACH(s, &p->subdisks, in_plex) { 231c3aadfb9SLukas Ertl /* Skip the broken subdisk. */ 232c3aadfb9SLukas Ertl if (s == broken) 233c3aadfb9SLukas Ertl continue; 234c3aadfb9SLukas Ertl 235c3aadfb9SLukas Ertl cbp = g_clone_bio(bp); 236c3aadfb9SLukas Ertl if (cbp == NULL) 237c3aadfb9SLukas Ertl return (ENOMEM); 238c3aadfb9SLukas Ertl cbp->bio_cmd = BIO_READ; 239c3aadfb9SLukas Ertl cbp->bio_data = g_malloc(real_len, M_WAITOK); 240c3aadfb9SLukas Ertl cbp->bio_cflags |= GV_BIO_MALLOC; 241c3aadfb9SLukas Ertl cbp->bio_offset = real_off; 242c3aadfb9SLukas Ertl cbp->bio_length = real_len; 243c3aadfb9SLukas Ertl cbp->bio_done = gv_plex_done; 244c3aadfb9SLukas Ertl cbp->bio_caller2 = s->consumer; 245c3aadfb9SLukas Ertl cbp->bio_driver1 = wp; 246c3aadfb9SLukas Ertl 247c3aadfb9SLukas Ertl GV_ENQUEUE(bp, cbp, pbp); 248c3aadfb9SLukas Ertl 249c3aadfb9SLukas Ertl bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO); 250c3aadfb9SLukas Ertl bq->bp = cbp; 251c3aadfb9SLukas Ertl TAILQ_INSERT_TAIL(&wp->bits, bq, queue); 252c3aadfb9SLukas Ertl } 253c3aadfb9SLukas Ertl 254c3aadfb9SLukas Ertl /* Write the parity data. */ 255c3aadfb9SLukas Ertl cbp = g_clone_bio(bp); 256c3aadfb9SLukas Ertl if (cbp == NULL) 257c3aadfb9SLukas Ertl return (ENOMEM); 258c3aadfb9SLukas Ertl cbp->bio_data = g_malloc(real_len, M_WAITOK | M_ZERO); 259c3aadfb9SLukas Ertl cbp->bio_cflags |= GV_BIO_MALLOC; 260c3aadfb9SLukas Ertl cbp->bio_offset = real_off; 261c3aadfb9SLukas Ertl cbp->bio_length = real_len; 262c3aadfb9SLukas Ertl cbp->bio_done = gv_plex_done; 263c3aadfb9SLukas Ertl cbp->bio_caller2 = broken->consumer; 264c3aadfb9SLukas Ertl cbp->bio_driver1 = wp; 265c3aadfb9SLukas Ertl cbp->bio_cflags |= GV_BIO_REBUILD; 266c3aadfb9SLukas Ertl wp->parity = cbp; 267c3aadfb9SLukas Ertl 268c3aadfb9SLukas Ertl p->synced = boff; 269c3aadfb9SLukas Ertl 270c3aadfb9SLukas Ertl return (0); 271c3aadfb9SLukas Ertl } 272c3aadfb9SLukas Ertl 27373679edcSLukas Ertl /* Build a request group to perform (part of) a RAID5 request. */ 27473679edcSLukas Ertl int 27567e3ab6eSLukas Ertl gv_build_raid5_req(struct gv_plex *p, struct gv_raid5_packet *wp, 27667e3ab6eSLukas Ertl struct bio *bp, caddr_t addr, off_t boff, off_t bcount) 27773679edcSLukas Ertl { 27873679edcSLukas Ertl struct g_geom *gp; 27973679edcSLukas Ertl struct gv_sd *broken, *original, *parity, *s; 28067e3ab6eSLukas Ertl struct gv_bioq *bq; 28167e3ab6eSLukas Ertl struct bio *cbp, *pbp; 28267e3ab6eSLukas Ertl int i, psdno, sdno, type; 283fb4e65d0SLukas Ertl off_t real_len, real_off; 28473679edcSLukas Ertl 28573679edcSLukas Ertl gp = bp->bio_to->geom; 28673679edcSLukas Ertl 28773679edcSLukas Ertl if (p == NULL || LIST_EMPTY(&p->subdisks)) 28873679edcSLukas Ertl return (ENXIO); 28973679edcSLukas Ertl 29073679edcSLukas Ertl /* We are optimistic and assume that this request will be OK. */ 29167e3ab6eSLukas Ertl #define REQ_TYPE_NORMAL 0 29267e3ab6eSLukas Ertl #define REQ_TYPE_DEGRADED 1 29367e3ab6eSLukas Ertl #define REQ_TYPE_NOPARITY 2 29467e3ab6eSLukas Ertl 29567e3ab6eSLukas Ertl type = REQ_TYPE_NORMAL; 29673679edcSLukas Ertl original = parity = broken = NULL; 29773679edcSLukas Ertl 298fb4e65d0SLukas Ertl gv_raid5_offset(p, boff, bcount, &real_off, &real_len, &sdno, &psdno); 29973679edcSLukas Ertl 30073679edcSLukas Ertl /* Find the right subdisks. */ 30173679edcSLukas Ertl i = 0; 30273679edcSLukas Ertl LIST_FOREACH(s, &p->subdisks, in_plex) { 30373679edcSLukas Ertl if (i == sdno) 30473679edcSLukas Ertl original = s; 30573679edcSLukas Ertl if (i == psdno) 30673679edcSLukas Ertl parity = s; 30773679edcSLukas Ertl if (s->state != GV_SD_UP) 30873679edcSLukas Ertl broken = s; 30973679edcSLukas Ertl i++; 31073679edcSLukas Ertl } 31173679edcSLukas Ertl 31273679edcSLukas Ertl if ((original == NULL) || (parity == NULL)) 31373679edcSLukas Ertl return (ENXIO); 31473679edcSLukas Ertl 31573679edcSLukas Ertl /* Our data stripe is missing. */ 31673679edcSLukas Ertl if (original->state != GV_SD_UP) 31767e3ab6eSLukas Ertl type = REQ_TYPE_DEGRADED; 31873679edcSLukas Ertl /* Our parity stripe is missing. */ 31973679edcSLukas Ertl if (parity->state != GV_SD_UP) { 32073679edcSLukas Ertl /* We cannot take another failure if we're already degraded. */ 32167e3ab6eSLukas Ertl if (type != REQ_TYPE_NORMAL) 32273679edcSLukas Ertl return (ENXIO); 32373679edcSLukas Ertl else 32467e3ab6eSLukas Ertl type = REQ_TYPE_NOPARITY; 32573679edcSLukas Ertl } 32673679edcSLukas Ertl 32767e3ab6eSLukas Ertl wp->length = real_len; 32873679edcSLukas Ertl wp->data = addr; 32967e3ab6eSLukas Ertl wp->lockbase = real_off; 33073679edcSLukas Ertl 33173679edcSLukas Ertl KASSERT(wp->length >= 0, ("gv_build_raid5_request: wp->length < 0")); 33273679edcSLukas Ertl 333c3aadfb9SLukas Ertl if ((p->flags & GV_PLEX_SYNCING) && (boff + real_len < p->synced)) 334c3aadfb9SLukas Ertl type = REQ_TYPE_NORMAL; 335c3aadfb9SLukas Ertl 33673679edcSLukas Ertl switch (bp->bio_cmd) { 33773679edcSLukas Ertl case BIO_READ: 33873679edcSLukas Ertl /* 33973679edcSLukas Ertl * For a degraded read we need to read in all stripes except 34073679edcSLukas Ertl * the broken one plus the parity stripe and then recalculate 34173679edcSLukas Ertl * the desired data. 34273679edcSLukas Ertl */ 34367e3ab6eSLukas Ertl if (type == REQ_TYPE_DEGRADED) { 34467e3ab6eSLukas Ertl bzero(wp->data, wp->length); 34573679edcSLukas Ertl LIST_FOREACH(s, &p->subdisks, in_plex) { 34673679edcSLukas Ertl /* Skip the broken subdisk. */ 34773679edcSLukas Ertl if (s == broken) 34873679edcSLukas Ertl continue; 34967e3ab6eSLukas Ertl cbp = g_clone_bio(bp); 35067e3ab6eSLukas Ertl if (cbp == NULL) 35173679edcSLukas Ertl return (ENOMEM); 35267e3ab6eSLukas Ertl cbp->bio_data = g_malloc(real_len, M_WAITOK); 35367e3ab6eSLukas Ertl cbp->bio_cflags |= GV_BIO_MALLOC; 35467e3ab6eSLukas Ertl cbp->bio_offset = real_off; 35567e3ab6eSLukas Ertl cbp->bio_length = real_len; 35667e3ab6eSLukas Ertl cbp->bio_done = gv_plex_done; 35767e3ab6eSLukas Ertl cbp->bio_caller2 = s->consumer; 35867e3ab6eSLukas Ertl cbp->bio_driver1 = wp; 35967e3ab6eSLukas Ertl 36067e3ab6eSLukas Ertl GV_ENQUEUE(bp, cbp, pbp); 36167e3ab6eSLukas Ertl 36267e3ab6eSLukas Ertl bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO); 36367e3ab6eSLukas Ertl bq->bp = cbp; 36467e3ab6eSLukas Ertl TAILQ_INSERT_TAIL(&wp->bits, bq, queue); 36573679edcSLukas Ertl } 36673679edcSLukas Ertl 36773679edcSLukas Ertl /* A normal read can be fulfilled with the original subdisk. */ 36873679edcSLukas Ertl } else { 36967e3ab6eSLukas Ertl cbp = g_clone_bio(bp); 37067e3ab6eSLukas Ertl if (cbp == NULL) 37173679edcSLukas Ertl return (ENOMEM); 37267e3ab6eSLukas Ertl cbp->bio_offset = real_off; 37367e3ab6eSLukas Ertl cbp->bio_length = real_len; 37467e3ab6eSLukas Ertl cbp->bio_data = addr; 37567e3ab6eSLukas Ertl cbp->bio_done = g_std_done; 37667e3ab6eSLukas Ertl cbp->bio_caller2 = original->consumer; 37767e3ab6eSLukas Ertl 37867e3ab6eSLukas Ertl GV_ENQUEUE(bp, cbp, pbp); 37973679edcSLukas Ertl } 38073679edcSLukas Ertl wp->lockbase = -1; 38167e3ab6eSLukas Ertl 38273679edcSLukas Ertl break; 38373679edcSLukas Ertl 38473679edcSLukas Ertl case BIO_WRITE: 38573679edcSLukas Ertl /* 38673679edcSLukas Ertl * A degraded write means we cannot write to the original data 38773679edcSLukas Ertl * subdisk. Thus we need to read in all valid stripes, 38873679edcSLukas Ertl * recalculate the parity from the original data, and then 38973679edcSLukas Ertl * write the parity stripe back out. 39073679edcSLukas Ertl */ 39167e3ab6eSLukas Ertl if (type == REQ_TYPE_DEGRADED) { 39267e3ab6eSLukas Ertl /* Read all subdisks. */ 39373679edcSLukas Ertl LIST_FOREACH(s, &p->subdisks, in_plex) { 39473679edcSLukas Ertl /* Skip the broken and the parity subdisk. */ 39567e3ab6eSLukas Ertl if ((s == broken) || (s == parity)) 39673679edcSLukas Ertl continue; 39773679edcSLukas Ertl 39867e3ab6eSLukas Ertl cbp = g_clone_bio(bp); 39967e3ab6eSLukas Ertl if (cbp == NULL) 40073679edcSLukas Ertl return (ENOMEM); 40167e3ab6eSLukas Ertl cbp->bio_cmd = BIO_READ; 40267e3ab6eSLukas Ertl cbp->bio_data = g_malloc(real_len, M_WAITOK); 40367e3ab6eSLukas Ertl cbp->bio_cflags |= GV_BIO_MALLOC; 40467e3ab6eSLukas Ertl cbp->bio_offset = real_off; 40567e3ab6eSLukas Ertl cbp->bio_length = real_len; 40667e3ab6eSLukas Ertl cbp->bio_done = gv_plex_done; 40767e3ab6eSLukas Ertl cbp->bio_caller2 = s->consumer; 40867e3ab6eSLukas Ertl cbp->bio_driver1 = wp; 40967e3ab6eSLukas Ertl 41067e3ab6eSLukas Ertl GV_ENQUEUE(bp, cbp, pbp); 41167e3ab6eSLukas Ertl 41267e3ab6eSLukas Ertl bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO); 41367e3ab6eSLukas Ertl bq->bp = cbp; 41467e3ab6eSLukas Ertl TAILQ_INSERT_TAIL(&wp->bits, bq, queue); 41573679edcSLukas Ertl } 41673679edcSLukas Ertl 41767e3ab6eSLukas Ertl /* Write the parity data. */ 41867e3ab6eSLukas Ertl cbp = g_clone_bio(bp); 41967e3ab6eSLukas Ertl if (cbp == NULL) 42073679edcSLukas Ertl return (ENOMEM); 42167e3ab6eSLukas Ertl cbp->bio_data = g_malloc(real_len, M_WAITOK); 42267e3ab6eSLukas Ertl cbp->bio_cflags |= GV_BIO_MALLOC; 42367e3ab6eSLukas Ertl bcopy(addr, cbp->bio_data, real_len); 42467e3ab6eSLukas Ertl cbp->bio_offset = real_off; 42567e3ab6eSLukas Ertl cbp->bio_length = real_len; 42667e3ab6eSLukas Ertl cbp->bio_done = gv_plex_done; 42767e3ab6eSLukas Ertl cbp->bio_caller2 = parity->consumer; 42867e3ab6eSLukas Ertl cbp->bio_driver1 = wp; 42967e3ab6eSLukas Ertl wp->parity = cbp; 43073679edcSLukas Ertl 43173679edcSLukas Ertl /* 43267e3ab6eSLukas Ertl * When the parity stripe is missing we just write out the data. 43373679edcSLukas Ertl */ 43467e3ab6eSLukas Ertl } else if (type == REQ_TYPE_NOPARITY) { 43567e3ab6eSLukas Ertl cbp = g_clone_bio(bp); 43667e3ab6eSLukas Ertl if (cbp == NULL) 437291cb0acSLukas Ertl return (ENOMEM); 43867e3ab6eSLukas Ertl cbp->bio_offset = real_off; 43967e3ab6eSLukas Ertl cbp->bio_length = real_len; 44067e3ab6eSLukas Ertl cbp->bio_data = addr; 44167e3ab6eSLukas Ertl cbp->bio_done = gv_plex_done; 44267e3ab6eSLukas Ertl cbp->bio_caller2 = original->consumer; 44367e3ab6eSLukas Ertl cbp->bio_driver1 = wp; 44473679edcSLukas Ertl 44567e3ab6eSLukas Ertl GV_ENQUEUE(bp, cbp, pbp); 44673679edcSLukas Ertl 44767e3ab6eSLukas Ertl bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO); 44867e3ab6eSLukas Ertl bq->bp = cbp; 44967e3ab6eSLukas Ertl TAILQ_INSERT_TAIL(&wp->bits, bq, queue); 45073679edcSLukas Ertl 45173679edcSLukas Ertl /* 45273679edcSLukas Ertl * A normal write request goes to the original subdisk, then we 45373679edcSLukas Ertl * read in all other stripes, recalculate the parity and write 45473679edcSLukas Ertl * out the parity again. 45573679edcSLukas Ertl */ 45673679edcSLukas Ertl } else { 45767e3ab6eSLukas Ertl /* Read old parity. */ 45867e3ab6eSLukas Ertl cbp = g_clone_bio(bp); 45967e3ab6eSLukas Ertl if (cbp == NULL) 460291cb0acSLukas Ertl return (ENOMEM); 46167e3ab6eSLukas Ertl cbp->bio_cmd = BIO_READ; 46267e3ab6eSLukas Ertl cbp->bio_data = g_malloc(real_len, M_WAITOK); 46367e3ab6eSLukas Ertl cbp->bio_cflags |= GV_BIO_MALLOC; 46467e3ab6eSLukas Ertl cbp->bio_offset = real_off; 46567e3ab6eSLukas Ertl cbp->bio_length = real_len; 46667e3ab6eSLukas Ertl cbp->bio_done = gv_plex_done; 46767e3ab6eSLukas Ertl cbp->bio_caller2 = parity->consumer; 46867e3ab6eSLukas Ertl cbp->bio_driver1 = wp; 46973679edcSLukas Ertl 47067e3ab6eSLukas Ertl GV_ENQUEUE(bp, cbp, pbp); 47167e3ab6eSLukas Ertl 47267e3ab6eSLukas Ertl bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO); 47367e3ab6eSLukas Ertl bq->bp = cbp; 47467e3ab6eSLukas Ertl TAILQ_INSERT_TAIL(&wp->bits, bq, queue); 47567e3ab6eSLukas Ertl 47667e3ab6eSLukas Ertl /* Read old data. */ 47767e3ab6eSLukas Ertl cbp = g_clone_bio(bp); 47867e3ab6eSLukas Ertl if (cbp == NULL) 47973679edcSLukas Ertl return (ENOMEM); 48067e3ab6eSLukas Ertl cbp->bio_cmd = BIO_READ; 48167e3ab6eSLukas Ertl cbp->bio_data = g_malloc(real_len, M_WAITOK); 48267e3ab6eSLukas Ertl cbp->bio_cflags |= GV_BIO_MALLOC; 48367e3ab6eSLukas Ertl cbp->bio_offset = real_off; 48467e3ab6eSLukas Ertl cbp->bio_length = real_len; 48567e3ab6eSLukas Ertl cbp->bio_done = gv_plex_done; 48667e3ab6eSLukas Ertl cbp->bio_caller2 = original->consumer; 48767e3ab6eSLukas Ertl cbp->bio_driver1 = wp; 48867e3ab6eSLukas Ertl 48967e3ab6eSLukas Ertl GV_ENQUEUE(bp, cbp, pbp); 49067e3ab6eSLukas Ertl 49167e3ab6eSLukas Ertl bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO); 49267e3ab6eSLukas Ertl bq->bp = cbp; 49367e3ab6eSLukas Ertl TAILQ_INSERT_TAIL(&wp->bits, bq, queue); 49467e3ab6eSLukas Ertl 49567e3ab6eSLukas Ertl /* Write new data. */ 49667e3ab6eSLukas Ertl cbp = g_clone_bio(bp); 49767e3ab6eSLukas Ertl if (cbp == NULL) 49867e3ab6eSLukas Ertl return (ENOMEM); 49967e3ab6eSLukas Ertl cbp->bio_data = addr; 50067e3ab6eSLukas Ertl cbp->bio_offset = real_off; 50167e3ab6eSLukas Ertl cbp->bio_length = real_len; 50267e3ab6eSLukas Ertl cbp->bio_done = gv_plex_done; 50367e3ab6eSLukas Ertl cbp->bio_caller2 = original->consumer; 50467e3ab6eSLukas Ertl 50567e3ab6eSLukas Ertl cbp->bio_driver1 = wp; 50667e3ab6eSLukas Ertl 50773679edcSLukas Ertl /* 50867e3ab6eSLukas Ertl * We must not write the new data until the old data 50967e3ab6eSLukas Ertl * was read, so hold this BIO back until we're ready 51067e3ab6eSLukas Ertl * for it. 51173679edcSLukas Ertl */ 51267e3ab6eSLukas Ertl wp->waiting = cbp; 51367e3ab6eSLukas Ertl 51467e3ab6eSLukas Ertl /* The final bio for the parity. */ 51567e3ab6eSLukas Ertl cbp = g_clone_bio(bp); 51667e3ab6eSLukas Ertl if (cbp == NULL) 517291cb0acSLukas Ertl return (ENOMEM); 51867e3ab6eSLukas Ertl cbp->bio_data = g_malloc(real_len, M_WAITOK | M_ZERO); 51967e3ab6eSLukas Ertl cbp->bio_cflags |= GV_BIO_MALLOC; 52067e3ab6eSLukas Ertl cbp->bio_offset = real_off; 52167e3ab6eSLukas Ertl cbp->bio_length = real_len; 52267e3ab6eSLukas Ertl cbp->bio_done = gv_plex_done; 52367e3ab6eSLukas Ertl cbp->bio_caller2 = parity->consumer; 52467e3ab6eSLukas Ertl cbp->bio_driver1 = wp; 52567e3ab6eSLukas Ertl 52667e3ab6eSLukas Ertl /* Remember that this is the BIO for the parity data. */ 52767e3ab6eSLukas Ertl wp->parity = cbp; 52873679edcSLukas Ertl } 52973679edcSLukas Ertl break; 53067e3ab6eSLukas Ertl 53173679edcSLukas Ertl default: 53273679edcSLukas Ertl return (EINVAL); 53373679edcSLukas Ertl } 53473679edcSLukas Ertl 53573679edcSLukas Ertl return (0); 53673679edcSLukas Ertl } 537fb4e65d0SLukas Ertl 538fb4e65d0SLukas Ertl /* Calculate the offsets in the various subdisks for a RAID5 request. */ 539fb4e65d0SLukas Ertl int 540fb4e65d0SLukas Ertl gv_raid5_offset(struct gv_plex *p, off_t boff, off_t bcount, off_t *real_off, 541fb4e65d0SLukas Ertl off_t *real_len, int *sdno, int *psdno) 542fb4e65d0SLukas Ertl { 543fb4e65d0SLukas Ertl int sd, psd; 544fb4e65d0SLukas Ertl off_t len_left, stripeend, stripeoff, stripestart; 545fb4e65d0SLukas Ertl 546fb4e65d0SLukas Ertl /* The number of the subdisk containing the parity stripe. */ 547fb4e65d0SLukas Ertl psd = p->sdcount - 1 - ( boff / (p->stripesize * (p->sdcount - 1))) % 548fb4e65d0SLukas Ertl p->sdcount; 549fb4e65d0SLukas Ertl KASSERT(psdno >= 0, ("gv_raid5_offset: psdno < 0")); 550fb4e65d0SLukas Ertl 551fb4e65d0SLukas Ertl /* Offset of the start address from the start of the stripe. */ 552fb4e65d0SLukas Ertl stripeoff = boff % (p->stripesize * (p->sdcount - 1)); 553fb4e65d0SLukas Ertl KASSERT(stripeoff >= 0, ("gv_raid5_offset: stripeoff < 0")); 554fb4e65d0SLukas Ertl 555fb4e65d0SLukas Ertl /* The number of the subdisk where the stripe resides. */ 556fb4e65d0SLukas Ertl sd = stripeoff / p->stripesize; 557fb4e65d0SLukas Ertl KASSERT(sdno >= 0, ("gv_raid5_offset: sdno < 0")); 558fb4e65d0SLukas Ertl 559fb4e65d0SLukas Ertl /* At or past parity subdisk. */ 560fb4e65d0SLukas Ertl if (sd >= psd) 561fb4e65d0SLukas Ertl sd++; 562fb4e65d0SLukas Ertl 563fb4e65d0SLukas Ertl /* The offset of the stripe on this subdisk. */ 564fb4e65d0SLukas Ertl stripestart = (boff - stripeoff) / (p->sdcount - 1); 565fb4e65d0SLukas Ertl KASSERT(stripestart >= 0, ("gv_raid5_offset: stripestart < 0")); 566fb4e65d0SLukas Ertl 567fb4e65d0SLukas Ertl stripeoff %= p->stripesize; 568fb4e65d0SLukas Ertl 569fb4e65d0SLukas Ertl /* The offset of the request on this subdisk. */ 570fb4e65d0SLukas Ertl *real_off = stripestart + stripeoff; 571fb4e65d0SLukas Ertl 572fb4e65d0SLukas Ertl stripeend = stripestart + p->stripesize; 573fb4e65d0SLukas Ertl len_left = stripeend - *real_off; 574fb4e65d0SLukas Ertl KASSERT(len_left >= 0, ("gv_raid5_offset: len_left < 0")); 575fb4e65d0SLukas Ertl 576fb4e65d0SLukas Ertl *real_len = (bcount <= len_left) ? bcount : len_left; 577fb4e65d0SLukas Ertl 578fb4e65d0SLukas Ertl if (sdno != NULL) 579fb4e65d0SLukas Ertl *sdno = sd; 580fb4e65d0SLukas Ertl if (psdno != NULL) 581fb4e65d0SLukas Ertl *psdno = psd; 582fb4e65d0SLukas Ertl 583fb4e65d0SLukas Ertl return (0); 584fb4e65d0SLukas Ertl } 585