173679edcSLukas Ertl /*- 273679edcSLukas Ertl * Copyright (c) 2004 Lukas Ertl 373679edcSLukas Ertl * All rights reserved. 473679edcSLukas Ertl * 573679edcSLukas Ertl * Redistribution and use in source and binary forms, with or without 673679edcSLukas Ertl * modification, are permitted provided that the following conditions 773679edcSLukas Ertl * are met: 873679edcSLukas Ertl * 1. Redistributions of source code must retain the above copyright 973679edcSLukas Ertl * notice, this list of conditions and the following disclaimer. 1073679edcSLukas Ertl * 2. Redistributions in binary form must reproduce the above copyright 1173679edcSLukas Ertl * notice, this list of conditions and the following disclaimer in the 1273679edcSLukas Ertl * documentation and/or other materials provided with the distribution. 1373679edcSLukas Ertl * 1473679edcSLukas Ertl * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 1573679edcSLukas Ertl * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 1673679edcSLukas Ertl * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 1773679edcSLukas Ertl * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 1873679edcSLukas Ertl * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 1973679edcSLukas Ertl * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 2073679edcSLukas Ertl * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 2173679edcSLukas Ertl * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 2273679edcSLukas Ertl * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 2373679edcSLukas Ertl * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 2473679edcSLukas Ertl * SUCH DAMAGE. 2573679edcSLukas Ertl */ 2673679edcSLukas Ertl 2773679edcSLukas Ertl #include <sys/cdefs.h> 2873679edcSLukas Ertl __FBSDID("$FreeBSD$"); 2973679edcSLukas Ertl 3073679edcSLukas Ertl #include <sys/param.h> 3173679edcSLukas Ertl #include <sys/bio.h> 3273679edcSLukas Ertl #include <sys/conf.h> 3373679edcSLukas Ertl #include <sys/errno.h> 3473679edcSLukas Ertl #include <sys/kernel.h> 3573679edcSLukas Ertl #include <sys/kthread.h> 3673679edcSLukas Ertl #include <sys/libkern.h> 3773679edcSLukas Ertl #include <sys/lock.h> 3873679edcSLukas Ertl #include <sys/malloc.h> 3973679edcSLukas Ertl #include <sys/mutex.h> 4073679edcSLukas Ertl #include <sys/systm.h> 4173679edcSLukas Ertl 4273679edcSLukas Ertl #include <geom/geom.h> 4373679edcSLukas Ertl #include <geom/vinum/geom_vinum_var.h> 4473679edcSLukas Ertl #include <geom/vinum/geom_vinum_raid5.h> 4573679edcSLukas Ertl #include <geom/vinum/geom_vinum.h> 4673679edcSLukas Ertl 4773679edcSLukas Ertl int gv_raid5_parity(struct gv_raid5_packet *); 4873679edcSLukas Ertl int gv_stripe_active(struct gv_raid5_packet *, struct gv_plex *); 4973679edcSLukas Ertl 5073679edcSLukas Ertl struct gv_raid5_bit * 5173679edcSLukas Ertl gv_new_raid5_bit(void) 5273679edcSLukas Ertl { 5373679edcSLukas Ertl struct gv_raid5_bit *r; 5473679edcSLukas Ertl r = g_malloc(sizeof(*r), M_NOWAIT | M_ZERO); 5573679edcSLukas Ertl KASSERT(r != NULL, ("gv_new_raid5_bit: NULL r")); 5673679edcSLukas Ertl return (r); 5773679edcSLukas Ertl } 5873679edcSLukas Ertl 5973679edcSLukas Ertl struct gv_raid5_packet * 6073679edcSLukas Ertl gv_new_raid5_packet(void) 6173679edcSLukas Ertl { 6273679edcSLukas Ertl struct gv_raid5_packet *wp; 6373679edcSLukas Ertl 6473679edcSLukas Ertl wp = g_malloc(sizeof(*wp), M_NOWAIT | M_ZERO); 6573679edcSLukas Ertl KASSERT(wp != NULL, ("gv_new_raid5_packet: NULL wp")); 6673679edcSLukas Ertl wp->state = SETUP; 6773679edcSLukas Ertl wp->type = JUNK; 6873679edcSLukas Ertl TAILQ_INIT(&wp->bits); 6973679edcSLukas Ertl 7073679edcSLukas Ertl return (wp); 7173679edcSLukas Ertl } 7273679edcSLukas Ertl 7373679edcSLukas Ertl /* 7473679edcSLukas Ertl * Check if the stripe that the work packet wants is already being used by 7573679edcSLukas Ertl * some other work packet. 7673679edcSLukas Ertl */ 7773679edcSLukas Ertl int 7873679edcSLukas Ertl gv_stripe_active(struct gv_raid5_packet *wp, struct gv_plex *sc) 7973679edcSLukas Ertl { 8073679edcSLukas Ertl struct gv_raid5_packet *wpa; 8173679edcSLukas Ertl 8273679edcSLukas Ertl TAILQ_FOREACH(wpa, &sc->worklist, list) { 8373679edcSLukas Ertl if (wpa->lockbase == wp->lockbase) { 8473679edcSLukas Ertl if (wpa->bio == wp->bio) 8573679edcSLukas Ertl return (0); 8673679edcSLukas Ertl return (1); 8773679edcSLukas Ertl } 8873679edcSLukas Ertl } 8973679edcSLukas Ertl return (0); 9073679edcSLukas Ertl } 9173679edcSLukas Ertl 9273679edcSLukas Ertl /* 9373679edcSLukas Ertl * The "worker" thread that runs through the worklist and fires off the 9473679edcSLukas Ertl * "subrequests" needed to fulfill a RAID5 read or write request. 9573679edcSLukas Ertl */ 9673679edcSLukas Ertl void 9773679edcSLukas Ertl gv_raid5_worker(void *arg) 9873679edcSLukas Ertl { 9973679edcSLukas Ertl struct bio *bp; 10073679edcSLukas Ertl struct g_geom *gp; 10173679edcSLukas Ertl struct gv_plex *p; 10273679edcSLukas Ertl struct gv_raid5_packet *wp, *wpt; 10373679edcSLukas Ertl struct gv_raid5_bit *rbp, *rbpt; 10473679edcSLukas Ertl int error, restart; 10573679edcSLukas Ertl 10673679edcSLukas Ertl gp = arg; 10773679edcSLukas Ertl p = gp->softc; 10873679edcSLukas Ertl 10973679edcSLukas Ertl mtx_lock(&p->worklist_mtx); 11073679edcSLukas Ertl for (;;) { 11173679edcSLukas Ertl restart = 0; 11273679edcSLukas Ertl g_trace(G_T_TOPOLOGY, "gv_raid5_worker scan"); 11373679edcSLukas Ertl TAILQ_FOREACH_SAFE(wp, &p->worklist, list, wpt) { 11473679edcSLukas Ertl /* This request packet is already being processed. */ 11573679edcSLukas Ertl if (wp->state == IO) 11673679edcSLukas Ertl continue; 11773679edcSLukas Ertl /* This request packet is ready for processing. */ 11873679edcSLukas Ertl if (wp->state == VALID) { 11973679edcSLukas Ertl /* Couldn't get the lock, try again. */ 12073679edcSLukas Ertl if ((wp->lockbase != -1) && 12173679edcSLukas Ertl gv_stripe_active(wp, p)) 12273679edcSLukas Ertl continue; 12373679edcSLukas Ertl 12473679edcSLukas Ertl wp->state = IO; 12573679edcSLukas Ertl mtx_unlock(&p->worklist_mtx); 12673679edcSLukas Ertl TAILQ_FOREACH_SAFE(rbp, &wp->bits, list, rbpt) 12773679edcSLukas Ertl g_io_request(rbp->bio, rbp->consumer); 12873679edcSLukas Ertl mtx_lock(&p->worklist_mtx); 12973679edcSLukas Ertl continue; 13073679edcSLukas Ertl } 13173679edcSLukas Ertl if (wp->state == FINISH) { 13273679edcSLukas Ertl bp = wp->bio; 13373679edcSLukas Ertl bp->bio_completed += wp->length; 13473679edcSLukas Ertl /* 13573679edcSLukas Ertl * Deliver the original request if we have 13673679edcSLukas Ertl * finished. 13773679edcSLukas Ertl */ 13873679edcSLukas Ertl if (bp->bio_completed == bp->bio_length) { 13973679edcSLukas Ertl mtx_unlock(&p->worklist_mtx); 14073679edcSLukas Ertl g_io_deliver(bp, 0); 14173679edcSLukas Ertl mtx_lock(&p->worklist_mtx); 14273679edcSLukas Ertl } 14373679edcSLukas Ertl TAILQ_REMOVE(&p->worklist, wp, list); 14473679edcSLukas Ertl if (wp->bufmalloc == 1) 14573679edcSLukas Ertl g_free(wp->buf); 14673679edcSLukas Ertl g_free(wp); 14773679edcSLukas Ertl restart++; 14873679edcSLukas Ertl /*break;*/ 14973679edcSLukas Ertl } 15073679edcSLukas Ertl } 15173679edcSLukas Ertl if (!restart) { 15273679edcSLukas Ertl /* Self-destruct. */ 15373679edcSLukas Ertl if (p->flags & GV_PLEX_THREAD_DIE) 15473679edcSLukas Ertl break; 15573679edcSLukas Ertl g_trace(G_T_TOPOLOGY, "gv_raid5_worker sleep"); 15673679edcSLukas Ertl error = msleep(p, &p->worklist_mtx, PRIBIO, "-", 15773679edcSLukas Ertl hz/100); 15873679edcSLukas Ertl } 15973679edcSLukas Ertl } 16073679edcSLukas Ertl mtx_unlock(&p->worklist_mtx); 16173679edcSLukas Ertl 16273679edcSLukas Ertl g_trace(G_T_TOPOLOGY, "gv_raid5_worker die"); 16373679edcSLukas Ertl 16473679edcSLukas Ertl /* Signal our plex that we are dead. */ 16573679edcSLukas Ertl p->flags |= GV_PLEX_THREAD_DEAD; 16673679edcSLukas Ertl wakeup(p); 16773679edcSLukas Ertl kthread_exit(0); 16873679edcSLukas Ertl } 16973679edcSLukas Ertl 17073679edcSLukas Ertl /* Final bio transaction to write out the parity data. */ 17173679edcSLukas Ertl int 17273679edcSLukas Ertl gv_raid5_parity(struct gv_raid5_packet *wp) 17373679edcSLukas Ertl { 17473679edcSLukas Ertl struct bio *bp; 17573679edcSLukas Ertl 17673679edcSLukas Ertl bp = g_new_bio(); 17773679edcSLukas Ertl if (bp == NULL) 17873679edcSLukas Ertl return (ENOMEM); 17973679edcSLukas Ertl 18073679edcSLukas Ertl wp->type = ISPARITY; 18173679edcSLukas Ertl bp->bio_cmd = BIO_WRITE; 18273679edcSLukas Ertl bp->bio_data = wp->buf; 18373679edcSLukas Ertl bp->bio_offset = wp->offset; 18473679edcSLukas Ertl bp->bio_length = wp->length; 18573679edcSLukas Ertl bp->bio_done = gv_raid5_done; 18673679edcSLukas Ertl bp->bio_caller1 = wp; 18773679edcSLukas Ertl bp->bio_caller2 = NULL; 18873679edcSLukas Ertl g_io_request(bp, wp->parity); 18973679edcSLukas Ertl 19073679edcSLukas Ertl return (0); 19173679edcSLukas Ertl } 19273679edcSLukas Ertl 19373679edcSLukas Ertl /* We end up here after each subrequest. */ 19473679edcSLukas Ertl void 19573679edcSLukas Ertl gv_raid5_done(struct bio *bp) 19673679edcSLukas Ertl { 19773679edcSLukas Ertl struct bio *obp; 19873679edcSLukas Ertl struct g_geom *gp; 19973679edcSLukas Ertl struct gv_plex *p; 20073679edcSLukas Ertl struct gv_raid5_packet *wp; 20173679edcSLukas Ertl struct gv_raid5_bit *rbp; 20273679edcSLukas Ertl off_t i; 20373679edcSLukas Ertl int error; 20473679edcSLukas Ertl 20573679edcSLukas Ertl wp = bp->bio_caller1; 20673679edcSLukas Ertl rbp = bp->bio_caller2; 20773679edcSLukas Ertl obp = wp->bio; 20873679edcSLukas Ertl gp = bp->bio_from->geom; 20973679edcSLukas Ertl p = gp->softc; 21073679edcSLukas Ertl 21173679edcSLukas Ertl /* One less active subrequest. */ 21273679edcSLukas Ertl wp->active--; 21373679edcSLukas Ertl 21473679edcSLukas Ertl switch (obp->bio_cmd) { 21573679edcSLukas Ertl case BIO_READ: 21673679edcSLukas Ertl /* Degraded reads need to handle parity data. */ 21773679edcSLukas Ertl if (wp->type == DEGRADED) { 21873679edcSLukas Ertl for (i = 0; i < wp->length; i++) 21973679edcSLukas Ertl wp->buf[i] ^= bp->bio_data[i]; 22073679edcSLukas Ertl 22173679edcSLukas Ertl /* When we're finished copy back the data we want. */ 22273679edcSLukas Ertl if (wp->active == 0) 22373679edcSLukas Ertl bcopy(wp->buf, wp->data, wp->length); 22473679edcSLukas Ertl } 22573679edcSLukas Ertl 22673679edcSLukas Ertl break; 22773679edcSLukas Ertl 22873679edcSLukas Ertl case BIO_WRITE: 22973679edcSLukas Ertl /* Handle the parity data, if needed. */ 23073679edcSLukas Ertl if ((wp->type != NOPARITY) && (wp->type != ISPARITY)) { 23173679edcSLukas Ertl for (i = 0; i < wp->length; i++) 23273679edcSLukas Ertl wp->buf[i] ^= bp->bio_data[i]; 23373679edcSLukas Ertl 23473679edcSLukas Ertl /* Write out the parity data we calculated. */ 23573679edcSLukas Ertl if (wp->active == 0) { 23673679edcSLukas Ertl wp->active++; 23773679edcSLukas Ertl error = gv_raid5_parity(wp); 23873679edcSLukas Ertl } 23973679edcSLukas Ertl } 24073679edcSLukas Ertl break; 24173679edcSLukas Ertl } 24273679edcSLukas Ertl 24373679edcSLukas Ertl g_destroy_bio(bp); 24473679edcSLukas Ertl 24573679edcSLukas Ertl if (rbp != NULL) { 24673679edcSLukas Ertl if (rbp->malloc == 1) 24773679edcSLukas Ertl g_free(rbp->buf); 24873679edcSLukas Ertl TAILQ_REMOVE(&wp->bits, rbp, list); 24973679edcSLukas Ertl g_free(rbp); 25073679edcSLukas Ertl } 25173679edcSLukas Ertl 25273679edcSLukas Ertl /* This request group is done. */ 25373679edcSLukas Ertl if (wp->active == 0) 25473679edcSLukas Ertl wp->state = FINISH; 25573679edcSLukas Ertl } 25673679edcSLukas Ertl 25773679edcSLukas Ertl /* Build a request group to perform (part of) a RAID5 request. */ 25873679edcSLukas Ertl int 25973679edcSLukas Ertl gv_build_raid5_req(struct gv_raid5_packet *wp, struct bio *bp, caddr_t addr, 26073679edcSLukas Ertl long bcount, off_t boff) 26173679edcSLukas Ertl { 26273679edcSLukas Ertl struct g_geom *gp; 26373679edcSLukas Ertl struct gv_plex *p; 26473679edcSLukas Ertl struct gv_raid5_bit *rbp; 26573679edcSLukas Ertl struct gv_sd *broken, *original, *parity, *s; 26673679edcSLukas Ertl int i, psdno, sdno; 26773679edcSLukas Ertl off_t len_left, real_off, stripeend, stripeoff, stripestart; 26873679edcSLukas Ertl 26973679edcSLukas Ertl gp = bp->bio_to->geom; 27073679edcSLukas Ertl p = gp->softc; 27173679edcSLukas Ertl 27273679edcSLukas Ertl if (p == NULL || LIST_EMPTY(&p->subdisks)) 27373679edcSLukas Ertl return (ENXIO); 27473679edcSLukas Ertl 27573679edcSLukas Ertl /* We are optimistic and assume that this request will be OK. */ 27673679edcSLukas Ertl wp->type = NORMAL; 27773679edcSLukas Ertl original = parity = broken = NULL; 27873679edcSLukas Ertl 27973679edcSLukas Ertl /* The number of the subdisk containing the parity stripe. */ 28073679edcSLukas Ertl psdno = p->sdcount - 1 - ( boff / (p->stripesize * (p->sdcount - 1))) % 28173679edcSLukas Ertl p->sdcount; 28273679edcSLukas Ertl KASSERT(psdno >= 0, ("gv_build_raid5_request: psdno < 0")); 28373679edcSLukas Ertl 28473679edcSLukas Ertl /* Offset of the start address from the start of the stripe. */ 28573679edcSLukas Ertl stripeoff = boff % (p->stripesize * (p->sdcount - 1)); 28673679edcSLukas Ertl KASSERT(stripeoff >= 0, ("gv_build_raid5_request: stripeoff < 0")); 28773679edcSLukas Ertl 28873679edcSLukas Ertl /* The number of the subdisk where the stripe resides. */ 28973679edcSLukas Ertl sdno = stripeoff / p->stripesize; 29073679edcSLukas Ertl KASSERT(sdno >= 0, ("gv_build_raid5_request: sdno < 0")); 29173679edcSLukas Ertl 29273679edcSLukas Ertl /* At or past parity subdisk. */ 29373679edcSLukas Ertl if (sdno >= psdno) 29473679edcSLukas Ertl sdno++; 29573679edcSLukas Ertl 29673679edcSLukas Ertl /* The offset of the stripe on this subdisk. */ 29773679edcSLukas Ertl stripestart = (boff - stripeoff) / (p->sdcount - 1); 29873679edcSLukas Ertl KASSERT(stripestart >= 0, ("gv_build_raid5_request: stripestart < 0")); 29973679edcSLukas Ertl 30073679edcSLukas Ertl if (stripeoff >= p->stripesize) 30173679edcSLukas Ertl stripeoff -= p->stripesize; 30273679edcSLukas Ertl 30373679edcSLukas Ertl /* The offset of the request on this subdisk. */ 30473679edcSLukas Ertl real_off = stripestart + stripeoff; 30573679edcSLukas Ertl 30673679edcSLukas Ertl stripeend = stripestart + p->stripesize; 30773679edcSLukas Ertl len_left = stripeend - real_off; 30873679edcSLukas Ertl KASSERT(len_left >= 0, ("gv_build_raid5_request: len_left < 0")); 30973679edcSLukas Ertl 31073679edcSLukas Ertl /* Find the right subdisks. */ 31173679edcSLukas Ertl i = 0; 31273679edcSLukas Ertl LIST_FOREACH(s, &p->subdisks, in_plex) { 31373679edcSLukas Ertl if (i == sdno) 31473679edcSLukas Ertl original = s; 31573679edcSLukas Ertl if (i == psdno) 31673679edcSLukas Ertl parity = s; 31773679edcSLukas Ertl if (s->state != GV_SD_UP) 31873679edcSLukas Ertl broken = s; 31973679edcSLukas Ertl i++; 32073679edcSLukas Ertl } 32173679edcSLukas Ertl 32273679edcSLukas Ertl if ((original == NULL) || (parity == NULL)) 32373679edcSLukas Ertl return (ENXIO); 32473679edcSLukas Ertl 32573679edcSLukas Ertl /* Our data stripe is missing. */ 32673679edcSLukas Ertl if (original->state != GV_SD_UP) 32773679edcSLukas Ertl wp->type = DEGRADED; 32873679edcSLukas Ertl /* Our parity stripe is missing. */ 32973679edcSLukas Ertl if (parity->state != GV_SD_UP) { 33073679edcSLukas Ertl /* We cannot take another failure if we're already degraded. */ 33173679edcSLukas Ertl if (wp->type != NORMAL) 33273679edcSLukas Ertl return (ENXIO); 33373679edcSLukas Ertl else 33473679edcSLukas Ertl wp->type = NOPARITY; 33573679edcSLukas Ertl } 33673679edcSLukas Ertl 33773679edcSLukas Ertl /* 33873679edcSLukas Ertl * A combined write is necessary when the original data subdisk and the 33973679edcSLukas Ertl * parity subdisk are both up, but one of the other subdisks isn't. 34073679edcSLukas Ertl */ 34173679edcSLukas Ertl if ((broken != NULL) && (broken != parity) && (broken != original)) 34273679edcSLukas Ertl wp->type = COMBINED; 34373679edcSLukas Ertl 34473679edcSLukas Ertl wp->offset = real_off; 34573679edcSLukas Ertl wp->length = (bcount <= len_left) ? bcount : len_left; 34673679edcSLukas Ertl wp->data = addr; 34773679edcSLukas Ertl wp->original = original->consumer; 34873679edcSLukas Ertl wp->parity = parity->consumer; 34973679edcSLukas Ertl wp->lockbase = stripestart; 35073679edcSLukas Ertl 35173679edcSLukas Ertl KASSERT(wp->length >= 0, ("gv_build_raid5_request: wp->length < 0")); 35273679edcSLukas Ertl 35373679edcSLukas Ertl switch (bp->bio_cmd) { 35473679edcSLukas Ertl case BIO_READ: 35573679edcSLukas Ertl /* 35673679edcSLukas Ertl * For a degraded read we need to read in all stripes except 35773679edcSLukas Ertl * the broken one plus the parity stripe and then recalculate 35873679edcSLukas Ertl * the desired data. 35973679edcSLukas Ertl */ 36073679edcSLukas Ertl if (wp->type == DEGRADED) { 36173679edcSLukas Ertl wp->buf = g_malloc(wp->length, M_WAITOK | M_ZERO); 36273679edcSLukas Ertl wp->bufmalloc = 1; 36373679edcSLukas Ertl LIST_FOREACH(s, &p->subdisks, in_plex) { 36473679edcSLukas Ertl /* Skip the broken subdisk. */ 36573679edcSLukas Ertl if (s == broken) 36673679edcSLukas Ertl continue; 36773679edcSLukas Ertl rbp = gv_new_raid5_bit(); 36873679edcSLukas Ertl rbp->consumer = s->consumer; 36973679edcSLukas Ertl rbp->bio = g_new_bio(); 37073679edcSLukas Ertl if (rbp->bio == NULL) 37173679edcSLukas Ertl return (ENOMEM); 37273679edcSLukas Ertl rbp->buf = g_malloc(wp->length, 37373679edcSLukas Ertl M_WAITOK | M_ZERO); 37473679edcSLukas Ertl rbp->malloc = 1; 37573679edcSLukas Ertl rbp->bio->bio_cmd = BIO_READ; 37673679edcSLukas Ertl rbp->bio->bio_offset = wp->offset; 37773679edcSLukas Ertl rbp->bio->bio_length = wp->length; 37873679edcSLukas Ertl rbp->bio->bio_data = rbp->buf; 37973679edcSLukas Ertl rbp->bio->bio_done = gv_raid5_done; 38073679edcSLukas Ertl rbp->bio->bio_caller1 = wp; 38173679edcSLukas Ertl rbp->bio->bio_caller2 = rbp; 38273679edcSLukas Ertl TAILQ_INSERT_HEAD(&wp->bits, rbp, list); 38373679edcSLukas Ertl wp->active++; 38473679edcSLukas Ertl wp->rqcount++; 38573679edcSLukas Ertl } 38673679edcSLukas Ertl 38773679edcSLukas Ertl /* A normal read can be fulfilled with the original subdisk. */ 38873679edcSLukas Ertl } else { 38973679edcSLukas Ertl rbp = gv_new_raid5_bit(); 39073679edcSLukas Ertl rbp->consumer = wp->original; 39173679edcSLukas Ertl rbp->bio = g_new_bio(); 39273679edcSLukas Ertl if (rbp->bio == NULL) 39373679edcSLukas Ertl return (ENOMEM); 39473679edcSLukas Ertl rbp->bio->bio_cmd = BIO_READ; 39573679edcSLukas Ertl rbp->bio->bio_offset = wp->offset; 39673679edcSLukas Ertl rbp->bio->bio_length = wp->length; 39773679edcSLukas Ertl rbp->buf = addr; 39873679edcSLukas Ertl rbp->bio->bio_data = rbp->buf; 39973679edcSLukas Ertl rbp->bio->bio_done = gv_raid5_done; 40073679edcSLukas Ertl rbp->bio->bio_caller1 = wp; 40173679edcSLukas Ertl rbp->bio->bio_caller2 = rbp; 40273679edcSLukas Ertl TAILQ_INSERT_HEAD(&wp->bits, rbp, list); 40373679edcSLukas Ertl wp->active++; 40473679edcSLukas Ertl wp->rqcount++; 40573679edcSLukas Ertl } 40673679edcSLukas Ertl if (wp->type != COMBINED) 40773679edcSLukas Ertl wp->lockbase = -1; 40873679edcSLukas Ertl break; 40973679edcSLukas Ertl 41073679edcSLukas Ertl case BIO_WRITE: 41173679edcSLukas Ertl /* 41273679edcSLukas Ertl * A degraded write means we cannot write to the original data 41373679edcSLukas Ertl * subdisk. Thus we need to read in all valid stripes, 41473679edcSLukas Ertl * recalculate the parity from the original data, and then 41573679edcSLukas Ertl * write the parity stripe back out. 41673679edcSLukas Ertl */ 41773679edcSLukas Ertl if (wp->type == DEGRADED) { 41873679edcSLukas Ertl wp->buf = g_malloc(wp->length, M_WAITOK | M_ZERO); 41973679edcSLukas Ertl wp->bufmalloc = 1; 42073679edcSLukas Ertl 42173679edcSLukas Ertl /* Copy the original data. */ 42273679edcSLukas Ertl bcopy(wp->data, wp->buf, wp->length); 42373679edcSLukas Ertl 42473679edcSLukas Ertl LIST_FOREACH(s, &p->subdisks, in_plex) { 42573679edcSLukas Ertl /* Skip the broken and the parity subdisk. */ 42673679edcSLukas Ertl if ((s == broken) || 42773679edcSLukas Ertl (s->consumer == wp->parity)) 42873679edcSLukas Ertl continue; 42973679edcSLukas Ertl 43073679edcSLukas Ertl rbp = gv_new_raid5_bit(); 43173679edcSLukas Ertl rbp->consumer = s->consumer; 43273679edcSLukas Ertl rbp->bio = g_new_bio(); 43373679edcSLukas Ertl if (rbp->bio == NULL) 43473679edcSLukas Ertl return (ENOMEM); 43573679edcSLukas Ertl rbp->buf = g_malloc(wp->length, 43673679edcSLukas Ertl M_WAITOK | M_ZERO); 43773679edcSLukas Ertl rbp->malloc = 1; 43873679edcSLukas Ertl rbp->bio->bio_cmd = BIO_READ; 43973679edcSLukas Ertl rbp->bio->bio_data = rbp->buf; 44073679edcSLukas Ertl rbp->bio->bio_offset = wp->offset; 44173679edcSLukas Ertl rbp->bio->bio_length = wp->length; 44273679edcSLukas Ertl rbp->bio->bio_done = gv_raid5_done; 44373679edcSLukas Ertl rbp->bio->bio_caller1 = wp; 44473679edcSLukas Ertl rbp->bio->bio_caller2 = rbp; 44573679edcSLukas Ertl TAILQ_INSERT_HEAD(&wp->bits, rbp, list); 44673679edcSLukas Ertl wp->active++; 44773679edcSLukas Ertl wp->rqcount++; 44873679edcSLukas Ertl } 44973679edcSLukas Ertl 45073679edcSLukas Ertl /* 45173679edcSLukas Ertl * When we don't have the parity stripe we just write out the 45273679edcSLukas Ertl * data. 45373679edcSLukas Ertl */ 45473679edcSLukas Ertl } else if (wp->type == NOPARITY) { 45573679edcSLukas Ertl rbp = gv_new_raid5_bit(); 45673679edcSLukas Ertl rbp->consumer = wp->original; 45773679edcSLukas Ertl rbp->bio = g_new_bio(); 45873679edcSLukas Ertl if (rbp->bio == NULL) 45973679edcSLukas Ertl return (ENOMEM); 46073679edcSLukas Ertl rbp->bio->bio_cmd = BIO_WRITE; 46173679edcSLukas Ertl rbp->bio->bio_offset = wp->offset; 46273679edcSLukas Ertl rbp->bio->bio_length = wp->length; 46373679edcSLukas Ertl rbp->bio->bio_data = addr; 46473679edcSLukas Ertl rbp->bio->bio_done = gv_raid5_done; 46573679edcSLukas Ertl rbp->bio->bio_caller1 = wp; 46673679edcSLukas Ertl rbp->bio->bio_caller2 = rbp; 46773679edcSLukas Ertl TAILQ_INSERT_HEAD(&wp->bits, rbp, list); 46873679edcSLukas Ertl wp->active++; 46973679edcSLukas Ertl wp->rqcount++; 47073679edcSLukas Ertl 47173679edcSLukas Ertl /* 47273679edcSLukas Ertl * A combined write means that our data subdisk and the parity 47373679edcSLukas Ertl * subdisks are both up, but another subdisk isn't. We need to 47473679edcSLukas Ertl * read all valid stripes including the parity to recalculate 47573679edcSLukas Ertl * the data of the stripe that is missing. Then we write our 47673679edcSLukas Ertl * original data, and together with the other data stripes 47773679edcSLukas Ertl * recalculate the parity again. 47873679edcSLukas Ertl */ 47973679edcSLukas Ertl } else if (wp->type == COMBINED) { 48073679edcSLukas Ertl wp->buf = g_malloc(wp->length, M_WAITOK | M_ZERO); 48173679edcSLukas Ertl wp->bufmalloc = 1; 48273679edcSLukas Ertl 48373679edcSLukas Ertl /* Get the data from all subdisks. */ 48473679edcSLukas Ertl LIST_FOREACH(s, &p->subdisks, in_plex) { 48573679edcSLukas Ertl /* Skip the broken subdisk. */ 48673679edcSLukas Ertl if (s == broken) 48773679edcSLukas Ertl continue; 48873679edcSLukas Ertl 48973679edcSLukas Ertl rbp = gv_new_raid5_bit(); 49073679edcSLukas Ertl rbp->consumer = s->consumer; 49173679edcSLukas Ertl rbp->bio = g_new_bio(); 49273679edcSLukas Ertl if (rbp->bio == NULL) 49373679edcSLukas Ertl return (ENOMEM); 49473679edcSLukas Ertl rbp->bio->bio_cmd = BIO_READ; 49573679edcSLukas Ertl rbp->buf = g_malloc(wp->length, 49673679edcSLukas Ertl M_WAITOK | M_ZERO); 49773679edcSLukas Ertl rbp->malloc = 1; 49873679edcSLukas Ertl rbp->bio->bio_data = rbp->buf; 49973679edcSLukas Ertl rbp->bio->bio_offset = wp->offset; 50073679edcSLukas Ertl rbp->bio->bio_length = wp->length; 50173679edcSLukas Ertl rbp->bio->bio_done = gv_raid5_done; 50273679edcSLukas Ertl rbp->bio->bio_caller1 = wp; 50373679edcSLukas Ertl rbp->bio->bio_caller2 = rbp; 50473679edcSLukas Ertl TAILQ_INSERT_HEAD(&wp->bits, rbp, list); 50573679edcSLukas Ertl wp->active++; 50673679edcSLukas Ertl wp->rqcount++; 50773679edcSLukas Ertl } 50873679edcSLukas Ertl 50973679edcSLukas Ertl /* Write the original data. */ 51073679edcSLukas Ertl rbp = gv_new_raid5_bit(); 51173679edcSLukas Ertl rbp->consumer = wp->original; 51273679edcSLukas Ertl rbp->buf = addr; 51373679edcSLukas Ertl rbp->bio = g_new_bio(); 51473679edcSLukas Ertl if (rbp->bio == NULL) 51573679edcSLukas Ertl return (ENOMEM); 51673679edcSLukas Ertl rbp->bio->bio_cmd = BIO_WRITE; 51773679edcSLukas Ertl rbp->bio->bio_data = rbp->buf; 51873679edcSLukas Ertl rbp->bio->bio_offset = wp->offset; 51973679edcSLukas Ertl rbp->bio->bio_length = wp->length; 52073679edcSLukas Ertl rbp->bio->bio_done = gv_raid5_done; 52173679edcSLukas Ertl rbp->bio->bio_caller1 = wp; 52273679edcSLukas Ertl rbp->bio->bio_caller2 = rbp; 52373679edcSLukas Ertl /* 52473679edcSLukas Ertl * Insert at the tail, because we want to read the old 52573679edcSLukas Ertl * data first. 52673679edcSLukas Ertl */ 52773679edcSLukas Ertl TAILQ_INSERT_TAIL(&wp->bits, rbp, list); 52873679edcSLukas Ertl wp->active++; 52973679edcSLukas Ertl wp->rqcount++; 53073679edcSLukas Ertl 53173679edcSLukas Ertl /* Get the rest of the data again. */ 53273679edcSLukas Ertl LIST_FOREACH(s, &p->subdisks, in_plex) { 53373679edcSLukas Ertl /* 53473679edcSLukas Ertl * Skip the broken subdisk, the parity, and the 53573679edcSLukas Ertl * one we just wrote. 53673679edcSLukas Ertl */ 53773679edcSLukas Ertl if ((s == broken) || 53873679edcSLukas Ertl (s->consumer == wp->parity) || 53973679edcSLukas Ertl (s->consumer == wp->original)) 54073679edcSLukas Ertl continue; 54173679edcSLukas Ertl rbp = gv_new_raid5_bit(); 54273679edcSLukas Ertl rbp->consumer = s->consumer; 54373679edcSLukas Ertl rbp->bio = g_new_bio(); 54473679edcSLukas Ertl if (rbp->bio == NULL) 54573679edcSLukas Ertl return (ENOMEM); 54673679edcSLukas Ertl rbp->bio->bio_cmd = BIO_READ; 54773679edcSLukas Ertl rbp->buf = g_malloc(wp->length, 54873679edcSLukas Ertl M_WAITOK | M_ZERO); 54973679edcSLukas Ertl rbp->malloc = 1; 55073679edcSLukas Ertl rbp->bio->bio_data = rbp->buf; 55173679edcSLukas Ertl rbp->bio->bio_offset = wp->offset; 55273679edcSLukas Ertl rbp->bio->bio_length = wp->length; 55373679edcSLukas Ertl rbp->bio->bio_done = gv_raid5_done; 55473679edcSLukas Ertl rbp->bio->bio_caller1 = wp; 55573679edcSLukas Ertl rbp->bio->bio_caller2 = rbp; 55673679edcSLukas Ertl /* 55773679edcSLukas Ertl * Again, insert at the tail to keep correct 55873679edcSLukas Ertl * order. 55973679edcSLukas Ertl */ 56073679edcSLukas Ertl TAILQ_INSERT_TAIL(&wp->bits, rbp, list); 56173679edcSLukas Ertl wp->active++; 56273679edcSLukas Ertl wp->rqcount++; 56373679edcSLukas Ertl } 56473679edcSLukas Ertl 56573679edcSLukas Ertl 56673679edcSLukas Ertl /* 56773679edcSLukas Ertl * A normal write request goes to the original subdisk, then we 56873679edcSLukas Ertl * read in all other stripes, recalculate the parity and write 56973679edcSLukas Ertl * out the parity again. 57073679edcSLukas Ertl */ 57173679edcSLukas Ertl } else { 57273679edcSLukas Ertl wp->buf = g_malloc(wp->length, M_WAITOK | M_ZERO); 57373679edcSLukas Ertl wp->bufmalloc = 1; 57473679edcSLukas Ertl LIST_FOREACH(s, &p->subdisks, in_plex) { 57573679edcSLukas Ertl /* Skip the parity stripe. */ 57673679edcSLukas Ertl if (s->consumer == wp->parity) 57773679edcSLukas Ertl continue; 57873679edcSLukas Ertl 57973679edcSLukas Ertl rbp = gv_new_raid5_bit(); 58073679edcSLukas Ertl rbp->consumer = s->consumer; 58173679edcSLukas Ertl rbp->bio = g_new_bio(); 58273679edcSLukas Ertl if (rbp->bio == NULL) 58373679edcSLukas Ertl return (ENOMEM); 58473679edcSLukas Ertl /* 58573679edcSLukas Ertl * The data for the original stripe is written, 58673679edcSLukas Ertl * the others need to be read in for the parity 58773679edcSLukas Ertl * calculation. 58873679edcSLukas Ertl */ 58973679edcSLukas Ertl if (s->consumer == wp->original) { 59073679edcSLukas Ertl rbp->bio->bio_cmd = BIO_WRITE; 59173679edcSLukas Ertl rbp->buf = addr; 59273679edcSLukas Ertl } else { 59373679edcSLukas Ertl rbp->bio->bio_cmd = BIO_READ; 59473679edcSLukas Ertl rbp->buf = g_malloc(wp->length, 59573679edcSLukas Ertl M_WAITOK | M_ZERO); 59673679edcSLukas Ertl rbp->malloc = 1; 59773679edcSLukas Ertl } 59873679edcSLukas Ertl rbp->bio->bio_data = rbp->buf; 59973679edcSLukas Ertl rbp->bio->bio_offset = wp->offset; 60073679edcSLukas Ertl rbp->bio->bio_length = wp->length; 60173679edcSLukas Ertl rbp->bio->bio_done = gv_raid5_done; 60273679edcSLukas Ertl rbp->bio->bio_caller1 = wp; 60373679edcSLukas Ertl rbp->bio->bio_caller2 = rbp; 60473679edcSLukas Ertl TAILQ_INSERT_HEAD(&wp->bits, rbp, list); 60573679edcSLukas Ertl wp->active++; 60673679edcSLukas Ertl wp->rqcount++; 60773679edcSLukas Ertl } 60873679edcSLukas Ertl } 60973679edcSLukas Ertl break; 61073679edcSLukas Ertl default: 61173679edcSLukas Ertl return (EINVAL); 61273679edcSLukas Ertl } 61373679edcSLukas Ertl 61473679edcSLukas Ertl wp->state = VALID; 61573679edcSLukas Ertl return (0); 61673679edcSLukas Ertl } 617