xref: /freebsd/sys/geom/vinum/geom_vinum_raid5.c (revision fb4e65d0)
1 /*-
2  * Copyright (c) 2004 Lukas Ertl
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24  * SUCH DAMAGE.
25  */
26 
27 #include <sys/cdefs.h>
28 __FBSDID("$FreeBSD$");
29 
30 #include <sys/param.h>
31 #include <sys/bio.h>
32 #include <sys/conf.h>
33 #include <sys/errno.h>
34 #include <sys/kernel.h>
35 #include <sys/kthread.h>
36 #include <sys/libkern.h>
37 #include <sys/lock.h>
38 #include <sys/malloc.h>
39 #include <sys/mutex.h>
40 #include <sys/systm.h>
41 
42 #include <geom/geom.h>
43 #include <geom/vinum/geom_vinum_var.h>
44 #include <geom/vinum/geom_vinum_raid5.h>
45 #include <geom/vinum/geom_vinum.h>
46 
47 int	gv_raid5_offset(struct gv_plex *, off_t, off_t, off_t *, off_t *,
48 	    int *, int *);
49 
50 /*
51  * Check if the stripe that the work packet wants is already being used by
52  * some other work packet.
53  */
54 int
55 gv_stripe_active(struct gv_plex *p, struct bio *bp)
56 {
57 	struct gv_raid5_packet *wp, *owp;
58 	int overlap;
59 
60 	wp = bp->bio_driver1;
61 	if (wp->lockbase == -1)
62 		return (0);
63 
64 	overlap = 0;
65 	TAILQ_FOREACH(owp, &p->packets, list) {
66 		if (owp == wp)
67 			break;
68 		if ((wp->lockbase >= owp->lockbase) &&
69 		    (wp->lockbase <= owp->lockbase + owp->length)) {
70 			overlap++;
71 			break;
72 		}
73 		if ((wp->lockbase <= owp->lockbase) &&
74 		    (wp->lockbase + wp->length >= owp->lockbase)) {
75 			overlap++;
76 			break;
77 		}
78 	}
79 
80 	return (overlap);
81 }
82 
83 int
84 gv_rebuild_raid5(struct gv_plex *p, struct gv_raid5_packet *wp, struct bio *bp,
85     caddr_t addr, off_t boff, off_t bcount)
86 {
87 	struct gv_sd *broken, *s;
88 	struct gv_bioq *bq;
89 	struct bio *cbp, *pbp;
90 	off_t real_len, real_off;
91 
92 	if (p == NULL || LIST_EMPTY(&p->subdisks))
93 		return (ENXIO);
94 
95 	gv_raid5_offset(p, boff, bcount, &real_off, &real_len, NULL, NULL);
96 
97 	/* Find the right subdisk. */
98 	broken = NULL;
99 	LIST_FOREACH(s, &p->subdisks, in_plex) {
100 		if (s->state != GV_SD_UP)
101 			broken = s;
102 	}
103 
104 	/* Parity stripe not found. */
105 	if (broken == NULL)
106 		return (ENXIO);
107 
108 	switch (broken->state) {
109 	case GV_SD_UP:
110 		return (EINVAL);
111 
112 	case GV_SD_STALE:
113 		if (!(bp->bio_cflags & GV_BIO_REBUILD))
114 			return (ENXIO);
115 
116 		printf("GEOM_VINUM: sd %s is reviving\n", broken->name);
117 		gv_set_sd_state(broken, GV_SD_REVIVING, GV_SETSTATE_FORCE);
118 		break;
119 
120 	case GV_SD_REVIVING:
121 		break;
122 
123 	default:
124 		/* All other subdisk states mean it's not accessible. */
125 		return (ENXIO);
126 	}
127 
128 	wp->length = real_len;
129 	wp->data = addr;
130 	wp->lockbase = real_off;
131 
132 	KASSERT(wp->length >= 0, ("gv_rebuild_raid5: wp->length < 0"));
133 
134 	/* Read all subdisks. */
135 	LIST_FOREACH(s, &p->subdisks, in_plex) {
136 		/* Skip the broken subdisk. */
137 		if (s == broken)
138 			continue;
139 
140 		cbp = g_clone_bio(bp);
141 		if (cbp == NULL)
142 			return (ENOMEM);
143 		cbp->bio_cmd = BIO_READ;
144 		cbp->bio_data = g_malloc(real_len, M_WAITOK);
145 		cbp->bio_cflags |= GV_BIO_MALLOC;
146 		cbp->bio_offset = real_off;
147 		cbp->bio_length = real_len;
148 		cbp->bio_done = gv_plex_done;
149 		cbp->bio_caller2 = s->consumer;
150 		cbp->bio_driver1 = wp;
151 
152 		GV_ENQUEUE(bp, cbp, pbp);
153 
154 		bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO);
155 		bq->bp = cbp;
156 		TAILQ_INSERT_TAIL(&wp->bits, bq, queue);
157 	}
158 
159 	/* Write the parity data. */
160 	cbp = g_clone_bio(bp);
161 	if (cbp == NULL)
162 		return (ENOMEM);
163 	cbp->bio_data = g_malloc(real_len, M_WAITOK | M_ZERO);
164 	cbp->bio_cflags |= GV_BIO_MALLOC;
165 	cbp->bio_offset = real_off;
166 	cbp->bio_length = real_len;
167 	cbp->bio_done = gv_plex_done;
168 	cbp->bio_caller2 = broken->consumer;
169 	cbp->bio_driver1 = wp;
170 	cbp->bio_cflags |= GV_BIO_REBUILD;
171 	wp->parity = cbp;
172 
173 	p->synced = boff;
174 
175 	return (0);
176 }
177 
178 /* Build a request group to perform (part of) a RAID5 request. */
179 int
180 gv_build_raid5_req(struct gv_plex *p, struct gv_raid5_packet *wp,
181     struct bio *bp, caddr_t addr, off_t boff, off_t bcount)
182 {
183 	struct g_geom *gp;
184 	struct gv_sd *broken, *original, *parity, *s;
185 	struct gv_bioq *bq;
186 	struct bio *cbp, *pbp;
187 	int i, psdno, sdno, type;
188 	off_t real_len, real_off;
189 
190 	gp = bp->bio_to->geom;
191 
192 	if (p == NULL || LIST_EMPTY(&p->subdisks))
193 		return (ENXIO);
194 
195 	/* We are optimistic and assume that this request will be OK. */
196 #define	REQ_TYPE_NORMAL		0
197 #define	REQ_TYPE_DEGRADED	1
198 #define	REQ_TYPE_NOPARITY	2
199 
200 	type = REQ_TYPE_NORMAL;
201 	original = parity = broken = NULL;
202 
203 	gv_raid5_offset(p, boff, bcount, &real_off, &real_len, &sdno, &psdno);
204 
205 	/* Find the right subdisks. */
206 	i = 0;
207 	LIST_FOREACH(s, &p->subdisks, in_plex) {
208 		if (i == sdno)
209 			original = s;
210 		if (i == psdno)
211 			parity = s;
212 		if (s->state != GV_SD_UP)
213 			broken = s;
214 		i++;
215 	}
216 
217 	if ((original == NULL) || (parity == NULL))
218 		return (ENXIO);
219 
220 	/* Our data stripe is missing. */
221 	if (original->state != GV_SD_UP)
222 		type = REQ_TYPE_DEGRADED;
223 	/* Our parity stripe is missing. */
224 	if (parity->state != GV_SD_UP) {
225 		/* We cannot take another failure if we're already degraded. */
226 		if (type != REQ_TYPE_NORMAL)
227 			return (ENXIO);
228 		else
229 			type = REQ_TYPE_NOPARITY;
230 	}
231 
232 	wp->length = real_len;
233 	wp->data = addr;
234 	wp->lockbase = real_off;
235 
236 	KASSERT(wp->length >= 0, ("gv_build_raid5_request: wp->length < 0"));
237 
238 	if ((p->flags & GV_PLEX_SYNCING) && (boff + real_len < p->synced))
239 		type = REQ_TYPE_NORMAL;
240 
241 	switch (bp->bio_cmd) {
242 	case BIO_READ:
243 		/*
244 		 * For a degraded read we need to read in all stripes except
245 		 * the broken one plus the parity stripe and then recalculate
246 		 * the desired data.
247 		 */
248 		if (type == REQ_TYPE_DEGRADED) {
249 			bzero(wp->data, wp->length);
250 			LIST_FOREACH(s, &p->subdisks, in_plex) {
251 				/* Skip the broken subdisk. */
252 				if (s == broken)
253 					continue;
254 				cbp = g_clone_bio(bp);
255 				if (cbp == NULL)
256 					return (ENOMEM);
257 				cbp->bio_data = g_malloc(real_len, M_WAITOK);
258 				cbp->bio_cflags |= GV_BIO_MALLOC;
259 				cbp->bio_offset = real_off;
260 				cbp->bio_length = real_len;
261 				cbp->bio_done = gv_plex_done;
262 				cbp->bio_caller2 = s->consumer;
263 				cbp->bio_driver1 = wp;
264 
265 				GV_ENQUEUE(bp, cbp, pbp);
266 
267 				bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO);
268 				bq->bp = cbp;
269 				TAILQ_INSERT_TAIL(&wp->bits, bq, queue);
270 			}
271 
272 		/* A normal read can be fulfilled with the original subdisk. */
273 		} else {
274 			cbp = g_clone_bio(bp);
275 			if (cbp == NULL)
276 				return (ENOMEM);
277 			cbp->bio_offset = real_off;
278 			cbp->bio_length = real_len;
279 			cbp->bio_data = addr;
280 			cbp->bio_done = g_std_done;
281 			cbp->bio_caller2 = original->consumer;
282 
283 			GV_ENQUEUE(bp, cbp, pbp);
284 		}
285 		wp->lockbase = -1;
286 
287 		break;
288 
289 	case BIO_WRITE:
290 		/*
291 		 * A degraded write means we cannot write to the original data
292 		 * subdisk.  Thus we need to read in all valid stripes,
293 		 * recalculate the parity from the original data, and then
294 		 * write the parity stripe back out.
295 		 */
296 		if (type == REQ_TYPE_DEGRADED) {
297 			/* Read all subdisks. */
298 			LIST_FOREACH(s, &p->subdisks, in_plex) {
299 				/* Skip the broken and the parity subdisk. */
300 				if ((s == broken) || (s == parity))
301 					continue;
302 
303 				cbp = g_clone_bio(bp);
304 				if (cbp == NULL)
305 					return (ENOMEM);
306 				cbp->bio_cmd = BIO_READ;
307 				cbp->bio_data = g_malloc(real_len, M_WAITOK);
308 				cbp->bio_cflags |= GV_BIO_MALLOC;
309 				cbp->bio_offset = real_off;
310 				cbp->bio_length = real_len;
311 				cbp->bio_done = gv_plex_done;
312 				cbp->bio_caller2 = s->consumer;
313 				cbp->bio_driver1 = wp;
314 
315 				GV_ENQUEUE(bp, cbp, pbp);
316 
317 				bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO);
318 				bq->bp = cbp;
319 				TAILQ_INSERT_TAIL(&wp->bits, bq, queue);
320 			}
321 
322 			/* Write the parity data. */
323 			cbp = g_clone_bio(bp);
324 			if (cbp == NULL)
325 				return (ENOMEM);
326 			cbp->bio_data = g_malloc(real_len, M_WAITOK);
327 			cbp->bio_cflags |= GV_BIO_MALLOC;
328 			bcopy(addr, cbp->bio_data, real_len);
329 			cbp->bio_offset = real_off;
330 			cbp->bio_length = real_len;
331 			cbp->bio_done = gv_plex_done;
332 			cbp->bio_caller2 = parity->consumer;
333 			cbp->bio_driver1 = wp;
334 			wp->parity = cbp;
335 
336 		/*
337 		 * When the parity stripe is missing we just write out the data.
338 		 */
339 		} else if (type == REQ_TYPE_NOPARITY) {
340 			cbp = g_clone_bio(bp);
341 			if (cbp == NULL)
342 				return (ENOMEM);
343 			cbp->bio_offset = real_off;
344 			cbp->bio_length = real_len;
345 			cbp->bio_data = addr;
346 			cbp->bio_done = gv_plex_done;
347 			cbp->bio_caller2 = original->consumer;
348 			cbp->bio_driver1 = wp;
349 
350 			GV_ENQUEUE(bp, cbp, pbp);
351 
352 			bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO);
353 			bq->bp = cbp;
354 			TAILQ_INSERT_TAIL(&wp->bits, bq, queue);
355 
356 		/*
357 		 * A normal write request goes to the original subdisk, then we
358 		 * read in all other stripes, recalculate the parity and write
359 		 * out the parity again.
360 		 */
361 		} else {
362 			/* Read old parity. */
363 			cbp = g_clone_bio(bp);
364 			if (cbp == NULL)
365 				return (ENOMEM);
366 			cbp->bio_cmd = BIO_READ;
367 			cbp->bio_data = g_malloc(real_len, M_WAITOK);
368 			cbp->bio_cflags |= GV_BIO_MALLOC;
369 			cbp->bio_offset = real_off;
370 			cbp->bio_length = real_len;
371 			cbp->bio_done = gv_plex_done;
372 			cbp->bio_caller2 = parity->consumer;
373 			cbp->bio_driver1 = wp;
374 
375 			GV_ENQUEUE(bp, cbp, pbp);
376 
377 			bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO);
378 			bq->bp = cbp;
379 			TAILQ_INSERT_TAIL(&wp->bits, bq, queue);
380 
381 			/* Read old data. */
382 			cbp = g_clone_bio(bp);
383 			if (cbp == NULL)
384 				return (ENOMEM);
385 			cbp->bio_cmd = BIO_READ;
386 			cbp->bio_data = g_malloc(real_len, M_WAITOK);
387 			cbp->bio_cflags |= GV_BIO_MALLOC;
388 			cbp->bio_offset = real_off;
389 			cbp->bio_length = real_len;
390 			cbp->bio_done = gv_plex_done;
391 			cbp->bio_caller2 = original->consumer;
392 			cbp->bio_driver1 = wp;
393 
394 			GV_ENQUEUE(bp, cbp, pbp);
395 
396 			bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO);
397 			bq->bp = cbp;
398 			TAILQ_INSERT_TAIL(&wp->bits, bq, queue);
399 
400 			/* Write new data. */
401 			cbp = g_clone_bio(bp);
402 			if (cbp == NULL)
403 				return (ENOMEM);
404 			cbp->bio_data = addr;
405 			cbp->bio_offset = real_off;
406 			cbp->bio_length = real_len;
407 			cbp->bio_done = gv_plex_done;
408 			cbp->bio_caller2 = original->consumer;
409 
410 			cbp->bio_driver1 = wp;
411 
412 			/*
413 			 * We must not write the new data until the old data
414 			 * was read, so hold this BIO back until we're ready
415 			 * for it.
416 			 */
417 			wp->waiting = cbp;
418 
419 			/* The final bio for the parity. */
420 			cbp = g_clone_bio(bp);
421 			if (cbp == NULL)
422 				return (ENOMEM);
423 			cbp->bio_data = g_malloc(real_len, M_WAITOK | M_ZERO);
424 			cbp->bio_cflags |= GV_BIO_MALLOC;
425 			cbp->bio_offset = real_off;
426 			cbp->bio_length = real_len;
427 			cbp->bio_done = gv_plex_done;
428 			cbp->bio_caller2 = parity->consumer;
429 			cbp->bio_driver1 = wp;
430 
431 			/* Remember that this is the BIO for the parity data. */
432 			wp->parity = cbp;
433 		}
434 		break;
435 
436 	default:
437 		return (EINVAL);
438 	}
439 
440 	return (0);
441 }
442 
443 /* Calculate the offsets in the various subdisks for a RAID5 request. */
444 int
445 gv_raid5_offset(struct gv_plex *p, off_t boff, off_t bcount, off_t *real_off,
446     off_t *real_len, int *sdno, int *psdno)
447 {
448 	int sd, psd;
449 	off_t len_left, stripeend, stripeoff, stripestart;
450 
451 	/* The number of the subdisk containing the parity stripe. */
452 	psd = p->sdcount - 1 - ( boff / (p->stripesize * (p->sdcount - 1))) %
453 	    p->sdcount;
454 	KASSERT(psdno >= 0, ("gv_raid5_offset: psdno < 0"));
455 
456 	/* Offset of the start address from the start of the stripe. */
457 	stripeoff = boff % (p->stripesize * (p->sdcount - 1));
458 	KASSERT(stripeoff >= 0, ("gv_raid5_offset: stripeoff < 0"));
459 
460 	/* The number of the subdisk where the stripe resides. */
461 	sd = stripeoff / p->stripesize;
462 	KASSERT(sdno >= 0, ("gv_raid5_offset: sdno < 0"));
463 
464 	/* At or past parity subdisk. */
465 	if (sd >= psd)
466 		sd++;
467 
468 	/* The offset of the stripe on this subdisk. */
469 	stripestart = (boff - stripeoff) / (p->sdcount - 1);
470 	KASSERT(stripestart >= 0, ("gv_raid5_offset: stripestart < 0"));
471 
472 	stripeoff %= p->stripesize;
473 
474 	/* The offset of the request on this subdisk. */
475 	*real_off = stripestart + stripeoff;
476 
477 	stripeend = stripestart + p->stripesize;
478 	len_left = stripeend - *real_off;
479 	KASSERT(len_left >= 0, ("gv_raid5_offset: len_left < 0"));
480 
481 	*real_len = (bcount <= len_left) ? bcount : len_left;
482 
483 	if (sdno != NULL)
484 		*sdno = sd;
485 	if (psdno != NULL)
486 		*psdno = psd;
487 
488 	return (0);
489 }
490