xref: /freebsd/sys/geom/vinum/geom_vinum_raid5.c (revision c3aadfb9)
1 /*-
2  * Copyright (c) 2004 Lukas Ertl
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24  * SUCH DAMAGE.
25  */
26 
27 #include <sys/cdefs.h>
28 __FBSDID("$FreeBSD$");
29 
30 #include <sys/param.h>
31 #include <sys/bio.h>
32 #include <sys/conf.h>
33 #include <sys/errno.h>
34 #include <sys/kernel.h>
35 #include <sys/kthread.h>
36 #include <sys/libkern.h>
37 #include <sys/lock.h>
38 #include <sys/malloc.h>
39 #include <sys/mutex.h>
40 #include <sys/systm.h>
41 
42 #include <geom/geom.h>
43 #include <geom/vinum/geom_vinum_var.h>
44 #include <geom/vinum/geom_vinum_raid5.h>
45 #include <geom/vinum/geom_vinum.h>
46 
47 /*
48  * Check if the stripe that the work packet wants is already being used by
49  * some other work packet.
50  */
51 int
52 gv_stripe_active(struct gv_plex *p, struct bio *bp)
53 {
54 	struct gv_raid5_packet *wp, *owp;
55 	int overlap;
56 
57 	wp = bp->bio_driver1;
58 	if (wp->lockbase == -1)
59 		return (0);
60 
61 	overlap = 0;
62 	TAILQ_FOREACH(owp, &p->packets, list) {
63 		if (owp == wp)
64 			break;
65 		if ((wp->lockbase >= owp->lockbase) &&
66 		    (wp->lockbase <= owp->lockbase + owp->length)) {
67 			overlap++;
68 			break;
69 		}
70 		if ((wp->lockbase <= owp->lockbase) &&
71 		    (wp->lockbase + wp->length >= owp->lockbase)) {
72 			overlap++;
73 			break;
74 		}
75 	}
76 
77 	return (overlap);
78 }
79 
80 int
81 gv_rebuild_raid5(struct gv_plex *p, struct gv_raid5_packet *wp, struct bio *bp,
82     caddr_t addr, off_t boff, off_t bcount)
83 {
84 	struct gv_sd *broken, *s;
85 	struct gv_bioq *bq;
86 	struct bio *cbp, *pbp;
87 	off_t len_left, real_len, real_off, stripeend, stripeoff, stripestart;
88 
89 	if (p == NULL || LIST_EMPTY(&p->subdisks))
90 		return (ENXIO);
91 
92 	/* Offset of the start address from the start of the stripe. */
93 	stripeoff = boff % (p->stripesize * (p->sdcount - 1));
94 	KASSERT(stripeoff >= 0, ("gv_build_raid5_request: stripeoff < 0"));
95 
96 	/* The offset of the stripe on this subdisk. */
97 	stripestart = (boff - stripeoff) / (p->sdcount - 1);
98 	KASSERT(stripestart >= 0, ("gv_build_raid5_request: stripestart < 0"));
99 
100 	stripeoff %= p->stripesize;
101 
102 	/* The offset of the request on this subdisk. */
103 	real_off = stripestart + stripeoff;
104 
105 	stripeend = stripestart + p->stripesize;
106 	len_left = stripeend - real_off;
107 	KASSERT(len_left >= 0, ("gv_build_raid5_request: len_left < 0"));
108 
109 	/* Find the right subdisk. */
110 	broken = NULL;
111 	LIST_FOREACH(s, &p->subdisks, in_plex) {
112 		if (s->state != GV_SD_UP)
113 			broken = s;
114 	}
115 
116 	/* Parity stripe not found. */
117 	if (broken == NULL)
118 		return (ENXIO);
119 
120 	switch (broken->state) {
121 	case GV_SD_UP:
122 		return (EINVAL);
123 
124 	case GV_SD_STALE:
125 		if (!(bp->bio_cflags & GV_BIO_REBUILD))
126 			return (ENXIO);
127 
128 		printf("GEOM_VINUM: sd %s is reviving\n", broken->name);
129 		gv_set_sd_state(broken, GV_SD_REVIVING, GV_SETSTATE_FORCE);
130 		break;
131 
132 	case GV_SD_REVIVING:
133 		break;
134 
135 	default:
136 		/* All other subdisk states mean it's not accessible. */
137 		return (ENXIO);
138 	}
139 
140 	real_len = (bcount <= len_left) ? bcount : len_left;
141 	wp->length = real_len;
142 	wp->data = addr;
143 	wp->lockbase = real_off;
144 
145 	KASSERT(wp->length >= 0, ("gv_build_raid5_request: wp->length < 0"));
146 
147 	/* Read all subdisks. */
148 	LIST_FOREACH(s, &p->subdisks, in_plex) {
149 		/* Skip the broken subdisk. */
150 		if (s == broken)
151 			continue;
152 
153 		cbp = g_clone_bio(bp);
154 		if (cbp == NULL)
155 			return (ENOMEM);
156 		cbp->bio_cmd = BIO_READ;
157 		cbp->bio_data = g_malloc(real_len, M_WAITOK);
158 		cbp->bio_cflags |= GV_BIO_MALLOC;
159 		cbp->bio_offset = real_off;
160 		cbp->bio_length = real_len;
161 		cbp->bio_done = gv_plex_done;
162 		cbp->bio_caller2 = s->consumer;
163 		cbp->bio_driver1 = wp;
164 
165 		GV_ENQUEUE(bp, cbp, pbp);
166 
167 		bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO);
168 		bq->bp = cbp;
169 		TAILQ_INSERT_TAIL(&wp->bits, bq, queue);
170 	}
171 
172 	/* Write the parity data. */
173 	cbp = g_clone_bio(bp);
174 	if (cbp == NULL)
175 		return (ENOMEM);
176 	cbp->bio_data = g_malloc(real_len, M_WAITOK | M_ZERO);
177 	cbp->bio_cflags |= GV_BIO_MALLOC;
178 	cbp->bio_offset = real_off;
179 	cbp->bio_length = real_len;
180 	cbp->bio_done = gv_plex_done;
181 	cbp->bio_caller2 = broken->consumer;
182 	cbp->bio_driver1 = wp;
183 	cbp->bio_cflags |= GV_BIO_REBUILD;
184 	wp->parity = cbp;
185 
186 	p->synced = boff;
187 
188 	return (0);
189 }
190 
191 /* Build a request group to perform (part of) a RAID5 request. */
192 int
193 gv_build_raid5_req(struct gv_plex *p, struct gv_raid5_packet *wp,
194     struct bio *bp, caddr_t addr, off_t boff, off_t bcount)
195 {
196 	struct g_geom *gp;
197 	struct gv_sd *broken, *original, *parity, *s;
198 	struct gv_bioq *bq;
199 	struct bio *cbp, *pbp;
200 	int i, psdno, sdno, type;
201 	off_t len_left, real_len, real_off, stripeend, stripeoff, stripestart;
202 
203 	gp = bp->bio_to->geom;
204 
205 	if (p == NULL || LIST_EMPTY(&p->subdisks))
206 		return (ENXIO);
207 
208 	/* We are optimistic and assume that this request will be OK. */
209 #define	REQ_TYPE_NORMAL		0
210 #define	REQ_TYPE_DEGRADED	1
211 #define	REQ_TYPE_NOPARITY	2
212 
213 	type = REQ_TYPE_NORMAL;
214 	original = parity = broken = NULL;
215 
216 	/* The number of the subdisk containing the parity stripe. */
217 	psdno = p->sdcount - 1 - ( boff / (p->stripesize * (p->sdcount - 1))) %
218 	    p->sdcount;
219 	KASSERT(psdno >= 0, ("gv_build_raid5_request: psdno < 0"));
220 
221 	/* Offset of the start address from the start of the stripe. */
222 	stripeoff = boff % (p->stripesize * (p->sdcount - 1));
223 	KASSERT(stripeoff >= 0, ("gv_build_raid5_request: stripeoff < 0"));
224 
225 	/* The number of the subdisk where the stripe resides. */
226 	sdno = stripeoff / p->stripesize;
227 	KASSERT(sdno >= 0, ("gv_build_raid5_request: sdno < 0"));
228 
229 	/* At or past parity subdisk. */
230 	if (sdno >= psdno)
231 		sdno++;
232 
233 	/* The offset of the stripe on this subdisk. */
234 	stripestart = (boff - stripeoff) / (p->sdcount - 1);
235 	KASSERT(stripestart >= 0, ("gv_build_raid5_request: stripestart < 0"));
236 
237 	stripeoff %= p->stripesize;
238 
239 	/* The offset of the request on this subdisk. */
240 	real_off = stripestart + stripeoff;
241 
242 	stripeend = stripestart + p->stripesize;
243 	len_left = stripeend - real_off;
244 	KASSERT(len_left >= 0, ("gv_build_raid5_request: len_left < 0"));
245 
246 	/* Find the right subdisks. */
247 	i = 0;
248 	LIST_FOREACH(s, &p->subdisks, in_plex) {
249 		if (i == sdno)
250 			original = s;
251 		if (i == psdno)
252 			parity = s;
253 		if (s->state != GV_SD_UP)
254 			broken = s;
255 		i++;
256 	}
257 
258 	if ((original == NULL) || (parity == NULL))
259 		return (ENXIO);
260 
261 	/* Our data stripe is missing. */
262 	if (original->state != GV_SD_UP)
263 		type = REQ_TYPE_DEGRADED;
264 	/* Our parity stripe is missing. */
265 	if (parity->state != GV_SD_UP) {
266 		/* We cannot take another failure if we're already degraded. */
267 		if (type != REQ_TYPE_NORMAL)
268 			return (ENXIO);
269 		else
270 			type = REQ_TYPE_NOPARITY;
271 	}
272 
273 	real_len = (bcount <= len_left) ? bcount : len_left;
274 	wp->length = real_len;
275 	wp->data = addr;
276 	wp->lockbase = real_off;
277 
278 	KASSERT(wp->length >= 0, ("gv_build_raid5_request: wp->length < 0"));
279 
280 	if ((p->flags & GV_PLEX_SYNCING) && (boff + real_len < p->synced))
281 		type = REQ_TYPE_NORMAL;
282 
283 	switch (bp->bio_cmd) {
284 	case BIO_READ:
285 		/*
286 		 * For a degraded read we need to read in all stripes except
287 		 * the broken one plus the parity stripe and then recalculate
288 		 * the desired data.
289 		 */
290 		if (type == REQ_TYPE_DEGRADED) {
291 			bzero(wp->data, wp->length);
292 			LIST_FOREACH(s, &p->subdisks, in_plex) {
293 				/* Skip the broken subdisk. */
294 				if (s == broken)
295 					continue;
296 				cbp = g_clone_bio(bp);
297 				if (cbp == NULL)
298 					return (ENOMEM);
299 				cbp->bio_data = g_malloc(real_len, M_WAITOK);
300 				cbp->bio_cflags |= GV_BIO_MALLOC;
301 				cbp->bio_offset = real_off;
302 				cbp->bio_length = real_len;
303 				cbp->bio_done = gv_plex_done;
304 				cbp->bio_caller2 = s->consumer;
305 				cbp->bio_driver1 = wp;
306 
307 				GV_ENQUEUE(bp, cbp, pbp);
308 
309 				bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO);
310 				bq->bp = cbp;
311 				TAILQ_INSERT_TAIL(&wp->bits, bq, queue);
312 			}
313 
314 		/* A normal read can be fulfilled with the original subdisk. */
315 		} else {
316 			cbp = g_clone_bio(bp);
317 			if (cbp == NULL)
318 				return (ENOMEM);
319 			cbp->bio_offset = real_off;
320 			cbp->bio_length = real_len;
321 			cbp->bio_data = addr;
322 			cbp->bio_done = g_std_done;
323 			cbp->bio_caller2 = original->consumer;
324 
325 			GV_ENQUEUE(bp, cbp, pbp);
326 		}
327 		wp->lockbase = -1;
328 
329 		break;
330 
331 	case BIO_WRITE:
332 		/*
333 		 * A degraded write means we cannot write to the original data
334 		 * subdisk.  Thus we need to read in all valid stripes,
335 		 * recalculate the parity from the original data, and then
336 		 * write the parity stripe back out.
337 		 */
338 		if (type == REQ_TYPE_DEGRADED) {
339 			/* Read all subdisks. */
340 			LIST_FOREACH(s, &p->subdisks, in_plex) {
341 				/* Skip the broken and the parity subdisk. */
342 				if ((s == broken) || (s == parity))
343 					continue;
344 
345 				cbp = g_clone_bio(bp);
346 				if (cbp == NULL)
347 					return (ENOMEM);
348 				cbp->bio_cmd = BIO_READ;
349 				cbp->bio_data = g_malloc(real_len, M_WAITOK);
350 				cbp->bio_cflags |= GV_BIO_MALLOC;
351 				cbp->bio_offset = real_off;
352 				cbp->bio_length = real_len;
353 				cbp->bio_done = gv_plex_done;
354 				cbp->bio_caller2 = s->consumer;
355 				cbp->bio_driver1 = wp;
356 
357 				GV_ENQUEUE(bp, cbp, pbp);
358 
359 				bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO);
360 				bq->bp = cbp;
361 				TAILQ_INSERT_TAIL(&wp->bits, bq, queue);
362 			}
363 
364 			/* Write the parity data. */
365 			cbp = g_clone_bio(bp);
366 			if (cbp == NULL)
367 				return (ENOMEM);
368 			cbp->bio_data = g_malloc(real_len, M_WAITOK);
369 			cbp->bio_cflags |= GV_BIO_MALLOC;
370 			bcopy(addr, cbp->bio_data, real_len);
371 			cbp->bio_offset = real_off;
372 			cbp->bio_length = real_len;
373 			cbp->bio_done = gv_plex_done;
374 			cbp->bio_caller2 = parity->consumer;
375 			cbp->bio_driver1 = wp;
376 			wp->parity = cbp;
377 
378 		/*
379 		 * When the parity stripe is missing we just write out the data.
380 		 */
381 		} else if (type == REQ_TYPE_NOPARITY) {
382 			cbp = g_clone_bio(bp);
383 			if (cbp == NULL)
384 				return (ENOMEM);
385 			cbp->bio_offset = real_off;
386 			cbp->bio_length = real_len;
387 			cbp->bio_data = addr;
388 			cbp->bio_done = gv_plex_done;
389 			cbp->bio_caller2 = original->consumer;
390 			cbp->bio_driver1 = wp;
391 
392 			GV_ENQUEUE(bp, cbp, pbp);
393 
394 			bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO);
395 			bq->bp = cbp;
396 			TAILQ_INSERT_TAIL(&wp->bits, bq, queue);
397 
398 		/*
399 		 * A normal write request goes to the original subdisk, then we
400 		 * read in all other stripes, recalculate the parity and write
401 		 * out the parity again.
402 		 */
403 		} else {
404 			/* Read old parity. */
405 			cbp = g_clone_bio(bp);
406 			if (cbp == NULL)
407 				return (ENOMEM);
408 			cbp->bio_cmd = BIO_READ;
409 			cbp->bio_data = g_malloc(real_len, M_WAITOK);
410 			cbp->bio_cflags |= GV_BIO_MALLOC;
411 			cbp->bio_offset = real_off;
412 			cbp->bio_length = real_len;
413 			cbp->bio_done = gv_plex_done;
414 			cbp->bio_caller2 = parity->consumer;
415 			cbp->bio_driver1 = wp;
416 
417 			GV_ENQUEUE(bp, cbp, pbp);
418 
419 			bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO);
420 			bq->bp = cbp;
421 			TAILQ_INSERT_TAIL(&wp->bits, bq, queue);
422 
423 			/* Read old data. */
424 			cbp = g_clone_bio(bp);
425 			if (cbp == NULL)
426 				return (ENOMEM);
427 			cbp->bio_cmd = BIO_READ;
428 			cbp->bio_data = g_malloc(real_len, M_WAITOK);
429 			cbp->bio_cflags |= GV_BIO_MALLOC;
430 			cbp->bio_offset = real_off;
431 			cbp->bio_length = real_len;
432 			cbp->bio_done = gv_plex_done;
433 			cbp->bio_caller2 = original->consumer;
434 			cbp->bio_driver1 = wp;
435 
436 			GV_ENQUEUE(bp, cbp, pbp);
437 
438 			bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO);
439 			bq->bp = cbp;
440 			TAILQ_INSERT_TAIL(&wp->bits, bq, queue);
441 
442 			/* Write new data. */
443 			cbp = g_clone_bio(bp);
444 			if (cbp == NULL)
445 				return (ENOMEM);
446 			cbp->bio_data = addr;
447 			cbp->bio_offset = real_off;
448 			cbp->bio_length = real_len;
449 			cbp->bio_done = gv_plex_done;
450 			cbp->bio_caller2 = original->consumer;
451 
452 			cbp->bio_driver1 = wp;
453 
454 			/*
455 			 * We must not write the new data until the old data
456 			 * was read, so hold this BIO back until we're ready
457 			 * for it.
458 			 */
459 			wp->waiting = cbp;
460 
461 			/* The final bio for the parity. */
462 			cbp = g_clone_bio(bp);
463 			if (cbp == NULL)
464 				return (ENOMEM);
465 			cbp->bio_data = g_malloc(real_len, M_WAITOK | M_ZERO);
466 			cbp->bio_cflags |= GV_BIO_MALLOC;
467 			cbp->bio_offset = real_off;
468 			cbp->bio_length = real_len;
469 			cbp->bio_done = gv_plex_done;
470 			cbp->bio_caller2 = parity->consumer;
471 			cbp->bio_driver1 = wp;
472 
473 			/* Remember that this is the BIO for the parity data. */
474 			wp->parity = cbp;
475 		}
476 		break;
477 
478 	default:
479 		return (EINVAL);
480 	}
481 
482 	return (0);
483 }
484