xref: /freebsd/sys/geom/vinum/geom_vinum_raid5.c (revision c3dba6d0)
1 /*-
2  * Copyright (c) 2004 Lukas Ertl
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24  * SUCH DAMAGE.
25  */
26 
27 #include <sys/cdefs.h>
28 __FBSDID("$FreeBSD$");
29 
30 #include <sys/param.h>
31 #include <sys/bio.h>
32 #include <sys/conf.h>
33 #include <sys/errno.h>
34 #include <sys/kernel.h>
35 #include <sys/kthread.h>
36 #include <sys/libkern.h>
37 #include <sys/lock.h>
38 #include <sys/malloc.h>
39 #include <sys/mutex.h>
40 #include <sys/systm.h>
41 
42 #include <geom/geom.h>
43 #include <geom/vinum/geom_vinum_var.h>
44 #include <geom/vinum/geom_vinum_raid5.h>
45 #include <geom/vinum/geom_vinum.h>
46 
47 int	gv_raid5_parity(struct gv_raid5_packet *);
48 int	gv_stripe_active(struct gv_raid5_packet *, struct gv_plex *);
49 
50 struct gv_raid5_bit *
51 gv_new_raid5_bit(void)
52 {
53 	struct gv_raid5_bit *r;
54 	r = g_malloc(sizeof(*r), M_NOWAIT | M_ZERO);
55 	KASSERT(r != NULL, ("gv_new_raid5_bit: NULL r"));
56 	return (r);
57 }
58 
59 struct gv_raid5_packet *
60 gv_new_raid5_packet(void)
61 {
62 	struct gv_raid5_packet *wp;
63 
64 	wp = g_malloc(sizeof(*wp), M_NOWAIT | M_ZERO);
65 	KASSERT(wp != NULL, ("gv_new_raid5_packet: NULL wp"));
66 	wp->state = SETUP;
67 	wp->type = JUNK;
68 	TAILQ_INIT(&wp->bits);
69 
70 	return (wp);
71 }
72 
73 /*
74  * Check if the stripe that the work packet wants is already being used by
75  * some other work packet.
76  */
77 int
78 gv_stripe_active(struct gv_raid5_packet *wp, struct gv_plex *sc)
79 {
80 	struct gv_raid5_packet *wpa;
81 
82 	TAILQ_FOREACH(wpa, &sc->worklist, list) {
83 		if (wpa->lockbase == wp->lockbase) {
84 			if (wpa->bio == wp->bio)
85 				return (0);
86 			return (1);
87 		}
88 	}
89 	return (0);
90 }
91 
92 /*
93  * The "worker" thread that runs through the worklist and fires off the
94  * "subrequests" needed to fulfill a RAID5 read or write request.
95  */
96 void
97 gv_raid5_worker(void *arg)
98 {
99 	struct bio *bp;
100 	struct g_geom *gp;
101 	struct gv_plex *p;
102 	struct gv_raid5_packet *wp, *wpt;
103 	struct gv_raid5_bit *rbp, *rbpt;
104 	int error, restart;
105 
106 	gp = arg;
107 	p = gp->softc;
108 
109 	mtx_lock(&p->worklist_mtx);
110 	for (;;) {
111 		restart = 0;
112 		TAILQ_FOREACH_SAFE(wp, &p->worklist, list, wpt) {
113 			/* This request packet is already being processed. */
114 			if (wp->state == IO)
115 				continue;
116 			/* This request packet is ready for processing. */
117 			if (wp->state == VALID) {
118 				/* Couldn't get the lock, try again. */
119 				if ((wp->lockbase != -1) &&
120 				    gv_stripe_active(wp, p))
121 					continue;
122 
123 				wp->state = IO;
124 				mtx_unlock(&p->worklist_mtx);
125 				TAILQ_FOREACH_SAFE(rbp, &wp->bits, list, rbpt)
126 					g_io_request(rbp->bio, rbp->consumer);
127 				mtx_lock(&p->worklist_mtx);
128 				continue;
129 			}
130 			if (wp->state == FINISH) {
131 				bp = wp->bio;
132 				bp->bio_completed += wp->length;
133 				/*
134 				 * Deliver the original request if we have
135 				 * finished.
136 				 */
137 				if (bp->bio_completed == bp->bio_length) {
138 					mtx_unlock(&p->worklist_mtx);
139 					g_io_deliver(bp, 0);
140 					mtx_lock(&p->worklist_mtx);
141 				}
142 				TAILQ_REMOVE(&p->worklist, wp, list);
143 				if (wp->bufmalloc == 1)
144 					g_free(wp->buf);
145 				g_free(wp);
146 				restart++;
147 				/*break;*/
148 			}
149 		}
150 		if (!restart) {
151 			/* Self-destruct. */
152 			if (p->flags & GV_PLEX_THREAD_DIE)
153 				break;
154 			error = msleep(p, &p->worklist_mtx, PRIBIO, "-",
155 			    hz/100);
156 		}
157 	}
158 	mtx_unlock(&p->worklist_mtx);
159 
160 	g_trace(G_T_TOPOLOGY, "gv_raid5_worker die");
161 
162 	/* Signal our plex that we are dead. */
163 	p->flags |= GV_PLEX_THREAD_DEAD;
164 	wakeup(p);
165 	kthread_exit(0);
166 }
167 
168 /* Final bio transaction to write out the parity data. */
169 int
170 gv_raid5_parity(struct gv_raid5_packet *wp)
171 {
172 	struct bio *bp;
173 
174 	bp = g_new_bio();
175 	if (bp == NULL)
176 		return (ENOMEM);
177 
178 	wp->type = ISPARITY;
179 	bp->bio_cmd = BIO_WRITE;
180 	bp->bio_data = wp->buf;
181 	bp->bio_offset = wp->offset;
182 	bp->bio_length = wp->length;
183 	bp->bio_done = gv_raid5_done;
184 	bp->bio_caller1 = wp;
185 	bp->bio_caller2 = NULL;
186 	g_io_request(bp, wp->parity);
187 
188 	return (0);
189 }
190 
191 /* We end up here after each subrequest. */
192 void
193 gv_raid5_done(struct bio *bp)
194 {
195 	struct bio *obp;
196 	struct g_geom *gp;
197 	struct gv_plex *p;
198 	struct gv_raid5_packet *wp;
199 	struct gv_raid5_bit *rbp;
200 	off_t i;
201 	int error;
202 
203 	wp = bp->bio_caller1;
204 	rbp = bp->bio_caller2;
205 	obp = wp->bio;
206 	gp = bp->bio_from->geom;
207 	p = gp->softc;
208 
209 	/* One less active subrequest. */
210 	wp->active--;
211 
212 	switch (obp->bio_cmd) {
213 	case BIO_READ:
214 		/* Degraded reads need to handle parity data. */
215 		if (wp->type == DEGRADED) {
216 			for (i = 0; i < wp->length; i++)
217 				wp->buf[i] ^= bp->bio_data[i];
218 
219 			/* When we're finished copy back the data we want. */
220 			if (wp->active == 0)
221 				bcopy(wp->buf, wp->data, wp->length);
222 		}
223 
224 		break;
225 
226 	case BIO_WRITE:
227 		/* Handle the parity data, if needed. */
228 		if ((wp->type != NOPARITY) && (wp->type != ISPARITY)) {
229 			for (i = 0; i < wp->length; i++)
230 				wp->buf[i] ^= bp->bio_data[i];
231 
232 			/* Write out the parity data we calculated. */
233 			if (wp->active == 0) {
234 				wp->active++;
235 				error = gv_raid5_parity(wp);
236 			}
237 		}
238 		break;
239 	}
240 
241 	g_destroy_bio(bp);
242 
243 	if (rbp != NULL) {
244 		if (rbp->malloc == 1)
245 			g_free(rbp->buf);
246 		TAILQ_REMOVE(&wp->bits, rbp, list);
247 		g_free(rbp);
248 	}
249 
250 	/* This request group is done. */
251 	if (wp->active == 0)
252 		wp->state = FINISH;
253 }
254 
255 /* Build a request group to perform (part of) a RAID5 request. */
256 int
257 gv_build_raid5_req(struct gv_raid5_packet *wp, struct bio *bp, caddr_t addr,
258     long bcount, off_t boff)
259 {
260 	struct g_geom *gp;
261 	struct gv_plex *p;
262 	struct gv_raid5_bit *rbp;
263 	struct gv_sd *broken, *original, *parity, *s;
264 	int i, psdno, sdno;
265 	off_t len_left, real_off, stripeend, stripeoff, stripestart;
266 
267 	gp = bp->bio_to->geom;
268 	p = gp->softc;
269 
270 	if (p == NULL || LIST_EMPTY(&p->subdisks))
271 		return (ENXIO);
272 
273 	/* We are optimistic and assume that this request will be OK. */
274 	wp->type = NORMAL;
275 	original = parity = broken = NULL;
276 
277 	/* The number of the subdisk containing the parity stripe. */
278 	psdno = p->sdcount - 1 - ( boff / (p->stripesize * (p->sdcount - 1))) %
279 	    p->sdcount;
280 	KASSERT(psdno >= 0, ("gv_build_raid5_request: psdno < 0"));
281 
282 	/* Offset of the start address from the start of the stripe. */
283 	stripeoff = boff % (p->stripesize * (p->sdcount - 1));
284 	KASSERT(stripeoff >= 0, ("gv_build_raid5_request: stripeoff < 0"));
285 
286 	/* The number of the subdisk where the stripe resides. */
287 	sdno = stripeoff / p->stripesize;
288 	KASSERT(sdno >= 0, ("gv_build_raid5_request: sdno < 0"));
289 
290 	/* At or past parity subdisk. */
291 	if (sdno >= psdno)
292 		sdno++;
293 
294 	/* The offset of the stripe on this subdisk. */
295 	stripestart = (boff - stripeoff) / (p->sdcount - 1);
296 	KASSERT(stripestart >= 0, ("gv_build_raid5_request: stripestart < 0"));
297 
298 	stripeoff %= p->stripesize;
299 
300 	/* The offset of the request on this subdisk. */
301 	real_off = stripestart + stripeoff;
302 
303 	stripeend = stripestart + p->stripesize;
304 	len_left = stripeend - real_off;
305 	KASSERT(len_left >= 0, ("gv_build_raid5_request: len_left < 0"));
306 
307 	/* Find the right subdisks. */
308 	i = 0;
309 	LIST_FOREACH(s, &p->subdisks, in_plex) {
310 		if (i == sdno)
311 			original = s;
312 		if (i == psdno)
313 			parity = s;
314 		if (s->state != GV_SD_UP)
315 			broken = s;
316 		i++;
317 	}
318 
319 	if ((original == NULL) || (parity == NULL))
320 		return (ENXIO);
321 
322 	/* Our data stripe is missing. */
323 	if (original->state != GV_SD_UP)
324 		wp->type = DEGRADED;
325 	/* Our parity stripe is missing. */
326 	if (parity->state != GV_SD_UP) {
327 		/* We cannot take another failure if we're already degraded. */
328 		if (wp->type != NORMAL)
329 			return (ENXIO);
330 		else
331 			wp->type = NOPARITY;
332 	}
333 
334 	/*
335 	 * A combined write is necessary when the original data subdisk and the
336 	 * parity subdisk are both up, but one of the other subdisks isn't.
337 	 */
338 	if ((broken != NULL) && (broken != parity) && (broken != original))
339 		wp->type = COMBINED;
340 
341 	wp->offset = real_off;
342 	wp->length = (bcount <= len_left) ? bcount : len_left;
343 	wp->data = addr;
344 	wp->original = original->consumer;
345 	wp->parity = parity->consumer;
346 	wp->lockbase = stripestart;
347 
348 	KASSERT(wp->length >= 0, ("gv_build_raid5_request: wp->length < 0"));
349 
350 	switch (bp->bio_cmd) {
351 	case BIO_READ:
352 		/*
353 		 * For a degraded read we need to read in all stripes except
354 		 * the broken one plus the parity stripe and then recalculate
355 		 * the desired data.
356 		 */
357 		if (wp->type == DEGRADED) {
358 			wp->buf = g_malloc(wp->length, M_NOWAIT | M_ZERO);
359 			if (wp->buf == NULL)
360 				return (ENOMEM);
361 			wp->bufmalloc = 1;
362 			LIST_FOREACH(s, &p->subdisks, in_plex) {
363 				/* Skip the broken subdisk. */
364 				if (s == broken)
365 					continue;
366 				rbp = gv_new_raid5_bit();
367 				rbp->consumer = s->consumer;
368 				rbp->bio = g_new_bio();
369 				if (rbp->bio == NULL)
370 					return (ENOMEM);
371 				rbp->buf = g_malloc(wp->length,
372 					M_NOWAIT | M_ZERO);
373 				if (rbp->buf == NULL)
374 					return (ENOMEM);
375 				rbp->malloc = 1;
376 				rbp->bio->bio_cmd = BIO_READ;
377 				rbp->bio->bio_offset = wp->offset;
378 				rbp->bio->bio_length = wp->length;
379 				rbp->bio->bio_data = rbp->buf;
380 				rbp->bio->bio_done = gv_raid5_done;
381 				rbp->bio->bio_caller1 = wp;
382 				rbp->bio->bio_caller2 = rbp;
383 				TAILQ_INSERT_HEAD(&wp->bits, rbp, list);
384 				wp->active++;
385 				wp->rqcount++;
386 			}
387 
388 		/* A normal read can be fulfilled with the original subdisk. */
389 		} else {
390 			rbp = gv_new_raid5_bit();
391 			rbp->consumer = wp->original;
392 			rbp->bio = g_new_bio();
393 			if (rbp->bio == NULL)
394 				return (ENOMEM);
395 			rbp->bio->bio_cmd = BIO_READ;
396 			rbp->bio->bio_offset = wp->offset;
397 			rbp->bio->bio_length = wp->length;
398 			rbp->buf = addr;
399 			rbp->bio->bio_data = rbp->buf;
400 			rbp->bio->bio_done = gv_raid5_done;
401 			rbp->bio->bio_caller1 = wp;
402 			rbp->bio->bio_caller2 = rbp;
403 			TAILQ_INSERT_HEAD(&wp->bits, rbp, list);
404 			wp->active++;
405 			wp->rqcount++;
406 		}
407 		if (wp->type != COMBINED)
408 			wp->lockbase = -1;
409 		break;
410 
411 	case BIO_WRITE:
412 		/*
413 		 * A degraded write means we cannot write to the original data
414 		 * subdisk.  Thus we need to read in all valid stripes,
415 		 * recalculate the parity from the original data, and then
416 		 * write the parity stripe back out.
417 		 */
418 		if (wp->type == DEGRADED) {
419 			wp->buf = g_malloc(wp->length, M_NOWAIT | M_ZERO);
420 			if (wp->buf == NULL)
421 				return (ENOMEM);
422 			wp->bufmalloc = 1;
423 
424 			/* Copy the original data. */
425 			bcopy(wp->data, wp->buf, wp->length);
426 
427 			LIST_FOREACH(s, &p->subdisks, in_plex) {
428 				/* Skip the broken and the parity subdisk. */
429 				if ((s == broken) ||
430 				    (s->consumer == wp->parity))
431 					continue;
432 
433 				rbp = gv_new_raid5_bit();
434 				rbp->consumer = s->consumer;
435 				rbp->bio = g_new_bio();
436 				if (rbp->bio == NULL)
437 					return (ENOMEM);
438 				rbp->buf = g_malloc(wp->length,
439 				    M_NOWAIT | M_ZERO);
440 				if (rbp->buf == NULL)
441 					return (ENOMEM);
442 				rbp->malloc = 1;
443 				rbp->bio->bio_cmd = BIO_READ;
444 				rbp->bio->bio_data = rbp->buf;
445 				rbp->bio->bio_offset = wp->offset;
446 				rbp->bio->bio_length = wp->length;
447 				rbp->bio->bio_done = gv_raid5_done;
448 				rbp->bio->bio_caller1 = wp;
449 				rbp->bio->bio_caller2 = rbp;
450 				TAILQ_INSERT_HEAD(&wp->bits, rbp, list);
451 				wp->active++;
452 				wp->rqcount++;
453 			}
454 
455 		/*
456 		 * When we don't have the parity stripe we just write out the
457 		 * data.
458 		 */
459 		} else if (wp->type == NOPARITY) {
460 			rbp = gv_new_raid5_bit();
461 			rbp->consumer = wp->original;
462 			rbp->bio = g_new_bio();
463 			if (rbp->bio == NULL)
464 				return (ENOMEM);
465 			rbp->bio->bio_cmd = BIO_WRITE;
466 			rbp->bio->bio_offset = wp->offset;
467 			rbp->bio->bio_length = wp->length;
468 			rbp->bio->bio_data = addr;
469 			rbp->bio->bio_done = gv_raid5_done;
470 			rbp->bio->bio_caller1 = wp;
471 			rbp->bio->bio_caller2 = rbp;
472 			TAILQ_INSERT_HEAD(&wp->bits, rbp, list);
473 			wp->active++;
474 			wp->rqcount++;
475 
476 		/*
477 		 * A combined write means that our data subdisk and the parity
478 		 * subdisks are both up, but another subdisk isn't.  We need to
479 		 * read all valid stripes including the parity to recalculate
480 		 * the data of the stripe that is missing.  Then we write our
481 		 * original data, and together with the other data stripes
482 		 * recalculate the parity again.
483 		 */
484 		} else if (wp->type == COMBINED) {
485 			wp->buf = g_malloc(wp->length, M_NOWAIT | M_ZERO);
486 			if (wp->buf == NULL)
487 				return (ENOMEM);
488 			wp->bufmalloc = 1;
489 
490 			/* Get the data from all subdisks. */
491 			LIST_FOREACH(s, &p->subdisks, in_plex) {
492 				/* Skip the broken subdisk. */
493 				if (s == broken)
494 					continue;
495 
496 				rbp = gv_new_raid5_bit();
497 				rbp->consumer = s->consumer;
498 				rbp->bio = g_new_bio();
499 				if (rbp->bio == NULL)
500 					return (ENOMEM);
501 				rbp->bio->bio_cmd = BIO_READ;
502 				rbp->buf = g_malloc(wp->length,
503 				    M_NOWAIT | M_ZERO);
504 				if (rbp->buf == NULL)
505 					return (ENOMEM);
506 				rbp->malloc = 1;
507 				rbp->bio->bio_data = rbp->buf;
508 				rbp->bio->bio_offset = wp->offset;
509 				rbp->bio->bio_length = wp->length;
510 				rbp->bio->bio_done = gv_raid5_done;
511 				rbp->bio->bio_caller1 = wp;
512 				rbp->bio->bio_caller2 = rbp;
513 				TAILQ_INSERT_HEAD(&wp->bits, rbp, list);
514 				wp->active++;
515 				wp->rqcount++;
516 			}
517 
518 			/* Write the original data. */
519 			rbp = gv_new_raid5_bit();
520 			rbp->consumer = wp->original;
521 			rbp->buf = addr;
522 			rbp->bio = g_new_bio();
523 			if (rbp->bio == NULL)
524 				return (ENOMEM);
525 			rbp->bio->bio_cmd = BIO_WRITE;
526 			rbp->bio->bio_data = rbp->buf;
527 			rbp->bio->bio_offset = wp->offset;
528 			rbp->bio->bio_length = wp->length;
529 			rbp->bio->bio_done = gv_raid5_done;
530 			rbp->bio->bio_caller1 = wp;
531 			rbp->bio->bio_caller2 = rbp;
532 			/*
533 			 * Insert at the tail, because we want to read the old
534 			 * data first.
535 			 */
536 			TAILQ_INSERT_TAIL(&wp->bits, rbp, list);
537 			wp->active++;
538 			wp->rqcount++;
539 
540 			/* Get the rest of the data again. */
541 			LIST_FOREACH(s, &p->subdisks, in_plex) {
542 				/*
543 				 * Skip the broken subdisk, the parity, and the
544 				 * one we just wrote.
545 				 */
546 				if ((s == broken) ||
547 				    (s->consumer == wp->parity) ||
548 				    (s->consumer == wp->original))
549 					continue;
550 				rbp = gv_new_raid5_bit();
551 				rbp->consumer = s->consumer;
552 				rbp->bio = g_new_bio();
553 				if (rbp->bio == NULL)
554 					return (ENOMEM);
555 				rbp->bio->bio_cmd = BIO_READ;
556 				rbp->buf = g_malloc(wp->length,
557 				    M_NOWAIT | M_ZERO);
558 				if (rbp->buf == NULL)
559 					return (ENOMEM);
560 				rbp->malloc = 1;
561 				rbp->bio->bio_data = rbp->buf;
562 				rbp->bio->bio_offset = wp->offset;
563 				rbp->bio->bio_length = wp->length;
564 				rbp->bio->bio_done = gv_raid5_done;
565 				rbp->bio->bio_caller1 = wp;
566 				rbp->bio->bio_caller2 = rbp;
567 				/*
568 				 * Again, insert at the tail to keep correct
569 				 * order.
570 				 */
571 				TAILQ_INSERT_TAIL(&wp->bits, rbp, list);
572 				wp->active++;
573 				wp->rqcount++;
574 			}
575 
576 
577 		/*
578 		 * A normal write request goes to the original subdisk, then we
579 		 * read in all other stripes, recalculate the parity and write
580 		 * out the parity again.
581 		 */
582 		} else {
583 			wp->buf = g_malloc(wp->length, M_NOWAIT | M_ZERO);
584 			if (wp->buf == NULL)
585 				return (ENOMEM);
586 			wp->bufmalloc = 1;
587 			LIST_FOREACH(s, &p->subdisks, in_plex) {
588 				/* Skip the parity stripe. */
589 				if (s->consumer == wp->parity)
590 					continue;
591 
592 				rbp = gv_new_raid5_bit();
593 				rbp->consumer = s->consumer;
594 				rbp->bio = g_new_bio();
595 				if (rbp->bio == NULL)
596 					return (ENOMEM);
597 				/*
598 				 * The data for the original stripe is written,
599 				 * the others need to be read in for the parity
600 				 * calculation.
601 				 */
602 				if (s->consumer == wp->original) {
603 					rbp->bio->bio_cmd = BIO_WRITE;
604 					rbp->buf = addr;
605 				} else {
606 					rbp->bio->bio_cmd = BIO_READ;
607 					rbp->buf = g_malloc(wp->length,
608 					    M_NOWAIT | M_ZERO);
609 					if (rbp->buf == NULL)
610 						return (ENOMEM);
611 					rbp->malloc = 1;
612 				}
613 				rbp->bio->bio_data = rbp->buf;
614 				rbp->bio->bio_offset = wp->offset;
615 				rbp->bio->bio_length = wp->length;
616 				rbp->bio->bio_done = gv_raid5_done;
617 				rbp->bio->bio_caller1 = wp;
618 				rbp->bio->bio_caller2 = rbp;
619 				TAILQ_INSERT_HEAD(&wp->bits, rbp, list);
620 				wp->active++;
621 				wp->rqcount++;
622 			}
623 		}
624 		break;
625 	default:
626 		return (EINVAL);
627 	}
628 
629 	wp->state = VALID;
630 	return (0);
631 }
632