xref: /freebsd/sys/geom/vinum/geom_vinum_plex.c (revision f11c507c)
1 /*-
2  * Copyright (c) 2004 Lukas Ertl
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24  * SUCH DAMAGE.
25  */
26 
27 #include <sys/cdefs.h>
28 __FBSDID("$FreeBSD$");
29 
30 #include <sys/param.h>
31 #include <sys/bio.h>
32 #include <sys/kernel.h>
33 #include <sys/kthread.h>
34 #include <sys/libkern.h>
35 #include <sys/lock.h>
36 #include <sys/malloc.h>
37 #include <sys/module.h>
38 #include <sys/mutex.h>
39 #include <sys/systm.h>
40 
41 #include <geom/geom.h>
42 #include <geom/vinum/geom_vinum_var.h>
43 #include <geom/vinum/geom_vinum_raid5.h>
44 #include <geom/vinum/geom_vinum.h>
45 
46 static void gv_plex_completed_request(struct gv_plex *, struct bio *);
47 static void gv_plex_normal_request(struct gv_plex *, struct bio *);
48 static void gv_plex_worker(void *);
49 static int gv_check_parity(struct gv_plex *, struct bio *,
50     struct gv_raid5_packet *);
51 static int gv_normal_parity(struct gv_plex *, struct bio *,
52     struct gv_raid5_packet *);
53 
54 /* XXX: is this the place to catch dying subdisks? */
55 static void
56 gv_plex_orphan(struct g_consumer *cp)
57 {
58 	struct g_geom *gp;
59 	struct gv_plex *p;
60 	int error;
61 
62 	g_topology_assert();
63 	gp = cp->geom;
64 	g_trace(G_T_TOPOLOGY, "gv_plex_orphan(%s)", gp->name);
65 
66 	if (cp->acr != 0 || cp->acw != 0 || cp->ace != 0)
67 		g_access(cp, -cp->acr, -cp->acw, -cp->ace);
68 	error = cp->provider->error;
69 	if (error == 0)
70 		error = ENXIO;
71 	g_detach(cp);
72 	g_destroy_consumer(cp);
73 	if (!LIST_EMPTY(&gp->consumer))
74 		return;
75 
76 	p = gp->softc;
77 	if (p != NULL) {
78 		gv_kill_plex_thread(p);
79 		p->geom = NULL;
80 		p->provider = NULL;
81 		p->consumer = NULL;
82 	}
83 	gp->softc = NULL;
84 	g_wither_geom(gp, error);
85 }
86 
87 void
88 gv_plex_done(struct bio *bp)
89 {
90 	struct gv_plex *p;
91 	struct gv_bioq *bq;
92 
93 	p = bp->bio_from->geom->softc;
94 	bp->bio_cflags |= GV_BIO_DONE;
95 	bq = g_malloc(sizeof(*bq), M_NOWAIT | M_ZERO);
96 	bq->bp = bp;
97 	mtx_lock(&p->bqueue_mtx);
98 	TAILQ_INSERT_TAIL(&p->bqueue, bq, queue);
99 	wakeup(p);
100 	mtx_unlock(&p->bqueue_mtx);
101 }
102 
103 /* Find the correct subdisk to send the bio to and build a bio to send. */
104 static int
105 gv_plexbuffer(struct gv_plex *p, struct bio *bp, caddr_t addr, off_t boff, off_t bcount)
106 {
107 	struct g_geom *gp;
108 	struct gv_sd *s;
109 	struct bio *cbp, *pbp;
110 	int i, sdno;
111 	off_t len_left, real_len, real_off;
112 	off_t stripeend, stripeno, stripestart;
113 
114 	if (p == NULL || LIST_EMPTY(&p->subdisks))
115 		return (ENXIO);
116 
117 	s = NULL;
118 	gp = bp->bio_to->geom;
119 
120 	/*
121 	 * We only handle concatenated and striped plexes here.  RAID5 plexes
122 	 * are handled in build_raid5_request().
123 	 */
124 	switch (p->org) {
125 	case GV_PLEX_CONCAT:
126 		/*
127 		 * Find the subdisk where this request starts.  The subdisks in
128 		 * this list must be ordered by plex_offset.
129 		 */
130 		LIST_FOREACH(s, &p->subdisks, in_plex) {
131 			if (s->plex_offset <= boff &&
132 			    s->plex_offset + s->size > boff)
133 				break;
134 		}
135 		/* Subdisk not found. */
136 		if (s == NULL)
137 			return (ENXIO);
138 
139 		/* Calculate corresponding offsets on disk. */
140 		real_off = boff - s->plex_offset;
141 		len_left = s->size - real_off;
142 		real_len = (bcount > len_left) ? len_left : bcount;
143 		break;
144 
145 	case GV_PLEX_STRIPED:
146 		/* The number of the stripe where the request starts. */
147 		stripeno = boff / p->stripesize;
148 
149 		/* The number of the subdisk where the stripe resides. */
150 		sdno = stripeno % p->sdcount;
151 
152 		/* Find the right subdisk. */
153 		i = 0;
154 		LIST_FOREACH(s, &p->subdisks, in_plex) {
155 			if (i == sdno)
156 				break;
157 			i++;
158 		}
159 
160 		/* Subdisk not found. */
161 		if (s == NULL)
162 			return (ENXIO);
163 
164 		/* The offset of the stripe from the start of the subdisk. */
165 		stripestart = (stripeno / p->sdcount) *
166 		    p->stripesize;
167 
168 		/* The offset at the end of the stripe. */
169 		stripeend = stripestart + p->stripesize;
170 
171 		/* The offset of the request on this subdisk. */
172 		real_off = boff - (stripeno * p->stripesize) +
173 		    stripestart;
174 
175 		/* The length left in this stripe. */
176 		len_left = stripeend - real_off;
177 
178 		real_len = (bcount <= len_left) ? bcount : len_left;
179 		break;
180 
181 	default:
182 		return (EINVAL);
183 	}
184 
185 	/* Now check if we can handle the request on this subdisk. */
186 	switch (s->state) {
187 	case GV_SD_UP:
188 		/* If the subdisk is up, just continue. */
189 		break;
190 
191 	case GV_SD_STALE:
192 		if (!(bp->bio_cflags & GV_BIO_SYNCREQ))
193 			return (ENXIO);
194 
195 		printf("GEOM_VINUM: sd %s is initializing\n", s->name);
196 		gv_set_sd_state(s, GV_SD_INITIALIZING, GV_SETSTATE_FORCE);
197 		break;
198 
199 	case GV_SD_INITIALIZING:
200 		if (bp->bio_cmd == BIO_READ)
201 			return (ENXIO);
202 		break;
203 
204 	default:
205 		/* All other subdisk states mean it's not accessible. */
206 		return (ENXIO);
207 	}
208 
209 	/* Clone the bio and adjust the offsets and sizes. */
210 	cbp = g_clone_bio(bp);
211 	if (cbp == NULL)
212 		return (ENOMEM);
213 	cbp->bio_offset = real_off;
214 	cbp->bio_length = real_len;
215 	cbp->bio_data = addr;
216 	cbp->bio_done = g_std_done;
217 	cbp->bio_caller2 = s->consumer;
218 	if ((bp->bio_cflags & GV_BIO_SYNCREQ)) {
219 		cbp->bio_cflags |= GV_BIO_SYNCREQ;
220 		cbp->bio_done = gv_plex_done;
221 	}
222 
223 	if (bp->bio_driver1 == NULL) {
224 		bp->bio_driver1 = cbp;
225 	} else {
226 		pbp = bp->bio_driver1;
227 		while (pbp->bio_caller1 != NULL)
228 			pbp = pbp->bio_caller1;
229 		pbp->bio_caller1 = cbp;
230 	}
231 
232 	return (0);
233 }
234 
235 static void
236 gv_plex_start(struct bio *bp)
237 {
238 	struct gv_plex *p;
239 	struct gv_bioq *bq;
240 
241 	switch(bp->bio_cmd) {
242 	case BIO_READ:
243 	case BIO_WRITE:
244 	case BIO_DELETE:
245 		break;
246 	case BIO_GETATTR:
247 	default:
248 		g_io_deliver(bp, EOPNOTSUPP);
249 		return;
250 	}
251 
252 	/*
253 	 * We cannot handle this request if too many of our subdisks are
254 	 * inaccessible.
255 	 */
256 	p = bp->bio_to->geom->softc;
257 	if ((p->state < GV_PLEX_DEGRADED) &&
258 	    !(bp->bio_cflags & GV_BIO_SYNCREQ)) {
259 		g_io_deliver(bp, ENXIO);
260 		return;
261 	}
262 
263 	bq = g_malloc(sizeof(*bq), M_NOWAIT | M_ZERO);
264 	bq->bp = bp;
265 	mtx_lock(&p->bqueue_mtx);
266 	TAILQ_INSERT_TAIL(&p->bqueue, bq, queue);
267 	wakeup(p);
268 	mtx_unlock(&p->bqueue_mtx);
269 }
270 
271 static void
272 gv_plex_worker(void *arg)
273 {
274 	struct bio *bp;
275 	struct gv_plex *p;
276 	struct gv_sd *s;
277 	struct gv_bioq *bq;
278 
279 	p = arg;
280 	KASSERT(p != NULL, ("NULL p"));
281 
282 	mtx_lock(&p->bqueue_mtx);
283 	for (;;) {
284 		/* We were signaled to exit. */
285 		if (p->flags & GV_PLEX_THREAD_DIE)
286 			break;
287 
288 		/* Take the first BIO from our queue. */
289 		bq = TAILQ_FIRST(&p->bqueue);
290 		if (bq == NULL) {
291 			msleep(p, &p->bqueue_mtx, PRIBIO, "-", hz/10);
292 			continue;
293 		}
294 		TAILQ_REMOVE(&p->bqueue, bq, queue);
295 		mtx_unlock(&p->bqueue_mtx);
296 
297 		bp = bq->bp;
298 
299 		/* A completed request. */
300 		if (bp->bio_cflags & GV_BIO_DONE) {
301 			g_free(bq);
302 
303 			if (bp->bio_cflags & GV_BIO_SYNCREQ ||
304 			    bp->bio_cflags & GV_BIO_REBUILD) {
305 				s = bp->bio_to->private;
306 				if (bp->bio_error == 0)
307 					s->initialized += bp->bio_length;
308 				if (s->initialized >= s->size) {
309 					g_topology_lock();
310 					gv_set_sd_state(s, GV_SD_UP,
311 					    GV_SETSTATE_CONFIG);
312 					g_topology_unlock();
313 					s->initialized = 0;
314 				}
315 			}
316 
317 			if (bp->bio_cflags & GV_BIO_SYNCREQ)
318 				g_std_done(bp);
319 			else
320 				gv_plex_completed_request(p, bp);
321 		/*
322 		 * A sub-request that was hold back because it interfered with
323 		 * another sub-request.
324 		 */
325 		} else if (bp->bio_cflags & GV_BIO_ONHOLD) {
326 			/* Is it still locked out? */
327 			if (gv_stripe_active(p, bp)) {
328 				/* Park the bio on the waiting queue. */
329 				mtx_lock(&p->bqueue_mtx);
330 				TAILQ_INSERT_TAIL(&p->wqueue, bq, queue);
331 				mtx_unlock(&p->bqueue_mtx);
332 			} else {
333 				g_free(bq);
334 				bp->bio_cflags &= ~GV_BIO_ONHOLD;
335 				g_io_request(bp, bp->bio_caller2);
336 			}
337 
338 		/* A normal request to this plex. */
339 		} else {
340 			g_free(bq);
341 			gv_plex_normal_request(p, bp);
342 		}
343 
344 		mtx_lock(&p->bqueue_mtx);
345 	}
346 	mtx_unlock(&p->bqueue_mtx);
347 	p->flags |= GV_PLEX_THREAD_DEAD;
348 	wakeup(p);
349 
350 	kthread_exit(ENXIO);
351 }
352 
353 static int
354 gv_normal_parity(struct gv_plex *p, struct bio *bp, struct gv_raid5_packet *wp)
355 {
356 	struct bio *cbp, *pbp;
357 	int finished, i;
358 
359 	finished = 1;
360 
361 	if (wp->waiting != NULL) {
362 		pbp = wp->waiting;
363 		wp->waiting = NULL;
364 		cbp = wp->parity;
365 		for (i = 0; i < wp->length; i++)
366 			cbp->bio_data[i] ^= pbp->bio_data[i];
367 		g_io_request(pbp, pbp->bio_caller2);
368 		finished = 0;
369 
370 	} else if (wp->parity != NULL) {
371 		cbp = wp->parity;
372 		wp->parity = NULL;
373 		g_io_request(cbp, cbp->bio_caller2);
374 		finished = 0;
375 	}
376 
377 	return (finished);
378 }
379 
380 static int
381 gv_check_parity(struct gv_plex *p, struct bio *bp, struct gv_raid5_packet *wp)
382 {
383 	struct bio *cbp, *pbp;
384 	int err, finished, i;
385 
386 	err = 0;
387 	finished = 1;
388 
389 	if (wp->waiting != NULL) {
390 		pbp = wp->waiting;
391 		wp->waiting = NULL;
392 		g_io_request(pbp, pbp->bio_caller2);
393 		finished = 0;
394 
395 	} else if (wp->parity != NULL) {
396 		cbp = wp->parity;
397 		wp->parity = NULL;
398 
399 		/* Check if the parity is correct. */
400 		for (i = 0; i < wp->length; i++) {
401 			if (bp->bio_data[i] != cbp->bio_data[i]) {
402 				err = 1;
403 				break;
404 			}
405 		}
406 
407 		/* The parity is not correct... */
408 		if (err) {
409 			bp->bio_parent->bio_error = EAGAIN;
410 
411 			/* ... but we rebuild it. */
412 			if (bp->bio_parent->bio_cflags & GV_BIO_PARITY) {
413 				g_io_request(cbp, cbp->bio_caller2);
414 				finished = 0;
415 			}
416 		}
417 
418 		/*
419 		 * Clean up the BIO we would have used for rebuilding the
420 		 * parity.
421 		 */
422 		if (finished) {
423 			bp->bio_parent->bio_inbed++;
424 			g_destroy_bio(cbp);
425 		}
426 
427 	}
428 
429 	return (finished);
430 }
431 
432 void
433 gv_plex_completed_request(struct gv_plex *p, struct bio *bp)
434 {
435 	struct bio *cbp, *pbp;
436 	struct gv_bioq *bq, *bq2;
437 	struct gv_raid5_packet *wp;
438 	int i;
439 
440 	wp = bp->bio_driver1;
441 
442 	switch (bp->bio_parent->bio_cmd) {
443 	case BIO_READ:
444 		if (wp == NULL)
445 			break;
446 
447 		TAILQ_FOREACH_SAFE(bq, &wp->bits, queue, bq2) {
448 			if (bq->bp == bp) {
449 				TAILQ_REMOVE(&wp->bits, bq, queue);
450 				g_free(bq);
451 				for (i = 0; i < wp->length; i++)
452 					wp->data[i] ^= bp->bio_data[i];
453 				break;
454 			}
455 		}
456 		if (TAILQ_EMPTY(&wp->bits)) {
457 			bp->bio_parent->bio_completed += wp->length;
458 			if (wp->lockbase != -1) {
459 				TAILQ_REMOVE(&p->packets, wp, list);
460 				/* Bring the waiting bios back into the game. */
461 				mtx_lock(&p->bqueue_mtx);
462 				TAILQ_CONCAT(&p->bqueue, &p->wqueue, queue);
463 				mtx_unlock(&p->bqueue_mtx);
464 			}
465 			g_free(wp);
466 		}
467 
468 		break;
469 
470  	case BIO_WRITE:
471 		if (wp == NULL)
472 			break;
473 
474 		/* Check if we need to handle parity data. */
475 		TAILQ_FOREACH_SAFE(bq, &wp->bits, queue, bq2) {
476 			if (bq->bp == bp) {
477 				TAILQ_REMOVE(&wp->bits, bq, queue);
478 				g_free(bq);
479 				cbp = wp->parity;
480 				if (cbp != NULL) {
481 					for (i = 0; i < wp->length; i++)
482 						cbp->bio_data[i] ^=
483 						    bp->bio_data[i];
484 				}
485 				break;
486 			}
487 		}
488 
489 		/* Handle parity data. */
490 		if (TAILQ_EMPTY(&wp->bits)) {
491 			if (bp->bio_parent->bio_cflags & GV_BIO_CHECK)
492 				i = gv_check_parity(p, bp, wp);
493 			else
494 				i = gv_normal_parity(p, bp, wp);
495 
496 			/* All of our sub-requests have finished. */
497 			if (i) {
498 				bp->bio_parent->bio_completed += wp->length;
499 				TAILQ_REMOVE(&p->packets, wp, list);
500 				/* Bring the waiting bios back into the game. */
501 				mtx_lock(&p->bqueue_mtx);
502 				TAILQ_CONCAT(&p->bqueue, &p->wqueue, queue);
503 				mtx_unlock(&p->bqueue_mtx);
504 				g_free(wp);
505 			}
506 		}
507 
508 		break;
509 	}
510 
511 	pbp = bp->bio_parent;
512 	if (pbp->bio_error == 0)
513 		pbp->bio_error = bp->bio_error;
514 
515 	/* When the original request is finished, we deliver it. */
516 	pbp->bio_inbed++;
517 	if (pbp->bio_inbed == pbp->bio_children)
518 		g_io_deliver(pbp, pbp->bio_error);
519 
520 	/* Clean up what we allocated. */
521 	if (bp->bio_cflags & GV_BIO_MALLOC)
522 		g_free(bp->bio_data);
523 	g_destroy_bio(bp);
524 }
525 
526 void
527 gv_plex_normal_request(struct gv_plex *p, struct bio *bp)
528 {
529 	struct bio *cbp, *pbp;
530 	struct gv_bioq *bq, *bq2;
531 	struct gv_raid5_packet *wp, *wp2;
532 	caddr_t addr;
533 	off_t bcount, boff;
534 	int err;
535 
536 	bcount = bp->bio_length;
537 	addr = bp->bio_data;
538 	boff = bp->bio_offset;
539 
540 	/* Walk over the whole length of the request, we might split it up. */
541 	while (bcount > 0) {
542 		wp = NULL;
543 
544  		/*
545 		 * RAID5 plexes need special treatment, as a single write
546 		 * request involves several read/write sub-requests.
547  		 */
548 		if (p->org == GV_PLEX_RAID5) {
549 			wp = g_malloc(sizeof(*wp), M_WAITOK | M_ZERO);
550 			wp->bio = bp;
551 			TAILQ_INIT(&wp->bits);
552 
553 			if (bp->bio_cflags & GV_BIO_REBUILD)
554 				err = gv_rebuild_raid5(p, wp, bp, addr,
555 				    boff, bcount);
556 			else if (bp->bio_cflags & GV_BIO_CHECK)
557 				err = gv_check_raid5(p, wp, bp, addr,
558 				    boff, bcount);
559 			else
560 				err = gv_build_raid5_req(p, wp, bp, addr,
561 				    boff, bcount);
562 
563  			/*
564 			 * Building the sub-request failed, we probably need to
565 			 * clean up a lot.
566  			 */
567  			if (err) {
568 				printf("GEOM_VINUM: plex request failed for ");
569 				g_print_bio(bp);
570 				printf("\n");
571 				TAILQ_FOREACH_SAFE(bq, &wp->bits, queue, bq2) {
572 					TAILQ_REMOVE(&wp->bits, bq, queue);
573 					g_free(bq);
574 				}
575 				if (wp->waiting != NULL) {
576 					if (wp->waiting->bio_cflags &
577 					    GV_BIO_MALLOC)
578 						g_free(wp->waiting->bio_data);
579 					g_destroy_bio(wp->waiting);
580 				}
581 				if (wp->parity != NULL) {
582 					if (wp->parity->bio_cflags &
583 					    GV_BIO_MALLOC)
584 						g_free(wp->parity->bio_data);
585 					g_destroy_bio(wp->parity);
586 				}
587 				g_free(wp);
588 
589 				TAILQ_FOREACH_SAFE(wp, &p->packets, list, wp2) {
590 					if (wp->bio == bp) {
591 						TAILQ_REMOVE(&p->packets, wp,
592 						    list);
593 						TAILQ_FOREACH_SAFE(bq,
594 						    &wp->bits, queue, bq2) {
595 							TAILQ_REMOVE(&wp->bits,
596 							    bq, queue);
597 							g_free(bq);
598 						}
599 						g_free(wp);
600 					}
601 				}
602 
603 				cbp = bp->bio_driver1;
604 				while (cbp != NULL) {
605 					pbp = cbp->bio_caller1;
606 					if (cbp->bio_cflags & GV_BIO_MALLOC)
607 						g_free(cbp->bio_data);
608 					g_destroy_bio(cbp);
609 					cbp = pbp;
610 				}
611 
612 				g_io_deliver(bp, err);
613  				return;
614  			}
615 
616 			if (TAILQ_EMPTY(&wp->bits))
617 				g_free(wp);
618 			else if (wp->lockbase != -1)
619 				TAILQ_INSERT_TAIL(&p->packets, wp, list);
620 
621 		/*
622 		 * Requests to concatenated and striped plexes go straight
623 		 * through.
624 		 */
625 		} else {
626 			err = gv_plexbuffer(p, bp, addr, boff, bcount);
627 
628 			/* Building the sub-request failed. */
629 			if (err) {
630 				printf("GEOM_VINUM: plex request failed for ");
631 				g_print_bio(bp);
632 				printf("\n");
633 				cbp = bp->bio_driver1;
634 				while (cbp != NULL) {
635 					pbp = cbp->bio_caller1;
636 					g_destroy_bio(cbp);
637 					cbp = pbp;
638 				}
639 				g_io_deliver(bp, err);
640 				return;
641 			}
642 		}
643 
644 		/* Abuse bio_caller1 as linked list. */
645 		pbp = bp->bio_driver1;
646 		while (pbp->bio_caller1 != NULL)
647 			pbp = pbp->bio_caller1;
648 		bcount -= pbp->bio_length;
649 		addr += pbp->bio_length;
650 		boff += pbp->bio_length;
651 	}
652 
653 	/* Fire off all sub-requests. */
654 	pbp = bp->bio_driver1;
655 	while (pbp != NULL) {
656 		/*
657 		 * RAID5 sub-requests need to come in correct order, otherwise
658 		 * we trip over the parity, as it might be overwritten by
659 		 * another sub-request.
660 		 */
661 		if (pbp->bio_driver1 != NULL &&
662 		    gv_stripe_active(p, pbp)) {
663 			/* Park the bio on the waiting queue. */
664 			pbp->bio_cflags |= GV_BIO_ONHOLD;
665 			bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO);
666 			bq->bp = pbp;
667 			mtx_lock(&p->bqueue_mtx);
668 			TAILQ_INSERT_TAIL(&p->wqueue, bq, queue);
669 			mtx_unlock(&p->bqueue_mtx);
670 		} else
671 			g_io_request(pbp, pbp->bio_caller2);
672 		pbp = pbp->bio_caller1;
673 	}
674 }
675 
676 static int
677 gv_plex_access(struct g_provider *pp, int dr, int dw, int de)
678 {
679 	struct g_geom *gp;
680 	struct g_consumer *cp, *cp2;
681 	int error;
682 
683 	gp = pp->geom;
684 
685 	error = ENXIO;
686 	LIST_FOREACH(cp, &gp->consumer, consumer) {
687 		error = g_access(cp, dr, dw, de);
688 		if (error) {
689 			LIST_FOREACH(cp2, &gp->consumer, consumer) {
690 				if (cp == cp2)
691 					break;
692 				g_access(cp2, -dr, -dw, -de);
693 			}
694 			return (error);
695 		}
696 	}
697 	return (error);
698 }
699 
700 static struct g_geom *
701 gv_plex_taste(struct g_class *mp, struct g_provider *pp, int flags __unused)
702 {
703 	struct g_geom *gp;
704 	struct g_consumer *cp, *cp2;
705 	struct g_provider *pp2;
706 	struct gv_plex *p;
707 	struct gv_sd *s;
708 	struct gv_softc *sc;
709 	int error;
710 
711 	g_trace(G_T_TOPOLOGY, "gv_plex_taste(%s, %s)", mp->name, pp->name);
712 	g_topology_assert();
713 
714 	/* We only want to attach to subdisks. */
715 	if (strcmp(pp->geom->class->name, "VINUMDRIVE"))
716 		return (NULL);
717 
718 	/* Find the VINUM class and its associated geom. */
719 	gp = find_vinum_geom();
720 	if (gp == NULL)
721 		return (NULL);
722 	sc = gp->softc;
723 	KASSERT(sc != NULL, ("gv_plex_taste: NULL sc"));
724 
725 	/* Find out which subdisk the offered provider corresponds to. */
726 	s = pp->private;
727 	KASSERT(s != NULL, ("gv_plex_taste: NULL s"));
728 
729 	/* Now find the correct plex where this subdisk belongs to. */
730 	p = gv_find_plex(sc, s->plex);
731 	KASSERT(p != NULL, ("gv_plex_taste: NULL p"));
732 
733 	/*
734 	 * Add this subdisk to this plex.  Since we trust the on-disk
735 	 * configuration, we don't check the given value (should we?).
736 	 * XXX: shouldn't be done here
737 	 */
738 	gv_sd_to_plex(p, s, 0);
739 
740 	/* Now check if there's already a geom for this plex. */
741 	gp = p->geom;
742 
743 	/* Yes, there is already a geom, so we just add the consumer. */
744 	if (gp != NULL) {
745 		cp2 = LIST_FIRST(&gp->consumer);
746 		/* Need to attach a new consumer to this subdisk. */
747 		cp = g_new_consumer(gp);
748 		error = g_attach(cp, pp);
749 		if (error) {
750 			printf("geom_vinum: couldn't attach consumer to %s\n",
751 			    pp->name);
752 			g_destroy_consumer(cp);
753 			return (NULL);
754 		}
755 		/* Adjust the access counts of the new consumer. */
756 		if ((cp2 != NULL) && (cp2->acr || cp2->acw || cp2->ace)) {
757 			error = g_access(cp, cp2->acr, cp2->acw, cp2->ace);
758 			if (error) {
759 				printf("geom_vinum: couldn't set access counts"
760 				    " for consumer on %s\n", pp->name);
761 				g_detach(cp);
762 				g_destroy_consumer(cp);
763 				return (NULL);
764 			}
765 		}
766 		s->consumer = cp;
767 
768 		/* Adjust the size of the providers this plex has. */
769 		LIST_FOREACH(pp2, &gp->provider, provider)
770 			pp2->mediasize = p->size;
771 
772 		/* Update the size of the volume this plex is attached to. */
773 		if (p->vol_sc != NULL)
774 			gv_update_vol_size(p->vol_sc, p->size);
775 
776 		/*
777 		 * If necessary, create a bio queue mutex and a worker thread.
778 		 */
779 		if (mtx_initialized(&p->bqueue_mtx) == 0)
780 			mtx_init(&p->bqueue_mtx, "gv_plex", NULL, MTX_DEF);
781 		if (!(p->flags & GV_PLEX_THREAD_ACTIVE)) {
782 			kthread_create(gv_plex_worker, p, NULL, 0, 0, "gv_p %s",
783 			    p->name);
784 			p->flags |= GV_PLEX_THREAD_ACTIVE;
785 		}
786 
787 		return (NULL);
788 
789 	/* We need to create a new geom. */
790 	} else {
791 		gp = g_new_geomf(mp, "%s", p->name);
792 		gp->start = gv_plex_start;
793 		gp->orphan = gv_plex_orphan;
794 		gp->access = gv_plex_access;
795 		gp->softc = p;
796 		p->geom = gp;
797 
798 		TAILQ_INIT(&p->packets);
799 		TAILQ_INIT(&p->bqueue);
800 		TAILQ_INIT(&p->wqueue);
801 		mtx_init(&p->bqueue_mtx, "gv_plex", NULL, MTX_DEF);
802 		kthread_create(gv_plex_worker, p, NULL, 0, 0, "gv_p %s",
803 		    p->name);
804 		p->flags |= GV_PLEX_THREAD_ACTIVE;
805 
806 		/* Attach a consumer to this provider. */
807 		cp = g_new_consumer(gp);
808 		g_attach(cp, pp);
809 		s->consumer = cp;
810 
811 		/* Create a provider for the outside world. */
812 		pp2 = g_new_providerf(gp, "gvinum/plex/%s", p->name);
813 		pp2->mediasize = p->size;
814 		pp2->sectorsize = pp->sectorsize;
815 		p->provider = pp2;
816 		g_error_provider(pp2, 0);
817 		return (gp);
818 	}
819 }
820 
821 static int
822 gv_plex_destroy_geom(struct gctl_req *req, struct g_class *mp,
823     struct g_geom *gp)
824 {
825 	struct gv_plex *p;
826 
827 	g_trace(G_T_TOPOLOGY, "gv_plex_destroy_geom: %s", gp->name);
828 	g_topology_assert();
829 
830 	p = gp->softc;
831 
832 	KASSERT(p != NULL, ("gv_plex_destroy_geom: null p of '%s'", gp->name));
833 
834 	/*
835 	 * If this is a RAID5 plex, check if its worker thread is still active
836 	 * and signal it to self destruct.
837 	 */
838 	gv_kill_plex_thread(p);
839 	/* g_free(sc); */
840 	g_wither_geom(gp, ENXIO);
841 	return (0);
842 }
843 
844 #define	VINUMPLEX_CLASS_NAME "VINUMPLEX"
845 
846 static struct g_class g_vinum_plex_class = {
847 	.name = VINUMPLEX_CLASS_NAME,
848 	.version = G_VERSION,
849 	.taste = gv_plex_taste,
850 	.destroy_geom = gv_plex_destroy_geom,
851 };
852 
853 DECLARE_GEOM_CLASS(g_vinum_plex_class, g_vinum_plex);
854