xref: /freebsd/sys/geom/vinum/geom_vinum_drive.c (revision 39beb93c)
1 /*-
2  * Copyright (c) 2004, 2005 Lukas Ertl
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24  * SUCH DAMAGE.
25  */
26 
27 #include <sys/cdefs.h>
28 __FBSDID("$FreeBSD$");
29 
30 #include <sys/param.h>
31 #include <sys/bio.h>
32 #include <sys/errno.h>
33 #include <sys/endian.h>
34 #include <sys/conf.h>
35 #include <sys/kernel.h>
36 #include <sys/kthread.h>
37 #include <sys/libkern.h>
38 #include <sys/lock.h>
39 #include <sys/malloc.h>
40 #include <sys/module.h>
41 #include <sys/mutex.h>
42 #include <sys/sbuf.h>
43 #include <sys/systm.h>
44 #include <sys/time.h>
45 #include <sys/vimage.h>
46 
47 #include <geom/geom.h>
48 #include <geom/vinum/geom_vinum_var.h>
49 #include <geom/vinum/geom_vinum.h>
50 #include <geom/vinum/geom_vinum_share.h>
51 
52 #define GV_LEGACY_I386	0
53 #define GV_LEGACY_AMD64 1
54 #define GV_LEGACY_SPARC64 2
55 #define GV_LEGACY_POWERPC 3
56 
57 static void	gv_drive_dead(void *, int);
58 static void	gv_drive_worker(void *);
59 static int	gv_legacy_header_type(uint8_t *, int);
60 
61 /*
62  * Here are the "offset (size)" for the various struct gv_hdr fields,
63  * for the legacy i386 (or 32-bit powerpc), legacy amd64 (or sparc64), and
64  * current (cpu & endian agnostic) versions of the on-disk format of the vinum
65  * header structure:
66  *
67  *       i386    amd64   current   field
68  *     -------- -------- --------  -----
69  *       0 ( 8)   0 ( 8)   0 ( 8)  magic
70  *       8 ( 4)   8 ( 8)   8 ( 8)  config_length
71  *      12 (32)  16 (32)  16 (32)  label.sysname
72  *      44 (32)  48 (32)  48 (32)  label.name
73  *      76 ( 4)  80 ( 8)  80 ( 8)  label.date_of_birth.tv_sec
74  *      80 ( 4)  88 ( 8)  88 ( 8)  label.date_of_birth.tv_usec
75  *      84 ( 4)  96 ( 8)  96 ( 8)  label.last_update.tv_sec
76  *      88 ( 4) 104 ( 8) 104 ( 8)  label.last_update.tv_usec
77  *      92 ( 8) 112 ( 8) 112 ( 8)  label.drive_size
78  *     ======== ======== ========
79  *     100      120      120       total size
80  *
81  * NOTE: i386 and amd64 formats are stored as little-endian; the current
82  * format uses big-endian (network order).
83  */
84 
85 
86 /* Checks for legacy format depending on platform. */
87 static int
88 gv_legacy_header_type(uint8_t *hdr, int bigendian)
89 {
90 	uint32_t *i32;
91 	int arch_32, arch_64, i;
92 
93 	/* Set arch according to endianess. */
94 	if (bigendian) {
95 		arch_32 = GV_LEGACY_POWERPC;
96 		arch_64 = GV_LEGACY_SPARC64;
97 	} else {
98 		arch_32 = GV_LEGACY_I386;
99 		arch_64 = GV_LEGACY_AMD64;
100 	}
101 
102 	/* if non-empty hostname overlaps 64-bit config_length */
103 	i32 = (uint32_t *)(hdr + 12);
104 	if (*i32 != 0)
105 		return (arch_32);
106 	/* check for non-empty hostname */
107 	if (hdr[16] != 0)
108 		return (arch_64);
109 	/* check bytes past 32-bit structure */
110 	for (i = 100; i < 120; i++)
111 		if (hdr[i] != 0)
112 			return (arch_32);
113 	/* check for overlapping timestamp */
114 	i32 = (uint32_t *)(hdr + 84);
115 
116 	if (*i32 == 0)
117 		return (arch_64);
118 	return (arch_32);
119 }
120 
121 /*
122  * Read the header while taking magic number into account, and write it to
123  * destination pointer.
124  */
125 int
126 gv_read_header(struct g_consumer *cp, struct gv_hdr *m_hdr)
127 {
128 	struct g_provider *pp;
129 	uint64_t magic_machdep;
130 	uint8_t *d_hdr;
131 	int be, off;
132 
133 #define GV_GET32(endian)					\
134 		endian##32toh(*((uint32_t *)&d_hdr[off]));	\
135 		off += 4
136 #define GV_GET64(endian)					\
137 		endian##64toh(*((uint64_t *)&d_hdr[off]));	\
138 		off += 8
139 
140 	KASSERT(m_hdr != NULL, ("gv_read_header: null m_hdr"));
141 	KASSERT(cp != NULL, ("gv_read_header: null cp"));
142 	pp = cp->provider;
143 	KASSERT(pp != NULL, ("gv_read_header: null pp"));
144 
145 	d_hdr = g_read_data(cp, GV_HDR_OFFSET, pp->sectorsize, NULL);
146 	if (d_hdr == NULL)
147 		return (-1);
148 	off = 0;
149 	m_hdr->magic = GV_GET64(be);
150 	magic_machdep = *((uint64_t *)&d_hdr[0]);
151 	/*
152 	 * The big endian machines will have a reverse of GV_OLD_MAGIC, so we
153 	 * need to decide if we are running on a big endian machine as well as
154 	 * checking the magic against the reverse of GV_OLD_MAGIC.
155 	 */
156 	be = (m_hdr->magic == magic_machdep);
157 	if (m_hdr->magic == GV_MAGIC) {
158 		m_hdr->config_length = GV_GET64(be);
159 		off = 16;
160 		bcopy(d_hdr + off, m_hdr->label.sysname, GV_HOSTNAME_LEN);
161 		off += GV_HOSTNAME_LEN;
162 		bcopy(d_hdr + off, m_hdr->label.name, GV_MAXDRIVENAME);
163 		off += GV_MAXDRIVENAME;
164 		m_hdr->label.date_of_birth.tv_sec = GV_GET64(be);
165 		m_hdr->label.date_of_birth.tv_usec = GV_GET64(be);
166 		m_hdr->label.last_update.tv_sec = GV_GET64(be);
167 		m_hdr->label.last_update.tv_usec = GV_GET64(be);
168 		m_hdr->label.drive_size = GV_GET64(be);
169 	} else if (m_hdr->magic != GV_OLD_MAGIC &&
170 	    m_hdr->magic != le64toh(GV_OLD_MAGIC)) {
171 		/* Not a gvinum drive. */
172 		g_free(d_hdr);
173 		return (-1);
174 	} else if (gv_legacy_header_type(d_hdr, be) == GV_LEGACY_SPARC64) {
175 		G_VINUM_DEBUG(1, "detected legacy sparc64 header");
176 		m_hdr->magic = GV_MAGIC;
177 		/* Legacy sparc64 on-disk header */
178 		m_hdr->config_length = GV_GET64(be);
179 		bcopy(d_hdr + 16, m_hdr->label.sysname, GV_HOSTNAME_LEN);
180 		off += GV_HOSTNAME_LEN;
181 		bcopy(d_hdr + 48, m_hdr->label.name, GV_MAXDRIVENAME);
182 		off += GV_MAXDRIVENAME;
183 		m_hdr->label.date_of_birth.tv_sec = GV_GET64(be);
184 		m_hdr->label.date_of_birth.tv_usec = GV_GET64(be);
185 		m_hdr->label.last_update.tv_sec = GV_GET64(be);
186 		m_hdr->label.last_update.tv_usec = GV_GET64(be);
187 		m_hdr->label.drive_size = GV_GET64(be);
188 	} else if (gv_legacy_header_type(d_hdr, be) == GV_LEGACY_POWERPC) {
189 		G_VINUM_DEBUG(1, "detected legacy PowerPC header");
190 		m_hdr->magic = GV_MAGIC;
191 		/* legacy 32-bit big endian on-disk header */
192 		m_hdr->config_length = GV_GET32(be);
193 		bcopy(d_hdr + off, m_hdr->label.sysname, GV_HOSTNAME_LEN);
194 		off += GV_HOSTNAME_LEN;
195 		bcopy(d_hdr + off, m_hdr->label.name, GV_MAXDRIVENAME);
196 		off += GV_MAXDRIVENAME;
197 		m_hdr->label.date_of_birth.tv_sec = GV_GET32(be);
198 		m_hdr->label.date_of_birth.tv_usec = GV_GET32(be);
199 		m_hdr->label.last_update.tv_sec = GV_GET32(be);
200 		m_hdr->label.last_update.tv_usec = GV_GET32(be);
201 		m_hdr->label.drive_size = GV_GET64(be);
202 	} else if (gv_legacy_header_type(d_hdr, be) == GV_LEGACY_I386) {
203 		G_VINUM_DEBUG(1, "detected legacy i386 header");
204 		m_hdr->magic = GV_MAGIC;
205 		/* legacy i386 on-disk header */
206 		m_hdr->config_length = GV_GET32(le);
207 		bcopy(d_hdr + off, m_hdr->label.sysname, GV_HOSTNAME_LEN);
208 		off += GV_HOSTNAME_LEN;
209 		bcopy(d_hdr + off, m_hdr->label.name, GV_MAXDRIVENAME);
210 		off += GV_MAXDRIVENAME;
211 		m_hdr->label.date_of_birth.tv_sec = GV_GET32(le);
212 		m_hdr->label.date_of_birth.tv_usec = GV_GET32(le);
213 		m_hdr->label.last_update.tv_sec = GV_GET32(le);
214 		m_hdr->label.last_update.tv_usec = GV_GET32(le);
215 		m_hdr->label.drive_size = GV_GET64(le);
216 	} else {
217 		G_VINUM_DEBUG(1, "detected legacy amd64 header");
218 		m_hdr->magic = GV_MAGIC;
219 		/* legacy amd64 on-disk header */
220 		m_hdr->config_length = GV_GET64(le);
221 		bcopy(d_hdr + 16, m_hdr->label.sysname, GV_HOSTNAME_LEN);
222 		off += GV_HOSTNAME_LEN;
223 		bcopy(d_hdr + 48, m_hdr->label.name, GV_MAXDRIVENAME);
224 		off += GV_MAXDRIVENAME;
225 		m_hdr->label.date_of_birth.tv_sec = GV_GET64(le);
226 		m_hdr->label.date_of_birth.tv_usec = GV_GET64(le);
227 		m_hdr->label.last_update.tv_sec = GV_GET64(le);
228 		m_hdr->label.last_update.tv_usec = GV_GET64(le);
229 		m_hdr->label.drive_size = GV_GET64(le);
230 	}
231 
232 	g_free(d_hdr);
233 	return (0);
234 }
235 
236 /* Write out the gvinum header. */
237 int
238 gv_write_header(struct g_consumer *cp, struct gv_hdr *m_hdr)
239 {
240 	uint8_t d_hdr[GV_HDR_LEN];
241 	int off, ret;
242 
243 #define GV_SET64BE(field)					\
244 	do {							\
245 		*((uint64_t *)&d_hdr[off]) = htobe64(field);	\
246 		off += 8;					\
247 	} while (0)
248 
249 	KASSERT(m_hdr != NULL, ("gv_write_header: null m_hdr"));
250 
251 	off = 0;
252 	memset(d_hdr, 0, GV_HDR_LEN);
253 	GV_SET64BE(m_hdr->magic);
254 	GV_SET64BE(m_hdr->config_length);
255 	off = 16;
256 	bcopy(m_hdr->label.sysname, d_hdr + off, GV_HOSTNAME_LEN);
257 	off += GV_HOSTNAME_LEN;
258 	bcopy(m_hdr->label.name, d_hdr + off, GV_MAXDRIVENAME);
259 	off += GV_MAXDRIVENAME;
260 	GV_SET64BE(m_hdr->label.date_of_birth.tv_sec);
261 	GV_SET64BE(m_hdr->label.date_of_birth.tv_usec);
262 	GV_SET64BE(m_hdr->label.last_update.tv_sec);
263 	GV_SET64BE(m_hdr->label.last_update.tv_usec);
264 	GV_SET64BE(m_hdr->label.drive_size);
265 
266 	ret = g_write_data(cp, GV_HDR_OFFSET, d_hdr, GV_HDR_LEN);
267 	return (ret);
268 }
269 
270 void
271 gv_config_new_drive(struct gv_drive *d)
272 {
273 	struct gv_hdr *vhdr;
274 	struct gv_freelist *fl;
275 
276 	KASSERT(d != NULL, ("config_new_drive: NULL d"));
277 
278 	vhdr = g_malloc(sizeof(*vhdr), M_WAITOK | M_ZERO);
279 	vhdr->magic = GV_MAGIC;
280 	vhdr->config_length = GV_CFG_LEN;
281 
282 	mtx_lock(&hostname_mtx);
283 	bcopy(G_hostname, vhdr->label.sysname, GV_HOSTNAME_LEN);
284 	mtx_unlock(&hostname_mtx);
285 	strncpy(vhdr->label.name, d->name, GV_MAXDRIVENAME);
286 	microtime(&vhdr->label.date_of_birth);
287 
288 	d->hdr = vhdr;
289 
290 	LIST_INIT(&d->subdisks);
291 	LIST_INIT(&d->freelist);
292 
293 	fl = g_malloc(sizeof(struct gv_freelist), M_WAITOK | M_ZERO);
294 	fl->offset = GV_DATA_START;
295 	fl->size = d->avail;
296 	LIST_INSERT_HEAD(&d->freelist, fl, freelist);
297 	d->freelist_entries = 1;
298 
299 	d->bqueue = g_malloc(sizeof(struct bio_queue_head), M_WAITOK | M_ZERO);
300 	bioq_init(d->bqueue);
301 	mtx_init(&d->bqueue_mtx, "gv_drive", NULL, MTX_DEF);
302 	kproc_create(gv_drive_worker, d, NULL, 0, 0, "gv_d %s", d->name);
303 	d->flags |= GV_DRIVE_THREAD_ACTIVE;
304 }
305 
306 void
307 gv_save_config_all(struct gv_softc *sc)
308 {
309 	struct gv_drive *d;
310 
311 	g_topology_assert();
312 
313 	LIST_FOREACH(d, &sc->drives, drive) {
314 		if (d->geom == NULL)
315 			continue;
316 		gv_save_config(NULL, d, sc);
317 	}
318 }
319 
320 /* Save the vinum configuration back to disk. */
321 void
322 gv_save_config(struct g_consumer *cp, struct gv_drive *d, struct gv_softc *sc)
323 {
324 	struct g_geom *gp;
325 	struct g_consumer *cp2;
326 	struct gv_hdr *vhdr, *hdr;
327 	struct sbuf *sb;
328 	int error;
329 
330 	g_topology_assert();
331 
332 	KASSERT(d != NULL, ("gv_save_config: null d"));
333 	KASSERT(sc != NULL, ("gv_save_config: null sc"));
334 
335 	/*
336 	 * We can't save the config on a drive that isn't up, but drives that
337 	 * were just created aren't officially up yet, so we check a special
338 	 * flag.
339 	 */
340 	if ((d->state != GV_DRIVE_UP) && !(d->flags && GV_DRIVE_NEWBORN))
341 		return;
342 
343 	if (cp == NULL) {
344 		gp = d->geom;
345 		KASSERT(gp != NULL, ("gv_save_config: null gp"));
346 		cp2 = LIST_FIRST(&gp->consumer);
347 		KASSERT(cp2 != NULL, ("gv_save_config: null cp2"));
348 	} else
349 		cp2 = cp;
350 
351 	vhdr = g_malloc(GV_HDR_LEN, M_WAITOK | M_ZERO);
352 	vhdr->magic = GV_MAGIC;
353 	vhdr->config_length = GV_CFG_LEN;
354 
355 	hdr = d->hdr;
356 	if (hdr == NULL) {
357 		G_VINUM_DEBUG(0, "drive %s has NULL hdr", d->name);
358 		g_free(vhdr);
359 		return;
360 	}
361 	microtime(&hdr->label.last_update);
362 	bcopy(&hdr->label, &vhdr->label, sizeof(struct gv_label));
363 
364 	sb = sbuf_new(NULL, NULL, GV_CFG_LEN, SBUF_FIXEDLEN);
365 	gv_format_config(sc, sb, 1, NULL);
366 	sbuf_finish(sb);
367 
368 	error = g_access(cp2, 0, 1, 0);
369 	if (error) {
370 		G_VINUM_DEBUG(0, "g_access failed on drive %s, errno %d",
371 		    d->name, error);
372 		sbuf_delete(sb);
373 		g_free(vhdr);
374 		return;
375 	}
376 	g_topology_unlock();
377 
378 	do {
379 		error = gv_write_header(cp2, vhdr);
380 		if (error) {
381 			G_VINUM_DEBUG(0, "writing vhdr failed on drive %s, "
382 			    "errno %d", d->name, error);
383 			break;
384 		}
385 
386 		error = g_write_data(cp2, GV_CFG_OFFSET, sbuf_data(sb),
387 		    GV_CFG_LEN);
388 		if (error) {
389 			G_VINUM_DEBUG(0, "writing first config copy failed "
390 			    "on drive %s, errno %d", d->name, error);
391 			break;
392 		}
393 
394 		error = g_write_data(cp2, GV_CFG_OFFSET + GV_CFG_LEN,
395 		    sbuf_data(sb), GV_CFG_LEN);
396 		if (error)
397 			G_VINUM_DEBUG(0, "writing second config copy failed "
398 			    "on drive %s, errno %d", d->name, error);
399 	} while (0);
400 
401 	g_topology_lock();
402 	g_access(cp2, 0, -1, 0);
403 	sbuf_delete(sb);
404 	g_free(vhdr);
405 
406 	if (d->geom != NULL)
407 		gv_drive_modify(d);
408 }
409 
410 /* This resembles g_slice_access(). */
411 static int
412 gv_drive_access(struct g_provider *pp, int dr, int dw, int de)
413 {
414 	struct g_geom *gp;
415 	struct g_consumer *cp;
416 	struct g_provider *pp2;
417 	struct gv_drive *d;
418 	struct gv_sd *s, *s2;
419 	int error;
420 
421 	gp = pp->geom;
422 	cp = LIST_FIRST(&gp->consumer);
423 	if (cp == NULL)
424 		return (0);
425 
426 	d = gp->softc;
427 	if (d == NULL)
428 		return (0);
429 
430 	s = pp->private;
431 	KASSERT(s != NULL, ("gv_drive_access: NULL s"));
432 
433 	LIST_FOREACH(s2, &d->subdisks, from_drive) {
434 		if (s == s2)
435 			continue;
436 		if (s->drive_offset + s->size <= s2->drive_offset)
437 			continue;
438 		if (s2->drive_offset + s2->size <= s->drive_offset)
439 			continue;
440 
441 		/* Overlap. */
442 		pp2 = s2->provider;
443 		KASSERT(s2 != NULL, ("gv_drive_access: NULL s2"));
444 		if ((pp->acw + dw) > 0 && pp2->ace > 0)
445 			return (EPERM);
446 		if ((pp->ace + de) > 0 && pp2->acw > 0)
447 			return (EPERM);
448 	}
449 
450 	error = g_access(cp, dr, dw, de);
451 	return (error);
452 }
453 
454 static void
455 gv_drive_done(struct bio *bp)
456 {
457 	struct gv_drive *d;
458 
459 	/* Put the BIO on the worker queue again. */
460 	d = bp->bio_from->geom->softc;
461 	bp->bio_cflags |= GV_BIO_DONE;
462 	mtx_lock(&d->bqueue_mtx);
463 	bioq_insert_tail(d->bqueue, bp);
464 	wakeup(d);
465 	mtx_unlock(&d->bqueue_mtx);
466 }
467 
468 
469 static void
470 gv_drive_start(struct bio *bp)
471 {
472 	struct gv_drive *d;
473 	struct gv_sd *s;
474 
475 	switch (bp->bio_cmd) {
476 	case BIO_READ:
477 	case BIO_WRITE:
478 	case BIO_DELETE:
479 		break;
480 	case BIO_GETATTR:
481 	default:
482 		g_io_deliver(bp, EOPNOTSUPP);
483 		return;
484 	}
485 
486 	s = bp->bio_to->private;
487 	if ((s->state == GV_SD_DOWN) || (s->state == GV_SD_STALE)) {
488 		g_io_deliver(bp, ENXIO);
489 		return;
490 	}
491 
492 	d = bp->bio_to->geom->softc;
493 
494 	/*
495 	 * Put the BIO on the worker queue, where the worker thread will pick
496 	 * it up.
497 	 */
498 	mtx_lock(&d->bqueue_mtx);
499 	bioq_disksort(d->bqueue, bp);
500 	wakeup(d);
501 	mtx_unlock(&d->bqueue_mtx);
502 
503 }
504 
505 static void
506 gv_drive_worker(void *arg)
507 {
508 	struct bio *bp, *cbp;
509 	struct g_geom *gp;
510 	struct g_provider *pp;
511 	struct gv_drive *d;
512 	struct gv_sd *s;
513 	int error;
514 
515 	d = arg;
516 
517 	mtx_lock(&d->bqueue_mtx);
518 	for (;;) {
519 		/* We were signaled to exit. */
520 		if (d->flags & GV_DRIVE_THREAD_DIE)
521 			break;
522 
523 		/* Take the first BIO from out queue. */
524 		bp = bioq_takefirst(d->bqueue);
525 		if (bp == NULL) {
526 			msleep(d, &d->bqueue_mtx, PRIBIO, "-", hz/10);
527 			continue;
528  		}
529 		mtx_unlock(&d->bqueue_mtx);
530 
531 		pp = bp->bio_to;
532 		gp = pp->geom;
533 
534 		/* Completed request. */
535 		if (bp->bio_cflags & GV_BIO_DONE) {
536 			error = bp->bio_error;
537 
538 			/* Deliver the original request. */
539 			g_std_done(bp);
540 
541 			/* The request had an error, we need to clean up. */
542 			if (error != 0) {
543 				g_topology_lock();
544 				gv_set_drive_state(d, GV_DRIVE_DOWN,
545 				    GV_SETSTATE_FORCE | GV_SETSTATE_CONFIG);
546 				g_topology_unlock();
547 				g_post_event(gv_drive_dead, d, M_WAITOK, d,
548 				    NULL);
549 			}
550 
551 		/* New request, needs to be sent downwards. */
552 		} else {
553 			s = pp->private;
554 
555 			if ((s->state == GV_SD_DOWN) ||
556 			    (s->state == GV_SD_STALE)) {
557 				g_io_deliver(bp, ENXIO);
558 				mtx_lock(&d->bqueue_mtx);
559 				continue;
560 			}
561 			if (bp->bio_offset > s->size) {
562 				g_io_deliver(bp, EINVAL);
563 				mtx_lock(&d->bqueue_mtx);
564 				continue;
565 			}
566 
567 			cbp = g_clone_bio(bp);
568 			if (cbp == NULL) {
569 				g_io_deliver(bp, ENOMEM);
570 				mtx_lock(&d->bqueue_mtx);
571 				continue;
572 			}
573 			if (cbp->bio_offset + cbp->bio_length > s->size)
574 				cbp->bio_length = s->size -
575 				    cbp->bio_offset;
576 			cbp->bio_done = gv_drive_done;
577 			cbp->bio_offset += s->drive_offset;
578 			g_io_request(cbp, LIST_FIRST(&gp->consumer));
579 		}
580 
581 		mtx_lock(&d->bqueue_mtx);
582 	}
583 
584 	while ((bp = bioq_takefirst(d->bqueue)) != NULL) {
585 		mtx_unlock(&d->bqueue_mtx);
586 		if (bp->bio_cflags & GV_BIO_DONE)
587 			g_std_done(bp);
588 		else
589 			g_io_deliver(bp, ENXIO);
590 		mtx_lock(&d->bqueue_mtx);
591 	}
592 	mtx_unlock(&d->bqueue_mtx);
593 	d->flags |= GV_DRIVE_THREAD_DEAD;
594 
595 	kproc_exit(ENXIO);
596 }
597 
598 
599 static void
600 gv_drive_orphan(struct g_consumer *cp)
601 {
602 	struct g_geom *gp;
603 	struct gv_drive *d;
604 
605 	g_topology_assert();
606 	gp = cp->geom;
607 	g_trace(G_T_TOPOLOGY, "gv_drive_orphan(%s)", gp->name);
608 	d = gp->softc;
609 	if (d != NULL) {
610 		gv_set_drive_state(d, GV_DRIVE_DOWN,
611 		    GV_SETSTATE_FORCE | GV_SETSTATE_CONFIG);
612 		g_post_event(gv_drive_dead, d, M_WAITOK, d, NULL);
613 	} else
614 		g_wither_geom(gp, ENXIO);
615 }
616 
617 static struct g_geom *
618 gv_drive_taste(struct g_class *mp, struct g_provider *pp, int flags __unused)
619 {
620 	struct g_geom *gp, *gp2;
621 	struct g_consumer *cp;
622 	struct gv_drive *d;
623 	struct gv_sd *s;
624 	struct gv_softc *sc;
625 	struct gv_freelist *fl;
626 	struct gv_hdr *vhdr;
627 	int error;
628 	char *buf, errstr[ERRBUFSIZ];
629 
630 	vhdr = NULL;
631 	d = NULL;
632 
633 	g_trace(G_T_TOPOLOGY, "gv_drive_taste(%s, %s)", mp->name, pp->name);
634 	g_topology_assert();
635 
636 	/* Find the VINUM class and its associated geom. */
637 	gp2 = find_vinum_geom();
638 	if (gp2 == NULL)
639 		return (NULL);
640 	sc = gp2->softc;
641 
642 	gp = g_new_geomf(mp, "%s.vinumdrive", pp->name);
643 	gp->start = gv_drive_start;
644 	gp->orphan = gv_drive_orphan;
645 	gp->access = gv_drive_access;
646 	gp->start = gv_drive_start;
647 
648 	cp = g_new_consumer(gp);
649 	g_attach(cp, pp);
650 	error = g_access(cp, 1, 0, 0);
651 	if (error) {
652 		g_detach(cp);
653 		g_destroy_consumer(cp);
654 		g_destroy_geom(gp);
655 		return (NULL);
656 	}
657 
658 	g_topology_unlock();
659 
660 	/* Now check if the provided slice is a valid vinum drive. */
661 	do {
662 		vhdr = g_malloc(GV_HDR_LEN, M_WAITOK | M_ZERO);
663 		error = gv_read_header(cp, vhdr);
664 		if (error) {
665 			g_free(vhdr);
666 			break;
667 		}
668 
669 		/* A valid vinum drive, let's parse the on-disk information. */
670 		buf = g_read_data(cp, GV_CFG_OFFSET, GV_CFG_LEN, NULL);
671 		if (buf == NULL) {
672 			g_free(vhdr);
673 			break;
674 		}
675 		g_topology_lock();
676 		gv_parse_config(sc, buf, 1);
677 		g_free(buf);
678 
679 		/*
680 		 * Let's see if this drive is already known in the
681 		 * configuration.
682 		 */
683 		d = gv_find_drive(sc, vhdr->label.name);
684 
685 		/* We already know about this drive. */
686 		if (d != NULL) {
687 			/* Check if this drive already has a geom. */
688 			if (d->geom != NULL) {
689 				g_topology_unlock();
690 				g_free(vhdr);
691 				break;
692 			}
693 			bcopy(vhdr, d->hdr, sizeof(*vhdr));
694 			g_free(vhdr);
695 
696 		/* This is a new drive. */
697 		} else {
698 			d = g_malloc(sizeof(*d), M_WAITOK | M_ZERO);
699 
700 			/* Initialize all needed variables. */
701 			d->size = pp->mediasize - GV_DATA_START;
702 			d->avail = d->size;
703 			d->hdr = vhdr;
704 			strncpy(d->name, vhdr->label.name, GV_MAXDRIVENAME);
705 			LIST_INIT(&d->subdisks);
706 			LIST_INIT(&d->freelist);
707 
708 			/* We also need a freelist entry. */
709 			fl = g_malloc(sizeof(*fl), M_WAITOK | M_ZERO);
710 			fl->offset = GV_DATA_START;
711 			fl->size = d->avail;
712 			LIST_INSERT_HEAD(&d->freelist, fl, freelist);
713 			d->freelist_entries = 1;
714 
715 			/* Save it into the main configuration. */
716 			LIST_INSERT_HEAD(&sc->drives, d, drive);
717 		}
718 
719 		/*
720 		 * Create bio queue, queue mutex and a worker thread, if
721 		 * necessary.
722 		 */
723 		if (d->bqueue == NULL) {
724 			d->bqueue = g_malloc(sizeof(struct bio_queue_head),
725 			    M_WAITOK | M_ZERO);
726 			bioq_init(d->bqueue);
727 		}
728 		if (mtx_initialized(&d->bqueue_mtx) == 0)
729 			mtx_init(&d->bqueue_mtx, "gv_drive", NULL, MTX_DEF);
730 
731 		if (!(d->flags & GV_DRIVE_THREAD_ACTIVE)) {
732 			kproc_create(gv_drive_worker, d, NULL, 0, 0,
733 			    "gv_d %s", d->name);
734 			d->flags |= GV_DRIVE_THREAD_ACTIVE;
735 		}
736 
737 		g_access(cp, -1, 0, 0);
738 
739 		gp->softc = d;
740 		d->geom = gp;
741 		d->vinumconf = sc;
742 		strncpy(d->device, pp->name, GV_MAXDRIVENAME);
743 
744 		/*
745 		 * Find out which subdisks belong to this drive and crosslink
746 		 * them.
747 		 */
748 		LIST_FOREACH(s, &sc->subdisks, sd) {
749 			if (!strncmp(s->drive, d->name, GV_MAXDRIVENAME))
750 				/* XXX: errors ignored */
751 				gv_sd_to_drive(sc, d, s, errstr,
752 				    sizeof(errstr));
753 		}
754 
755 		/* This drive is now up for sure. */
756 		gv_set_drive_state(d, GV_DRIVE_UP, 0);
757 
758 		/*
759 		 * If there are subdisks on this drive, we need to create
760 		 * providers for them.
761 		 */
762 		if (d->sdcount)
763 			gv_drive_modify(d);
764 
765 		return (gp);
766 
767 	} while (0);
768 
769 	g_topology_lock();
770 	g_access(cp, -1, 0, 0);
771 
772 	g_detach(cp);
773 	g_destroy_consumer(cp);
774 	g_destroy_geom(gp);
775 	return (NULL);
776 }
777 
778 /*
779  * Modify the providers for the given drive 'd'.  It is assumed that the
780  * subdisk list of 'd' is already correctly set up.
781  */
782 void
783 gv_drive_modify(struct gv_drive *d)
784 {
785 	struct g_geom *gp;
786 	struct g_consumer *cp;
787 	struct g_provider *pp, *pp2;
788 	struct gv_sd *s;
789 
790 	KASSERT(d != NULL, ("gv_drive_modify: null d"));
791 	gp = d->geom;
792 	KASSERT(gp != NULL, ("gv_drive_modify: null gp"));
793 	cp = LIST_FIRST(&gp->consumer);
794 	KASSERT(cp != NULL, ("gv_drive_modify: null cp"));
795 	pp = cp->provider;
796 	KASSERT(pp != NULL, ("gv_drive_modify: null pp"));
797 
798 	g_topology_assert();
799 
800 	LIST_FOREACH(s, &d->subdisks, from_drive) {
801 		/* This subdisk already has a provider. */
802 		if (s->provider != NULL)
803 			continue;
804 		pp2 = g_new_providerf(gp, "gvinum/sd/%s", s->name);
805 		pp2->mediasize = s->size;
806 		pp2->sectorsize = pp->sectorsize;
807 		g_error_provider(pp2, 0);
808 		s->provider = pp2;
809 		pp2->private = s;
810 	}
811 }
812 
813 static void
814 gv_drive_dead(void *arg, int flag)
815 {
816 	struct g_geom *gp;
817 	struct g_consumer *cp;
818 	struct gv_drive *d;
819 	struct gv_sd *s;
820 
821 	g_topology_assert();
822 	KASSERT(arg != NULL, ("gv_drive_dead: NULL arg"));
823 
824 	if (flag == EV_CANCEL)
825 		return;
826 
827 	d = arg;
828 	if (d->state != GV_DRIVE_DOWN)
829 		return;
830 
831 	g_trace(G_T_TOPOLOGY, "gv_drive_dead(%s)", d->name);
832 
833 	gp = d->geom;
834 	if (gp == NULL)
835 		return;
836 
837 	LIST_FOREACH(cp, &gp->consumer, consumer) {
838 		if (cp->nstart != cp->nend) {
839 			G_VINUM_DEBUG(0, "dead drive '%s' still has "
840 			    "active requests, cannot detach consumer",
841 			    d->name);
842 			g_post_event(gv_drive_dead, d, M_WAITOK, d,
843 			    NULL);
844 			return;
845 		}
846 		if (cp->acr != 0 || cp->acw != 0 || cp->ace != 0)
847 			g_access(cp, -cp->acr, -cp->acw, -cp->ace);
848 	}
849 
850 	G_VINUM_DEBUG(1, "lost drive '%s'", d->name);
851 	d->geom = NULL;
852 	LIST_FOREACH(s, &d->subdisks, from_drive) {
853 		s->provider = NULL;
854 		s->consumer = NULL;
855 	}
856 	gv_kill_drive_thread(d);
857 	gp->softc = NULL;
858 	g_wither_geom(gp, ENXIO);
859 }
860 
861 static int
862 gv_drive_destroy_geom(struct gctl_req *req, struct g_class *mp,
863     struct g_geom *gp)
864 {
865 	struct gv_drive *d;
866 
867 	g_trace(G_T_TOPOLOGY, "gv_drive_destroy_geom: %s", gp->name);
868 	g_topology_assert();
869 
870 	d = gp->softc;
871 	gv_kill_drive_thread(d);
872 
873 	g_wither_geom(gp, ENXIO);
874 	return (0);
875 }
876 
877 #define	VINUMDRIVE_CLASS_NAME "VINUMDRIVE"
878 
879 static struct g_class g_vinum_drive_class = {
880 	.name = VINUMDRIVE_CLASS_NAME,
881 	.version = G_VERSION,
882 	.taste = gv_drive_taste,
883 	.destroy_geom = gv_drive_destroy_geom
884 };
885 
886 DECLARE_GEOM_CLASS(g_vinum_drive_class, g_vinum_drive);
887