xref: /dragonfly/sys/dev/disk/ccd/ccd.c (revision 4a65f651)
1 /*
2  * Copyright (c) 2007 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  *
34  * $DragonFly: src/sys/dev/disk/ccd/ccd.c,v 1.50 2007/11/06 03:50:02 dillon Exp $
35  */
36 /*
37  * Copyright (c) 1995 Jason R. Thorpe.
38  * All rights reserved.
39  *
40  * Redistribution and use in source and binary forms, with or without
41  * modification, are permitted provided that the following conditions
42  * are met:
43  * 1. Redistributions of source code must retain the above copyright
44  *    notice, this list of conditions and the following disclaimer.
45  * 2. Redistributions in binary form must reproduce the above copyright
46  *    notice, this list of conditions and the following disclaimer in the
47  *    documentation and/or other materials provided with the distribution.
48  * 3. All advertising materials mentioning features or use of this software
49  *    must display the following acknowledgement:
50  *	This product includes software developed for the NetBSD Project
51  *	by Jason R. Thorpe.
52  * 4. The name of the author may not be used to endorse or promote products
53  *    derived from this software without specific prior written permission.
54  *
55  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
56  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
57  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
58  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
59  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
60  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
61  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
62  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
63  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
64  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
65  * SUCH DAMAGE.
66  */
67 
68 /*
69  * Copyright (c) 1988 University of Utah.
70  * Copyright (c) 1990, 1993
71  *	The Regents of the University of California.  All rights reserved.
72  *
73  * This code is derived from software contributed to Berkeley by
74  * the Systems Programming Group of the University of Utah Computer
75  * Science Department.
76  *
77  * Redistribution and use in source and binary forms, with or without
78  * modification, are permitted provided that the following conditions
79  * are met:
80  * 1. Redistributions of source code must retain the above copyright
81  *    notice, this list of conditions and the following disclaimer.
82  * 2. Redistributions in binary form must reproduce the above copyright
83  *    notice, this list of conditions and the following disclaimer in the
84  *    documentation and/or other materials provided with the distribution.
85  * 3. All advertising materials mentioning features or use of this software
86  *    must display the following acknowledgement:
87  *	This product includes software developed by the University of
88  *	California, Berkeley and its contributors.
89  * 4. Neither the name of the University nor the names of its contributors
90  *    may be used to endorse or promote products derived from this software
91  *    without specific prior written permission.
92  *
93  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
94  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
95  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
96  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
97  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
98  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
99  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
100  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
101  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
102  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
103  * SUCH DAMAGE.
104  *
105  * from: Utah $Hdr: cd.c 1.6 90/11/28$
106  */
107 /*
108  * @(#)cd.c	8.2 (Berkeley) 11/16/93
109  * $FreeBSD: src/sys/dev/ccd/ccd.c,v 1.73.2.1 2001/09/11 09:49:52 kris Exp $
110  * $NetBSD: ccd.c,v 1.22 1995/12/08 19:13:26 thorpej Exp $
111  * $DragonFly: src/sys/dev/disk/ccd/ccd.c,v 1.50 2007/11/06 03:50:02 dillon Exp $
112  */
113 
114 /*
115  * "Concatenated" disk driver.
116  *
117  * Original dynamic configuration support by:
118  *	Jason R. Thorpe <thorpej@nas.nasa.gov>
119  *	Numerical Aerodynamic Simulation Facility
120  *	Mail Stop 258-6
121  *	NASA Ames Research Center
122  *	Moffett Field, CA 94035
123  */
124 
125 #include "use_ccd.h"
126 
127 #include <sys/param.h>
128 #include <sys/systm.h>
129 #include <sys/kernel.h>
130 #include <sys/module.h>
131 #include <sys/proc.h>
132 #include <sys/buf.h>
133 #include <sys/malloc.h>
134 #include <sys/nlookup.h>
135 #include <sys/conf.h>
136 #include <sys/stat.h>
137 #include <sys/sysctl.h>
138 #include <sys/disk.h>
139 #include <sys/dtype.h>
140 #include <sys/diskslice.h>
141 #include <sys/devicestat.h>
142 #include <sys/fcntl.h>
143 #include <sys/vnode.h>
144 #include <sys/buf2.h>
145 #include <sys/ccdvar.h>
146 
147 #include <vm/vm_zone.h>
148 
149 #include <vfs/ufs/dinode.h> 	/* XXX Used only for fs.h */
150 #include <vfs/ufs/fs.h> 	/* XXX used only to get BBSIZE and SBSIZE */
151 
152 #include <sys/thread2.h>
153 
154 #if defined(CCDDEBUG) && !defined(DEBUG)
155 #define DEBUG
156 #endif
157 
158 #ifdef DEBUG
159 #define CCDB_FOLLOW	0x01
160 #define CCDB_INIT	0x02
161 #define CCDB_IO		0x04
162 #define CCDB_LABEL	0x08
163 #define CCDB_VNODE	0x10
164 static int ccddebug = CCDB_FOLLOW | CCDB_INIT | CCDB_IO | CCDB_LABEL |
165     CCDB_VNODE;
166 SYSCTL_INT(_debug, OID_AUTO, ccddebug, CTLFLAG_RW, &ccddebug, 0, "");
167 #undef DEBUG
168 #endif
169 
170 #define	ccdunit(x)	dkunit(x)
171 #define ccdpart(x)	dkpart(x)
172 
173 /*
174    This is how mirroring works (only writes are special):
175 
176    When initiating a write, ccdbuffer() returns two "struct ccdbuf *"s
177    linked together by the cb_mirror field.  "cb_pflags &
178    CCDPF_MIRROR_DONE" is set to 0 on both of them.
179 
180    When a component returns to ccdiodone(), it checks if "cb_pflags &
181    CCDPF_MIRROR_DONE" is set or not.  If not, it sets the partner's
182    flag and returns.  If it is, it means its partner has already
183    returned, so it will go to the regular cleanup.
184 
185  */
186 
187 struct ccdbuf {
188 	struct buf	cb_buf;		/* new I/O buf */
189 	struct vnode	*cb_vp;		/* related vnode */
190 	struct bio	*cb_obio;	/* ptr. to original I/O buf */
191 	struct ccdbuf	*cb_freenext;	/* free list link */
192 	int		cb_unit;	/* target unit */
193 	int		cb_comp;	/* target component */
194 	int		cb_pflags;	/* mirror/parity status flag */
195 	struct ccdbuf	*cb_mirror;	/* mirror counterpart */
196 };
197 
198 /* bits in cb_pflags */
199 #define CCDPF_MIRROR_DONE 1	/* if set, mirror counterpart is done */
200 
201 static d_open_t ccdopen;
202 static d_close_t ccdclose;
203 static d_strategy_t ccdstrategy;
204 static d_ioctl_t ccdioctl;
205 static d_dump_t ccddump;
206 
207 #define NCCDFREEHIWAT	16
208 
209 #define CDEV_MAJOR 74
210 
211 static struct dev_ops ccd_ops = {
212 	{ "ccd", CDEV_MAJOR, D_DISK },
213 	.d_open =	ccdopen,
214 	.d_close =	ccdclose,
215 	.d_read =	physread,
216 	.d_write =	physwrite,
217 	.d_ioctl =	ccdioctl,
218 	.d_strategy =	ccdstrategy,
219 	.d_dump =	ccddump
220 };
221 
222 /* called during module initialization */
223 static	void ccdattach (void);
224 static	int ccddetach (void);
225 static	int ccd_modevent (module_t, int, void *);
226 
227 /* called by biodone() at interrupt time */
228 static	void ccdiodone (struct bio *bio);
229 
230 static	void ccdstart (struct ccd_softc *, struct bio *);
231 static	void ccdinterleave (struct ccd_softc *, int);
232 static	void ccdintr (struct ccd_softc *, struct bio *);
233 static	int ccdinit (struct ccddevice *, char **, struct ucred *);
234 static	int ccdlookup (char *, struct vnode **);
235 static	void ccdbuffer (struct ccdbuf **ret, struct ccd_softc *,
236 		struct bio *, off_t, caddr_t, long);
237 static	int ccdlock (struct ccd_softc *);
238 static	void ccdunlock (struct ccd_softc *);
239 
240 #ifdef DEBUG
241 static	void printiinfo (struct ccdiinfo *);
242 #endif
243 
244 /* Non-private for the benefit of libkvm. */
245 struct	ccd_softc *ccd_softc;
246 struct	ccddevice *ccddevs;
247 struct	ccdbuf *ccdfreebufs;
248 static	int numccdfreebufs;
249 static	int numccd = 0;
250 
251 /*
252  * getccdbuf() -	Allocate and zero a ccd buffer.
253  *
254  *	This routine is called at splbio().
255  */
256 
257 static __inline
258 struct ccdbuf *
259 getccdbuf(void)
260 {
261 	struct ccdbuf *cbp;
262 
263 	/*
264 	 * Allocate from freelist or malloc as necessary
265 	 */
266 	if ((cbp = ccdfreebufs) != NULL) {
267 		ccdfreebufs = cbp->cb_freenext;
268 		--numccdfreebufs;
269 		reinitbufbio(&cbp->cb_buf);
270 	} else {
271 		cbp = kmalloc(sizeof(struct ccdbuf), M_DEVBUF, M_WAITOK|M_ZERO);
272 		initbufbio(&cbp->cb_buf);
273 	}
274 
275 	/*
276 	 * independant struct buf initialization
277 	 */
278 	buf_dep_init(&cbp->cb_buf);
279 	BUF_LOCKINIT(&cbp->cb_buf);
280 	BUF_LOCK(&cbp->cb_buf, LK_EXCLUSIVE);
281 	BUF_KERNPROC(&cbp->cb_buf);
282 	cbp->cb_buf.b_flags = B_PAGING | B_BNOCLIP;
283 
284 	return(cbp);
285 }
286 
287 /*
288  * putccdbuf() -	Free a ccd buffer.
289  *
290  *	This routine is called at splbio().
291  */
292 
293 static __inline
294 void
295 putccdbuf(struct ccdbuf *cbp)
296 {
297 	BUF_UNLOCK(&cbp->cb_buf);
298 	BUF_LOCKFREE(&cbp->cb_buf);
299 
300 	if (numccdfreebufs < NCCDFREEHIWAT) {
301 		cbp->cb_freenext = ccdfreebufs;
302 		ccdfreebufs = cbp;
303 		++numccdfreebufs;
304 	} else {
305 		kfree((caddr_t)cbp, M_DEVBUF);
306 	}
307 }
308 
309 /*
310  * Called by main() during pseudo-device attachment.  All we need
311  * to do is allocate enough space for devices to be configured later, and
312  * add devsw entries.
313  */
314 static void
315 ccdattach(void)
316 {
317 	struct disk_info info;
318 	struct ccd_softc *cs;
319 	int i;
320 	int num = NCCD;
321 
322 	if (num > 1)
323 		kprintf("ccd0-%d: Concatenated disk drivers\n", num-1);
324 	else
325 		kprintf("ccd0: Concatenated disk driver\n");
326 
327 	ccd_softc = kmalloc(num * sizeof(struct ccd_softc), M_DEVBUF,
328 			    M_WAITOK | M_ZERO);
329 	ccddevs = kmalloc(num * sizeof(struct ccddevice), M_DEVBUF,
330 			    M_WAITOK | M_ZERO);
331 	numccd = num;
332 
333 	/*
334 	 * With normal disk devices the open simply fails if the media
335 	 * is not present.  With CCD we have to be able to open the
336 	 * raw disk to use the ioctl's to set it up, so create a dummy
337 	 * disk info structure so dscheck() doesn't blow up.
338 	 */
339 	bzero(&info, sizeof(info));
340 	info.d_media_blksize = DEV_BSIZE;
341 
342 	for (i = 0; i < numccd; ++i) {
343 		cs = &ccd_softc[i];
344 		cs->sc_dev = disk_create(i, &cs->sc_disk, &ccd_ops);
345 		cs->sc_dev->si_drv1 = cs;
346 		cs->sc_dev->si_iosize_max = 256 * 512;	/* XXX */
347 		disk_setdiskinfo(&cs->sc_disk, &info);
348 	}
349 }
350 
351 static int
352 ccddetach(void)
353 {
354 	struct ccd_softc *cs;
355 	struct dev_ioctl_args ioctl_args;
356 	int i;
357 	int error = 0;
358 	int eval;
359 
360 	bzero(&ioctl_args, sizeof(ioctl_args));
361 
362 	for (i = 0; i < numccd; ++i) {
363 		cs = &ccd_softc[i];
364 		if (cs->sc_dev == NULL)
365 			continue;
366 		ioctl_args.a_head.a_dev = cs->sc_dev;
367 		ioctl_args.a_cmd = CCDIOCCLR;
368 		ioctl_args.a_fflag = FWRITE;
369 		eval = ccdioctl(&ioctl_args);
370 		if (eval && eval != ENXIO) {
371 			kprintf("ccd%d: In use, cannot detach\n", i);
372 			error = EBUSY;
373 		}
374 	}
375 	if (error == 0) {
376 		for (i = 0; i < numccd; ++i) {
377 			cs = &ccd_softc[i];
378 			if (cs->sc_dev == NULL)
379 				continue;
380 			disk_destroy(&cs->sc_disk);
381 			cs->sc_dev = NULL;
382 		}
383 		if (ccd_softc)
384 			kfree(ccd_softc, M_DEVBUF);
385 		if (ccddevs)
386 			kfree(ccddevs, M_DEVBUF);
387 	}
388 	return (error);
389 }
390 
391 static int
392 ccd_modevent(module_t mod, int type, void *data)
393 {
394 	int error = 0;
395 
396 	switch (type) {
397 	case MOD_LOAD:
398 		ccdattach();
399 		break;
400 
401 	case MOD_UNLOAD:
402 		error = ccddetach();
403 		break;
404 
405 	default:	/* MOD_SHUTDOWN etc */
406 		break;
407 	}
408 	return (error);
409 }
410 
411 DEV_MODULE(ccd, ccd_modevent, NULL);
412 
413 static int
414 ccdinit(struct ccddevice *ccd, char **cpaths, struct ucred *cred)
415 {
416 	struct ccd_softc *cs = &ccd_softc[ccd->ccd_unit];
417 	struct ccdcinfo *ci = NULL;	/* XXX */
418 	int ix;
419 	struct vnode *vp;
420 	u_int64_t skip;
421 	u_int64_t size;
422 	u_int64_t minsize;
423 	int maxsecsize;
424 	struct partinfo dpart;
425 	struct ccdgeom *ccg = &cs->sc_geom;
426 	char tmppath[MAXPATHLEN];
427 	int error = 0;
428 
429 #ifdef DEBUG
430 	if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
431 		kprintf("ccdinit: unit %d\n", ccd->ccd_unit);
432 #endif
433 
434 	cs->sc_size = 0;
435 	cs->sc_ileave = ccd->ccd_interleave;
436 	cs->sc_nccdisks = ccd->ccd_ndev;
437 
438 	/* Allocate space for the component info. */
439 	cs->sc_cinfo = kmalloc(cs->sc_nccdisks * sizeof(struct ccdcinfo),
440 				M_DEVBUF, M_WAITOK);
441 	cs->sc_maxiosize = MAXPHYS;
442 
443 	/*
444 	 * Verify that each component piece exists and record
445 	 * relevant information about it.
446 	 */
447 	maxsecsize = 0;
448 	minsize = 0;
449 	for (ix = 0; ix < cs->sc_nccdisks; ix++) {
450 		vp = ccd->ccd_vpp[ix];
451 		ci = &cs->sc_cinfo[ix];
452 		ci->ci_vp = vp;
453 
454 		/*
455 		 * Copy in the pathname of the component.
456 		 */
457 		bzero(tmppath, sizeof(tmppath));	/* sanity */
458 		if ((error = copyinstr(cpaths[ix], tmppath,
459 		    MAXPATHLEN, &ci->ci_pathlen)) != 0) {
460 #ifdef DEBUG
461 			if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
462 				kprintf("ccd%d: can't copy path, error = %d\n",
463 				    ccd->ccd_unit, error);
464 #endif
465 			goto fail;
466 		}
467 		ci->ci_path = kmalloc(ci->ci_pathlen, M_DEVBUF, M_WAITOK);
468 		bcopy(tmppath, ci->ci_path, ci->ci_pathlen);
469 
470 		ci->ci_dev = vn_todev(vp);
471 		if (ci->ci_dev->si_iosize_max &&
472 		    cs->sc_maxiosize > ci->ci_dev->si_iosize_max) {
473 			cs->sc_maxiosize = ci->ci_dev->si_iosize_max;
474 		}
475 
476 		/*
477 		 * Get partition information for the component.
478 		 */
479 		error = VOP_IOCTL(vp, DIOCGPART, (caddr_t)&dpart, FREAD,
480 				  cred, NULL);
481 		if (error) {
482 #ifdef DEBUG
483 			if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
484 				 kprintf("ccd%d: %s: ioctl failed, error = %d\n",
485 				     ccd->ccd_unit, ci->ci_path, error);
486 #endif
487 			goto fail;
488 		}
489 		if (dpart.fstype != FS_CCD &&
490 		    !kuuid_is_ccd(&dpart.fstype_uuid)) {
491 			kprintf("ccd%d: %s: filesystem type must be 'ccd'\n",
492 				ccd->ccd_unit, ci->ci_path);
493 			error = EFTYPE;
494 			goto fail;
495 		}
496 		if (maxsecsize < dpart.media_blksize)
497 			maxsecsize = dpart.media_blksize;
498 
499 		/*
500 		 * Skip a certain amount of storage at the beginning of
501 		 * the component to make sure we don't infringe on any
502 		 * reserved sectors.  This is handled entirely by
503 		 * dpart.reserved_blocks but we also impose a minimum
504 		 * of 16 sectors for backwards compatibility.
505 		 */
506 		skip = 16;
507 		if (skip < dpart.reserved_blocks)
508 			skip = dpart.reserved_blocks;
509 		size = dpart.media_blocks - skip;
510 
511 		/*
512 		 * Calculate the size, truncating to an interleave
513 		 * boundary if necessary.
514 		 */
515 		if (cs->sc_ileave > 1)
516 			size -= size % cs->sc_ileave;
517 
518 		if ((int64_t)size <= 0) {
519 #ifdef DEBUG
520 			if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
521 				kprintf("ccd%d: %s: size == 0\n",
522 				    ccd->ccd_unit, ci->ci_path);
523 #endif
524 			error = ENODEV;
525 			goto fail;
526 		}
527 
528 		/*
529 		 * Calculate the smallest uniform component, used
530 		 * elsewhere.
531 		 */
532 		if (minsize == 0 || minsize > size)
533 			minsize = size;
534 		ci->ci_skip = skip;
535 		ci->ci_size = size;
536 		cs->sc_size += size;
537 	}
538 	kprintf("ccd%d: max component iosize is %d total blocks %lld\n",
539 		cs->sc_unit, cs->sc_maxiosize, (long long)cs->sc_size);
540 
541 	/*
542 	 * Don't allow the interleave to be smaller than
543 	 * the biggest component sector.
544 	 */
545 	if ((cs->sc_ileave > 0) &&
546 	    (cs->sc_ileave % (maxsecsize / DEV_BSIZE))) {
547 #ifdef DEBUG
548 		if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
549 			kprintf("ccd%d: interleave must be at least %d\n",
550 			    ccd->ccd_unit, (maxsecsize / DEV_BSIZE));
551 #endif
552 		error = EINVAL;
553 		goto fail;
554 	}
555 
556 	/*
557 	 * If uniform interleave is desired set all sizes to that of
558 	 * the smallest component.  This will guarentee that a single
559 	 * interleave table is generated.
560 	 *
561 	 * Lost space must be taken into account when calculating the
562 	 * overall size.  Half the space is lost when CCDF_MIRROR is
563 	 * specified.  One disk is lost when CCDF_PARITY is specified.
564 	 */
565 	if (ccd->ccd_flags & CCDF_UNIFORM) {
566 		for (ci = cs->sc_cinfo;
567 		     ci < &cs->sc_cinfo[cs->sc_nccdisks]; ci++) {
568 			ci->ci_size = minsize;
569 		}
570 		if (ccd->ccd_flags & CCDF_MIRROR) {
571 			/*
572 			 * Check to see if an even number of components
573 			 * have been specified.  The interleave must also
574 			 * be non-zero in order for us to be able to
575 			 * guarentee the topology.
576 			 */
577 			if (cs->sc_nccdisks % 2) {
578 				kprintf("ccd%d: mirroring requires an even number of disks\n", ccd->ccd_unit );
579 				error = EINVAL;
580 				goto fail;
581 			}
582 			if (cs->sc_ileave == 0) {
583 				kprintf("ccd%d: an interleave must be specified when mirroring\n", ccd->ccd_unit);
584 				error = EINVAL;
585 				goto fail;
586 			}
587 			cs->sc_size = (cs->sc_nccdisks/2) * minsize;
588 		} else if (ccd->ccd_flags & CCDF_PARITY) {
589 			cs->sc_size = (cs->sc_nccdisks-1) * minsize;
590 		} else {
591 			if (cs->sc_ileave == 0) {
592 				kprintf("ccd%d: an interleave must be specified when using parity\n", ccd->ccd_unit);
593 				error = EINVAL;
594 				goto fail;
595 			}
596 			cs->sc_size = cs->sc_nccdisks * minsize;
597 		}
598 	}
599 
600 	/*
601 	 * Construct the interleave table.
602 	 */
603 	ccdinterleave(cs, ccd->ccd_unit);
604 
605 	/*
606 	 * Create pseudo-geometry based on 1MB cylinders.  It's
607 	 * pretty close.
608 	 */
609 	ccg->ccg_secsize = maxsecsize;
610 	ccg->ccg_ntracks = 1;
611 	ccg->ccg_nsectors = 1024 * 1024 / ccg->ccg_secsize;
612 	ccg->ccg_ncylinders = cs->sc_size / ccg->ccg_nsectors;
613 
614 	/*
615 	 * Add an devstat entry for this device.
616 	 */
617 	devstat_add_entry(&cs->device_stats, "ccd", ccd->ccd_unit,
618 			  ccg->ccg_secsize, DEVSTAT_ALL_SUPPORTED,
619 			  DEVSTAT_TYPE_STORARRAY |DEVSTAT_TYPE_IF_OTHER,
620 			  DEVSTAT_PRIORITY_ARRAY);
621 
622 	cs->sc_flags |= CCDF_INITED;
623 	cs->sc_cflags = ccd->ccd_flags;	/* So we can find out later... */
624 	cs->sc_unit = ccd->ccd_unit;
625 	return (0);
626 fail:
627 	while (ci > cs->sc_cinfo) {
628 		ci--;
629 		kfree(ci->ci_path, M_DEVBUF);
630 	}
631 	kfree(cs->sc_cinfo, M_DEVBUF);
632 	cs->sc_cinfo = NULL;
633 	return (error);
634 }
635 
636 static void
637 ccdinterleave(struct ccd_softc *cs, int unit)
638 {
639 	struct ccdcinfo *ci, *smallci;
640 	struct ccdiinfo *ii;
641 	u_int64_t bn;
642 	u_int64_t lbn;
643 	u_int64_t size;
644 	int icount;
645 	int ix;
646 
647 #ifdef DEBUG
648 	if (ccddebug & CCDB_INIT)
649 		kprintf("ccdinterleave(%x): ileave %d\n", cs, cs->sc_ileave);
650 #endif
651 
652 	/*
653 	 * Allocate an interleave table.  The worst case occurs when each
654 	 * of N disks is of a different size, resulting in N interleave
655 	 * tables.
656 	 *
657 	 * Chances are this is too big, but we don't care.
658 	 */
659 	icount = cs->sc_nccdisks + 1;
660 	cs->sc_itable = kmalloc(icount * sizeof(struct ccdiinfo),
661 				M_DEVBUF, M_WAITOK|M_ZERO);
662 
663 	/*
664 	 * Trivial case: no interleave (actually interleave of disk size).
665 	 * Each table entry represents a single component in its entirety.
666 	 *
667 	 * An interleave of 0 may not be used with a mirror or parity setup.
668 	 */
669 	if (cs->sc_ileave == 0) {
670 		bn = 0;
671 		ii = cs->sc_itable;
672 
673 		for (ix = 0; ix < cs->sc_nccdisks; ix++) {
674 			/* Allocate space for ii_index. */
675 			ii->ii_index = kmalloc(sizeof(int), M_DEVBUF, M_WAITOK);
676 			ii->ii_ndisk = 1;
677 			ii->ii_startblk = bn;
678 			ii->ii_startoff = 0;
679 			ii->ii_index[0] = ix;
680 			bn += cs->sc_cinfo[ix].ci_size;
681 			ii++;
682 		}
683 		ii->ii_ndisk = 0;
684 #ifdef DEBUG
685 		if (ccddebug & CCDB_INIT)
686 			printiinfo(cs->sc_itable);
687 #endif
688 		return;
689 	}
690 
691 	/*
692 	 * The following isn't fast or pretty; it doesn't have to be.
693 	 */
694 	size = 0;
695 	bn = lbn = 0;
696 	for (ii = cs->sc_itable; ii < &cs->sc_itable[icount]; ++ii) {
697 		/*
698 		 * Allocate space for ii_index.  We might allocate more then
699 		 * we use.
700 		 */
701 		ii->ii_index = kmalloc((sizeof(int) * cs->sc_nccdisks),
702 					M_DEVBUF, M_WAITOK);
703 
704 		/*
705 		 * Locate the smallest of the remaining components
706 		 */
707 		smallci = NULL;
708 		ci = cs->sc_cinfo;
709 		while (ci < &cs->sc_cinfo[cs->sc_nccdisks]) {
710 			if (ci->ci_size > size &&
711 			    (smallci == NULL ||
712 			     ci->ci_size < smallci->ci_size)) {
713 				smallci = ci;
714 			}
715 			++ci;
716 		}
717 
718 		/*
719 		 * Nobody left, all done
720 		 */
721 		if (smallci == NULL) {
722 			ii->ii_ndisk = 0;
723 			break;
724 		}
725 
726 		/*
727 		 * Record starting logical block using an sc_ileave blocksize.
728 		 */
729 		ii->ii_startblk = bn / cs->sc_ileave;
730 
731 		/*
732 		 * Record starting component block using an sc_ileave
733 		 * blocksize.  This value is relative to the beginning of
734 		 * a component disk.
735 		 */
736 		ii->ii_startoff = lbn;
737 
738 		/*
739 		 * Determine how many disks take part in this interleave
740 		 * and record their indices.
741 		 */
742 		ix = 0;
743 		for (ci = cs->sc_cinfo;
744 		    ci < &cs->sc_cinfo[cs->sc_nccdisks]; ci++) {
745 			if (ci->ci_size >= smallci->ci_size) {
746 				ii->ii_index[ix++] = ci - cs->sc_cinfo;
747 			}
748 		}
749 		ii->ii_ndisk = ix;
750 
751 		/*
752 		 * Adjust for loop
753 		 */
754 		bn += ix * (smallci->ci_size - size);
755 		lbn = smallci->ci_size / cs->sc_ileave;
756 		size = smallci->ci_size;
757 	}
758 	if (ii == &cs->sc_itable[icount])
759 		panic("ccdinterlave software bug!  table exhausted");
760 #ifdef DEBUG
761 	if (ccddebug & CCDB_INIT)
762 		printiinfo(cs->sc_itable);
763 #endif
764 }
765 
766 /* ARGSUSED */
767 static int
768 ccdopen(struct dev_open_args *ap)
769 {
770 	cdev_t dev = ap->a_head.a_dev;
771 	int unit = ccdunit(dev);
772 	struct ccd_softc *cs;
773 	int error = 0;
774 
775 #ifdef DEBUG
776 	if (ccddebug & CCDB_FOLLOW)
777 		kprintf("ccdopen(%x, %x)\n", dev, flags);
778 #endif
779 	if (unit >= numccd)
780 		return (ENXIO);
781 	cs = &ccd_softc[unit];
782 
783 	if ((error = ccdlock(cs)) == 0) {
784 		ccdunlock(cs);
785 	}
786 	return (error);
787 }
788 
789 /* ARGSUSED */
790 static int
791 ccdclose(struct dev_close_args *ap)
792 {
793 	cdev_t dev = ap->a_head.a_dev;
794 	int unit = ccdunit(dev);
795 	struct ccd_softc *cs;
796 	int error = 0;
797 
798 #ifdef DEBUG
799 	if (ccddebug & CCDB_FOLLOW)
800 		kprintf("ccdclose(%x, %x)\n", dev, flags);
801 #endif
802 
803 	if (unit >= numccd)
804 		return (ENXIO);
805 	cs = &ccd_softc[unit];
806 	if ((error = ccdlock(cs)) == 0) {
807 		ccdunlock(cs);
808 	}
809 	return (error);
810 }
811 
812 static int
813 ccdstrategy(struct dev_strategy_args *ap)
814 {
815 	cdev_t dev = ap->a_head.a_dev;
816 	struct bio *bio = ap->a_bio;
817 	int unit = ccdunit(dev);
818 	struct bio *nbio;
819 	struct buf *bp = bio->bio_buf;
820 	struct ccd_softc *cs = &ccd_softc[unit];
821 	u_int64_t pbn;	/* in sc_secsize chunks */
822 	u_int32_t sz;	/* in sc_secsize chunks */
823 
824 #ifdef DEBUG
825 	if (ccddebug & CCDB_FOLLOW)
826 		kprintf("ccdstrategy(%x): unit %d\n", bp, unit);
827 #endif
828 	if ((cs->sc_flags & CCDF_INITED) == 0) {
829 		bp->b_error = ENXIO;
830 		goto error;
831 	}
832 
833 	/* If it's a nil transfer, wake up the top half now. */
834 	if (bp->b_bcount == 0) {
835 		bp->b_resid = 0;
836 		goto done;
837 	}
838 
839 	/*
840 	 * Do bounds checking and adjust transfer.  If there's an
841 	 * error, the bounds check will flag that for us.
842 	 */
843 
844 	pbn = bio->bio_offset / cs->sc_geom.ccg_secsize;
845 	sz = howmany(bp->b_bcount, cs->sc_geom.ccg_secsize);
846 
847 	/*
848 	 * If out of bounds return an error.  If the request goes
849 	 * past EOF, clip the request as appropriate.  If exactly
850 	 * at EOF, return success (don't clip), but with 0 bytes
851 	 * of I/O.
852 	 *
853 	 * Mark EOF B_INVAL (just like bad), indicating that the
854 	 * contents of the buffer, if any, is invalid.
855 	 */
856 	if ((int64_t)pbn < 0)
857 		goto bad;
858 	if (pbn + sz > cs->sc_size) {
859 		if (pbn > cs->sc_size || (bp->b_flags & B_BNOCLIP))
860 			goto bad;
861 		if (pbn == cs->sc_size) {
862 			bp->b_resid = bp->b_bcount;
863 			bp->b_flags |= B_INVAL;
864 			goto done;
865 		}
866 		sz = (long)(cs->sc_size - pbn);
867 		bp->b_bcount = sz * cs->sc_geom.ccg_secsize;
868 	}
869 	nbio = bio;
870 
871 	bp->b_resid = bp->b_bcount;
872 	nbio->bio_driver_info = dev;
873 
874 	/*
875 	 * "Start" the unit.
876 	 */
877 	crit_enter();
878 	ccdstart(cs, nbio);
879 	crit_exit();
880 	return(0);
881 
882 	/*
883 	 * note: bio, not nbio, is valid at the done label.
884 	 */
885 bad:
886 	bp->b_error = EINVAL;
887 error:
888 	bp->b_resid = bp->b_bcount;
889 	bp->b_flags |= B_ERROR | B_INVAL;
890 done:
891 	biodone(bio);
892 	return(0);
893 }
894 
895 static void
896 ccdstart(struct ccd_softc *cs, struct bio *bio)
897 {
898 	long bcount, rcount;
899 	struct ccdbuf *cbp[4];
900 	struct buf *bp = bio->bio_buf;
901 	/* XXX! : 2 reads and 2 writes for RAID 4/5 */
902 	caddr_t addr;
903 	off_t doffset;
904 
905 #ifdef DEBUG
906 	if (ccddebug & CCDB_FOLLOW)
907 		kprintf("ccdstart(%x, %x)\n", cs, bp);
908 #endif
909 
910 	/* Record the transaction start  */
911 	devstat_start_transaction(&cs->device_stats);
912 
913 	/*
914 	 * Allocate component buffers and fire off the requests
915 	 */
916 	doffset = bio->bio_offset;
917 	addr = bp->b_data;
918 
919 	for (bcount = bp->b_bcount; bcount > 0; bcount -= rcount) {
920 		ccdbuffer(cbp, cs, bio, doffset, addr, bcount);
921 		rcount = cbp[0]->cb_buf.b_bcount;
922 
923 		if (cs->sc_cflags & CCDF_MIRROR) {
924 			/*
925 			 * Mirroring.  Writes go to both disks, reads are
926 			 * taken from whichever disk seems most appropriate.
927 			 *
928 			 * We attempt to localize reads to the disk whos arm
929 			 * is nearest the read request.  We ignore seeks due
930 			 * to writes when making this determination and we
931 			 * also try to avoid hogging.
932 			 */
933 			if (cbp[0]->cb_buf.b_cmd != BUF_CMD_READ) {
934 				vn_strategy(cbp[0]->cb_vp,
935 					    &cbp[0]->cb_buf.b_bio1);
936 				vn_strategy(cbp[1]->cb_vp,
937 					    &cbp[1]->cb_buf.b_bio1);
938 			} else {
939 				int pick = cs->sc_pick;
940 				daddr_t range = cs->sc_size / 16 * cs->sc_geom.ccg_secsize;
941 				if (doffset < cs->sc_blk[pick] - range ||
942 				    doffset > cs->sc_blk[pick] + range
943 				) {
944 					cs->sc_pick = pick = 1 - pick;
945 				}
946 				cs->sc_blk[pick] = doffset + rcount;
947 				vn_strategy(cbp[pick]->cb_vp,
948 					    &cbp[pick]->cb_buf.b_bio1);
949 			}
950 		} else {
951 			/*
952 			 * Not mirroring
953 			 */
954 			vn_strategy(cbp[0]->cb_vp,
955 				     &cbp[0]->cb_buf.b_bio1);
956 		}
957 		doffset += rcount;
958 		addr += rcount;
959 	}
960 }
961 
962 /*
963  * Build a component buffer header.
964  */
965 static void
966 ccdbuffer(struct ccdbuf **cb, struct ccd_softc *cs, struct bio *bio,
967 	  off_t doffset, caddr_t addr, long bcount)
968 {
969 	struct ccdcinfo *ci, *ci2 = NULL;	/* XXX */
970 	struct ccdbuf *cbp;
971 	u_int64_t bn;
972 	u_int64_t cbn;
973 	u_int64_t cboff;
974 	off_t cbc;
975 
976 #ifdef DEBUG
977 	if (ccddebug & CCDB_IO)
978 		kprintf("ccdbuffer(%x, %x, %d, %x, %d)\n",
979 		       cs, bp, bn, addr, bcount);
980 #endif
981 	/*
982 	 * Determine which component bn falls in.
983 	 */
984 	bn = doffset / cs->sc_geom.ccg_secsize;
985 	cbn = bn;
986 	cboff = 0;
987 
988 	if (cs->sc_ileave == 0) {
989 		/*
990 		 * Serially concatenated and neither a mirror nor a parity
991 		 * config.  This is a special case.
992 		 */
993 		daddr_t sblk;
994 
995 		sblk = 0;
996 		for (ci = cs->sc_cinfo; cbn >= sblk + ci->ci_size; ci++)
997 			sblk += ci->ci_size;
998 		cbn -= sblk;
999 	} else {
1000 		struct ccdiinfo *ii;
1001 		int ccdisk, off;
1002 
1003 		/*
1004 		 * Calculate cbn, the logical superblock (sc_ileave chunks),
1005 		 * and cboff, a normal block offset (DEV_BSIZE chunks) relative
1006 		 * to cbn.
1007 		 */
1008 		cboff = cbn % cs->sc_ileave;	/* DEV_BSIZE gran */
1009 		cbn = cbn / cs->sc_ileave;	/* DEV_BSIZE * ileave gran */
1010 
1011 		/*
1012 		 * Figure out which interleave table to use.
1013 		 */
1014 		for (ii = cs->sc_itable; ii->ii_ndisk; ii++) {
1015 			if (ii->ii_startblk > cbn)
1016 				break;
1017 		}
1018 		ii--;
1019 
1020 		/*
1021 		 * off is the logical superblock relative to the beginning
1022 		 * of this interleave block.
1023 		 */
1024 		off = cbn - ii->ii_startblk;
1025 
1026 		/*
1027 		 * We must calculate which disk component to use (ccdisk),
1028 		 * and recalculate cbn to be the superblock relative to
1029 		 * the beginning of the component.  This is typically done by
1030 		 * adding 'off' and ii->ii_startoff together.  However, 'off'
1031 		 * must typically be divided by the number of components in
1032 		 * this interleave array to be properly convert it from a
1033 		 * CCD-relative logical superblock number to a
1034 		 * component-relative superblock number.
1035 		 */
1036 		if (ii->ii_ndisk == 1) {
1037 			/*
1038 			 * When we have just one disk, it can't be a mirror
1039 			 * or a parity config.
1040 			 */
1041 			ccdisk = ii->ii_index[0];
1042 			cbn = ii->ii_startoff + off;
1043 		} else {
1044 			if (cs->sc_cflags & CCDF_MIRROR) {
1045 				/*
1046 				 * We have forced a uniform mapping, resulting
1047 				 * in a single interleave array.  We double
1048 				 * up on the first half of the available
1049 				 * components and our mirror is in the second
1050 				 * half.  This only works with a single
1051 				 * interleave array because doubling up
1052 				 * doubles the number of sectors, so there
1053 				 * cannot be another interleave array because
1054 				 * the next interleave array's calculations
1055 				 * would be off.
1056 				 */
1057 				int ndisk2 = ii->ii_ndisk / 2;
1058 				ccdisk = ii->ii_index[off % ndisk2];
1059 				cbn = ii->ii_startoff + off / ndisk2;
1060 				ci2 = &cs->sc_cinfo[ccdisk + ndisk2];
1061 			} else if (cs->sc_cflags & CCDF_PARITY) {
1062 				/*
1063 				 * XXX not implemented yet
1064 				 */
1065 				int ndisk2 = ii->ii_ndisk - 1;
1066 				ccdisk = ii->ii_index[off % ndisk2];
1067 				cbn = ii->ii_startoff + off / ndisk2;
1068 				if (cbn % ii->ii_ndisk <= ccdisk)
1069 					ccdisk++;
1070 			} else {
1071 				ccdisk = ii->ii_index[off % ii->ii_ndisk];
1072 				cbn = ii->ii_startoff + off / ii->ii_ndisk;
1073 			}
1074 		}
1075 
1076 		ci = &cs->sc_cinfo[ccdisk];
1077 
1078 		/*
1079 		 * Convert cbn from a superblock to a normal block so it
1080 		 * can be used to calculate (along with cboff) the normal
1081 		 * block index into this particular disk.
1082 		 */
1083 		cbn *= cs->sc_ileave;
1084 	}
1085 
1086 	/*
1087 	 * Fill in the component buf structure.
1088 	 *
1089 	 * NOTE: devices do not use b_bufsize, only b_bcount, but b_bcount
1090 	 * will be truncated on device EOF so we use b_bufsize to detect
1091 	 * the case.
1092 	 */
1093 	cbp = getccdbuf();
1094 	cbp->cb_buf.b_cmd = bio->bio_buf->b_cmd;
1095 	cbp->cb_buf.b_flags |= bio->bio_buf->b_flags;
1096 	cbp->cb_buf.b_data = addr;
1097 	cbp->cb_vp = ci->ci_vp;
1098 	if (cs->sc_ileave == 0)
1099 		cbc = dbtob((off_t)(ci->ci_size - cbn));
1100 	else
1101 		cbc = dbtob((off_t)(cs->sc_ileave - cboff));
1102 	if (cbc > cs->sc_maxiosize)
1103 		cbc = cs->sc_maxiosize;
1104 	cbp->cb_buf.b_bcount = (cbc < bcount) ? cbc : bcount;
1105  	cbp->cb_buf.b_bufsize = cbp->cb_buf.b_bcount;
1106 
1107 	cbp->cb_buf.b_bio1.bio_done = ccdiodone;
1108 	cbp->cb_buf.b_bio1.bio_caller_info1.ptr = cbp;
1109 	cbp->cb_buf.b_bio1.bio_offset = dbtob(cbn + cboff + ci->ci_skip);
1110 
1111 	/*
1112 	 * context for ccdiodone
1113 	 */
1114 	cbp->cb_obio = bio;
1115 	cbp->cb_unit = cs - ccd_softc;
1116 	cbp->cb_comp = ci - cs->sc_cinfo;
1117 
1118 #ifdef DEBUG
1119 	if (ccddebug & CCDB_IO)
1120 		kprintf(" dev %x(u%d): cbp %x off %lld addr %x bcnt %d\n",
1121 		       ci->ci_dev, ci-cs->sc_cinfo, cbp,
1122 		       cbp->cb_buf.b_bio1.bio_offset,
1123 		       cbp->cb_buf.b_data, cbp->cb_buf.b_bcount);
1124 #endif
1125 	cb[0] = cbp;
1126 
1127 	/*
1128 	 * Note: both I/O's setup when reading from mirror, but only one
1129 	 * will be executed.
1130 	 */
1131 	if (cs->sc_cflags & CCDF_MIRROR) {
1132 		/* mirror, setup second I/O */
1133 		cbp = getccdbuf();
1134 
1135 		cbp->cb_buf.b_cmd = bio->bio_buf->b_cmd;
1136 		cbp->cb_buf.b_flags |= bio->bio_buf->b_flags;
1137 		cbp->cb_buf.b_data = addr;
1138 		cbp->cb_vp = ci2->ci_vp;
1139 		if (cs->sc_ileave == 0)
1140 		      cbc = dbtob((off_t)(ci->ci_size - cbn));
1141 		else
1142 		      cbc = dbtob((off_t)(cs->sc_ileave - cboff));
1143 		if (cbc > cs->sc_maxiosize)
1144 			cbc = cs->sc_maxiosize;
1145 		cbp->cb_buf.b_bcount = (cbc < bcount) ? cbc : bcount;
1146 		cbp->cb_buf.b_bufsize = cbp->cb_buf.b_bcount;
1147 
1148 		cbp->cb_buf.b_bio1.bio_done = ccdiodone;
1149 		cbp->cb_buf.b_bio1.bio_caller_info1.ptr = cbp;
1150 		cbp->cb_buf.b_bio1.bio_offset = dbtob(cbn + cboff + ci2->ci_skip);
1151 
1152 		/*
1153 		 * context for ccdiodone
1154 		 */
1155 		cbp->cb_obio = bio;
1156 		cbp->cb_unit = cs - ccd_softc;
1157 		cbp->cb_comp = ci2 - cs->sc_cinfo;
1158 		cb[1] = cbp;
1159 		/* link together the ccdbuf's and clear "mirror done" flag */
1160 		cb[0]->cb_mirror = cb[1];
1161 		cb[1]->cb_mirror = cb[0];
1162 		cb[0]->cb_pflags &= ~CCDPF_MIRROR_DONE;
1163 		cb[1]->cb_pflags &= ~CCDPF_MIRROR_DONE;
1164 	}
1165 }
1166 
1167 static void
1168 ccdintr(struct ccd_softc *cs, struct bio *bio)
1169 {
1170 	struct buf *bp = bio->bio_buf;
1171 
1172 #ifdef DEBUG
1173 	if (ccddebug & CCDB_FOLLOW)
1174 		kprintf("ccdintr(%x, %x)\n", cs, bp);
1175 #endif
1176 	/*
1177 	 * Request is done for better or worse, wakeup the top half.
1178 	 */
1179 	if (bp->b_flags & B_ERROR)
1180 		bp->b_resid = bp->b_bcount;
1181 	devstat_end_transaction_buf(&cs->device_stats, bp);
1182 	biodone(bio);
1183 }
1184 
1185 /*
1186  * Called at interrupt time.
1187  * Mark the component as done and if all components are done,
1188  * take a ccd interrupt.
1189  */
1190 static void
1191 ccdiodone(struct bio *bio)
1192 {
1193 	struct ccdbuf *cbp = bio->bio_caller_info1.ptr;
1194 	struct bio *obio = cbp->cb_obio;
1195 	struct buf *obp = obio->bio_buf;
1196 	int unit = cbp->cb_unit;
1197 	int count;
1198 
1199 	/*
1200 	 * Since we do not have exclusive access to underlying devices,
1201 	 * we can't keep cache translations around.
1202 	 */
1203 	clearbiocache(bio->bio_next);
1204 
1205 	crit_enter();
1206 #ifdef DEBUG
1207 	if (ccddebug & CCDB_FOLLOW)
1208 		kprintf("ccdiodone(%x)\n", cbp);
1209 	if (ccddebug & CCDB_IO) {
1210 		kprintf("ccdiodone: bp %x bcount %d resid %d\n",
1211 		       obp, obp->b_bcount, obp->b_resid);
1212 		kprintf(" dev %x(u%d), cbp %x off %lld addr %x bcnt %d\n",
1213 		       cbp->cb_buf.b_dev, cbp->cb_comp, cbp,
1214 		       cbp->cb_buf.b_loffset, cbp->cb_buf.b_data,
1215 		       cbp->cb_buf.b_bcount);
1216 	}
1217 #endif
1218 
1219 	/*
1220 	 * If an error occured, report it.  If this is a mirrored
1221 	 * configuration and the first of two possible reads, do not
1222 	 * set the error in the bp yet because the second read may
1223 	 * succeed.
1224 	 */
1225 	if (cbp->cb_buf.b_flags & B_ERROR) {
1226 		const char *msg = "";
1227 
1228 		if ((ccd_softc[unit].sc_cflags & CCDF_MIRROR) &&
1229 		    (cbp->cb_buf.b_cmd == BUF_CMD_READ) &&
1230 		    (cbp->cb_pflags & CCDPF_MIRROR_DONE) == 0) {
1231 			/*
1232 			 * We will try our read on the other disk down
1233 			 * below, also reverse the default pick so if we
1234 			 * are doing a scan we do not keep hitting the
1235 			 * bad disk first.
1236 			 */
1237 			struct ccd_softc *cs = &ccd_softc[unit];
1238 
1239 			msg = ", trying other disk";
1240 			cs->sc_pick = 1 - cs->sc_pick;
1241 			cs->sc_blk[cs->sc_pick] = obio->bio_offset;
1242 		} else {
1243 			obp->b_flags |= B_ERROR;
1244 			obp->b_error = cbp->cb_buf.b_error ?
1245 			    cbp->cb_buf.b_error : EIO;
1246 		}
1247 		kprintf("ccd%d: error %d on component %d "
1248 			"offset %jd (ccd offset %jd)%s\n",
1249 		        unit, obp->b_error, cbp->cb_comp,
1250 		        (intmax_t)cbp->cb_buf.b_bio2.bio_offset,
1251 		        (intmax_t)obio->bio_offset,
1252 		        msg);
1253 	}
1254 
1255 	/*
1256 	 * Process mirror.  If we are writing, I/O has been initiated on both
1257 	 * buffers and we fall through only after both are finished.
1258 	 *
1259 	 * If we are reading only one I/O is initiated at a time.  If an
1260 	 * error occurs we initiate the second I/O and return, otherwise
1261 	 * we free the second I/O without initiating it.
1262 	 */
1263 
1264 	if (ccd_softc[unit].sc_cflags & CCDF_MIRROR) {
1265 		if (cbp->cb_buf.b_cmd != BUF_CMD_READ) {
1266 			/*
1267 			 * When writing, handshake with the second buffer
1268 			 * to determine when both are done.  If both are not
1269 			 * done, return here.
1270 			 */
1271 			if ((cbp->cb_pflags & CCDPF_MIRROR_DONE) == 0) {
1272 				cbp->cb_mirror->cb_pflags |= CCDPF_MIRROR_DONE;
1273 				putccdbuf(cbp);
1274 				crit_exit();
1275 				return;
1276 			}
1277 		} else {
1278 			/*
1279 			 * When reading, either dispose of the second buffer
1280 			 * or initiate I/O on the second buffer if an error
1281 			 * occured with this one.
1282 			 */
1283 			if ((cbp->cb_pflags & CCDPF_MIRROR_DONE) == 0) {
1284 				if (cbp->cb_buf.b_flags & B_ERROR) {
1285 					cbp->cb_mirror->cb_pflags |=
1286 					    CCDPF_MIRROR_DONE;
1287 					vn_strategy(
1288 					    cbp->cb_mirror->cb_vp,
1289 					    &cbp->cb_mirror->cb_buf.b_bio1
1290 					);
1291 					putccdbuf(cbp);
1292 					crit_exit();
1293 					return;
1294 				} else {
1295 					putccdbuf(cbp->cb_mirror);
1296 					/* fall through */
1297 				}
1298 			}
1299 		}
1300 	}
1301 
1302 	/*
1303 	 * Use our saved b_bufsize to determine if an unexpected EOF occured.
1304 	 */
1305 	count = cbp->cb_buf.b_bufsize;
1306 	putccdbuf(cbp);
1307 
1308 	/*
1309 	 * If all done, "interrupt".
1310 	 */
1311 	obp->b_resid -= count;
1312 	if (obp->b_resid < 0)
1313 		panic("ccdiodone: count");
1314 	if (obp->b_resid == 0)
1315 		ccdintr(&ccd_softc[unit], obio);
1316 	crit_exit();
1317 }
1318 
1319 static int
1320 ccdioctl(struct dev_ioctl_args *ap)
1321 {
1322 	cdev_t dev = ap->a_head.a_dev;
1323 	int unit = ccdunit(dev);
1324 	int i, j, lookedup = 0, error = 0;
1325 	struct ccd_softc *cs;
1326 	struct ccd_ioctl *ccio = (struct ccd_ioctl *)ap->a_data;
1327 	struct ccddevice ccd;
1328 	struct disk_info info;
1329 	char **cpp;
1330 	struct vnode **vpp;
1331 
1332 	if (unit >= numccd)
1333 		return (ENXIO);
1334 	cs = &ccd_softc[unit];
1335 
1336 	bzero(&ccd, sizeof(ccd));
1337 
1338 	switch (ap->a_cmd) {
1339 	case CCDIOCSET:
1340 		if (cs->sc_flags & CCDF_INITED)
1341 			return (EBUSY);
1342 
1343 		if ((ap->a_fflag & FWRITE) == 0)
1344 			return (EBADF);
1345 
1346 		if ((error = ccdlock(cs)) != 0)
1347 			return (error);
1348 
1349 		if (ccio->ccio_ndisks > CCD_MAXNDISKS) {
1350 			ccdunlock(cs);
1351 			return (EINVAL);
1352 		}
1353 
1354 		/* Fill in some important bits. */
1355 		ccd.ccd_unit = unit;
1356 		ccd.ccd_interleave = ccio->ccio_ileave;
1357 		if (ccd.ccd_interleave == 0 &&
1358 		    ((ccio->ccio_flags & CCDF_MIRROR) ||
1359 		     (ccio->ccio_flags & CCDF_PARITY))) {
1360 			kprintf("ccd%d: disabling mirror/parity, interleave is 0\n", unit);
1361 			ccio->ccio_flags &= ~(CCDF_MIRROR | CCDF_PARITY);
1362 		}
1363 		if ((ccio->ccio_flags & CCDF_MIRROR) &&
1364 		    (ccio->ccio_flags & CCDF_PARITY)) {
1365 			kprintf("ccd%d: can't specify both mirror and parity, using mirror\n", unit);
1366 			ccio->ccio_flags &= ~CCDF_PARITY;
1367 		}
1368 		if ((ccio->ccio_flags & (CCDF_MIRROR | CCDF_PARITY)) &&
1369 		    !(ccio->ccio_flags & CCDF_UNIFORM)) {
1370 			kprintf("ccd%d: mirror/parity forces uniform flag\n",
1371 			       unit);
1372 			ccio->ccio_flags |= CCDF_UNIFORM;
1373 		}
1374 		ccd.ccd_flags = ccio->ccio_flags & CCDF_USERMASK;
1375 
1376 		/*
1377 		 * Allocate space for and copy in the array of
1378 		 * componet pathnames and device numbers.
1379 		 */
1380 		cpp = kmalloc(ccio->ccio_ndisks * sizeof(char *),
1381 		    M_DEVBUF, M_WAITOK);
1382 		vpp = kmalloc(ccio->ccio_ndisks * sizeof(struct vnode *),
1383 		    M_DEVBUF, M_WAITOK);
1384 
1385 		error = copyin((caddr_t)ccio->ccio_disks, (caddr_t)cpp,
1386 				ccio->ccio_ndisks * sizeof(char **));
1387 		if (error) {
1388 			kfree(vpp, M_DEVBUF);
1389 			kfree(cpp, M_DEVBUF);
1390 			ccdunlock(cs);
1391 			return (error);
1392 		}
1393 
1394 #ifdef DEBUG
1395 		if (ccddebug & CCDB_INIT) {
1396 			for (i = 0; i < ccio->ccio_ndisks; ++i)
1397 				kprintf("ccdioctl: component %d: 0x%x\n",
1398 				    i, cpp[i]);
1399 		}
1400 #endif
1401 
1402 		for (i = 0; i < ccio->ccio_ndisks; ++i) {
1403 #ifdef DEBUG
1404 			if (ccddebug & CCDB_INIT)
1405 				kprintf("ccdioctl: lookedup = %d\n", lookedup);
1406 #endif
1407 			if ((error = ccdlookup(cpp[i], &vpp[i])) != 0) {
1408 				for (j = 0; j < lookedup; ++j)
1409 					(void)vn_close(vpp[j], FREAD|FWRITE);
1410 				kfree(vpp, M_DEVBUF);
1411 				kfree(cpp, M_DEVBUF);
1412 				ccdunlock(cs);
1413 				return (error);
1414 			}
1415 			++lookedup;
1416 		}
1417 		ccd.ccd_cpp = cpp;
1418 		ccd.ccd_vpp = vpp;
1419 		ccd.ccd_ndev = ccio->ccio_ndisks;
1420 
1421 		/*
1422 		 * Initialize the ccd.  Fills in the softc for us.
1423 		 */
1424 		if ((error = ccdinit(&ccd, cpp, ap->a_cred)) != 0) {
1425 			for (j = 0; j < lookedup; ++j)
1426 				(void)vn_close(vpp[j], FREAD|FWRITE);
1427 			kfree(vpp, M_DEVBUF);
1428 			kfree(cpp, M_DEVBUF);
1429 			ccdunlock(cs);
1430 			return (error);
1431 		}
1432 
1433 		/*
1434 		 * The ccd has been successfully initialized, so
1435 		 * we can place it into the array and read the disklabel.
1436 		 */
1437 		bcopy(&ccd, &ccddevs[unit], sizeof(ccd));
1438 		ccio->ccio_unit = unit;
1439 		ccio->ccio_size = cs->sc_size;
1440 
1441 		bzero(&info, sizeof(info));
1442 		info.d_media_blksize = cs->sc_geom.ccg_secsize;
1443 		info.d_media_blocks  = cs->sc_size;
1444 		info.d_nheads	     = cs->sc_geom.ccg_ntracks;
1445 		info.d_secpertrack   = cs->sc_geom.ccg_nsectors;
1446 		info.d_ncylinders    = cs->sc_geom.ccg_ncylinders;
1447 		info.d_secpercyl     = info.d_nheads * info.d_secpertrack;
1448 
1449 		/*
1450 		 * For cases where a label is directly applied to the ccd,
1451 		 * without slices, DSO_COMPATMBR forces one sector be
1452 		 * reserved for backwards compatibility.
1453 		 */
1454 		info.d_dsflags	     = DSO_COMPATMBR;
1455 		disk_setdiskinfo(&cs->sc_disk, &info);
1456 
1457 		ccdunlock(cs);
1458 
1459 		break;
1460 
1461 	case CCDIOCCLR:
1462 		if ((cs->sc_flags & CCDF_INITED) == 0)
1463 			return (ENXIO);
1464 
1465 		if ((ap->a_fflag & FWRITE) == 0)
1466 			return (EBADF);
1467 
1468 		if ((error = ccdlock(cs)) != 0)
1469 			return (error);
1470 
1471 		if (dev_drefs(cs->sc_dev) > 1) {
1472 			ccdunlock(cs);
1473 			return (EBUSY);
1474 		}
1475 
1476 		/*
1477 		 * Free ccd_softc information and clear entry.
1478 		 */
1479 
1480 		/* Close the components and free their pathnames. */
1481 		for (i = 0; i < cs->sc_nccdisks; ++i) {
1482 			/*
1483 			 * XXX: this close could potentially fail and
1484 			 * cause Bad Things.  Maybe we need to force
1485 			 * the close to happen?
1486 			 */
1487 #ifdef DEBUG
1488 			if (ccddebug & CCDB_VNODE)
1489 				vprint("CCDIOCCLR: vnode info",
1490 				    cs->sc_cinfo[i].ci_vp);
1491 #endif
1492 			(void)vn_close(cs->sc_cinfo[i].ci_vp, FREAD|FWRITE);
1493 			kfree(cs->sc_cinfo[i].ci_path, M_DEVBUF);
1494 		}
1495 
1496 		/* Free interleave index. */
1497 		for (i = 0; cs->sc_itable[i].ii_ndisk; ++i)
1498 			kfree(cs->sc_itable[i].ii_index, M_DEVBUF);
1499 
1500 		/* Free component info and interleave table. */
1501 		kfree(cs->sc_cinfo, M_DEVBUF);
1502 		kfree(cs->sc_itable, M_DEVBUF);
1503 		cs->sc_cinfo = NULL;
1504 		cs->sc_itable = NULL;
1505 		cs->sc_flags &= ~CCDF_INITED;
1506 
1507 		/*
1508 		 * Free ccddevice information and clear entry.
1509 		 */
1510 		kfree(ccddevs[unit].ccd_cpp, M_DEVBUF);
1511 		kfree(ccddevs[unit].ccd_vpp, M_DEVBUF);
1512 		bcopy(&ccd, &ccddevs[unit], sizeof(ccd));
1513 
1514 		/*
1515 		 * And remove the devstat entry.
1516 		 */
1517 		devstat_remove_entry(&cs->device_stats);
1518 
1519 		/* This must be atomic. */
1520 		crit_enter();
1521 		ccdunlock(cs);
1522 		crit_exit();
1523 
1524 		break;
1525 
1526 	default:
1527 		return (ENOTTY);
1528 	}
1529 
1530 	return (0);
1531 }
1532 
1533 static int
1534 ccddump(struct dev_dump_args *ap)
1535 {
1536 	/* Not implemented. */
1537 	return ENXIO;
1538 }
1539 
1540 /*
1541  * Lookup the provided name in the filesystem.  If the file exists,
1542  * is a valid block device, and isn't being used by anyone else,
1543  * set *vpp to the file's vnode.
1544  */
1545 static int
1546 ccdlookup(char *path, struct vnode **vpp)
1547 {
1548 	struct nlookupdata nd;
1549 	struct vnode *vp;
1550 	int error;
1551 
1552 	*vpp = NULL;
1553 
1554 	error = nlookup_init(&nd, path, UIO_USERSPACE, NLC_FOLLOW|NLC_LOCKVP);
1555 	if (error)
1556 		return (error);
1557 	if ((error = vn_open(&nd, NULL, FREAD|FWRITE, 0)) != 0) {
1558 #ifdef DEBUG
1559 		if (ccddebug & CCDB_FOLLOW|CCDB_INIT)
1560 			kprintf("ccdlookup: vn_open error = %d\n", error);
1561 #endif
1562 		goto done;
1563 	}
1564 	vp = nd.nl_open_vp;
1565 
1566 	if (vp->v_opencount > 1) {
1567 		error = EBUSY;
1568 		goto done;
1569 	}
1570 
1571 	if (!vn_isdisk(vp, &error))
1572 		goto done;
1573 
1574 #ifdef DEBUG
1575 	if (ccddebug & CCDB_VNODE)
1576 		vprint("ccdlookup: vnode info", vp);
1577 #endif
1578 
1579 	vn_unlock(vp);
1580 	nd.nl_open_vp = NULL;
1581 	nlookup_done(&nd);
1582 	*vpp = vp;				/* leave ref intact  */
1583 	return (0);
1584 done:
1585 	nlookup_done(&nd);
1586 	return (error);
1587 }
1588 
1589 /*
1590  * Wait interruptibly for an exclusive lock.
1591  *
1592  * XXX
1593  * Several drivers do this; it should be abstracted and made MP-safe.
1594  */
1595 static int
1596 ccdlock(struct ccd_softc *cs)
1597 {
1598 	int error;
1599 
1600 	while ((cs->sc_flags & CCDF_LOCKED) != 0) {
1601 		cs->sc_flags |= CCDF_WANTED;
1602 		if ((error = tsleep(cs, PCATCH, "ccdlck", 0)) != 0)
1603 			return (error);
1604 	}
1605 	cs->sc_flags |= CCDF_LOCKED;
1606 	return (0);
1607 }
1608 
1609 /*
1610  * Unlock and wake up any waiters.
1611  */
1612 static void
1613 ccdunlock(struct ccd_softc *cs)
1614 {
1615 
1616 	cs->sc_flags &= ~CCDF_LOCKED;
1617 	if ((cs->sc_flags & CCDF_WANTED) != 0) {
1618 		cs->sc_flags &= ~CCDF_WANTED;
1619 		wakeup(cs);
1620 	}
1621 }
1622 
1623 #ifdef DEBUG
1624 static void
1625 printiinfo(struct ccdiinfo *ii)
1626 {
1627 	int ix, i;
1628 
1629 	for (ix = 0; ii->ii_ndisk; ix++, ii++) {
1630 		kprintf(" itab[%d]: #dk %d sblk %d soff %d",
1631 		       ix, ii->ii_ndisk, ii->ii_startblk, ii->ii_startoff);
1632 		for (i = 0; i < ii->ii_ndisk; i++)
1633 			kprintf(" %d", ii->ii_index[i]);
1634 		kprintf("\n");
1635 	}
1636 }
1637 #endif
1638 
1639 
1640 /* Local Variables: */
1641 /* c-argdecl-indent: 8 */
1642 /* c-continued-statement-offset: 8 */
1643 /* c-indent-level: 8 */
1644 /* End: */
1645