xref: /dragonfly/sys/dev/disk/ccd/ccd.c (revision b71f52a9)
1 /*
2  * Copyright (c) 2007 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  *
34  * $DragonFly: src/sys/dev/disk/ccd/ccd.c,v 1.50 2007/11/06 03:50:02 dillon Exp $
35  */
36 /*
37  * Copyright (c) 1995 Jason R. Thorpe.
38  * All rights reserved.
39  *
40  * Redistribution and use in source and binary forms, with or without
41  * modification, are permitted provided that the following conditions
42  * are met:
43  * 1. Redistributions of source code must retain the above copyright
44  *    notice, this list of conditions and the following disclaimer.
45  * 2. Redistributions in binary form must reproduce the above copyright
46  *    notice, this list of conditions and the following disclaimer in the
47  *    documentation and/or other materials provided with the distribution.
48  * 3. All advertising materials mentioning features or use of this software
49  *    must display the following acknowledgement:
50  *	This product includes software developed for the NetBSD Project
51  *	by Jason R. Thorpe.
52  * 4. The name of the author may not be used to endorse or promote products
53  *    derived from this software without specific prior written permission.
54  *
55  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
56  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
57  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
58  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
59  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
60  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
61  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
62  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
63  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
64  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
65  * SUCH DAMAGE.
66  */
67 
68 /*
69  * Copyright (c) 1988 University of Utah.
70  * Copyright (c) 1990, 1993
71  *	The Regents of the University of California.  All rights reserved.
72  *
73  * This code is derived from software contributed to Berkeley by
74  * the Systems Programming Group of the University of Utah Computer
75  * Science Department.
76  *
77  * Redistribution and use in source and binary forms, with or without
78  * modification, are permitted provided that the following conditions
79  * are met:
80  * 1. Redistributions of source code must retain the above copyright
81  *    notice, this list of conditions and the following disclaimer.
82  * 2. Redistributions in binary form must reproduce the above copyright
83  *    notice, this list of conditions and the following disclaimer in the
84  *    documentation and/or other materials provided with the distribution.
85  * 3. All advertising materials mentioning features or use of this software
86  *    must display the following acknowledgement:
87  *	This product includes software developed by the University of
88  *	California, Berkeley and its contributors.
89  * 4. Neither the name of the University nor the names of its contributors
90  *    may be used to endorse or promote products derived from this software
91  *    without specific prior written permission.
92  *
93  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
94  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
95  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
96  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
97  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
98  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
99  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
100  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
101  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
102  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
103  * SUCH DAMAGE.
104  *
105  * from: Utah $Hdr: cd.c 1.6 90/11/28$
106  */
107 /*
108  * @(#)cd.c	8.2 (Berkeley) 11/16/93
109  * $FreeBSD: src/sys/dev/ccd/ccd.c,v 1.73.2.1 2001/09/11 09:49:52 kris Exp $
110  * $NetBSD: ccd.c,v 1.22 1995/12/08 19:13:26 thorpej Exp $
111  * $DragonFly: src/sys/dev/disk/ccd/ccd.c,v 1.50 2007/11/06 03:50:02 dillon Exp $
112  */
113 
114 /*
115  * "Concatenated" disk driver.
116  *
117  * Original dynamic configuration support by:
118  *	Jason R. Thorpe <thorpej@nas.nasa.gov>
119  *	Numerical Aerodynamic Simulation Facility
120  *	Mail Stop 258-6
121  *	NASA Ames Research Center
122  *	Moffett Field, CA 94035
123  */
124 
125 #include "use_ccd.h"
126 
127 #include <sys/param.h>
128 #include <sys/systm.h>
129 #include <sys/kernel.h>
130 #include <sys/module.h>
131 #include <sys/proc.h>
132 #include <sys/buf.h>
133 #include <sys/malloc.h>
134 #include <sys/nlookup.h>
135 #include <sys/conf.h>
136 #include <sys/stat.h>
137 #include <sys/sysctl.h>
138 #include <sys/disk.h>
139 #include <sys/dtype.h>
140 #include <sys/diskslice.h>
141 #include <sys/devicestat.h>
142 #include <sys/fcntl.h>
143 #include <sys/vnode.h>
144 #include <sys/buf2.h>
145 #include <sys/ccdvar.h>
146 
147 #include <vm/vm_zone.h>
148 
149 #include <vfs/ufs/dinode.h> 	/* XXX Used only for fs.h */
150 #include <vfs/ufs/fs.h> 	/* XXX used only to get BBSIZE and SBSIZE */
151 
152 #include <sys/thread2.h>
153 
154 #if defined(CCDDEBUG) && !defined(DEBUG)
155 #define DEBUG
156 #endif
157 
158 #ifdef DEBUG
159 #define CCDB_FOLLOW	0x01
160 #define CCDB_INIT	0x02
161 #define CCDB_IO		0x04
162 #define CCDB_LABEL	0x08
163 #define CCDB_VNODE	0x10
164 static int ccddebug = CCDB_FOLLOW | CCDB_INIT | CCDB_IO | CCDB_LABEL |
165     CCDB_VNODE;
166 SYSCTL_INT(_debug, OID_AUTO, ccddebug, CTLFLAG_RW, &ccddebug, 0, "");
167 #undef DEBUG
168 #endif
169 
170 #define	ccdunit(x)	dkunit(x)
171 #define ccdpart(x)	dkpart(x)
172 
173 /*
174    This is how mirroring works (only writes are special):
175 
176    When initiating a write, ccdbuffer() returns two "struct ccdbuf *"s
177    linked together by the cb_mirror field.  "cb_pflags &
178    CCDPF_MIRROR_DONE" is set to 0 on both of them.
179 
180    When a component returns to ccdiodone(), it checks if "cb_pflags &
181    CCDPF_MIRROR_DONE" is set or not.  If not, it sets the partner's
182    flag and returns.  If it is, it means its partner has already
183    returned, so it will go to the regular cleanup.
184 
185  */
186 
187 struct ccdbuf {
188 	struct buf	cb_buf;		/* new I/O buf */
189 	struct vnode	*cb_vp;		/* related vnode */
190 	struct bio	*cb_obio;	/* ptr. to original I/O buf */
191 	struct ccdbuf	*cb_freenext;	/* free list link */
192 	int		cb_unit;	/* target unit */
193 	int		cb_comp;	/* target component */
194 	int		cb_pflags;	/* mirror/parity status flag */
195 	struct ccdbuf	*cb_mirror;	/* mirror counterpart */
196 };
197 
198 /* bits in cb_pflags */
199 #define CCDPF_MIRROR_DONE 1	/* if set, mirror counterpart is done */
200 
201 static d_open_t ccdopen;
202 static d_close_t ccdclose;
203 static d_strategy_t ccdstrategy;
204 static d_ioctl_t ccdioctl;
205 static d_dump_t ccddump;
206 
207 #define NCCDFREEHIWAT	16
208 
209 #define CDEV_MAJOR 74
210 
211 static struct dev_ops ccd_ops = {
212 	{ "ccd", CDEV_MAJOR, D_DISK },
213 	.d_open =	ccdopen,
214 	.d_close =	ccdclose,
215 	.d_read =	physread,
216 	.d_write =	physwrite,
217 	.d_ioctl =	ccdioctl,
218 	.d_strategy =	ccdstrategy,
219 	.d_dump =	ccddump
220 };
221 
222 /* called during module initialization */
223 static	void ccdattach (void);
224 static	int ccddetach (void);
225 static	int ccd_modevent (module_t, int, void *);
226 
227 /* called by biodone() at interrupt time */
228 static	void ccdiodone (struct bio *bio);
229 
230 static	void ccdstart (struct ccd_softc *, struct bio *);
231 static	void ccdinterleave (struct ccd_softc *, int);
232 static	void ccdintr (struct ccd_softc *, struct bio *);
233 static	int ccdinit (struct ccddevice *, char **, struct ucred *);
234 static	int ccdlookup (char *, struct vnode **);
235 static	void ccdbuffer (struct ccdbuf **ret, struct ccd_softc *,
236 		struct bio *, off_t, caddr_t, long);
237 static	int ccdlock (struct ccd_softc *);
238 static	void ccdunlock (struct ccd_softc *);
239 
240 #ifdef DEBUG
241 static	void printiinfo (struct ccdiinfo *);
242 #endif
243 
244 /* Non-private for the benefit of libkvm. */
245 struct	ccd_softc *ccd_softc;
246 struct	ccddevice *ccddevs;
247 struct	ccdbuf *ccdfreebufs;
248 static	int numccdfreebufs;
249 static	int numccd = 0;
250 
251 /*
252  * getccdbuf() -	Allocate and zero a ccd buffer.
253  *
254  *	This routine is called at splbio().
255  */
256 
257 static __inline
258 struct ccdbuf *
259 getccdbuf(void)
260 {
261 	struct ccdbuf *cbp;
262 
263 	/*
264 	 * Allocate from freelist or malloc as necessary
265 	 */
266 	if ((cbp = ccdfreebufs) != NULL) {
267 		ccdfreebufs = cbp->cb_freenext;
268 		--numccdfreebufs;
269 		reinitbufbio(&cbp->cb_buf);
270 	} else {
271 		cbp = kmalloc(sizeof(struct ccdbuf), M_DEVBUF, M_WAITOK|M_ZERO);
272 		initbufbio(&cbp->cb_buf);
273 	}
274 
275 	/*
276 	 * independant struct buf initialization
277 	 */
278 	buf_dep_init(&cbp->cb_buf);
279 	BUF_LOCKINIT(&cbp->cb_buf);
280 	BUF_LOCK(&cbp->cb_buf, LK_EXCLUSIVE);
281 	BUF_KERNPROC(&cbp->cb_buf);
282 	cbp->cb_buf.b_flags = B_PAGING | B_BNOCLIP;
283 
284 	return(cbp);
285 }
286 
287 /*
288  * putccdbuf() -	Free a ccd buffer.
289  *
290  *	This routine is called at splbio().
291  */
292 
293 static __inline
294 void
295 putccdbuf(struct ccdbuf *cbp)
296 {
297 	BUF_UNLOCK(&cbp->cb_buf);
298 	BUF_LOCKFREE(&cbp->cb_buf);
299 
300 	if (numccdfreebufs < NCCDFREEHIWAT) {
301 		cbp->cb_freenext = ccdfreebufs;
302 		ccdfreebufs = cbp;
303 		++numccdfreebufs;
304 	} else {
305 		kfree((caddr_t)cbp, M_DEVBUF);
306 	}
307 }
308 
309 /*
310  * Called by main() during pseudo-device attachment.  All we need
311  * to do is allocate enough space for devices to be configured later, and
312  * add devsw entries.
313  */
314 static void
315 ccdattach(void)
316 {
317 	struct disk_info info;
318 	struct ccd_softc *cs;
319 	int i;
320 	int num = NCCD;
321 
322 	if (num > 1)
323 		kprintf("ccd0-%d: Concatenated disk drivers\n", num-1);
324 	else
325 		kprintf("ccd0: Concatenated disk driver\n");
326 
327 	ccd_softc = kmalloc(num * sizeof(struct ccd_softc), M_DEVBUF,
328 			    M_WAITOK | M_ZERO);
329 	ccddevs = kmalloc(num * sizeof(struct ccddevice), M_DEVBUF,
330 			    M_WAITOK | M_ZERO);
331 	numccd = num;
332 
333 	/*
334 	 * With normal disk devices the open simply fails if the media
335 	 * is not present.  With CCD we have to be able to open the
336 	 * raw disk to use the ioctl's to set it up, so create a dummy
337 	 * disk info structure so dscheck() doesn't blow up.
338 	 */
339 	bzero(&info, sizeof(info));
340 	info.d_media_blksize = DEV_BSIZE;
341 
342 	for (i = 0; i < numccd; ++i) {
343 		cs = &ccd_softc[i];
344 		cs->sc_dev = disk_create(i, &cs->sc_disk, &ccd_ops);
345 		cs->sc_dev->si_drv1 = cs;
346 		cs->sc_dev->si_iosize_max = 256 * 512;	/* XXX */
347 		disk_setdiskinfo(&cs->sc_disk, &info);
348 	}
349 }
350 
351 static int
352 ccddetach(void)
353 {
354 	struct ccd_softc *cs;
355 	struct dev_ioctl_args ioctl_args;
356 	int i;
357 	int error = 0;
358 	int eval;
359 
360 	bzero(&ioctl_args, sizeof(ioctl_args));
361 
362 	for (i = 0; i < numccd; ++i) {
363 		cs = &ccd_softc[i];
364 		if (cs->sc_dev == NULL)
365 			continue;
366 		ioctl_args.a_head.a_dev = cs->sc_dev;
367 		ioctl_args.a_cmd = CCDIOCCLR;
368 		ioctl_args.a_fflag = FWRITE;
369 		eval = ccdioctl(&ioctl_args);
370 		if (eval && eval != ENXIO) {
371 			kprintf("ccd%d: In use, cannot detach\n", i);
372 			error = EBUSY;
373 		}
374 	}
375 	if (error == 0) {
376 		for (i = 0; i < numccd; ++i) {
377 			cs = &ccd_softc[i];
378 			if (cs->sc_dev == NULL)
379 				continue;
380 			disk_destroy(&cs->sc_disk);
381 			cs->sc_dev = NULL;
382 		}
383 		if (ccd_softc)
384 			kfree(ccd_softc, M_DEVBUF);
385 		if (ccddevs)
386 			kfree(ccddevs, M_DEVBUF);
387 	}
388 	return (error);
389 }
390 
391 static int
392 ccd_modevent(module_t mod, int type, void *data)
393 {
394 	int error = 0;
395 
396 	switch (type) {
397 	case MOD_LOAD:
398 		ccdattach();
399 		break;
400 
401 	case MOD_UNLOAD:
402 		error = ccddetach();
403 		break;
404 
405 	default:	/* MOD_SHUTDOWN etc */
406 		break;
407 	}
408 	return (error);
409 }
410 
411 DEV_MODULE(ccd, ccd_modevent, NULL);
412 
413 static int
414 ccdinit(struct ccddevice *ccd, char **cpaths, struct ucred *cred)
415 {
416 	struct ccd_softc *cs = &ccd_softc[ccd->ccd_unit];
417 	struct ccdcinfo *ci = NULL;	/* XXX */
418 	int ix;
419 	struct vnode *vp;
420 	u_int64_t skip;
421 	u_int64_t size;
422 	u_int64_t minsize;
423 	int maxsecsize;
424 	struct partinfo dpart;
425 	struct ccdgeom *ccg = &cs->sc_geom;
426 	char tmppath[MAXPATHLEN];
427 	int error = 0;
428 
429 #ifdef DEBUG
430 	if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
431 		kprintf("ccdinit: unit %d\n", ccd->ccd_unit);
432 #endif
433 
434 	cs->sc_size = 0;
435 	cs->sc_ileave = ccd->ccd_interleave;
436 	cs->sc_nccdisks = ccd->ccd_ndev;
437 
438 	/* Allocate space for the component info. */
439 	cs->sc_cinfo = kmalloc(cs->sc_nccdisks * sizeof(struct ccdcinfo),
440 				M_DEVBUF, M_WAITOK);
441 	cs->sc_maxiosize = MAXPHYS;
442 
443 	/*
444 	 * Verify that each component piece exists and record
445 	 * relevant information about it.
446 	 */
447 	maxsecsize = 0;
448 	minsize = 0;
449 	for (ix = 0; ix < cs->sc_nccdisks; ix++) {
450 		vp = ccd->ccd_vpp[ix];
451 		ci = &cs->sc_cinfo[ix];
452 		ci->ci_vp = vp;
453 
454 		/*
455 		 * Copy in the pathname of the component.
456 		 */
457 		bzero(tmppath, sizeof(tmppath));	/* sanity */
458 		if ((error = copyinstr(cpaths[ix], tmppath,
459 		    MAXPATHLEN, &ci->ci_pathlen)) != 0) {
460 #ifdef DEBUG
461 			if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
462 				kprintf("ccd%d: can't copy path, error = %d\n",
463 				    ccd->ccd_unit, error);
464 #endif
465 			goto fail;
466 		}
467 		ci->ci_path = kmalloc(ci->ci_pathlen, M_DEVBUF, M_WAITOK);
468 		bcopy(tmppath, ci->ci_path, ci->ci_pathlen);
469 
470 		ci->ci_dev = vn_todev(vp);
471 		if (ci->ci_dev->si_iosize_max &&
472 		    cs->sc_maxiosize > ci->ci_dev->si_iosize_max) {
473 			cs->sc_maxiosize = ci->ci_dev->si_iosize_max;
474 		}
475 
476 		/*
477 		 * Get partition information for the component.
478 		 */
479 		error = VOP_IOCTL(vp, DIOCGPART, (caddr_t)&dpart, FREAD, cred);
480 		if (error) {
481 #ifdef DEBUG
482 			if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
483 				 kprintf("ccd%d: %s: ioctl failed, error = %d\n",
484 				     ccd->ccd_unit, ci->ci_path, error);
485 #endif
486 			goto fail;
487 		}
488 		if (dpart.fstype != FS_CCD &&
489 		    !kuuid_is_ccd(&dpart.fstype_uuid)) {
490 			kprintf("ccd%d: %s: filesystem type must be 'ccd'\n",
491 				ccd->ccd_unit, ci->ci_path);
492 			error = EFTYPE;
493 			goto fail;
494 		}
495 		if (maxsecsize < dpart.media_blksize)
496 			maxsecsize = dpart.media_blksize;
497 
498 		/*
499 		 * Skip a certain amount of storage at the beginning of
500 		 * the component to make sure we don't infringe on any
501 		 * reserved sectors.  This is handled entirely by
502 		 * dpart.reserved_blocks but we also impose a minimum
503 		 * of 16 sectors for backwards compatibility.
504 		 */
505 		skip = 16;
506 		if (skip < dpart.reserved_blocks)
507 			skip = dpart.reserved_blocks;
508 		size = dpart.media_blocks - skip;
509 
510 		/*
511 		 * Calculate the size, truncating to an interleave
512 		 * boundary if necessary.
513 		 */
514 		if (cs->sc_ileave > 1)
515 			size -= size % cs->sc_ileave;
516 
517 		if ((int64_t)size <= 0) {
518 #ifdef DEBUG
519 			if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
520 				kprintf("ccd%d: %s: size == 0\n",
521 				    ccd->ccd_unit, ci->ci_path);
522 #endif
523 			error = ENODEV;
524 			goto fail;
525 		}
526 
527 		/*
528 		 * Calculate the smallest uniform component, used
529 		 * elsewhere.
530 		 */
531 		if (minsize == 0 || minsize > size)
532 			minsize = size;
533 		ci->ci_skip = skip;
534 		ci->ci_size = size;
535 		cs->sc_size += size;
536 	}
537 	kprintf("ccd%d: max component iosize is %d total blocks %lld\n",
538 		cs->sc_unit, cs->sc_maxiosize, (long long)cs->sc_size);
539 
540 	/*
541 	 * Don't allow the interleave to be smaller than
542 	 * the biggest component sector.
543 	 */
544 	if ((cs->sc_ileave > 0) &&
545 	    (cs->sc_ileave % (maxsecsize / DEV_BSIZE))) {
546 #ifdef DEBUG
547 		if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
548 			kprintf("ccd%d: interleave must be at least %d\n",
549 			    ccd->ccd_unit, (maxsecsize / DEV_BSIZE));
550 #endif
551 		error = EINVAL;
552 		goto fail;
553 	}
554 
555 	/*
556 	 * If uniform interleave is desired set all sizes to that of
557 	 * the smallest component.  This will guarentee that a single
558 	 * interleave table is generated.
559 	 *
560 	 * Lost space must be taken into account when calculating the
561 	 * overall size.  Half the space is lost when CCDF_MIRROR is
562 	 * specified.  One disk is lost when CCDF_PARITY is specified.
563 	 */
564 	if (ccd->ccd_flags & CCDF_UNIFORM) {
565 		for (ci = cs->sc_cinfo;
566 		     ci < &cs->sc_cinfo[cs->sc_nccdisks]; ci++) {
567 			ci->ci_size = minsize;
568 		}
569 		if (ccd->ccd_flags & CCDF_MIRROR) {
570 			/*
571 			 * Check to see if an even number of components
572 			 * have been specified.  The interleave must also
573 			 * be non-zero in order for us to be able to
574 			 * guarentee the topology.
575 			 */
576 			if (cs->sc_nccdisks % 2) {
577 				kprintf("ccd%d: mirroring requires an even number of disks\n", ccd->ccd_unit );
578 				error = EINVAL;
579 				goto fail;
580 			}
581 			if (cs->sc_ileave == 0) {
582 				kprintf("ccd%d: an interleave must be specified when mirroring\n", ccd->ccd_unit);
583 				error = EINVAL;
584 				goto fail;
585 			}
586 			cs->sc_size = (cs->sc_nccdisks/2) * minsize;
587 		} else if (ccd->ccd_flags & CCDF_PARITY) {
588 			cs->sc_size = (cs->sc_nccdisks-1) * minsize;
589 		} else {
590 			if (cs->sc_ileave == 0) {
591 				kprintf("ccd%d: an interleave must be specified when using parity\n", ccd->ccd_unit);
592 				error = EINVAL;
593 				goto fail;
594 			}
595 			cs->sc_size = cs->sc_nccdisks * minsize;
596 		}
597 	}
598 
599 	/*
600 	 * Construct the interleave table.
601 	 */
602 	ccdinterleave(cs, ccd->ccd_unit);
603 
604 	/*
605 	 * Create pseudo-geometry based on 1MB cylinders.  It's
606 	 * pretty close.
607 	 */
608 	ccg->ccg_secsize = maxsecsize;
609 	ccg->ccg_ntracks = 1;
610 	ccg->ccg_nsectors = 1024 * 1024 / ccg->ccg_secsize;
611 	ccg->ccg_ncylinders = cs->sc_size / ccg->ccg_nsectors;
612 
613 	/*
614 	 * Add an devstat entry for this device.
615 	 */
616 	devstat_add_entry(&cs->device_stats, "ccd", ccd->ccd_unit,
617 			  ccg->ccg_secsize, DEVSTAT_ALL_SUPPORTED,
618 			  DEVSTAT_TYPE_STORARRAY |DEVSTAT_TYPE_IF_OTHER,
619 			  DEVSTAT_PRIORITY_ARRAY);
620 
621 	cs->sc_flags |= CCDF_INITED;
622 	cs->sc_cflags = ccd->ccd_flags;	/* So we can find out later... */
623 	cs->sc_unit = ccd->ccd_unit;
624 	return (0);
625 fail:
626 	while (ci > cs->sc_cinfo) {
627 		ci--;
628 		kfree(ci->ci_path, M_DEVBUF);
629 	}
630 	kfree(cs->sc_cinfo, M_DEVBUF);
631 	cs->sc_cinfo = NULL;
632 	return (error);
633 }
634 
635 static void
636 ccdinterleave(struct ccd_softc *cs, int unit)
637 {
638 	struct ccdcinfo *ci, *smallci;
639 	struct ccdiinfo *ii;
640 	u_int64_t bn;
641 	u_int64_t lbn;
642 	u_int64_t size;
643 	int icount;
644 	int ix;
645 
646 #ifdef DEBUG
647 	if (ccddebug & CCDB_INIT)
648 		kprintf("ccdinterleave(%x): ileave %d\n", cs, cs->sc_ileave);
649 #endif
650 
651 	/*
652 	 * Allocate an interleave table.  The worst case occurs when each
653 	 * of N disks is of a different size, resulting in N interleave
654 	 * tables.
655 	 *
656 	 * Chances are this is too big, but we don't care.
657 	 */
658 	icount = cs->sc_nccdisks + 1;
659 	cs->sc_itable = kmalloc(icount * sizeof(struct ccdiinfo),
660 				M_DEVBUF, M_WAITOK|M_ZERO);
661 
662 	/*
663 	 * Trivial case: no interleave (actually interleave of disk size).
664 	 * Each table entry represents a single component in its entirety.
665 	 *
666 	 * An interleave of 0 may not be used with a mirror or parity setup.
667 	 */
668 	if (cs->sc_ileave == 0) {
669 		bn = 0;
670 		ii = cs->sc_itable;
671 
672 		for (ix = 0; ix < cs->sc_nccdisks; ix++) {
673 			/* Allocate space for ii_index. */
674 			ii->ii_index = kmalloc(sizeof(int), M_DEVBUF, M_WAITOK);
675 			ii->ii_ndisk = 1;
676 			ii->ii_startblk = bn;
677 			ii->ii_startoff = 0;
678 			ii->ii_index[0] = ix;
679 			bn += cs->sc_cinfo[ix].ci_size;
680 			ii++;
681 		}
682 		ii->ii_ndisk = 0;
683 #ifdef DEBUG
684 		if (ccddebug & CCDB_INIT)
685 			printiinfo(cs->sc_itable);
686 #endif
687 		return;
688 	}
689 
690 	/*
691 	 * The following isn't fast or pretty; it doesn't have to be.
692 	 */
693 	size = 0;
694 	bn = lbn = 0;
695 	for (ii = cs->sc_itable; ii < &cs->sc_itable[icount]; ++ii) {
696 		/*
697 		 * Allocate space for ii_index.  We might allocate more then
698 		 * we use.
699 		 */
700 		ii->ii_index = kmalloc((sizeof(int) * cs->sc_nccdisks),
701 					M_DEVBUF, M_WAITOK);
702 
703 		/*
704 		 * Locate the smallest of the remaining components
705 		 */
706 		smallci = NULL;
707 		ci = cs->sc_cinfo;
708 		while (ci < &cs->sc_cinfo[cs->sc_nccdisks]) {
709 			if (ci->ci_size > size &&
710 			    (smallci == NULL ||
711 			     ci->ci_size < smallci->ci_size)) {
712 				smallci = ci;
713 			}
714 			++ci;
715 		}
716 
717 		/*
718 		 * Nobody left, all done
719 		 */
720 		if (smallci == NULL) {
721 			ii->ii_ndisk = 0;
722 			break;
723 		}
724 
725 		/*
726 		 * Record starting logical block using an sc_ileave blocksize.
727 		 */
728 		ii->ii_startblk = bn / cs->sc_ileave;
729 
730 		/*
731 		 * Record starting component block using an sc_ileave
732 		 * blocksize.  This value is relative to the beginning of
733 		 * a component disk.
734 		 */
735 		ii->ii_startoff = lbn;
736 
737 		/*
738 		 * Determine how many disks take part in this interleave
739 		 * and record their indices.
740 		 */
741 		ix = 0;
742 		for (ci = cs->sc_cinfo;
743 		    ci < &cs->sc_cinfo[cs->sc_nccdisks]; ci++) {
744 			if (ci->ci_size >= smallci->ci_size) {
745 				ii->ii_index[ix++] = ci - cs->sc_cinfo;
746 			}
747 		}
748 		ii->ii_ndisk = ix;
749 
750 		/*
751 		 * Adjust for loop
752 		 */
753 		bn += ix * (smallci->ci_size - size);
754 		lbn = smallci->ci_size / cs->sc_ileave;
755 		size = smallci->ci_size;
756 	}
757 	if (ii == &cs->sc_itable[icount])
758 		panic("ccdinterlave software bug!  table exhausted");
759 #ifdef DEBUG
760 	if (ccddebug & CCDB_INIT)
761 		printiinfo(cs->sc_itable);
762 #endif
763 }
764 
765 /* ARGSUSED */
766 static int
767 ccdopen(struct dev_open_args *ap)
768 {
769 	cdev_t dev = ap->a_head.a_dev;
770 	int unit = ccdunit(dev);
771 	struct ccd_softc *cs;
772 	int error = 0;
773 
774 #ifdef DEBUG
775 	if (ccddebug & CCDB_FOLLOW)
776 		kprintf("ccdopen(%x, %x)\n", dev, flags);
777 #endif
778 	if (unit >= numccd)
779 		return (ENXIO);
780 	cs = &ccd_softc[unit];
781 
782 	if ((error = ccdlock(cs)) == 0) {
783 		ccdunlock(cs);
784 	}
785 	return (error);
786 }
787 
788 /* ARGSUSED */
789 static int
790 ccdclose(struct dev_close_args *ap)
791 {
792 	cdev_t dev = ap->a_head.a_dev;
793 	int unit = ccdunit(dev);
794 	struct ccd_softc *cs;
795 	int error = 0;
796 
797 #ifdef DEBUG
798 	if (ccddebug & CCDB_FOLLOW)
799 		kprintf("ccdclose(%x, %x)\n", dev, flags);
800 #endif
801 
802 	if (unit >= numccd)
803 		return (ENXIO);
804 	cs = &ccd_softc[unit];
805 	if ((error = ccdlock(cs)) == 0) {
806 		ccdunlock(cs);
807 	}
808 	return (error);
809 }
810 
811 static int
812 ccdstrategy(struct dev_strategy_args *ap)
813 {
814 	cdev_t dev = ap->a_head.a_dev;
815 	struct bio *bio = ap->a_bio;
816 	int unit = ccdunit(dev);
817 	struct bio *nbio;
818 	struct buf *bp = bio->bio_buf;
819 	struct ccd_softc *cs = &ccd_softc[unit];
820 	u_int64_t pbn;	/* in sc_secsize chunks */
821 	u_int32_t sz;	/* in sc_secsize chunks */
822 
823 #ifdef DEBUG
824 	if (ccddebug & CCDB_FOLLOW)
825 		kprintf("ccdstrategy(%x): unit %d\n", bp, unit);
826 #endif
827 	if ((cs->sc_flags & CCDF_INITED) == 0) {
828 		bp->b_error = ENXIO;
829 		goto error;
830 	}
831 
832 	/* If it's a nil transfer, wake up the top half now. */
833 	if (bp->b_bcount == 0) {
834 		bp->b_resid = 0;
835 		goto done;
836 	}
837 
838 	/*
839 	 * Do bounds checking and adjust transfer.  If there's an
840 	 * error, the bounds check will flag that for us.
841 	 */
842 
843 	pbn = bio->bio_offset / cs->sc_geom.ccg_secsize;
844 	sz = howmany(bp->b_bcount, cs->sc_geom.ccg_secsize);
845 
846 	/*
847 	 * If out of bounds return an error.  If the request goes
848 	 * past EOF, clip the request as appropriate.  If exactly
849 	 * at EOF, return success (don't clip), but with 0 bytes
850 	 * of I/O.
851 	 *
852 	 * Mark EOF B_INVAL (just like bad), indicating that the
853 	 * contents of the buffer, if any, is invalid.
854 	 */
855 	if ((int64_t)pbn < 0)
856 		goto bad;
857 	if (pbn + sz > cs->sc_size) {
858 		if (pbn > cs->sc_size || (bp->b_flags & B_BNOCLIP))
859 			goto bad;
860 		if (pbn == cs->sc_size) {
861 			bp->b_resid = bp->b_bcount;
862 			bp->b_flags |= B_INVAL;
863 			goto done;
864 		}
865 		sz = (long)(cs->sc_size - pbn);
866 		bp->b_bcount = sz * cs->sc_geom.ccg_secsize;
867 	}
868 	nbio = bio;
869 
870 	bp->b_resid = bp->b_bcount;
871 	nbio->bio_driver_info = dev;
872 
873 	/*
874 	 * "Start" the unit.
875 	 */
876 	crit_enter();
877 	ccdstart(cs, nbio);
878 	crit_exit();
879 	return(0);
880 
881 	/*
882 	 * note: bio, not nbio, is valid at the done label.
883 	 */
884 bad:
885 	bp->b_error = EINVAL;
886 error:
887 	bp->b_resid = bp->b_bcount;
888 	bp->b_flags |= B_ERROR | B_INVAL;
889 done:
890 	biodone(bio);
891 	return(0);
892 }
893 
894 static void
895 ccdstart(struct ccd_softc *cs, struct bio *bio)
896 {
897 	long bcount, rcount;
898 	struct ccdbuf *cbp[4];
899 	struct buf *bp = bio->bio_buf;
900 	/* XXX! : 2 reads and 2 writes for RAID 4/5 */
901 	caddr_t addr;
902 	off_t doffset;
903 
904 #ifdef DEBUG
905 	if (ccddebug & CCDB_FOLLOW)
906 		kprintf("ccdstart(%x, %x)\n", cs, bp);
907 #endif
908 
909 	/* Record the transaction start  */
910 	devstat_start_transaction(&cs->device_stats);
911 
912 	/*
913 	 * Allocate component buffers and fire off the requests
914 	 */
915 	doffset = bio->bio_offset;
916 	addr = bp->b_data;
917 
918 	for (bcount = bp->b_bcount; bcount > 0; bcount -= rcount) {
919 		ccdbuffer(cbp, cs, bio, doffset, addr, bcount);
920 		rcount = cbp[0]->cb_buf.b_bcount;
921 
922 		if (cs->sc_cflags & CCDF_MIRROR) {
923 			/*
924 			 * Mirroring.  Writes go to both disks, reads are
925 			 * taken from whichever disk seems most appropriate.
926 			 *
927 			 * We attempt to localize reads to the disk whos arm
928 			 * is nearest the read request.  We ignore seeks due
929 			 * to writes when making this determination and we
930 			 * also try to avoid hogging.
931 			 */
932 			if (cbp[0]->cb_buf.b_cmd != BUF_CMD_READ) {
933 				vn_strategy(cbp[0]->cb_vp,
934 					    &cbp[0]->cb_buf.b_bio1);
935 				vn_strategy(cbp[1]->cb_vp,
936 					    &cbp[1]->cb_buf.b_bio1);
937 			} else {
938 				int pick = cs->sc_pick;
939 				daddr_t range = cs->sc_size / 16 * cs->sc_geom.ccg_secsize;
940 				if (doffset < cs->sc_blk[pick] - range ||
941 				    doffset > cs->sc_blk[pick] + range
942 				) {
943 					cs->sc_pick = pick = 1 - pick;
944 				}
945 				cs->sc_blk[pick] = doffset + rcount;
946 				vn_strategy(cbp[pick]->cb_vp,
947 					    &cbp[pick]->cb_buf.b_bio1);
948 			}
949 		} else {
950 			/*
951 			 * Not mirroring
952 			 */
953 			vn_strategy(cbp[0]->cb_vp,
954 				     &cbp[0]->cb_buf.b_bio1);
955 		}
956 		doffset += rcount;
957 		addr += rcount;
958 	}
959 }
960 
961 /*
962  * Build a component buffer header.
963  */
964 static void
965 ccdbuffer(struct ccdbuf **cb, struct ccd_softc *cs, struct bio *bio,
966 	  off_t doffset, caddr_t addr, long bcount)
967 {
968 	struct ccdcinfo *ci, *ci2 = NULL;	/* XXX */
969 	struct ccdbuf *cbp;
970 	u_int64_t bn;
971 	u_int64_t cbn;
972 	u_int64_t cboff;
973 	off_t cbc;
974 
975 #ifdef DEBUG
976 	if (ccddebug & CCDB_IO)
977 		kprintf("ccdbuffer(%x, %x, %d, %x, %d)\n",
978 		       cs, bp, bn, addr, bcount);
979 #endif
980 	/*
981 	 * Determine which component bn falls in.
982 	 */
983 	bn = doffset / cs->sc_geom.ccg_secsize;
984 	cbn = bn;
985 	cboff = 0;
986 
987 	if (cs->sc_ileave == 0) {
988 		/*
989 		 * Serially concatenated and neither a mirror nor a parity
990 		 * config.  This is a special case.
991 		 */
992 		daddr_t sblk;
993 
994 		sblk = 0;
995 		for (ci = cs->sc_cinfo; cbn >= sblk + ci->ci_size; ci++)
996 			sblk += ci->ci_size;
997 		cbn -= sblk;
998 	} else {
999 		struct ccdiinfo *ii;
1000 		int ccdisk, off;
1001 
1002 		/*
1003 		 * Calculate cbn, the logical superblock (sc_ileave chunks),
1004 		 * and cboff, a normal block offset (DEV_BSIZE chunks) relative
1005 		 * to cbn.
1006 		 */
1007 		cboff = cbn % cs->sc_ileave;	/* DEV_BSIZE gran */
1008 		cbn = cbn / cs->sc_ileave;	/* DEV_BSIZE * ileave gran */
1009 
1010 		/*
1011 		 * Figure out which interleave table to use.
1012 		 */
1013 		for (ii = cs->sc_itable; ii->ii_ndisk; ii++) {
1014 			if (ii->ii_startblk > cbn)
1015 				break;
1016 		}
1017 		ii--;
1018 
1019 		/*
1020 		 * off is the logical superblock relative to the beginning
1021 		 * of this interleave block.
1022 		 */
1023 		off = cbn - ii->ii_startblk;
1024 
1025 		/*
1026 		 * We must calculate which disk component to use (ccdisk),
1027 		 * and recalculate cbn to be the superblock relative to
1028 		 * the beginning of the component.  This is typically done by
1029 		 * adding 'off' and ii->ii_startoff together.  However, 'off'
1030 		 * must typically be divided by the number of components in
1031 		 * this interleave array to be properly convert it from a
1032 		 * CCD-relative logical superblock number to a
1033 		 * component-relative superblock number.
1034 		 */
1035 		if (ii->ii_ndisk == 1) {
1036 			/*
1037 			 * When we have just one disk, it can't be a mirror
1038 			 * or a parity config.
1039 			 */
1040 			ccdisk = ii->ii_index[0];
1041 			cbn = ii->ii_startoff + off;
1042 		} else {
1043 			if (cs->sc_cflags & CCDF_MIRROR) {
1044 				/*
1045 				 * We have forced a uniform mapping, resulting
1046 				 * in a single interleave array.  We double
1047 				 * up on the first half of the available
1048 				 * components and our mirror is in the second
1049 				 * half.  This only works with a single
1050 				 * interleave array because doubling up
1051 				 * doubles the number of sectors, so there
1052 				 * cannot be another interleave array because
1053 				 * the next interleave array's calculations
1054 				 * would be off.
1055 				 */
1056 				int ndisk2 = ii->ii_ndisk / 2;
1057 				ccdisk = ii->ii_index[off % ndisk2];
1058 				cbn = ii->ii_startoff + off / ndisk2;
1059 				ci2 = &cs->sc_cinfo[ccdisk + ndisk2];
1060 			} else if (cs->sc_cflags & CCDF_PARITY) {
1061 				/*
1062 				 * XXX not implemented yet
1063 				 */
1064 				int ndisk2 = ii->ii_ndisk - 1;
1065 				ccdisk = ii->ii_index[off % ndisk2];
1066 				cbn = ii->ii_startoff + off / ndisk2;
1067 				if (cbn % ii->ii_ndisk <= ccdisk)
1068 					ccdisk++;
1069 			} else {
1070 				ccdisk = ii->ii_index[off % ii->ii_ndisk];
1071 				cbn = ii->ii_startoff + off / ii->ii_ndisk;
1072 			}
1073 		}
1074 
1075 		ci = &cs->sc_cinfo[ccdisk];
1076 
1077 		/*
1078 		 * Convert cbn from a superblock to a normal block so it
1079 		 * can be used to calculate (along with cboff) the normal
1080 		 * block index into this particular disk.
1081 		 */
1082 		cbn *= cs->sc_ileave;
1083 	}
1084 
1085 	/*
1086 	 * Fill in the component buf structure.
1087 	 *
1088 	 * NOTE: devices do not use b_bufsize, only b_bcount, but b_bcount
1089 	 * will be truncated on device EOF so we use b_bufsize to detect
1090 	 * the case.
1091 	 */
1092 	cbp = getccdbuf();
1093 	cbp->cb_buf.b_cmd = bio->bio_buf->b_cmd;
1094 	cbp->cb_buf.b_flags |= bio->bio_buf->b_flags;
1095 	cbp->cb_buf.b_data = addr;
1096 	cbp->cb_vp = ci->ci_vp;
1097 	if (cs->sc_ileave == 0)
1098 		cbc = dbtob((off_t)(ci->ci_size - cbn));
1099 	else
1100 		cbc = dbtob((off_t)(cs->sc_ileave - cboff));
1101 	if (cbc > cs->sc_maxiosize)
1102 		cbc = cs->sc_maxiosize;
1103 	cbp->cb_buf.b_bcount = (cbc < bcount) ? cbc : bcount;
1104  	cbp->cb_buf.b_bufsize = cbp->cb_buf.b_bcount;
1105 
1106 	cbp->cb_buf.b_bio1.bio_done = ccdiodone;
1107 	cbp->cb_buf.b_bio1.bio_caller_info1.ptr = cbp;
1108 	cbp->cb_buf.b_bio1.bio_offset = dbtob(cbn + cboff + ci->ci_skip);
1109 
1110 	/*
1111 	 * context for ccdiodone
1112 	 */
1113 	cbp->cb_obio = bio;
1114 	cbp->cb_unit = cs - ccd_softc;
1115 	cbp->cb_comp = ci - cs->sc_cinfo;
1116 
1117 #ifdef DEBUG
1118 	if (ccddebug & CCDB_IO)
1119 		kprintf(" dev %x(u%d): cbp %x off %lld addr %x bcnt %d\n",
1120 		       ci->ci_dev, ci-cs->sc_cinfo, cbp,
1121 		       cbp->cb_buf.b_bio1.bio_offset,
1122 		       cbp->cb_buf.b_data, cbp->cb_buf.b_bcount);
1123 #endif
1124 	cb[0] = cbp;
1125 
1126 	/*
1127 	 * Note: both I/O's setup when reading from mirror, but only one
1128 	 * will be executed.
1129 	 */
1130 	if (cs->sc_cflags & CCDF_MIRROR) {
1131 		/* mirror, setup second I/O */
1132 		cbp = getccdbuf();
1133 
1134 		cbp->cb_buf.b_cmd = bio->bio_buf->b_cmd;
1135 		cbp->cb_buf.b_flags |= bio->bio_buf->b_flags;
1136 		cbp->cb_buf.b_data = addr;
1137 		cbp->cb_vp = ci2->ci_vp;
1138 		if (cs->sc_ileave == 0)
1139 		      cbc = dbtob((off_t)(ci->ci_size - cbn));
1140 		else
1141 		      cbc = dbtob((off_t)(cs->sc_ileave - cboff));
1142 		if (cbc > cs->sc_maxiosize)
1143 			cbc = cs->sc_maxiosize;
1144 		cbp->cb_buf.b_bcount = (cbc < bcount) ? cbc : bcount;
1145 		cbp->cb_buf.b_bufsize = cbp->cb_buf.b_bcount;
1146 
1147 		cbp->cb_buf.b_bio1.bio_done = ccdiodone;
1148 		cbp->cb_buf.b_bio1.bio_caller_info1.ptr = cbp;
1149 		cbp->cb_buf.b_bio1.bio_offset = dbtob(cbn + cboff + ci2->ci_skip);
1150 
1151 		/*
1152 		 * context for ccdiodone
1153 		 */
1154 		cbp->cb_obio = bio;
1155 		cbp->cb_unit = cs - ccd_softc;
1156 		cbp->cb_comp = ci2 - cs->sc_cinfo;
1157 		cb[1] = cbp;
1158 		/* link together the ccdbuf's and clear "mirror done" flag */
1159 		cb[0]->cb_mirror = cb[1];
1160 		cb[1]->cb_mirror = cb[0];
1161 		cb[0]->cb_pflags &= ~CCDPF_MIRROR_DONE;
1162 		cb[1]->cb_pflags &= ~CCDPF_MIRROR_DONE;
1163 	}
1164 }
1165 
1166 static void
1167 ccdintr(struct ccd_softc *cs, struct bio *bio)
1168 {
1169 	struct buf *bp = bio->bio_buf;
1170 
1171 #ifdef DEBUG
1172 	if (ccddebug & CCDB_FOLLOW)
1173 		kprintf("ccdintr(%x, %x)\n", cs, bp);
1174 #endif
1175 	/*
1176 	 * Request is done for better or worse, wakeup the top half.
1177 	 */
1178 	if (bp->b_flags & B_ERROR)
1179 		bp->b_resid = bp->b_bcount;
1180 	devstat_end_transaction_buf(&cs->device_stats, bp);
1181 	biodone(bio);
1182 }
1183 
1184 /*
1185  * Called at interrupt time.
1186  * Mark the component as done and if all components are done,
1187  * take a ccd interrupt.
1188  */
1189 static void
1190 ccdiodone(struct bio *bio)
1191 {
1192 	struct ccdbuf *cbp = bio->bio_caller_info1.ptr;
1193 	struct bio *obio = cbp->cb_obio;
1194 	struct buf *obp = obio->bio_buf;
1195 	int unit = cbp->cb_unit;
1196 	int count;
1197 
1198 	/*
1199 	 * Since we do not have exclusive access to underlying devices,
1200 	 * we can't keep cache translations around.
1201 	 */
1202 	clearbiocache(bio->bio_next);
1203 
1204 	crit_enter();
1205 #ifdef DEBUG
1206 	if (ccddebug & CCDB_FOLLOW)
1207 		kprintf("ccdiodone(%x)\n", cbp);
1208 	if (ccddebug & CCDB_IO) {
1209 		kprintf("ccdiodone: bp %x bcount %d resid %d\n",
1210 		       obp, obp->b_bcount, obp->b_resid);
1211 		kprintf(" dev %x(u%d), cbp %x off %lld addr %x bcnt %d\n",
1212 		       cbp->cb_buf.b_dev, cbp->cb_comp, cbp,
1213 		       cbp->cb_buf.b_loffset, cbp->cb_buf.b_data,
1214 		       cbp->cb_buf.b_bcount);
1215 	}
1216 #endif
1217 
1218 	/*
1219 	 * If an error occured, report it.  If this is a mirrored
1220 	 * configuration and the first of two possible reads, do not
1221 	 * set the error in the bp yet because the second read may
1222 	 * succeed.
1223 	 */
1224 	if (cbp->cb_buf.b_flags & B_ERROR) {
1225 		const char *msg = "";
1226 
1227 		if ((ccd_softc[unit].sc_cflags & CCDF_MIRROR) &&
1228 		    (cbp->cb_buf.b_cmd == BUF_CMD_READ) &&
1229 		    (cbp->cb_pflags & CCDPF_MIRROR_DONE) == 0) {
1230 			/*
1231 			 * We will try our read on the other disk down
1232 			 * below, also reverse the default pick so if we
1233 			 * are doing a scan we do not keep hitting the
1234 			 * bad disk first.
1235 			 */
1236 			struct ccd_softc *cs = &ccd_softc[unit];
1237 
1238 			msg = ", trying other disk";
1239 			cs->sc_pick = 1 - cs->sc_pick;
1240 			cs->sc_blk[cs->sc_pick] = obio->bio_offset;
1241 		} else {
1242 			obp->b_flags |= B_ERROR;
1243 			obp->b_error = cbp->cb_buf.b_error ?
1244 			    cbp->cb_buf.b_error : EIO;
1245 		}
1246 		kprintf("ccd%d: error %d on component %d offset %lld (ccd offset %lld)%s\n",
1247 		       unit, obp->b_error, cbp->cb_comp,
1248 		       cbp->cb_buf.b_bio2.bio_offset,
1249 		       obio->bio_offset, msg);
1250 	}
1251 
1252 	/*
1253 	 * Process mirror.  If we are writing, I/O has been initiated on both
1254 	 * buffers and we fall through only after both are finished.
1255 	 *
1256 	 * If we are reading only one I/O is initiated at a time.  If an
1257 	 * error occurs we initiate the second I/O and return, otherwise
1258 	 * we free the second I/O without initiating it.
1259 	 */
1260 
1261 	if (ccd_softc[unit].sc_cflags & CCDF_MIRROR) {
1262 		if (cbp->cb_buf.b_cmd != BUF_CMD_READ) {
1263 			/*
1264 			 * When writing, handshake with the second buffer
1265 			 * to determine when both are done.  If both are not
1266 			 * done, return here.
1267 			 */
1268 			if ((cbp->cb_pflags & CCDPF_MIRROR_DONE) == 0) {
1269 				cbp->cb_mirror->cb_pflags |= CCDPF_MIRROR_DONE;
1270 				putccdbuf(cbp);
1271 				crit_exit();
1272 				return;
1273 			}
1274 		} else {
1275 			/*
1276 			 * When reading, either dispose of the second buffer
1277 			 * or initiate I/O on the second buffer if an error
1278 			 * occured with this one.
1279 			 */
1280 			if ((cbp->cb_pflags & CCDPF_MIRROR_DONE) == 0) {
1281 				if (cbp->cb_buf.b_flags & B_ERROR) {
1282 					cbp->cb_mirror->cb_pflags |=
1283 					    CCDPF_MIRROR_DONE;
1284 					vn_strategy(
1285 					    cbp->cb_mirror->cb_vp,
1286 					    &cbp->cb_mirror->cb_buf.b_bio1
1287 					);
1288 					putccdbuf(cbp);
1289 					crit_exit();
1290 					return;
1291 				} else {
1292 					putccdbuf(cbp->cb_mirror);
1293 					/* fall through */
1294 				}
1295 			}
1296 		}
1297 	}
1298 
1299 	/*
1300 	 * Use our saved b_bufsize to determine if an unexpected EOF occured.
1301 	 */
1302 	count = cbp->cb_buf.b_bufsize;
1303 	putccdbuf(cbp);
1304 
1305 	/*
1306 	 * If all done, "interrupt".
1307 	 */
1308 	obp->b_resid -= count;
1309 	if (obp->b_resid < 0)
1310 		panic("ccdiodone: count");
1311 	if (obp->b_resid == 0)
1312 		ccdintr(&ccd_softc[unit], obio);
1313 	crit_exit();
1314 }
1315 
1316 static int
1317 ccdioctl(struct dev_ioctl_args *ap)
1318 {
1319 	cdev_t dev = ap->a_head.a_dev;
1320 	int unit = ccdunit(dev);
1321 	int i, j, lookedup = 0, error = 0;
1322 	struct ccd_softc *cs;
1323 	struct ccd_ioctl *ccio = (struct ccd_ioctl *)ap->a_data;
1324 	struct ccddevice ccd;
1325 	struct disk_info info;
1326 	char **cpp;
1327 	struct vnode **vpp;
1328 
1329 	if (unit >= numccd)
1330 		return (ENXIO);
1331 	cs = &ccd_softc[unit];
1332 
1333 	bzero(&ccd, sizeof(ccd));
1334 
1335 	switch (ap->a_cmd) {
1336 	case CCDIOCSET:
1337 		if (cs->sc_flags & CCDF_INITED)
1338 			return (EBUSY);
1339 
1340 		if ((ap->a_fflag & FWRITE) == 0)
1341 			return (EBADF);
1342 
1343 		if ((error = ccdlock(cs)) != 0)
1344 			return (error);
1345 
1346 		if (ccio->ccio_ndisks > CCD_MAXNDISKS) {
1347 			ccdunlock(cs);
1348 			return (EINVAL);
1349 		}
1350 
1351 		/* Fill in some important bits. */
1352 		ccd.ccd_unit = unit;
1353 		ccd.ccd_interleave = ccio->ccio_ileave;
1354 		if (ccd.ccd_interleave == 0 &&
1355 		    ((ccio->ccio_flags & CCDF_MIRROR) ||
1356 		     (ccio->ccio_flags & CCDF_PARITY))) {
1357 			kprintf("ccd%d: disabling mirror/parity, interleave is 0\n", unit);
1358 			ccio->ccio_flags &= ~(CCDF_MIRROR | CCDF_PARITY);
1359 		}
1360 		if ((ccio->ccio_flags & CCDF_MIRROR) &&
1361 		    (ccio->ccio_flags & CCDF_PARITY)) {
1362 			kprintf("ccd%d: can't specify both mirror and parity, using mirror\n", unit);
1363 			ccio->ccio_flags &= ~CCDF_PARITY;
1364 		}
1365 		if ((ccio->ccio_flags & (CCDF_MIRROR | CCDF_PARITY)) &&
1366 		    !(ccio->ccio_flags & CCDF_UNIFORM)) {
1367 			kprintf("ccd%d: mirror/parity forces uniform flag\n",
1368 			       unit);
1369 			ccio->ccio_flags |= CCDF_UNIFORM;
1370 		}
1371 		ccd.ccd_flags = ccio->ccio_flags & CCDF_USERMASK;
1372 
1373 		/*
1374 		 * Allocate space for and copy in the array of
1375 		 * componet pathnames and device numbers.
1376 		 */
1377 		cpp = kmalloc(ccio->ccio_ndisks * sizeof(char *),
1378 		    M_DEVBUF, M_WAITOK);
1379 		vpp = kmalloc(ccio->ccio_ndisks * sizeof(struct vnode *),
1380 		    M_DEVBUF, M_WAITOK);
1381 
1382 		error = copyin((caddr_t)ccio->ccio_disks, (caddr_t)cpp,
1383 				ccio->ccio_ndisks * sizeof(char **));
1384 		if (error) {
1385 			kfree(vpp, M_DEVBUF);
1386 			kfree(cpp, M_DEVBUF);
1387 			ccdunlock(cs);
1388 			return (error);
1389 		}
1390 
1391 #ifdef DEBUG
1392 		if (ccddebug & CCDB_INIT) {
1393 			for (i = 0; i < ccio->ccio_ndisks; ++i)
1394 				kprintf("ccdioctl: component %d: 0x%x\n",
1395 				    i, cpp[i]);
1396 		}
1397 #endif
1398 
1399 		for (i = 0; i < ccio->ccio_ndisks; ++i) {
1400 #ifdef DEBUG
1401 			if (ccddebug & CCDB_INIT)
1402 				kprintf("ccdioctl: lookedup = %d\n", lookedup);
1403 #endif
1404 			if ((error = ccdlookup(cpp[i], &vpp[i])) != 0) {
1405 				for (j = 0; j < lookedup; ++j)
1406 					(void)vn_close(vpp[j], FREAD|FWRITE);
1407 				kfree(vpp, M_DEVBUF);
1408 				kfree(cpp, M_DEVBUF);
1409 				ccdunlock(cs);
1410 				return (error);
1411 			}
1412 			++lookedup;
1413 		}
1414 		ccd.ccd_cpp = cpp;
1415 		ccd.ccd_vpp = vpp;
1416 		ccd.ccd_ndev = ccio->ccio_ndisks;
1417 
1418 		/*
1419 		 * Initialize the ccd.  Fills in the softc for us.
1420 		 */
1421 		if ((error = ccdinit(&ccd, cpp, ap->a_cred)) != 0) {
1422 			for (j = 0; j < lookedup; ++j)
1423 				(void)vn_close(vpp[j], FREAD|FWRITE);
1424 			kfree(vpp, M_DEVBUF);
1425 			kfree(cpp, M_DEVBUF);
1426 			ccdunlock(cs);
1427 			return (error);
1428 		}
1429 
1430 		/*
1431 		 * The ccd has been successfully initialized, so
1432 		 * we can place it into the array and read the disklabel.
1433 		 */
1434 		bcopy(&ccd, &ccddevs[unit], sizeof(ccd));
1435 		ccio->ccio_unit = unit;
1436 		ccio->ccio_size = cs->sc_size;
1437 
1438 		bzero(&info, sizeof(info));
1439 		info.d_media_blksize = cs->sc_geom.ccg_secsize;
1440 		info.d_media_blocks  = cs->sc_size;
1441 		info.d_nheads	     = cs->sc_geom.ccg_ntracks;
1442 		info.d_secpertrack   = cs->sc_geom.ccg_nsectors;
1443 		info.d_ncylinders    = cs->sc_geom.ccg_ncylinders;
1444 		info.d_secpercyl     = info.d_nheads * info.d_secpertrack;
1445 
1446 		/*
1447 		 * For cases where a label is directly applied to the ccd,
1448 		 * without slices, DSO_COMPATMBR forces one sector be
1449 		 * reserved for backwards compatibility.
1450 		 */
1451 		info.d_dsflags	     = DSO_COMPATMBR;
1452 		disk_setdiskinfo(&cs->sc_disk, &info);
1453 
1454 		ccdunlock(cs);
1455 
1456 		break;
1457 
1458 	case CCDIOCCLR:
1459 		if ((cs->sc_flags & CCDF_INITED) == 0)
1460 			return (ENXIO);
1461 
1462 		if ((ap->a_fflag & FWRITE) == 0)
1463 			return (EBADF);
1464 
1465 		if ((error = ccdlock(cs)) != 0)
1466 			return (error);
1467 
1468 		if (dev_drefs(cs->sc_dev) > 1) {
1469 			ccdunlock(cs);
1470 			return (EBUSY);
1471 		}
1472 
1473 		/*
1474 		 * Free ccd_softc information and clear entry.
1475 		 */
1476 
1477 		/* Close the components and free their pathnames. */
1478 		for (i = 0; i < cs->sc_nccdisks; ++i) {
1479 			/*
1480 			 * XXX: this close could potentially fail and
1481 			 * cause Bad Things.  Maybe we need to force
1482 			 * the close to happen?
1483 			 */
1484 #ifdef DEBUG
1485 			if (ccddebug & CCDB_VNODE)
1486 				vprint("CCDIOCCLR: vnode info",
1487 				    cs->sc_cinfo[i].ci_vp);
1488 #endif
1489 			(void)vn_close(cs->sc_cinfo[i].ci_vp, FREAD|FWRITE);
1490 			kfree(cs->sc_cinfo[i].ci_path, M_DEVBUF);
1491 		}
1492 
1493 		/* Free interleave index. */
1494 		for (i = 0; cs->sc_itable[i].ii_ndisk; ++i)
1495 			kfree(cs->sc_itable[i].ii_index, M_DEVBUF);
1496 
1497 		/* Free component info and interleave table. */
1498 		kfree(cs->sc_cinfo, M_DEVBUF);
1499 		kfree(cs->sc_itable, M_DEVBUF);
1500 		cs->sc_cinfo = NULL;
1501 		cs->sc_itable = NULL;
1502 		cs->sc_flags &= ~CCDF_INITED;
1503 
1504 		/*
1505 		 * Free ccddevice information and clear entry.
1506 		 */
1507 		kfree(ccddevs[unit].ccd_cpp, M_DEVBUF);
1508 		kfree(ccddevs[unit].ccd_vpp, M_DEVBUF);
1509 		bcopy(&ccd, &ccddevs[unit], sizeof(ccd));
1510 
1511 		/*
1512 		 * And remove the devstat entry.
1513 		 */
1514 		devstat_remove_entry(&cs->device_stats);
1515 
1516 		/* This must be atomic. */
1517 		crit_enter();
1518 		ccdunlock(cs);
1519 		crit_exit();
1520 
1521 		break;
1522 
1523 	default:
1524 		return (ENOTTY);
1525 	}
1526 
1527 	return (0);
1528 }
1529 
1530 static int
1531 ccddump(struct dev_dump_args *ap)
1532 {
1533 	/* Not implemented. */
1534 	return ENXIO;
1535 }
1536 
1537 /*
1538  * Lookup the provided name in the filesystem.  If the file exists,
1539  * is a valid block device, and isn't being used by anyone else,
1540  * set *vpp to the file's vnode.
1541  */
1542 static int
1543 ccdlookup(char *path, struct vnode **vpp)
1544 {
1545 	struct nlookupdata nd;
1546 	struct vnode *vp;
1547 	int error;
1548 
1549 	*vpp = NULL;
1550 
1551 	error = nlookup_init(&nd, path, UIO_USERSPACE, NLC_FOLLOW|NLC_LOCKVP);
1552 	if (error)
1553 		return (error);
1554 	if ((error = vn_open(&nd, NULL, FREAD|FWRITE, 0)) != 0) {
1555 #ifdef DEBUG
1556 		if (ccddebug & CCDB_FOLLOW|CCDB_INIT)
1557 			kprintf("ccdlookup: vn_open error = %d\n", error);
1558 #endif
1559 		goto done;
1560 	}
1561 	vp = nd.nl_open_vp;
1562 
1563 	if (vp->v_opencount > 1) {
1564 		error = EBUSY;
1565 		goto done;
1566 	}
1567 
1568 	if (!vn_isdisk(vp, &error))
1569 		goto done;
1570 
1571 #ifdef DEBUG
1572 	if (ccddebug & CCDB_VNODE)
1573 		vprint("ccdlookup: vnode info", vp);
1574 #endif
1575 
1576 	vn_unlock(vp);
1577 	nd.nl_open_vp = NULL;
1578 	nlookup_done(&nd);
1579 	*vpp = vp;				/* leave ref intact  */
1580 	return (0);
1581 done:
1582 	nlookup_done(&nd);
1583 	return (error);
1584 }
1585 
1586 /*
1587  * Wait interruptibly for an exclusive lock.
1588  *
1589  * XXX
1590  * Several drivers do this; it should be abstracted and made MP-safe.
1591  */
1592 static int
1593 ccdlock(struct ccd_softc *cs)
1594 {
1595 	int error;
1596 
1597 	while ((cs->sc_flags & CCDF_LOCKED) != 0) {
1598 		cs->sc_flags |= CCDF_WANTED;
1599 		if ((error = tsleep(cs, PCATCH, "ccdlck", 0)) != 0)
1600 			return (error);
1601 	}
1602 	cs->sc_flags |= CCDF_LOCKED;
1603 	return (0);
1604 }
1605 
1606 /*
1607  * Unlock and wake up any waiters.
1608  */
1609 static void
1610 ccdunlock(struct ccd_softc *cs)
1611 {
1612 
1613 	cs->sc_flags &= ~CCDF_LOCKED;
1614 	if ((cs->sc_flags & CCDF_WANTED) != 0) {
1615 		cs->sc_flags &= ~CCDF_WANTED;
1616 		wakeup(cs);
1617 	}
1618 }
1619 
1620 #ifdef DEBUG
1621 static void
1622 printiinfo(struct ccdiinfo *ii)
1623 {
1624 	int ix, i;
1625 
1626 	for (ix = 0; ii->ii_ndisk; ix++, ii++) {
1627 		kprintf(" itab[%d]: #dk %d sblk %d soff %d",
1628 		       ix, ii->ii_ndisk, ii->ii_startblk, ii->ii_startoff);
1629 		for (i = 0; i < ii->ii_ndisk; i++)
1630 			kprintf(" %d", ii->ii_index[i]);
1631 		kprintf("\n");
1632 	}
1633 }
1634 #endif
1635 
1636 
1637 /* Local Variables: */
1638 /* c-argdecl-indent: 8 */
1639 /* c-continued-statement-offset: 8 */
1640 /* c-indent-level: 8 */
1641 /* End: */
1642