xref: /dragonfly/sys/dev/disk/ccd/ccd.c (revision ed5d5720)
1 /*
2  * Copyright (c) 2007 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  *
34  * $DragonFly: src/sys/dev/disk/ccd/ccd.c,v 1.50 2007/11/06 03:50:02 dillon Exp $
35  */
36 /*
37  * Copyright (c) 1995 Jason R. Thorpe.
38  * All rights reserved.
39  *
40  * Redistribution and use in source and binary forms, with or without
41  * modification, are permitted provided that the following conditions
42  * are met:
43  * 1. Redistributions of source code must retain the above copyright
44  *    notice, this list of conditions and the following disclaimer.
45  * 2. Redistributions in binary form must reproduce the above copyright
46  *    notice, this list of conditions and the following disclaimer in the
47  *    documentation and/or other materials provided with the distribution.
48  * 3. All advertising materials mentioning features or use of this software
49  *    must display the following acknowledgement:
50  *	This product includes software developed for the NetBSD Project
51  *	by Jason R. Thorpe.
52  * 4. The name of the author may not be used to endorse or promote products
53  *    derived from this software without specific prior written permission.
54  *
55  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
56  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
57  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
58  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
59  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
60  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
61  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
62  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
63  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
64  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
65  * SUCH DAMAGE.
66  */
67 
68 /*
69  * Copyright (c) 1988 University of Utah.
70  * Copyright (c) 1990, 1993
71  *	The Regents of the University of California.  All rights reserved.
72  *
73  * This code is derived from software contributed to Berkeley by
74  * the Systems Programming Group of the University of Utah Computer
75  * Science Department.
76  *
77  * Redistribution and use in source and binary forms, with or without
78  * modification, are permitted provided that the following conditions
79  * are met:
80  * 1. Redistributions of source code must retain the above copyright
81  *    notice, this list of conditions and the following disclaimer.
82  * 2. Redistributions in binary form must reproduce the above copyright
83  *    notice, this list of conditions and the following disclaimer in the
84  *    documentation and/or other materials provided with the distribution.
85  * 3. All advertising materials mentioning features or use of this software
86  *    must display the following acknowledgement:
87  *	This product includes software developed by the University of
88  *	California, Berkeley and its contributors.
89  * 4. Neither the name of the University nor the names of its contributors
90  *    may be used to endorse or promote products derived from this software
91  *    without specific prior written permission.
92  *
93  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
94  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
95  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
96  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
97  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
98  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
99  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
100  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
101  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
102  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
103  * SUCH DAMAGE.
104  *
105  * from: Utah $Hdr: cd.c 1.6 90/11/28$
106  */
107 /*
108  * @(#)cd.c	8.2 (Berkeley) 11/16/93
109  * $FreeBSD: src/sys/dev/ccd/ccd.c,v 1.73.2.1 2001/09/11 09:49:52 kris Exp $
110  * $NetBSD: ccd.c,v 1.22 1995/12/08 19:13:26 thorpej Exp $
111  * $DragonFly: src/sys/dev/disk/ccd/ccd.c,v 1.50 2007/11/06 03:50:02 dillon Exp $
112  */
113 
114 /*
115  * "Concatenated" disk driver.
116  *
117  * Original dynamic configuration support by:
118  *	Jason R. Thorpe <thorpej@nas.nasa.gov>
119  *	Numerical Aerodynamic Simulation Facility
120  *	Mail Stop 258-6
121  *	NASA Ames Research Center
122  *	Moffett Field, CA 94035
123  */
124 
125 #include "use_ccd.h"
126 
127 #include <sys/param.h>
128 #include <sys/systm.h>
129 #include <sys/kernel.h>
130 #include <sys/module.h>
131 #include <sys/proc.h>
132 #include <sys/buf.h>
133 #include <sys/malloc.h>
134 #include <sys/nlookup.h>
135 #include <sys/conf.h>
136 #include <sys/stat.h>
137 #include <sys/sysctl.h>
138 #include <sys/disk.h>
139 #include <sys/dtype.h>
140 #include <sys/diskslice.h>
141 #include <sys/devicestat.h>
142 #include <sys/fcntl.h>
143 #include <sys/vnode.h>
144 #include <sys/buf2.h>
145 #include <sys/ccdvar.h>
146 
147 #include <vm/vm_zone.h>
148 
149 #include <vfs/ufs/dinode.h> 	/* XXX Used only for fs.h */
150 #include <vfs/ufs/fs.h> 	/* XXX used only to get BBSIZE and SBSIZE */
151 
152 #include <sys/thread2.h>
153 
154 #if defined(CCDDEBUG) && !defined(DEBUG)
155 #define DEBUG
156 #endif
157 
158 #ifdef DEBUG
159 #define CCDB_FOLLOW	0x01
160 #define CCDB_INIT	0x02
161 #define CCDB_IO		0x04
162 #define CCDB_LABEL	0x08
163 #define CCDB_VNODE	0x10
164 static int ccddebug = CCDB_FOLLOW | CCDB_INIT | CCDB_IO | CCDB_LABEL |
165     CCDB_VNODE;
166 SYSCTL_INT(_debug, OID_AUTO, ccddebug, CTLFLAG_RW, &ccddebug, 0, "");
167 #undef DEBUG
168 #endif
169 
170 #define	ccdunit(x)	dkunit(x)
171 #define ccdpart(x)	dkpart(x)
172 
173 /*
174    This is how mirroring works (only writes are special):
175 
176    When initiating a write, ccdbuffer() returns two "struct ccdbuf *"s
177    linked together by the cb_mirror field.  "cb_pflags &
178    CCDPF_MIRROR_DONE" is set to 0 on both of them.
179 
180    When a component returns to ccdiodone(), it checks if "cb_pflags &
181    CCDPF_MIRROR_DONE" is set or not.  If not, it sets the partner's
182    flag and returns.  If it is, it means its partner has already
183    returned, so it will go to the regular cleanup.
184 
185  */
186 
187 struct ccdbuf {
188 	struct buf	cb_buf;		/* new I/O buf */
189 	struct vnode	*cb_vp;		/* related vnode */
190 	struct bio	*cb_obio;	/* ptr. to original I/O buf */
191 	struct ccdbuf	*cb_freenext;	/* free list link */
192 	int		cb_unit;	/* target unit */
193 	int		cb_comp;	/* target component */
194 	int		cb_pflags;	/* mirror/parity status flag */
195 	struct ccdbuf	*cb_mirror;	/* mirror counterpart */
196 };
197 
198 /* bits in cb_pflags */
199 #define CCDPF_MIRROR_DONE 1	/* if set, mirror counterpart is done */
200 
201 static d_open_t ccdopen;
202 static d_close_t ccdclose;
203 static d_strategy_t ccdstrategy;
204 static d_ioctl_t ccdioctl;
205 static d_dump_t ccddump;
206 
207 #define NCCDFREEHIWAT	16
208 
209 #define CDEV_MAJOR 74
210 
211 static struct dev_ops ccd_ops = {
212 	{ "ccd", CDEV_MAJOR, D_DISK },
213 	.d_open =	ccdopen,
214 	.d_close =	ccdclose,
215 	.d_read =	physread,
216 	.d_write =	physwrite,
217 	.d_ioctl =	ccdioctl,
218 	.d_strategy =	ccdstrategy,
219 	.d_dump =	ccddump
220 };
221 
222 /* called during module initialization */
223 static	void ccdattach (void);
224 static	int ccd_modevent (module_t, int, void *);
225 
226 /* called by biodone() at interrupt time */
227 static	void ccdiodone (struct bio *bio);
228 
229 static	void ccdstart (struct ccd_softc *, struct bio *);
230 static	void ccdinterleave (struct ccd_softc *, int);
231 static	void ccdintr (struct ccd_softc *, struct bio *);
232 static	int ccdinit (struct ccddevice *, char **, struct ucred *);
233 static	int ccdlookup (char *, struct vnode **);
234 static	void ccdbuffer (struct ccdbuf **ret, struct ccd_softc *,
235 		struct bio *, off_t, caddr_t, long);
236 static	int ccdlock (struct ccd_softc *);
237 static	void ccdunlock (struct ccd_softc *);
238 
239 #ifdef DEBUG
240 static	void printiinfo (struct ccdiinfo *);
241 #endif
242 
243 /* Non-private for the benefit of libkvm. */
244 struct	ccd_softc *ccd_softc;
245 struct	ccddevice *ccddevs;
246 struct	ccdbuf *ccdfreebufs;
247 static	int numccdfreebufs;
248 static	int numccd = 0;
249 
250 /*
251  * getccdbuf() -	Allocate and zero a ccd buffer.
252  *
253  *	This routine is called at splbio().
254  */
255 
256 static __inline
257 struct ccdbuf *
258 getccdbuf(void)
259 {
260 	struct ccdbuf *cbp;
261 
262 	/*
263 	 * Allocate from freelist or malloc as necessary
264 	 */
265 	if ((cbp = ccdfreebufs) != NULL) {
266 		ccdfreebufs = cbp->cb_freenext;
267 		--numccdfreebufs;
268 		reinitbufbio(&cbp->cb_buf);
269 	} else {
270 		cbp = kmalloc(sizeof(struct ccdbuf), M_DEVBUF, M_WAITOK|M_ZERO);
271 		initbufbio(&cbp->cb_buf);
272 	}
273 
274 	/*
275 	 * independant struct buf initialization
276 	 */
277 	buf_dep_init(&cbp->cb_buf);
278 	BUF_LOCKINIT(&cbp->cb_buf);
279 	BUF_LOCK(&cbp->cb_buf, LK_EXCLUSIVE);
280 	BUF_KERNPROC(&cbp->cb_buf);
281 	cbp->cb_buf.b_flags = B_PAGING | B_BNOCLIP;
282 
283 	return(cbp);
284 }
285 
286 /*
287  * putccdbuf() -	Free a ccd buffer.
288  *
289  *	This routine is called at splbio().
290  */
291 
292 static __inline
293 void
294 putccdbuf(struct ccdbuf *cbp)
295 {
296 	BUF_UNLOCK(&cbp->cb_buf);
297 	BUF_LOCKFREE(&cbp->cb_buf);
298 
299 	if (numccdfreebufs < NCCDFREEHIWAT) {
300 		cbp->cb_freenext = ccdfreebufs;
301 		ccdfreebufs = cbp;
302 		++numccdfreebufs;
303 	} else {
304 		kfree((caddr_t)cbp, M_DEVBUF);
305 	}
306 }
307 
308 /*
309  * Called by main() during pseudo-device attachment.  All we need
310  * to do is allocate enough space for devices to be configured later, and
311  * add devsw entries.
312  */
313 static void
314 ccdattach(void)
315 {
316 	struct disk_info info;
317 	struct ccd_softc *cs;
318 	int i;
319 	int num = NCCD;
320 
321 	if (num > 1)
322 		kprintf("ccd0-%d: Concatenated disk drivers\n", num-1);
323 	else
324 		kprintf("ccd0: Concatenated disk driver\n");
325 
326 	ccd_softc = kmalloc(num * sizeof(struct ccd_softc), M_DEVBUF,
327 			    M_WAITOK | M_ZERO);
328 	ccddevs = kmalloc(num * sizeof(struct ccddevice), M_DEVBUF,
329 			    M_WAITOK | M_ZERO);
330 	numccd = num;
331 
332 	/*
333 	 * With normal disk devices the open simply fails if the media
334 	 * is not present.  With CCD we have to be able to open the
335 	 * raw disk to use the ioctl's to set it up, so create a dummy
336 	 * disk info structure so dscheck() doesn't blow up.
337 	 */
338 	bzero(&info, sizeof(info));
339 	info.d_media_blksize = DEV_BSIZE;
340 
341 	for (i = 0; i < numccd; ++i) {
342 		cs = &ccd_softc[i];
343 		cs->sc_dev = disk_create(i, &cs->sc_disk, &ccd_ops);
344 		cs->sc_dev->si_drv1 = cs;
345 		cs->sc_dev->si_iosize_max = 256 * 512;	/* XXX */
346 		disk_setdiskinfo(&cs->sc_disk, &info);
347 	}
348 }
349 
350 static int
351 ccd_modevent(module_t mod, int type, void *data)
352 {
353 	int error = 0;
354 
355 	switch (type) {
356 	case MOD_LOAD:
357 		ccdattach();
358 		break;
359 
360 	case MOD_UNLOAD:
361 		kprintf("ccd0: Unload not supported!\n");
362 		error = EOPNOTSUPP;
363 		break;
364 
365 	default:	/* MOD_SHUTDOWN etc */
366 		break;
367 	}
368 	return (error);
369 }
370 
371 DEV_MODULE(ccd, ccd_modevent, NULL);
372 
373 static int
374 ccdinit(struct ccddevice *ccd, char **cpaths, struct ucred *cred)
375 {
376 	struct ccd_softc *cs = &ccd_softc[ccd->ccd_unit];
377 	struct ccdcinfo *ci = NULL;	/* XXX */
378 	int ix;
379 	struct vnode *vp;
380 	u_int64_t skip;
381 	u_int64_t size;
382 	u_int64_t minsize;
383 	int maxsecsize;
384 	struct partinfo dpart;
385 	struct ccdgeom *ccg = &cs->sc_geom;
386 	char tmppath[MAXPATHLEN];
387 	int error = 0;
388 
389 #ifdef DEBUG
390 	if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
391 		kprintf("ccdinit: unit %d\n", ccd->ccd_unit);
392 #endif
393 
394 	cs->sc_size = 0;
395 	cs->sc_ileave = ccd->ccd_interleave;
396 	cs->sc_nccdisks = ccd->ccd_ndev;
397 
398 	/* Allocate space for the component info. */
399 	cs->sc_cinfo = kmalloc(cs->sc_nccdisks * sizeof(struct ccdcinfo),
400 				M_DEVBUF, M_WAITOK);
401 	cs->sc_maxiosize = MAXPHYS;
402 
403 	/*
404 	 * Verify that each component piece exists and record
405 	 * relevant information about it.
406 	 */
407 	maxsecsize = 0;
408 	minsize = 0;
409 	for (ix = 0; ix < cs->sc_nccdisks; ix++) {
410 		vp = ccd->ccd_vpp[ix];
411 		ci = &cs->sc_cinfo[ix];
412 		ci->ci_vp = vp;
413 
414 		/*
415 		 * Copy in the pathname of the component.
416 		 */
417 		bzero(tmppath, sizeof(tmppath));	/* sanity */
418 		if ((error = copyinstr(cpaths[ix], tmppath,
419 		    MAXPATHLEN, &ci->ci_pathlen)) != 0) {
420 #ifdef DEBUG
421 			if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
422 				kprintf("ccd%d: can't copy path, error = %d\n",
423 				    ccd->ccd_unit, error);
424 #endif
425 			goto fail;
426 		}
427 		ci->ci_path = kmalloc(ci->ci_pathlen, M_DEVBUF, M_WAITOK);
428 		bcopy(tmppath, ci->ci_path, ci->ci_pathlen);
429 
430 		ci->ci_dev = vn_todev(vp);
431 		if (ci->ci_dev->si_iosize_max &&
432 		    cs->sc_maxiosize > ci->ci_dev->si_iosize_max) {
433 			cs->sc_maxiosize = ci->ci_dev->si_iosize_max;
434 		}
435 
436 		/*
437 		 * Get partition information for the component.
438 		 */
439 		error = VOP_IOCTL(vp, DIOCGPART, (caddr_t)&dpart, FREAD, cred);
440 		if (error) {
441 #ifdef DEBUG
442 			if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
443 				 kprintf("ccd%d: %s: ioctl failed, error = %d\n",
444 				     ccd->ccd_unit, ci->ci_path, error);
445 #endif
446 			goto fail;
447 		}
448 		if (dpart.fstype != FS_CCD &&
449 		    !kuuid_is_ccd(&dpart.fstype_uuid)) {
450 			kprintf("ccd%d: %s: filesystem type must be 'ccd'\n",
451 				ccd->ccd_unit, ci->ci_path);
452 			error = EFTYPE;
453 			goto fail;
454 		}
455 		if (maxsecsize < dpart.media_blksize)
456 			maxsecsize = dpart.media_blksize;
457 
458 		/*
459 		 * Skip a certain amount of storage at the beginning of
460 		 * the component to make sure we don't infringe on any
461 		 * reserved sectors.  This is handled entirely by
462 		 * dpart.reserved_blocks but we also impose a minimum
463 		 * of 16 sectors for backwards compatibility.
464 		 */
465 		skip = 16;
466 		if (skip < dpart.reserved_blocks)
467 			skip = dpart.reserved_blocks;
468 		size = dpart.media_blocks - skip;
469 
470 		/*
471 		 * Calculate the size, truncating to an interleave
472 		 * boundary if necessary.
473 		 */
474 		if (cs->sc_ileave > 1)
475 			size -= size % cs->sc_ileave;
476 
477 		if ((int64_t)size <= 0) {
478 #ifdef DEBUG
479 			if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
480 				kprintf("ccd%d: %s: size == 0\n",
481 				    ccd->ccd_unit, ci->ci_path);
482 #endif
483 			error = ENODEV;
484 			goto fail;
485 		}
486 
487 		/*
488 		 * Calculate the smallest uniform component, used
489 		 * elsewhere.
490 		 */
491 		if (minsize == 0 || minsize > size)
492 			minsize = size;
493 		ci->ci_skip = skip;
494 		ci->ci_size = size;
495 		cs->sc_size += size;
496 	}
497 	kprintf("ccd%d: max component iosize is %d\n",
498 		cs->sc_unit, cs->sc_maxiosize);
499 
500 	/*
501 	 * Don't allow the interleave to be smaller than
502 	 * the biggest component sector.
503 	 */
504 	if ((cs->sc_ileave > 0) &&
505 	    (cs->sc_ileave % (maxsecsize / DEV_BSIZE))) {
506 #ifdef DEBUG
507 		if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
508 			kprintf("ccd%d: interleave must be at least %d\n",
509 			    ccd->ccd_unit, (maxsecsize / DEV_BSIZE));
510 #endif
511 		error = EINVAL;
512 		goto fail;
513 	}
514 
515 	/*
516 	 * If uniform interleave is desired set all sizes to that of
517 	 * the smallest component.  This will guarentee that a single
518 	 * interleave table is generated.
519 	 *
520 	 * Lost space must be taken into account when calculating the
521 	 * overall size.  Half the space is lost when CCDF_MIRROR is
522 	 * specified.  One disk is lost when CCDF_PARITY is specified.
523 	 */
524 	if (ccd->ccd_flags & CCDF_UNIFORM) {
525 		for (ci = cs->sc_cinfo;
526 		     ci < &cs->sc_cinfo[cs->sc_nccdisks]; ci++) {
527 			ci->ci_size = minsize;
528 		}
529 		if (ccd->ccd_flags & CCDF_MIRROR) {
530 			/*
531 			 * Check to see if an even number of components
532 			 * have been specified.  The interleave must also
533 			 * be non-zero in order for us to be able to
534 			 * guarentee the topology.
535 			 */
536 			if (cs->sc_nccdisks % 2) {
537 				kprintf("ccd%d: mirroring requires an even number of disks\n", ccd->ccd_unit );
538 				error = EINVAL;
539 				goto fail;
540 			}
541 			if (cs->sc_ileave == 0) {
542 				kprintf("ccd%d: an interleave must be specified when mirroring\n", ccd->ccd_unit);
543 				error = EINVAL;
544 				goto fail;
545 			}
546 			cs->sc_size = (cs->sc_nccdisks/2) * minsize;
547 		} else if (ccd->ccd_flags & CCDF_PARITY) {
548 			cs->sc_size = (cs->sc_nccdisks-1) * minsize;
549 		} else {
550 			if (cs->sc_ileave == 0) {
551 				kprintf("ccd%d: an interleave must be specified when using parity\n", ccd->ccd_unit);
552 				error = EINVAL;
553 				goto fail;
554 			}
555 			cs->sc_size = cs->sc_nccdisks * minsize;
556 		}
557 	}
558 
559 	/*
560 	 * Construct the interleave table.
561 	 */
562 	ccdinterleave(cs, ccd->ccd_unit);
563 
564 	/*
565 	 * Create pseudo-geometry based on 1MB cylinders.  It's
566 	 * pretty close.
567 	 */
568 	ccg->ccg_secsize = maxsecsize;
569 	ccg->ccg_ntracks = 1;
570 	ccg->ccg_nsectors = 1024 * 1024 / ccg->ccg_secsize;
571 	ccg->ccg_ncylinders = cs->sc_size / ccg->ccg_nsectors;
572 
573 	/*
574 	 * Add an devstat entry for this device.
575 	 */
576 	devstat_add_entry(&cs->device_stats, "ccd", ccd->ccd_unit,
577 			  ccg->ccg_secsize, DEVSTAT_ALL_SUPPORTED,
578 			  DEVSTAT_TYPE_STORARRAY |DEVSTAT_TYPE_IF_OTHER,
579 			  DEVSTAT_PRIORITY_ARRAY);
580 
581 	cs->sc_flags |= CCDF_INITED;
582 	cs->sc_cflags = ccd->ccd_flags;	/* So we can find out later... */
583 	cs->sc_unit = ccd->ccd_unit;
584 	return (0);
585 fail:
586 	while (ci > cs->sc_cinfo) {
587 		ci--;
588 		kfree(ci->ci_path, M_DEVBUF);
589 	}
590 	kfree(cs->sc_cinfo, M_DEVBUF);
591 	cs->sc_cinfo = NULL;
592 	return (error);
593 }
594 
595 static void
596 ccdinterleave(struct ccd_softc *cs, int unit)
597 {
598 	struct ccdcinfo *ci, *smallci;
599 	struct ccdiinfo *ii;
600 	u_int64_t bn;
601 	u_int64_t lbn;
602 	u_int64_t size;
603 	int icount;
604 	int ix;
605 
606 #ifdef DEBUG
607 	if (ccddebug & CCDB_INIT)
608 		kprintf("ccdinterleave(%x): ileave %d\n", cs, cs->sc_ileave);
609 #endif
610 
611 	/*
612 	 * Allocate an interleave table.  The worst case occurs when each
613 	 * of N disks is of a different size, resulting in N interleave
614 	 * tables.
615 	 *
616 	 * Chances are this is too big, but we don't care.
617 	 */
618 	icount = cs->sc_nccdisks + 1;
619 	cs->sc_itable = kmalloc(icount * sizeof(struct ccdiinfo),
620 				M_DEVBUF, M_WAITOK|M_ZERO);
621 
622 	/*
623 	 * Trivial case: no interleave (actually interleave of disk size).
624 	 * Each table entry represents a single component in its entirety.
625 	 *
626 	 * An interleave of 0 may not be used with a mirror or parity setup.
627 	 */
628 	if (cs->sc_ileave == 0) {
629 		bn = 0;
630 		ii = cs->sc_itable;
631 
632 		for (ix = 0; ix < cs->sc_nccdisks; ix++) {
633 			/* Allocate space for ii_index. */
634 			ii->ii_index = kmalloc(sizeof(int), M_DEVBUF, M_WAITOK);
635 			ii->ii_ndisk = 1;
636 			ii->ii_startblk = bn;
637 			ii->ii_startoff = 0;
638 			ii->ii_index[0] = ix;
639 			bn += cs->sc_cinfo[ix].ci_size;
640 			ii++;
641 		}
642 		ii->ii_ndisk = 0;
643 #ifdef DEBUG
644 		if (ccddebug & CCDB_INIT)
645 			printiinfo(cs->sc_itable);
646 #endif
647 		return;
648 	}
649 
650 	/*
651 	 * The following isn't fast or pretty; it doesn't have to be.
652 	 */
653 	size = 0;
654 	bn = lbn = 0;
655 	for (ii = cs->sc_itable; ii < &cs->sc_itable[icount]; ++ii) {
656 		/*
657 		 * Allocate space for ii_index.  We might allocate more then
658 		 * we use.
659 		 */
660 		ii->ii_index = kmalloc((sizeof(int) * cs->sc_nccdisks),
661 					M_DEVBUF, M_WAITOK);
662 
663 		/*
664 		 * Locate the smallest of the remaining components
665 		 */
666 		smallci = NULL;
667 		ci = cs->sc_cinfo;
668 		while (ci < &cs->sc_cinfo[cs->sc_nccdisks]) {
669 			if (ci->ci_size > size &&
670 			    (smallci == NULL ||
671 			     ci->ci_size < smallci->ci_size)) {
672 				smallci = ci;
673 			}
674 			++ci;
675 		}
676 
677 		/*
678 		 * Nobody left, all done
679 		 */
680 		if (smallci == NULL) {
681 			ii->ii_ndisk = 0;
682 			break;
683 		}
684 
685 		/*
686 		 * Record starting logical block using an sc_ileave blocksize.
687 		 */
688 		ii->ii_startblk = bn / cs->sc_ileave;
689 
690 		/*
691 		 * Record starting component block using an sc_ileave
692 		 * blocksize.  This value is relative to the beginning of
693 		 * a component disk.
694 		 */
695 		ii->ii_startoff = lbn;
696 
697 		/*
698 		 * Determine how many disks take part in this interleave
699 		 * and record their indices.
700 		 */
701 		ix = 0;
702 		for (ci = cs->sc_cinfo;
703 		    ci < &cs->sc_cinfo[cs->sc_nccdisks]; ci++) {
704 			if (ci->ci_size >= smallci->ci_size) {
705 				ii->ii_index[ix++] = ci - cs->sc_cinfo;
706 			}
707 		}
708 		ii->ii_ndisk = ix;
709 
710 		/*
711 		 * Adjust for loop
712 		 */
713 		bn += ix * (smallci->ci_size - size);
714 		lbn = smallci->ci_size / cs->sc_ileave;
715 		size = smallci->ci_size;
716 	}
717 	if (ii == &cs->sc_itable[icount])
718 		panic("ccdinterlave software bug!  table exhausted");
719 #ifdef DEBUG
720 	if (ccddebug & CCDB_INIT)
721 		printiinfo(cs->sc_itable);
722 #endif
723 }
724 
725 /* ARGSUSED */
726 static int
727 ccdopen(struct dev_open_args *ap)
728 {
729 	cdev_t dev = ap->a_head.a_dev;
730 	int unit = ccdunit(dev);
731 	struct ccd_softc *cs;
732 	int error = 0;
733 
734 #ifdef DEBUG
735 	if (ccddebug & CCDB_FOLLOW)
736 		kprintf("ccdopen(%x, %x)\n", dev, flags);
737 #endif
738 	if (unit >= numccd)
739 		return (ENXIO);
740 	cs = &ccd_softc[unit];
741 
742 	if ((error = ccdlock(cs)) == 0) {
743 		ccdunlock(cs);
744 	}
745 	return (error);
746 }
747 
748 /* ARGSUSED */
749 static int
750 ccdclose(struct dev_close_args *ap)
751 {
752 	cdev_t dev = ap->a_head.a_dev;
753 	int unit = ccdunit(dev);
754 	struct ccd_softc *cs;
755 	int error = 0;
756 
757 #ifdef DEBUG
758 	if (ccddebug & CCDB_FOLLOW)
759 		kprintf("ccdclose(%x, %x)\n", dev, flags);
760 #endif
761 
762 	if (unit >= numccd)
763 		return (ENXIO);
764 	cs = &ccd_softc[unit];
765 	if ((error = ccdlock(cs)) == 0) {
766 		ccdunlock(cs);
767 	}
768 	return (error);
769 }
770 
771 static int
772 ccdstrategy(struct dev_strategy_args *ap)
773 {
774 	cdev_t dev = ap->a_head.a_dev;
775 	struct bio *bio = ap->a_bio;
776 	int unit = ccdunit(dev);
777 	struct bio *nbio;
778 	struct buf *bp = bio->bio_buf;
779 	struct ccd_softc *cs = &ccd_softc[unit];
780 	u_int64_t pbn;	/* in sc_secsize chunks */
781 	u_int32_t sz;	/* in sc_secsize chunks */
782 
783 #ifdef DEBUG
784 	if (ccddebug & CCDB_FOLLOW)
785 		kprintf("ccdstrategy(%x): unit %d\n", bp, unit);
786 #endif
787 	if ((cs->sc_flags & CCDF_INITED) == 0) {
788 		bp->b_error = ENXIO;
789 		goto error;
790 	}
791 
792 	/* If it's a nil transfer, wake up the top half now. */
793 	if (bp->b_bcount == 0) {
794 		bp->b_resid = 0;
795 		goto done;
796 	}
797 
798 	/*
799 	 * Do bounds checking and adjust transfer.  If there's an
800 	 * error, the bounds check will flag that for us.
801 	 */
802 
803 	pbn = bio->bio_offset / cs->sc_geom.ccg_secsize;
804 	sz = howmany(bp->b_bcount, cs->sc_geom.ccg_secsize);
805 
806 	/*
807 	 * If out of bounds return an error.  If the request goes
808 	 * past EOF, clip the request as appropriate.  If exactly
809 	 * at EOF, return success (don't clip), but with 0 bytes
810 	 * of I/O.
811 	 *
812 	 * Mark EOF B_INVAL (just like bad), indicating that the
813 	 * contents of the buffer, if any, is invalid.
814 	 */
815 	if ((int64_t)pbn < 0)
816 		goto bad;
817 	if (pbn + sz > cs->sc_size) {
818 		if (pbn > cs->sc_size || (bp->b_flags & B_BNOCLIP))
819 			goto bad;
820 		if (pbn == cs->sc_size) {
821 			bp->b_resid = bp->b_bcount;
822 			bp->b_flags |= B_INVAL;
823 			goto done;
824 		}
825 		sz = (long)(cs->sc_size - pbn);
826 		bp->b_bcount = sz * cs->sc_geom.ccg_secsize;
827 	}
828 	nbio = bio;
829 
830 	bp->b_resid = bp->b_bcount;
831 	nbio->bio_driver_info = dev;
832 
833 	/*
834 	 * "Start" the unit.
835 	 */
836 	crit_enter();
837 	ccdstart(cs, nbio);
838 	crit_exit();
839 	return(0);
840 
841 	/*
842 	 * note: bio, not nbio, is valid at the done label.
843 	 */
844 bad:
845 	bp->b_error = EINVAL;
846 error:
847 	bp->b_resid = bp->b_bcount;
848 	bp->b_flags |= B_ERROR | B_INVAL;
849 done:
850 	biodone(bio);
851 	return(0);
852 }
853 
854 static void
855 ccdstart(struct ccd_softc *cs, struct bio *bio)
856 {
857 	long bcount, rcount;
858 	struct ccdbuf *cbp[4];
859 	struct buf *bp = bio->bio_buf;
860 	/* XXX! : 2 reads and 2 writes for RAID 4/5 */
861 	caddr_t addr;
862 	off_t doffset;
863 
864 #ifdef DEBUG
865 	if (ccddebug & CCDB_FOLLOW)
866 		kprintf("ccdstart(%x, %x)\n", cs, bp);
867 #endif
868 
869 	/* Record the transaction start  */
870 	devstat_start_transaction(&cs->device_stats);
871 
872 	/*
873 	 * Allocate component buffers and fire off the requests
874 	 */
875 	doffset = bio->bio_offset;
876 	addr = bp->b_data;
877 
878 	for (bcount = bp->b_bcount; bcount > 0; bcount -= rcount) {
879 		ccdbuffer(cbp, cs, bio, doffset, addr, bcount);
880 		rcount = cbp[0]->cb_buf.b_bcount;
881 
882 		if (cs->sc_cflags & CCDF_MIRROR) {
883 			/*
884 			 * Mirroring.  Writes go to both disks, reads are
885 			 * taken from whichever disk seems most appropriate.
886 			 *
887 			 * We attempt to localize reads to the disk whos arm
888 			 * is nearest the read request.  We ignore seeks due
889 			 * to writes when making this determination and we
890 			 * also try to avoid hogging.
891 			 */
892 			if (cbp[0]->cb_buf.b_cmd != BUF_CMD_READ) {
893 				vn_strategy(cbp[0]->cb_vp,
894 					    &cbp[0]->cb_buf.b_bio1);
895 				vn_strategy(cbp[1]->cb_vp,
896 					    &cbp[1]->cb_buf.b_bio1);
897 			} else {
898 				int pick = cs->sc_pick;
899 				daddr_t range = cs->sc_size / 16 * cs->sc_geom.ccg_secsize;
900 				if (doffset < cs->sc_blk[pick] - range ||
901 				    doffset > cs->sc_blk[pick] + range
902 				) {
903 					cs->sc_pick = pick = 1 - pick;
904 				}
905 				cs->sc_blk[pick] = doffset + rcount;
906 				vn_strategy(cbp[pick]->cb_vp,
907 					    &cbp[pick]->cb_buf.b_bio1);
908 			}
909 		} else {
910 			/*
911 			 * Not mirroring
912 			 */
913 			vn_strategy(cbp[0]->cb_vp,
914 				     &cbp[0]->cb_buf.b_bio1);
915 		}
916 		doffset += rcount;
917 		addr += rcount;
918 	}
919 }
920 
921 /*
922  * Build a component buffer header.
923  */
924 static void
925 ccdbuffer(struct ccdbuf **cb, struct ccd_softc *cs, struct bio *bio,
926 	  off_t doffset, caddr_t addr, long bcount)
927 {
928 	struct ccdcinfo *ci, *ci2 = NULL;	/* XXX */
929 	struct ccdbuf *cbp;
930 	u_int64_t bn;
931 	u_int64_t cbn;
932 	u_int64_t cboff;
933 	off_t cbc;
934 
935 #ifdef DEBUG
936 	if (ccddebug & CCDB_IO)
937 		kprintf("ccdbuffer(%x, %x, %d, %x, %d)\n",
938 		       cs, bp, bn, addr, bcount);
939 #endif
940 	/*
941 	 * Determine which component bn falls in.
942 	 */
943 	bn = doffset / cs->sc_geom.ccg_secsize;
944 	cbn = bn;
945 	cboff = 0;
946 
947 	if (cs->sc_ileave == 0) {
948 		/*
949 		 * Serially concatenated and neither a mirror nor a parity
950 		 * config.  This is a special case.
951 		 */
952 		daddr_t sblk;
953 
954 		sblk = 0;
955 		for (ci = cs->sc_cinfo; cbn >= sblk + ci->ci_size; ci++)
956 			sblk += ci->ci_size;
957 		cbn -= sblk;
958 	} else {
959 		struct ccdiinfo *ii;
960 		int ccdisk, off;
961 
962 		/*
963 		 * Calculate cbn, the logical superblock (sc_ileave chunks),
964 		 * and cboff, a normal block offset (DEV_BSIZE chunks) relative
965 		 * to cbn.
966 		 */
967 		cboff = cbn % cs->sc_ileave;	/* DEV_BSIZE gran */
968 		cbn = cbn / cs->sc_ileave;	/* DEV_BSIZE * ileave gran */
969 
970 		/*
971 		 * Figure out which interleave table to use.
972 		 */
973 		for (ii = cs->sc_itable; ii->ii_ndisk; ii++) {
974 			if (ii->ii_startblk > cbn)
975 				break;
976 		}
977 		ii--;
978 
979 		/*
980 		 * off is the logical superblock relative to the beginning
981 		 * of this interleave block.
982 		 */
983 		off = cbn - ii->ii_startblk;
984 
985 		/*
986 		 * We must calculate which disk component to use (ccdisk),
987 		 * and recalculate cbn to be the superblock relative to
988 		 * the beginning of the component.  This is typically done by
989 		 * adding 'off' and ii->ii_startoff together.  However, 'off'
990 		 * must typically be divided by the number of components in
991 		 * this interleave array to be properly convert it from a
992 		 * CCD-relative logical superblock number to a
993 		 * component-relative superblock number.
994 		 */
995 		if (ii->ii_ndisk == 1) {
996 			/*
997 			 * When we have just one disk, it can't be a mirror
998 			 * or a parity config.
999 			 */
1000 			ccdisk = ii->ii_index[0];
1001 			cbn = ii->ii_startoff + off;
1002 		} else {
1003 			if (cs->sc_cflags & CCDF_MIRROR) {
1004 				/*
1005 				 * We have forced a uniform mapping, resulting
1006 				 * in a single interleave array.  We double
1007 				 * up on the first half of the available
1008 				 * components and our mirror is in the second
1009 				 * half.  This only works with a single
1010 				 * interleave array because doubling up
1011 				 * doubles the number of sectors, so there
1012 				 * cannot be another interleave array because
1013 				 * the next interleave array's calculations
1014 				 * would be off.
1015 				 */
1016 				int ndisk2 = ii->ii_ndisk / 2;
1017 				ccdisk = ii->ii_index[off % ndisk2];
1018 				cbn = ii->ii_startoff + off / ndisk2;
1019 				ci2 = &cs->sc_cinfo[ccdisk + ndisk2];
1020 			} else if (cs->sc_cflags & CCDF_PARITY) {
1021 				/*
1022 				 * XXX not implemented yet
1023 				 */
1024 				int ndisk2 = ii->ii_ndisk - 1;
1025 				ccdisk = ii->ii_index[off % ndisk2];
1026 				cbn = ii->ii_startoff + off / ndisk2;
1027 				if (cbn % ii->ii_ndisk <= ccdisk)
1028 					ccdisk++;
1029 			} else {
1030 				ccdisk = ii->ii_index[off % ii->ii_ndisk];
1031 				cbn = ii->ii_startoff + off / ii->ii_ndisk;
1032 			}
1033 		}
1034 
1035 		ci = &cs->sc_cinfo[ccdisk];
1036 
1037 		/*
1038 		 * Convert cbn from a superblock to a normal block so it
1039 		 * can be used to calculate (along with cboff) the normal
1040 		 * block index into this particular disk.
1041 		 */
1042 		cbn *= cs->sc_ileave;
1043 	}
1044 
1045 	/*
1046 	 * Fill in the component buf structure.
1047 	 *
1048 	 * NOTE: devices do not use b_bufsize, only b_bcount, but b_bcount
1049 	 * will be truncated on device EOF so we use b_bufsize to detect
1050 	 * the case.
1051 	 */
1052 	cbp = getccdbuf();
1053 	cbp->cb_buf.b_cmd = bio->bio_buf->b_cmd;
1054 	cbp->cb_buf.b_flags |= bio->bio_buf->b_flags;
1055 	cbp->cb_buf.b_data = addr;
1056 	cbp->cb_vp = ci->ci_vp;
1057 	if (cs->sc_ileave == 0)
1058 		cbc = dbtob((off_t)(ci->ci_size - cbn));
1059 	else
1060 		cbc = dbtob((off_t)(cs->sc_ileave - cboff));
1061 	if (cbc > cs->sc_maxiosize)
1062 		cbc = cs->sc_maxiosize;
1063 	cbp->cb_buf.b_bcount = (cbc < bcount) ? cbc : bcount;
1064  	cbp->cb_buf.b_bufsize = cbp->cb_buf.b_bcount;
1065 
1066 	cbp->cb_buf.b_bio1.bio_done = ccdiodone;
1067 	cbp->cb_buf.b_bio1.bio_caller_info1.ptr = cbp;
1068 	cbp->cb_buf.b_bio1.bio_offset = dbtob(cbn + cboff + ci->ci_skip);
1069 
1070 	/*
1071 	 * context for ccdiodone
1072 	 */
1073 	cbp->cb_obio = bio;
1074 	cbp->cb_unit = cs - ccd_softc;
1075 	cbp->cb_comp = ci - cs->sc_cinfo;
1076 
1077 #ifdef DEBUG
1078 	if (ccddebug & CCDB_IO)
1079 		kprintf(" dev %x(u%d): cbp %x off %lld addr %x bcnt %d\n",
1080 		       ci->ci_dev, ci-cs->sc_cinfo, cbp,
1081 		       cbp->cb_buf.b_bio1.bio_offset,
1082 		       cbp->cb_buf.b_data, cbp->cb_buf.b_bcount);
1083 #endif
1084 	cb[0] = cbp;
1085 
1086 	/*
1087 	 * Note: both I/O's setup when reading from mirror, but only one
1088 	 * will be executed.
1089 	 */
1090 	if (cs->sc_cflags & CCDF_MIRROR) {
1091 		/* mirror, setup second I/O */
1092 		cbp = getccdbuf();
1093 
1094 		cbp->cb_buf.b_cmd = bio->bio_buf->b_cmd;
1095 		cbp->cb_buf.b_flags |= bio->bio_buf->b_flags;
1096 		cbp->cb_buf.b_data = addr;
1097 		cbp->cb_vp = ci2->ci_vp;
1098 		if (cs->sc_ileave == 0)
1099 		      cbc = dbtob((off_t)(ci->ci_size - cbn));
1100 		else
1101 		      cbc = dbtob((off_t)(cs->sc_ileave - cboff));
1102 		if (cbc > cs->sc_maxiosize)
1103 			cbc = cs->sc_maxiosize;
1104 		cbp->cb_buf.b_bcount = (cbc < bcount) ? cbc : bcount;
1105 		cbp->cb_buf.b_bufsize = cbp->cb_buf.b_bcount;
1106 
1107 		cbp->cb_buf.b_bio1.bio_done = ccdiodone;
1108 		cbp->cb_buf.b_bio1.bio_caller_info1.ptr = cbp;
1109 		cbp->cb_buf.b_bio1.bio_offset = dbtob(cbn + cboff + ci2->ci_skip);
1110 
1111 		/*
1112 		 * context for ccdiodone
1113 		 */
1114 		cbp->cb_obio = bio;
1115 		cbp->cb_unit = cs - ccd_softc;
1116 		cbp->cb_comp = ci2 - cs->sc_cinfo;
1117 		cb[1] = cbp;
1118 		/* link together the ccdbuf's and clear "mirror done" flag */
1119 		cb[0]->cb_mirror = cb[1];
1120 		cb[1]->cb_mirror = cb[0];
1121 		cb[0]->cb_pflags &= ~CCDPF_MIRROR_DONE;
1122 		cb[1]->cb_pflags &= ~CCDPF_MIRROR_DONE;
1123 	}
1124 }
1125 
1126 static void
1127 ccdintr(struct ccd_softc *cs, struct bio *bio)
1128 {
1129 	struct buf *bp = bio->bio_buf;
1130 
1131 #ifdef DEBUG
1132 	if (ccddebug & CCDB_FOLLOW)
1133 		kprintf("ccdintr(%x, %x)\n", cs, bp);
1134 #endif
1135 	/*
1136 	 * Request is done for better or worse, wakeup the top half.
1137 	 */
1138 	if (bp->b_flags & B_ERROR)
1139 		bp->b_resid = bp->b_bcount;
1140 	devstat_end_transaction_buf(&cs->device_stats, bp);
1141 	biodone(bio);
1142 }
1143 
1144 /*
1145  * Called at interrupt time.
1146  * Mark the component as done and if all components are done,
1147  * take a ccd interrupt.
1148  */
1149 static void
1150 ccdiodone(struct bio *bio)
1151 {
1152 	struct ccdbuf *cbp = bio->bio_caller_info1.ptr;
1153 	struct bio *obio = cbp->cb_obio;
1154 	struct buf *obp = obio->bio_buf;
1155 	int unit = cbp->cb_unit;
1156 	int count;
1157 
1158 	/*
1159 	 * Since we do not have exclusive access to underlying devices,
1160 	 * we can't keep cache translations around.
1161 	 */
1162 	clearbiocache(bio->bio_next);
1163 
1164 	crit_enter();
1165 #ifdef DEBUG
1166 	if (ccddebug & CCDB_FOLLOW)
1167 		kprintf("ccdiodone(%x)\n", cbp);
1168 	if (ccddebug & CCDB_IO) {
1169 		kprintf("ccdiodone: bp %x bcount %d resid %d\n",
1170 		       obp, obp->b_bcount, obp->b_resid);
1171 		kprintf(" dev %x(u%d), cbp %x off %lld addr %x bcnt %d\n",
1172 		       cbp->cb_buf.b_dev, cbp->cb_comp, cbp,
1173 		       cbp->cb_buf.b_loffset, cbp->cb_buf.b_data,
1174 		       cbp->cb_buf.b_bcount);
1175 	}
1176 #endif
1177 
1178 	/*
1179 	 * If an error occured, report it.  If this is a mirrored
1180 	 * configuration and the first of two possible reads, do not
1181 	 * set the error in the bp yet because the second read may
1182 	 * succeed.
1183 	 */
1184 	if (cbp->cb_buf.b_flags & B_ERROR) {
1185 		const char *msg = "";
1186 
1187 		if ((ccd_softc[unit].sc_cflags & CCDF_MIRROR) &&
1188 		    (cbp->cb_buf.b_cmd == BUF_CMD_READ) &&
1189 		    (cbp->cb_pflags & CCDPF_MIRROR_DONE) == 0) {
1190 			/*
1191 			 * We will try our read on the other disk down
1192 			 * below, also reverse the default pick so if we
1193 			 * are doing a scan we do not keep hitting the
1194 			 * bad disk first.
1195 			 */
1196 			struct ccd_softc *cs = &ccd_softc[unit];
1197 
1198 			msg = ", trying other disk";
1199 			cs->sc_pick = 1 - cs->sc_pick;
1200 			cs->sc_blk[cs->sc_pick] = obio->bio_offset;
1201 		} else {
1202 			obp->b_flags |= B_ERROR;
1203 			obp->b_error = cbp->cb_buf.b_error ?
1204 			    cbp->cb_buf.b_error : EIO;
1205 		}
1206 		kprintf("ccd%d: error %d on component %d offset %lld (ccd offset %lld)%s\n",
1207 		       unit, obp->b_error, cbp->cb_comp,
1208 		       cbp->cb_buf.b_bio2.bio_offset,
1209 		       obio->bio_offset, msg);
1210 	}
1211 
1212 	/*
1213 	 * Process mirror.  If we are writing, I/O has been initiated on both
1214 	 * buffers and we fall through only after both are finished.
1215 	 *
1216 	 * If we are reading only one I/O is initiated at a time.  If an
1217 	 * error occurs we initiate the second I/O and return, otherwise
1218 	 * we free the second I/O without initiating it.
1219 	 */
1220 
1221 	if (ccd_softc[unit].sc_cflags & CCDF_MIRROR) {
1222 		if (cbp->cb_buf.b_cmd != BUF_CMD_READ) {
1223 			/*
1224 			 * When writing, handshake with the second buffer
1225 			 * to determine when both are done.  If both are not
1226 			 * done, return here.
1227 			 */
1228 			if ((cbp->cb_pflags & CCDPF_MIRROR_DONE) == 0) {
1229 				cbp->cb_mirror->cb_pflags |= CCDPF_MIRROR_DONE;
1230 				putccdbuf(cbp);
1231 				crit_exit();
1232 				return;
1233 			}
1234 		} else {
1235 			/*
1236 			 * When reading, either dispose of the second buffer
1237 			 * or initiate I/O on the second buffer if an error
1238 			 * occured with this one.
1239 			 */
1240 			if ((cbp->cb_pflags & CCDPF_MIRROR_DONE) == 0) {
1241 				if (cbp->cb_buf.b_flags & B_ERROR) {
1242 					cbp->cb_mirror->cb_pflags |=
1243 					    CCDPF_MIRROR_DONE;
1244 					vn_strategy(
1245 					    cbp->cb_mirror->cb_vp,
1246 					    &cbp->cb_mirror->cb_buf.b_bio1
1247 					);
1248 					putccdbuf(cbp);
1249 					crit_exit();
1250 					return;
1251 				} else {
1252 					putccdbuf(cbp->cb_mirror);
1253 					/* fall through */
1254 				}
1255 			}
1256 		}
1257 	}
1258 
1259 	/*
1260 	 * Use our saved b_bufsize to determine if an unexpected EOF occured.
1261 	 */
1262 	count = cbp->cb_buf.b_bufsize;
1263 	putccdbuf(cbp);
1264 
1265 	/*
1266 	 * If all done, "interrupt".
1267 	 */
1268 	obp->b_resid -= count;
1269 	if (obp->b_resid < 0)
1270 		panic("ccdiodone: count");
1271 	if (obp->b_resid == 0)
1272 		ccdintr(&ccd_softc[unit], obio);
1273 	crit_exit();
1274 }
1275 
1276 static int
1277 ccdioctl(struct dev_ioctl_args *ap)
1278 {
1279 	cdev_t dev = ap->a_head.a_dev;
1280 	int unit = ccdunit(dev);
1281 	int i, j, lookedup = 0, error = 0;
1282 	struct ccd_softc *cs;
1283 	struct ccd_ioctl *ccio = (struct ccd_ioctl *)ap->a_data;
1284 	struct ccddevice ccd;
1285 	struct disk_info info;
1286 	char **cpp;
1287 	struct vnode **vpp;
1288 
1289 	if (unit >= numccd)
1290 		return (ENXIO);
1291 	cs = &ccd_softc[unit];
1292 
1293 	bzero(&ccd, sizeof(ccd));
1294 
1295 	switch (ap->a_cmd) {
1296 	case CCDIOCSET:
1297 		if (cs->sc_flags & CCDF_INITED)
1298 			return (EBUSY);
1299 
1300 		if ((ap->a_fflag & FWRITE) == 0)
1301 			return (EBADF);
1302 
1303 		if ((error = ccdlock(cs)) != 0)
1304 			return (error);
1305 
1306 		if (ccio->ccio_ndisks > CCD_MAXNDISKS) {
1307 			ccdunlock(cs);
1308 			return (EINVAL);
1309 		}
1310 
1311 		/* Fill in some important bits. */
1312 		ccd.ccd_unit = unit;
1313 		ccd.ccd_interleave = ccio->ccio_ileave;
1314 		if (ccd.ccd_interleave == 0 &&
1315 		    ((ccio->ccio_flags & CCDF_MIRROR) ||
1316 		     (ccio->ccio_flags & CCDF_PARITY))) {
1317 			kprintf("ccd%d: disabling mirror/parity, interleave is 0\n", unit);
1318 			ccio->ccio_flags &= ~(CCDF_MIRROR | CCDF_PARITY);
1319 		}
1320 		if ((ccio->ccio_flags & CCDF_MIRROR) &&
1321 		    (ccio->ccio_flags & CCDF_PARITY)) {
1322 			kprintf("ccd%d: can't specify both mirror and parity, using mirror\n", unit);
1323 			ccio->ccio_flags &= ~CCDF_PARITY;
1324 		}
1325 		if ((ccio->ccio_flags & (CCDF_MIRROR | CCDF_PARITY)) &&
1326 		    !(ccio->ccio_flags & CCDF_UNIFORM)) {
1327 			kprintf("ccd%d: mirror/parity forces uniform flag\n",
1328 			       unit);
1329 			ccio->ccio_flags |= CCDF_UNIFORM;
1330 		}
1331 		ccd.ccd_flags = ccio->ccio_flags & CCDF_USERMASK;
1332 
1333 		/*
1334 		 * Allocate space for and copy in the array of
1335 		 * componet pathnames and device numbers.
1336 		 */
1337 		cpp = kmalloc(ccio->ccio_ndisks * sizeof(char *),
1338 		    M_DEVBUF, M_WAITOK);
1339 		vpp = kmalloc(ccio->ccio_ndisks * sizeof(struct vnode *),
1340 		    M_DEVBUF, M_WAITOK);
1341 
1342 		error = copyin((caddr_t)ccio->ccio_disks, (caddr_t)cpp,
1343 				ccio->ccio_ndisks * sizeof(char **));
1344 		if (error) {
1345 			kfree(vpp, M_DEVBUF);
1346 			kfree(cpp, M_DEVBUF);
1347 			ccdunlock(cs);
1348 			return (error);
1349 		}
1350 
1351 #ifdef DEBUG
1352 		if (ccddebug & CCDB_INIT) {
1353 			for (i = 0; i < ccio->ccio_ndisks; ++i)
1354 				kprintf("ccdioctl: component %d: 0x%x\n",
1355 				    i, cpp[i]);
1356 		}
1357 #endif
1358 
1359 		for (i = 0; i < ccio->ccio_ndisks; ++i) {
1360 #ifdef DEBUG
1361 			if (ccddebug & CCDB_INIT)
1362 				kprintf("ccdioctl: lookedup = %d\n", lookedup);
1363 #endif
1364 			if ((error = ccdlookup(cpp[i], &vpp[i])) != 0) {
1365 				for (j = 0; j < lookedup; ++j)
1366 					(void)vn_close(vpp[j], FREAD|FWRITE);
1367 				kfree(vpp, M_DEVBUF);
1368 				kfree(cpp, M_DEVBUF);
1369 				ccdunlock(cs);
1370 				return (error);
1371 			}
1372 			++lookedup;
1373 		}
1374 		ccd.ccd_cpp = cpp;
1375 		ccd.ccd_vpp = vpp;
1376 		ccd.ccd_ndev = ccio->ccio_ndisks;
1377 
1378 		/*
1379 		 * Initialize the ccd.  Fills in the softc for us.
1380 		 */
1381 		if ((error = ccdinit(&ccd, cpp, ap->a_cred)) != 0) {
1382 			for (j = 0; j < lookedup; ++j)
1383 				(void)vn_close(vpp[j], FREAD|FWRITE);
1384 			kfree(vpp, M_DEVBUF);
1385 			kfree(cpp, M_DEVBUF);
1386 			ccdunlock(cs);
1387 			return (error);
1388 		}
1389 
1390 		/*
1391 		 * The ccd has been successfully initialized, so
1392 		 * we can place it into the array and read the disklabel.
1393 		 */
1394 		bcopy(&ccd, &ccddevs[unit], sizeof(ccd));
1395 		ccio->ccio_unit = unit;
1396 		ccio->ccio_size = cs->sc_size;
1397 
1398 		bzero(&info, sizeof(info));
1399 		info.d_media_blksize = cs->sc_geom.ccg_secsize;
1400 		info.d_media_blocks  = cs->sc_size;
1401 		info.d_nheads	     = cs->sc_geom.ccg_ntracks;
1402 		info.d_secpertrack   = cs->sc_geom.ccg_nsectors;
1403 		info.d_ncylinders    = cs->sc_geom.ccg_ncylinders;
1404 		info.d_secpercyl     = info.d_nheads * info.d_secpertrack;
1405 
1406 		/*
1407 		 * For cases where a label is directly applied to the ccd,
1408 		 * without slices, DSO_COMPATMBR forces one sector be
1409 		 * reserved for backwards compatibility.
1410 		 */
1411 		info.d_dsflags	     = DSO_COMPATMBR;
1412 		disk_setdiskinfo(&cs->sc_disk, &info);
1413 
1414 		ccdunlock(cs);
1415 
1416 		break;
1417 
1418 	case CCDIOCCLR:
1419 		if ((cs->sc_flags & CCDF_INITED) == 0)
1420 			return (ENXIO);
1421 
1422 		if ((ap->a_fflag & FWRITE) == 0)
1423 			return (EBADF);
1424 
1425 		if ((error = ccdlock(cs)) != 0)
1426 			return (error);
1427 
1428 		if (dev_drefs(cs->sc_dev) > 1) {
1429 			ccdunlock(cs);
1430 			return (EBUSY);
1431 		}
1432 
1433 		/*
1434 		 * Free ccd_softc information and clear entry.
1435 		 */
1436 
1437 		/* Close the components and free their pathnames. */
1438 		for (i = 0; i < cs->sc_nccdisks; ++i) {
1439 			/*
1440 			 * XXX: this close could potentially fail and
1441 			 * cause Bad Things.  Maybe we need to force
1442 			 * the close to happen?
1443 			 */
1444 #ifdef DEBUG
1445 			if (ccddebug & CCDB_VNODE)
1446 				vprint("CCDIOCCLR: vnode info",
1447 				    cs->sc_cinfo[i].ci_vp);
1448 #endif
1449 			(void)vn_close(cs->sc_cinfo[i].ci_vp, FREAD|FWRITE);
1450 			kfree(cs->sc_cinfo[i].ci_path, M_DEVBUF);
1451 		}
1452 
1453 		/* Free interleave index. */
1454 		for (i = 0; cs->sc_itable[i].ii_ndisk; ++i)
1455 			kfree(cs->sc_itable[i].ii_index, M_DEVBUF);
1456 
1457 		/* Free component info and interleave table. */
1458 		kfree(cs->sc_cinfo, M_DEVBUF);
1459 		kfree(cs->sc_itable, M_DEVBUF);
1460 		cs->sc_cinfo = NULL;
1461 		cs->sc_itable = NULL;
1462 		cs->sc_flags &= ~CCDF_INITED;
1463 
1464 		/*
1465 		 * Free ccddevice information and clear entry.
1466 		 */
1467 		kfree(ccddevs[unit].ccd_cpp, M_DEVBUF);
1468 		kfree(ccddevs[unit].ccd_vpp, M_DEVBUF);
1469 		bcopy(&ccd, &ccddevs[unit], sizeof(ccd));
1470 
1471 		/*
1472 		 * And remove the devstat entry.
1473 		 */
1474 		devstat_remove_entry(&cs->device_stats);
1475 
1476 		/* This must be atomic. */
1477 		crit_enter();
1478 		ccdunlock(cs);
1479 		crit_exit();
1480 
1481 		break;
1482 
1483 	default:
1484 		return (ENOTTY);
1485 	}
1486 
1487 	return (0);
1488 }
1489 
1490 static int
1491 ccddump(struct dev_dump_args *ap)
1492 {
1493 	/* Not implemented. */
1494 	return ENXIO;
1495 }
1496 
1497 /*
1498  * Lookup the provided name in the filesystem.  If the file exists,
1499  * is a valid block device, and isn't being used by anyone else,
1500  * set *vpp to the file's vnode.
1501  */
1502 static int
1503 ccdlookup(char *path, struct vnode **vpp)
1504 {
1505 	struct nlookupdata nd;
1506 	struct vnode *vp;
1507 	int error;
1508 
1509 	*vpp = NULL;
1510 
1511 	error = nlookup_init(&nd, path, UIO_USERSPACE, NLC_FOLLOW|NLC_LOCKVP);
1512 	if (error)
1513 		return (error);
1514 	if ((error = vn_open(&nd, NULL, FREAD|FWRITE, 0)) != 0) {
1515 #ifdef DEBUG
1516 		if (ccddebug & CCDB_FOLLOW|CCDB_INIT)
1517 			kprintf("ccdlookup: vn_open error = %d\n", error);
1518 #endif
1519 		goto done;
1520 	}
1521 	vp = nd.nl_open_vp;
1522 
1523 	if (vp->v_opencount > 1) {
1524 		error = EBUSY;
1525 		goto done;
1526 	}
1527 
1528 	if (!vn_isdisk(vp, &error))
1529 		goto done;
1530 
1531 #ifdef DEBUG
1532 	if (ccddebug & CCDB_VNODE)
1533 		vprint("ccdlookup: vnode info", vp);
1534 #endif
1535 
1536 	vn_unlock(vp);
1537 	nd.nl_open_vp = NULL;
1538 	nlookup_done(&nd);
1539 	*vpp = vp;				/* leave ref intact  */
1540 	return (0);
1541 done:
1542 	nlookup_done(&nd);
1543 	return (error);
1544 }
1545 
1546 /*
1547  * Wait interruptibly for an exclusive lock.
1548  *
1549  * XXX
1550  * Several drivers do this; it should be abstracted and made MP-safe.
1551  */
1552 static int
1553 ccdlock(struct ccd_softc *cs)
1554 {
1555 	int error;
1556 
1557 	while ((cs->sc_flags & CCDF_LOCKED) != 0) {
1558 		cs->sc_flags |= CCDF_WANTED;
1559 		if ((error = tsleep(cs, PCATCH, "ccdlck", 0)) != 0)
1560 			return (error);
1561 	}
1562 	cs->sc_flags |= CCDF_LOCKED;
1563 	return (0);
1564 }
1565 
1566 /*
1567  * Unlock and wake up any waiters.
1568  */
1569 static void
1570 ccdunlock(struct ccd_softc *cs)
1571 {
1572 
1573 	cs->sc_flags &= ~CCDF_LOCKED;
1574 	if ((cs->sc_flags & CCDF_WANTED) != 0) {
1575 		cs->sc_flags &= ~CCDF_WANTED;
1576 		wakeup(cs);
1577 	}
1578 }
1579 
1580 #ifdef DEBUG
1581 static void
1582 printiinfo(struct ccdiinfo *ii)
1583 {
1584 	int ix, i;
1585 
1586 	for (ix = 0; ii->ii_ndisk; ix++, ii++) {
1587 		kprintf(" itab[%d]: #dk %d sblk %d soff %d",
1588 		       ix, ii->ii_ndisk, ii->ii_startblk, ii->ii_startoff);
1589 		for (i = 0; i < ii->ii_ndisk; i++)
1590 			kprintf(" %d", ii->ii_index[i]);
1591 		kprintf("\n");
1592 	}
1593 }
1594 #endif
1595 
1596 
1597 /* Local Variables: */
1598 /* c-argdecl-indent: 8 */
1599 /* c-continued-statement-offset: 8 */
1600 /* c-indent-level: 8 */
1601 /* End: */
1602