xref: /dragonfly/sys/dev/disk/ccd/ccd.c (revision d600454b)
1 /* $FreeBSD: src/sys/dev/ccd/ccd.c,v 1.73.2.1 2001/09/11 09:49:52 kris Exp $ */
2 /* $DragonFly: src/sys/dev/disk/ccd/ccd.c,v 1.22 2006/02/17 19:17:55 dillon Exp $ */
3 
4 /*	$NetBSD: ccd.c,v 1.22 1995/12/08 19:13:26 thorpej Exp $	*/
5 
6 /*
7  * Copyright (c) 1995 Jason R. Thorpe.
8  * All rights reserved.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 3. All advertising materials mentioning features or use of this software
19  *    must display the following acknowledgement:
20  *	This product includes software developed for the NetBSD Project
21  *	by Jason R. Thorpe.
22  * 4. The name of the author may not be used to endorse or promote products
23  *    derived from this software without specific prior written permission.
24  *
25  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
26  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
27  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
28  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
29  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
30  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
31  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
32  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
33  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
34  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
35  * SUCH DAMAGE.
36  */
37 
38 /*
39  * Copyright (c) 1988 University of Utah.
40  * Copyright (c) 1990, 1993
41  *	The Regents of the University of California.  All rights reserved.
42  *
43  * This code is derived from software contributed to Berkeley by
44  * the Systems Programming Group of the University of Utah Computer
45  * Science Department.
46  *
47  * Redistribution and use in source and binary forms, with or without
48  * modification, are permitted provided that the following conditions
49  * are met:
50  * 1. Redistributions of source code must retain the above copyright
51  *    notice, this list of conditions and the following disclaimer.
52  * 2. Redistributions in binary form must reproduce the above copyright
53  *    notice, this list of conditions and the following disclaimer in the
54  *    documentation and/or other materials provided with the distribution.
55  * 3. All advertising materials mentioning features or use of this software
56  *    must display the following acknowledgement:
57  *	This product includes software developed by the University of
58  *	California, Berkeley and its contributors.
59  * 4. Neither the name of the University nor the names of its contributors
60  *    may be used to endorse or promote products derived from this software
61  *    without specific prior written permission.
62  *
63  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
64  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
65  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
66  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
67  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
68  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
69  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
70  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
71  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
72  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
73  * SUCH DAMAGE.
74  *
75  * from: Utah $Hdr: cd.c 1.6 90/11/28$
76  *
77  *	@(#)cd.c	8.2 (Berkeley) 11/16/93
78  */
79 
80 /*
81  * "Concatenated" disk driver.
82  *
83  * Dynamic configuration and disklabel support by:
84  *	Jason R. Thorpe <thorpej@nas.nasa.gov>
85  *	Numerical Aerodynamic Simulation Facility
86  *	Mail Stop 258-6
87  *	NASA Ames Research Center
88  *	Moffett Field, CA 94035
89  */
90 
91 #include "use_ccd.h"
92 
93 #include <sys/param.h>
94 #include <sys/systm.h>
95 #include <sys/kernel.h>
96 #include <sys/module.h>
97 #include <sys/proc.h>
98 #include <sys/buf.h>
99 #include <sys/malloc.h>
100 #include <sys/nlookup.h>
101 #include <sys/conf.h>
102 #include <sys/stat.h>
103 #include <sys/sysctl.h>
104 #include <sys/disklabel.h>
105 #include <vfs/ufs/fs.h>
106 #include <sys/devicestat.h>
107 #include <sys/fcntl.h>
108 #include <sys/vnode.h>
109 #include <sys/buf2.h>
110 
111 #include <sys/ccdvar.h>
112 
113 #include <sys/thread2.h>
114 
115 #include <vm/vm_zone.h>
116 
117 #if defined(CCDDEBUG) && !defined(DEBUG)
118 #define DEBUG
119 #endif
120 
121 #ifdef DEBUG
122 #define CCDB_FOLLOW	0x01
123 #define CCDB_INIT	0x02
124 #define CCDB_IO		0x04
125 #define CCDB_LABEL	0x08
126 #define CCDB_VNODE	0x10
127 static int ccddebug = CCDB_FOLLOW | CCDB_INIT | CCDB_IO | CCDB_LABEL |
128     CCDB_VNODE;
129 SYSCTL_INT(_debug, OID_AUTO, ccddebug, CTLFLAG_RW, &ccddebug, 0, "");
130 #undef DEBUG
131 #endif
132 
133 #define	ccdunit(x)	dkunit(x)
134 #define ccdpart(x)	dkpart(x)
135 
136 /*
137    This is how mirroring works (only writes are special):
138 
139    When initiating a write, ccdbuffer() returns two "struct ccdbuf *"s
140    linked together by the cb_mirror field.  "cb_pflags &
141    CCDPF_MIRROR_DONE" is set to 0 on both of them.
142 
143    When a component returns to ccdiodone(), it checks if "cb_pflags &
144    CCDPF_MIRROR_DONE" is set or not.  If not, it sets the partner's
145    flag and returns.  If it is, it means its partner has already
146    returned, so it will go to the regular cleanup.
147 
148  */
149 
150 struct ccdbuf {
151 	struct buf	cb_buf;		/* new I/O buf */
152 	struct bio	*cb_obio;	/* ptr. to original I/O buf */
153 	struct ccdbuf	*cb_freenext;	/* free list link */
154 	int		cb_unit;	/* target unit */
155 	int		cb_comp;	/* target component */
156 	int		cb_pflags;	/* mirror/parity status flag */
157 	struct ccdbuf	*cb_mirror;	/* mirror counterpart */
158 };
159 
160 /* bits in cb_pflags */
161 #define CCDPF_MIRROR_DONE 1	/* if set, mirror counterpart is done */
162 
163 #define CCDLABELDEV(dev)	\
164 	(make_sub_dev(dev, dkmakeminor(ccdunit((dev)), 0, RAW_PART)))
165 
166 static d_open_t ccdopen;
167 static d_close_t ccdclose;
168 static d_strategy_t ccdstrategy;
169 static d_ioctl_t ccdioctl;
170 static d_dump_t ccddump;
171 static d_psize_t ccdsize;
172 
173 #define NCCDFREEHIWAT	16
174 
175 #define CDEV_MAJOR 74
176 
177 static struct cdevsw ccd_cdevsw = {
178 	/* name */	"ccd",
179 	/* maj */	CDEV_MAJOR,
180 	/* flags */	D_DISK,
181 	/* port */      NULL,
182 	/* clone */	NULL,
183 
184 	/* open */	ccdopen,
185 	/* close */	ccdclose,
186 	/* read */	physread,
187 	/* write */	physwrite,
188 	/* ioctl */	ccdioctl,
189 	/* poll */	nopoll,
190 	/* mmap */	nommap,
191 	/* strategy */	ccdstrategy,
192 	/* dump */	ccddump,
193 	/* psize */	ccdsize
194 };
195 
196 /* called during module initialization */
197 static	void ccdattach (void);
198 static	int ccd_modevent (module_t, int, void *);
199 
200 /* called by biodone() at interrupt time */
201 static	void ccdiodone (struct bio *bio);
202 
203 static	void ccdstart (struct ccd_softc *, struct bio *);
204 static	void ccdinterleave (struct ccd_softc *, int);
205 static	void ccdintr (struct ccd_softc *, struct bio *);
206 static	int ccdinit (struct ccddevice *, char **, struct thread *);
207 static	int ccdlookup (char *, struct thread *td, struct vnode **);
208 static	void ccdbuffer (struct ccdbuf **ret, struct ccd_softc *,
209 		struct bio *, daddr_t, caddr_t, long);
210 static	void ccdgetdisklabel (dev_t);
211 static	void ccdmakedisklabel (struct ccd_softc *);
212 static	int ccdlock (struct ccd_softc *);
213 static	void ccdunlock (struct ccd_softc *);
214 
215 #ifdef DEBUG
216 static	void printiinfo (struct ccdiinfo *);
217 #endif
218 
219 /* Non-private for the benefit of libkvm. */
220 struct	ccd_softc *ccd_softc;
221 struct	ccddevice *ccddevs;
222 struct	ccdbuf *ccdfreebufs;
223 static	int numccdfreebufs;
224 static	int numccd = 0;
225 
226 /*
227  * getccdbuf() -	Allocate and zero a ccd buffer.
228  *
229  *	This routine is called at splbio().
230  */
231 
232 static __inline
233 struct ccdbuf *
234 getccdbuf(struct ccdbuf *cpy)
235 {
236 	struct ccdbuf *cbp;
237 
238 	/*
239 	 * Allocate from freelist or malloc as necessary
240 	 */
241 	if ((cbp = ccdfreebufs) != NULL) {
242 		ccdfreebufs = cbp->cb_freenext;
243 		--numccdfreebufs;
244 		reinitbufbio(&cbp->cb_buf);
245 	} else {
246 		cbp = malloc(sizeof(struct ccdbuf), M_DEVBUF, M_WAITOK);
247 		initbufbio(&cbp->cb_buf);
248 	}
249 
250 	/*
251 	 * Used by mirroring code
252 	 */
253 	if (cpy)
254 		bcopy(cpy, cbp, sizeof(struct ccdbuf));
255 	else
256 		bzero(cbp, sizeof(struct ccdbuf));
257 
258 	/*
259 	 * independant struct buf initialization
260 	 */
261 	LIST_INIT(&cbp->cb_buf.b_dep);
262 	BUF_LOCKINIT(&cbp->cb_buf);
263 	BUF_LOCK(&cbp->cb_buf, LK_EXCLUSIVE);
264 	BUF_KERNPROC(&cbp->cb_buf);
265 
266 	return(cbp);
267 }
268 
269 /*
270  * putccdbuf() -	Free a ccd buffer.
271  *
272  *	This routine is called at splbio().
273  */
274 
275 static __inline
276 void
277 putccdbuf(struct ccdbuf *cbp)
278 {
279 	BUF_UNLOCK(&cbp->cb_buf);
280 	BUF_LOCKFREE(&cbp->cb_buf);
281 
282 	if (numccdfreebufs < NCCDFREEHIWAT) {
283 		cbp->cb_freenext = ccdfreebufs;
284 		ccdfreebufs = cbp;
285 		++numccdfreebufs;
286 	} else {
287 		free((caddr_t)cbp, M_DEVBUF);
288 	}
289 }
290 
291 
292 /*
293  * Number of blocks to untouched in front of a component partition.
294  * This is to avoid violating its disklabel area when it starts at the
295  * beginning of the slice.
296  */
297 #if !defined(CCD_OFFSET)
298 #define CCD_OFFSET 16
299 #endif
300 
301 /*
302  * Called by main() during pseudo-device attachment.  All we need
303  * to do is allocate enough space for devices to be configured later, and
304  * add devsw entries.
305  */
306 static void
307 ccdattach(void)
308 {
309 	int i;
310 	int num = NCCD;
311 
312 	if (num > 1)
313 		printf("ccd0-%d: Concatenated disk drivers\n", num-1);
314 	else
315 		printf("ccd0: Concatenated disk driver\n");
316 
317 	ccd_softc = malloc(num * sizeof(struct ccd_softc), M_DEVBUF,
318 			    M_WAITOK | M_ZERO);
319 	ccddevs = malloc(num * sizeof(struct ccddevice), M_DEVBUF,
320 			    M_WAITOK | M_ZERO);
321 	numccd = num;
322 
323 	cdevsw_add(&ccd_cdevsw, 0, 0);
324 	/* XXX: is this necessary? */
325 	for (i = 0; i < numccd; ++i)
326 		ccddevs[i].ccd_dk = -1;
327 }
328 
329 static int
330 ccd_modevent(module_t mod, int type, void *data)
331 {
332 	int error = 0;
333 
334 	switch (type) {
335 	case MOD_LOAD:
336 		ccdattach();
337 		break;
338 
339 	case MOD_UNLOAD:
340 		printf("ccd0: Unload not supported!\n");
341 		error = EOPNOTSUPP;
342 		break;
343 
344 	default:	/* MOD_SHUTDOWN etc */
345 		break;
346 	}
347 	return (error);
348 }
349 
350 DEV_MODULE(ccd, ccd_modevent, NULL);
351 
352 static int
353 ccdinit(struct ccddevice *ccd, char **cpaths, struct thread *td)
354 {
355 	struct ccd_softc *cs = &ccd_softc[ccd->ccd_unit];
356 	struct ccdcinfo *ci = NULL;	/* XXX */
357 	size_t size;
358 	int ix;
359 	struct vnode *vp;
360 	size_t minsize;
361 	int maxsecsize;
362 	struct partinfo dpart;
363 	struct ccdgeom *ccg = &cs->sc_geom;
364 	char tmppath[MAXPATHLEN];
365 	int error = 0;
366 	struct ucred *cred;
367 
368 	KKASSERT(td->td_proc);
369 	cred = td->td_proc->p_ucred;
370 
371 #ifdef DEBUG
372 	if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
373 		printf("ccdinit: unit %d\n", ccd->ccd_unit);
374 #endif
375 
376 	cs->sc_size = 0;
377 	cs->sc_ileave = ccd->ccd_interleave;
378 	cs->sc_nccdisks = ccd->ccd_ndev;
379 
380 	/* Allocate space for the component info. */
381 	cs->sc_cinfo = malloc(cs->sc_nccdisks * sizeof(struct ccdcinfo),
382 	    M_DEVBUF, M_WAITOK);
383 
384 	/*
385 	 * Verify that each component piece exists and record
386 	 * relevant information about it.
387 	 */
388 	maxsecsize = 0;
389 	minsize = 0;
390 	for (ix = 0; ix < cs->sc_nccdisks; ix++) {
391 		vp = ccd->ccd_vpp[ix];
392 		ci = &cs->sc_cinfo[ix];
393 		ci->ci_vp = vp;
394 
395 		/*
396 		 * Copy in the pathname of the component.
397 		 */
398 		bzero(tmppath, sizeof(tmppath));	/* sanity */
399 		if ((error = copyinstr(cpaths[ix], tmppath,
400 		    MAXPATHLEN, &ci->ci_pathlen)) != 0) {
401 #ifdef DEBUG
402 			if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
403 				printf("ccd%d: can't copy path, error = %d\n",
404 				    ccd->ccd_unit, error);
405 #endif
406 			goto fail;
407 		}
408 		ci->ci_path = malloc(ci->ci_pathlen, M_DEVBUF, M_WAITOK);
409 		bcopy(tmppath, ci->ci_path, ci->ci_pathlen);
410 
411 		ci->ci_dev = vn_todev(vp);
412 
413 		/*
414 		 * Get partition information for the component.
415 		 */
416 		if ((error = VOP_IOCTL(vp, DIOCGPART, (caddr_t)&dpart,
417 		    FREAD, cred, td)) != 0) {
418 #ifdef DEBUG
419 			if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
420 				 printf("ccd%d: %s: ioctl failed, error = %d\n",
421 				     ccd->ccd_unit, ci->ci_path, error);
422 #endif
423 			goto fail;
424 		}
425 		if (dpart.part->p_fstype == FS_BSDFFS) {
426 			maxsecsize =
427 			    ((dpart.disklab->d_secsize > maxsecsize) ?
428 			    dpart.disklab->d_secsize : maxsecsize);
429 			size = dpart.part->p_size - CCD_OFFSET;
430 		} else {
431 #ifdef DEBUG
432 			if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
433 				printf("ccd%d: %s: incorrect partition type\n",
434 				    ccd->ccd_unit, ci->ci_path);
435 #endif
436 			error = EFTYPE;
437 			goto fail;
438 		}
439 
440 		/*
441 		 * Calculate the size, truncating to an interleave
442 		 * boundary if necessary.
443 		 */
444 
445 		if (cs->sc_ileave > 1)
446 			size -= size % cs->sc_ileave;
447 
448 		if (size == 0) {
449 #ifdef DEBUG
450 			if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
451 				printf("ccd%d: %s: size == 0\n",
452 				    ccd->ccd_unit, ci->ci_path);
453 #endif
454 			error = ENODEV;
455 			goto fail;
456 		}
457 
458 		if (minsize == 0 || size < minsize)
459 			minsize = size;
460 		ci->ci_size = size;
461 		cs->sc_size += size;
462 	}
463 
464 	/*
465 	 * Don't allow the interleave to be smaller than
466 	 * the biggest component sector.
467 	 */
468 	if ((cs->sc_ileave > 0) &&
469 	    (cs->sc_ileave < (maxsecsize / DEV_BSIZE))) {
470 #ifdef DEBUG
471 		if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
472 			printf("ccd%d: interleave must be at least %d\n",
473 			    ccd->ccd_unit, (maxsecsize / DEV_BSIZE));
474 #endif
475 		error = EINVAL;
476 		goto fail;
477 	}
478 
479 	/*
480 	 * If uniform interleave is desired set all sizes to that of
481 	 * the smallest component.  This will guarentee that a single
482 	 * interleave table is generated.
483 	 *
484 	 * Lost space must be taken into account when calculating the
485 	 * overall size.  Half the space is lost when CCDF_MIRROR is
486 	 * specified.  One disk is lost when CCDF_PARITY is specified.
487 	 */
488 	if (ccd->ccd_flags & CCDF_UNIFORM) {
489 		for (ci = cs->sc_cinfo;
490 		     ci < &cs->sc_cinfo[cs->sc_nccdisks]; ci++) {
491 			ci->ci_size = minsize;
492 		}
493 		if (ccd->ccd_flags & CCDF_MIRROR) {
494 			/*
495 			 * Check to see if an even number of components
496 			 * have been specified.  The interleave must also
497 			 * be non-zero in order for us to be able to
498 			 * guarentee the topology.
499 			 */
500 			if (cs->sc_nccdisks % 2) {
501 				printf("ccd%d: mirroring requires an even number of disks\n", ccd->ccd_unit );
502 				error = EINVAL;
503 				goto fail;
504 			}
505 			if (cs->sc_ileave == 0) {
506 				printf("ccd%d: an interleave must be specified when mirroring\n", ccd->ccd_unit);
507 				error = EINVAL;
508 				goto fail;
509 			}
510 			cs->sc_size = (cs->sc_nccdisks/2) * minsize;
511 		} else if (ccd->ccd_flags & CCDF_PARITY) {
512 			cs->sc_size = (cs->sc_nccdisks-1) * minsize;
513 		} else {
514 			if (cs->sc_ileave == 0) {
515 				printf("ccd%d: an interleave must be specified when using parity\n", ccd->ccd_unit);
516 				error = EINVAL;
517 				goto fail;
518 			}
519 			cs->sc_size = cs->sc_nccdisks * minsize;
520 		}
521 	}
522 
523 	/*
524 	 * Construct the interleave table.
525 	 */
526 	ccdinterleave(cs, ccd->ccd_unit);
527 
528 	/*
529 	 * Create pseudo-geometry based on 1MB cylinders.  It's
530 	 * pretty close.
531 	 */
532 	ccg->ccg_secsize = maxsecsize;
533 	ccg->ccg_ntracks = 1;
534 	ccg->ccg_nsectors = 1024 * 1024 / ccg->ccg_secsize;
535 	ccg->ccg_ncylinders = cs->sc_size / ccg->ccg_nsectors;
536 
537 	/*
538 	 * Add an devstat entry for this device.
539 	 */
540 	devstat_add_entry(&cs->device_stats, "ccd", ccd->ccd_unit,
541 			  ccg->ccg_secsize, DEVSTAT_ALL_SUPPORTED,
542 			  DEVSTAT_TYPE_STORARRAY |DEVSTAT_TYPE_IF_OTHER,
543 			  DEVSTAT_PRIORITY_ARRAY);
544 
545 	cs->sc_flags |= CCDF_INITED;
546 	cs->sc_cflags = ccd->ccd_flags;	/* So we can find out later... */
547 	cs->sc_unit = ccd->ccd_unit;
548 	return (0);
549 fail:
550 	while (ci > cs->sc_cinfo) {
551 		ci--;
552 		free(ci->ci_path, M_DEVBUF);
553 	}
554 	free(cs->sc_cinfo, M_DEVBUF);
555 	return (error);
556 }
557 
558 static void
559 ccdinterleave(struct ccd_softc *cs, int unit)
560 {
561 	struct ccdcinfo *ci, *smallci;
562 	struct ccdiinfo *ii;
563 	daddr_t bn, lbn;
564 	int ix;
565 	u_long size;
566 
567 #ifdef DEBUG
568 	if (ccddebug & CCDB_INIT)
569 		printf("ccdinterleave(%x): ileave %d\n", cs, cs->sc_ileave);
570 #endif
571 
572 	/*
573 	 * Allocate an interleave table.  The worst case occurs when each
574 	 * of N disks is of a different size, resulting in N interleave
575 	 * tables.
576 	 *
577 	 * Chances are this is too big, but we don't care.
578 	 */
579 	size = (cs->sc_nccdisks + 1) * sizeof(struct ccdiinfo);
580 	cs->sc_itable = (struct ccdiinfo *)malloc(size, M_DEVBUF, M_WAITOK);
581 	bzero((caddr_t)cs->sc_itable, size);
582 
583 	/*
584 	 * Trivial case: no interleave (actually interleave of disk size).
585 	 * Each table entry represents a single component in its entirety.
586 	 *
587 	 * An interleave of 0 may not be used with a mirror or parity setup.
588 	 */
589 	if (cs->sc_ileave == 0) {
590 		bn = 0;
591 		ii = cs->sc_itable;
592 
593 		for (ix = 0; ix < cs->sc_nccdisks; ix++) {
594 			/* Allocate space for ii_index. */
595 			ii->ii_index = malloc(sizeof(int), M_DEVBUF, M_WAITOK);
596 			ii->ii_ndisk = 1;
597 			ii->ii_startblk = bn;
598 			ii->ii_startoff = 0;
599 			ii->ii_index[0] = ix;
600 			bn += cs->sc_cinfo[ix].ci_size;
601 			ii++;
602 		}
603 		ii->ii_ndisk = 0;
604 #ifdef DEBUG
605 		if (ccddebug & CCDB_INIT)
606 			printiinfo(cs->sc_itable);
607 #endif
608 		return;
609 	}
610 
611 	/*
612 	 * The following isn't fast or pretty; it doesn't have to be.
613 	 */
614 	size = 0;
615 	bn = lbn = 0;
616 	for (ii = cs->sc_itable; ; ii++) {
617 		/*
618 		 * Allocate space for ii_index.  We might allocate more then
619 		 * we use.
620 		 */
621 		ii->ii_index = malloc((sizeof(int) * cs->sc_nccdisks),
622 		    M_DEVBUF, M_WAITOK);
623 
624 		/*
625 		 * Locate the smallest of the remaining components
626 		 */
627 		smallci = NULL;
628 		for (ci = cs->sc_cinfo; ci < &cs->sc_cinfo[cs->sc_nccdisks];
629 		    ci++) {
630 			if (ci->ci_size > size &&
631 			    (smallci == NULL ||
632 			     ci->ci_size < smallci->ci_size)) {
633 				smallci = ci;
634 			}
635 		}
636 
637 		/*
638 		 * Nobody left, all done
639 		 */
640 		if (smallci == NULL) {
641 			ii->ii_ndisk = 0;
642 			break;
643 		}
644 
645 		/*
646 		 * Record starting logical block using an sc_ileave blocksize.
647 		 */
648 		ii->ii_startblk = bn / cs->sc_ileave;
649 
650 		/*
651 		 * Record starting comopnent block using an sc_ileave
652 		 * blocksize.  This value is relative to the beginning of
653 		 * a component disk.
654 		 */
655 		ii->ii_startoff = lbn;
656 
657 		/*
658 		 * Determine how many disks take part in this interleave
659 		 * and record their indices.
660 		 */
661 		ix = 0;
662 		for (ci = cs->sc_cinfo;
663 		    ci < &cs->sc_cinfo[cs->sc_nccdisks]; ci++) {
664 			if (ci->ci_size >= smallci->ci_size) {
665 				ii->ii_index[ix++] = ci - cs->sc_cinfo;
666 			}
667 		}
668 		ii->ii_ndisk = ix;
669 		bn += ix * (smallci->ci_size - size);
670 		lbn = smallci->ci_size / cs->sc_ileave;
671 		size = smallci->ci_size;
672 	}
673 #ifdef DEBUG
674 	if (ccddebug & CCDB_INIT)
675 		printiinfo(cs->sc_itable);
676 #endif
677 }
678 
679 /* ARGSUSED */
680 static int
681 ccdopen(dev_t dev, int flags, int fmt, d_thread_t *td)
682 {
683 	int unit = ccdunit(dev);
684 	struct ccd_softc *cs;
685 	struct disklabel *lp;
686 	int error = 0, part, pmask;
687 
688 #ifdef DEBUG
689 	if (ccddebug & CCDB_FOLLOW)
690 		printf("ccdopen(%x, %x)\n", dev, flags);
691 #endif
692 	if (unit >= numccd)
693 		return (ENXIO);
694 	cs = &ccd_softc[unit];
695 
696 	if ((error = ccdlock(cs)) != 0)
697 		return (error);
698 
699 	lp = &cs->sc_label;
700 
701 	part = ccdpart(dev);
702 	pmask = (1 << part);
703 
704 	/*
705 	 * If we're initialized, check to see if there are any other
706 	 * open partitions.  If not, then it's safe to update
707 	 * the in-core disklabel.
708 	 */
709 	if ((cs->sc_flags & CCDF_INITED) && (cs->sc_openmask == 0))
710 		ccdgetdisklabel(dev);
711 
712 	/* Check that the partition exists. */
713 	if (part != RAW_PART && ((part >= lp->d_npartitions) ||
714 	    (lp->d_partitions[part].p_fstype == FS_UNUSED))) {
715 		error = ENXIO;
716 		goto done;
717 	}
718 
719 	cs->sc_openmask |= pmask;
720  done:
721 	ccdunlock(cs);
722 	return (0);
723 }
724 
725 /* ARGSUSED */
726 static int
727 ccdclose(dev_t dev, int flags, int fmt, d_thread_t *td)
728 {
729 	int unit = ccdunit(dev);
730 	struct ccd_softc *cs;
731 	int error = 0, part;
732 
733 #ifdef DEBUG
734 	if (ccddebug & CCDB_FOLLOW)
735 		printf("ccdclose(%x, %x)\n", dev, flags);
736 #endif
737 
738 	if (unit >= numccd)
739 		return (ENXIO);
740 	cs = &ccd_softc[unit];
741 
742 	if ((error = ccdlock(cs)) != 0)
743 		return (error);
744 
745 	part = ccdpart(dev);
746 
747 	/* ...that much closer to allowing unconfiguration... */
748 	cs->sc_openmask &= ~(1 << part);
749 	ccdunlock(cs);
750 	return (0);
751 }
752 
753 static void
754 ccdstrategy(dev_t dev, struct bio *bio)
755 {
756 	int unit = ccdunit(dev);
757 	struct bio *nbio;
758 	struct buf *bp = bio->bio_buf;
759 	struct ccd_softc *cs = &ccd_softc[unit];
760 	int wlabel;
761 	struct disklabel *lp;
762 
763 #ifdef DEBUG
764 	if (ccddebug & CCDB_FOLLOW)
765 		printf("ccdstrategy(%x): unit %d\n", bp, unit);
766 #endif
767 	if ((cs->sc_flags & CCDF_INITED) == 0) {
768 		bp->b_error = ENXIO;
769 		bp->b_flags |= B_ERROR;
770 		goto done;
771 	}
772 
773 	/* If it's a nil transfer, wake up the top half now. */
774 	if (bp->b_bcount == 0)
775 		goto done;
776 
777 	lp = &cs->sc_label;
778 
779 	/*
780 	 * Do bounds checking and adjust transfer.  If there's an
781 	 * error, the bounds check will flag that for us.
782 	 */
783 	wlabel = cs->sc_flags & (CCDF_WLABEL|CCDF_LABELLING);
784 	if (ccdpart(dev) != RAW_PART) {
785 		nbio = bounds_check_with_label(dev, bio, lp, wlabel);
786 		if (nbio == NULL)
787 			goto done;
788 	} else {
789 		int pbn;        /* in sc_secsize chunks */
790 		long sz;        /* in sc_secsize chunks */
791 
792 		pbn = bio->bio_blkno / (cs->sc_geom.ccg_secsize / DEV_BSIZE);
793 		sz = howmany(bp->b_bcount, cs->sc_geom.ccg_secsize);
794 
795 		/*
796 		 * If out of bounds return an error. If at the EOF point,
797 		 * simply read or write less.
798 		 */
799 
800 		if (pbn < 0 || pbn >= cs->sc_size) {
801 			bp->b_resid = bp->b_bcount;
802 			if (pbn != cs->sc_size) {
803 				bp->b_error = EINVAL;
804 				bp->b_flags |= B_ERROR | B_INVAL;
805 			}
806 			goto done;
807 		}
808 
809 		/*
810 		 * If the request crosses EOF, truncate the request.
811 		 */
812 		if (pbn + sz > cs->sc_size) {
813 			bp->b_bcount = (cs->sc_size - pbn) *
814 			    cs->sc_geom.ccg_secsize;
815 		}
816 		nbio = bio;
817 	}
818 
819 	bp->b_resid = bp->b_bcount;
820 	nbio->bio_driver_info = dev;
821 
822 	/*
823 	 * "Start" the unit.
824 	 */
825 	crit_enter();
826 	ccdstart(cs, nbio);
827 	crit_exit();
828 	return;
829 
830 	/*
831 	 * note: bio, not nbio, is valid at the done label.
832 	 */
833 done:
834 	biodone(bio);
835 }
836 
837 static void
838 ccdstart(struct ccd_softc *cs, struct bio *bio)
839 {
840 	long bcount, rcount;
841 	struct ccdbuf *cbp[4];
842 	struct buf *bp = bio->bio_buf;
843 	dev_t dev = bio->bio_driver_info;
844 	/* XXX! : 2 reads and 2 writes for RAID 4/5 */
845 	caddr_t addr;
846 	daddr_t bn;
847 	struct partition *pp;
848 
849 #ifdef DEBUG
850 	if (ccddebug & CCDB_FOLLOW)
851 		printf("ccdstart(%x, %x)\n", cs, bp);
852 #endif
853 
854 	/* Record the transaction start  */
855 	devstat_start_transaction(&cs->device_stats);
856 
857 	/*
858 	 * Translate the partition-relative block number to an absolute.
859 	 */
860 	bn = bio->bio_blkno;
861 	if (ccdpart(dev) != RAW_PART) {
862 		pp = &cs->sc_label.d_partitions[ccdpart(dev)];
863 		bn += pp->p_offset;
864 	}
865 
866 	/*
867 	 * Allocate component buffers and fire off the requests
868 	 */
869 	addr = bp->b_data;
870 	for (bcount = bp->b_bcount; bcount > 0; bcount -= rcount) {
871 		ccdbuffer(cbp, cs, bio, bn, addr, bcount);
872 		rcount = cbp[0]->cb_buf.b_bcount;
873 
874 		if (cs->sc_cflags & CCDF_MIRROR) {
875 			/*
876 			 * Mirroring.  Writes go to both disks, reads are
877 			 * taken from whichever disk seems most appropriate.
878 			 *
879 			 * We attempt to localize reads to the disk whos arm
880 			 * is nearest the read request.  We ignore seeks due
881 			 * to writes when making this determination and we
882 			 * also try to avoid hogging.
883 			 */
884 			if ((cbp[0]->cb_buf.b_flags & B_READ) == 0) {
885 				vn_strategy(cbp[0]->cb_buf.b_vp,
886 				    &cbp[0]->cb_buf.b_bio1);
887 				vn_strategy(cbp[1]->cb_buf.b_vp,
888 				    &cbp[1]->cb_buf.b_bio1);
889 			} else {
890 				int pick = cs->sc_pick;
891 				daddr_t range = cs->sc_size / 16;
892 
893 				if (bn < cs->sc_blk[pick] - range ||
894 				    bn > cs->sc_blk[pick] + range
895 				) {
896 					cs->sc_pick = pick = 1 - pick;
897 				}
898 				cs->sc_blk[pick] = bn + btodb(rcount);
899 				vn_strategy(cbp[pick]->cb_buf.b_vp,
900 				    &cbp[pick]->cb_buf.b_bio1);
901 			}
902 		} else {
903 			/*
904 			 * Not mirroring
905 			 */
906 			vn_strategy(cbp[0]->cb_buf.b_vp,
907 				     &cbp[0]->cb_buf.b_bio1);
908 		}
909 		bn += btodb(rcount);
910 		addr += rcount;
911 	}
912 }
913 
914 /*
915  * Build a component buffer header.
916  */
917 static void
918 ccdbuffer(struct ccdbuf **cb, struct ccd_softc *cs, struct bio *bio, daddr_t bn,
919 	  caddr_t addr, long bcount)
920 {
921 	struct ccdcinfo *ci, *ci2 = NULL;	/* XXX */
922 	struct ccdbuf *cbp;
923 	daddr_t cbn, cboff;
924 	off_t cbc;
925 
926 #ifdef DEBUG
927 	if (ccddebug & CCDB_IO)
928 		printf("ccdbuffer(%x, %x, %d, %x, %d)\n",
929 		       cs, bp, bn, addr, bcount);
930 #endif
931 	/*
932 	 * Determine which component bn falls in.
933 	 */
934 	cbn = bn;
935 	cboff = 0;
936 
937 	if (cs->sc_ileave == 0) {
938 		/*
939 		 * Serially concatenated and neither a mirror nor a parity
940 		 * config.  This is a special case.
941 		 */
942 		daddr_t sblk;
943 
944 		sblk = 0;
945 		for (ci = cs->sc_cinfo; cbn >= sblk + ci->ci_size; ci++)
946 			sblk += ci->ci_size;
947 		cbn -= sblk;
948 	} else {
949 		struct ccdiinfo *ii;
950 		int ccdisk, off;
951 
952 		/*
953 		 * Calculate cbn, the logical superblock (sc_ileave chunks),
954 		 * and cboff, a normal block offset (DEV_BSIZE chunks) relative
955 		 * to cbn.
956 		 */
957 		cboff = cbn % cs->sc_ileave;	/* DEV_BSIZE gran */
958 		cbn = cbn / cs->sc_ileave;	/* DEV_BSIZE * ileave gran */
959 
960 		/*
961 		 * Figure out which interleave table to use.
962 		 */
963 		for (ii = cs->sc_itable; ii->ii_ndisk; ii++) {
964 			if (ii->ii_startblk > cbn)
965 				break;
966 		}
967 		ii--;
968 
969 		/*
970 		 * off is the logical superblock relative to the beginning
971 		 * of this interleave block.
972 		 */
973 		off = cbn - ii->ii_startblk;
974 
975 		/*
976 		 * We must calculate which disk component to use (ccdisk),
977 		 * and recalculate cbn to be the superblock relative to
978 		 * the beginning of the component.  This is typically done by
979 		 * adding 'off' and ii->ii_startoff together.  However, 'off'
980 		 * must typically be divided by the number of components in
981 		 * this interleave array to be properly convert it from a
982 		 * CCD-relative logical superblock number to a
983 		 * component-relative superblock number.
984 		 */
985 		if (ii->ii_ndisk == 1) {
986 			/*
987 			 * When we have just one disk, it can't be a mirror
988 			 * or a parity config.
989 			 */
990 			ccdisk = ii->ii_index[0];
991 			cbn = ii->ii_startoff + off;
992 		} else {
993 			if (cs->sc_cflags & CCDF_MIRROR) {
994 				/*
995 				 * We have forced a uniform mapping, resulting
996 				 * in a single interleave array.  We double
997 				 * up on the first half of the available
998 				 * components and our mirror is in the second
999 				 * half.  This only works with a single
1000 				 * interleave array because doubling up
1001 				 * doubles the number of sectors, so there
1002 				 * cannot be another interleave array because
1003 				 * the next interleave array's calculations
1004 				 * would be off.
1005 				 */
1006 				int ndisk2 = ii->ii_ndisk / 2;
1007 				ccdisk = ii->ii_index[off % ndisk2];
1008 				cbn = ii->ii_startoff + off / ndisk2;
1009 				ci2 = &cs->sc_cinfo[ccdisk + ndisk2];
1010 			} else if (cs->sc_cflags & CCDF_PARITY) {
1011 				/*
1012 				 * XXX not implemented yet
1013 				 */
1014 				int ndisk2 = ii->ii_ndisk - 1;
1015 				ccdisk = ii->ii_index[off % ndisk2];
1016 				cbn = ii->ii_startoff + off / ndisk2;
1017 				if (cbn % ii->ii_ndisk <= ccdisk)
1018 					ccdisk++;
1019 			} else {
1020 				ccdisk = ii->ii_index[off % ii->ii_ndisk];
1021 				cbn = ii->ii_startoff + off / ii->ii_ndisk;
1022 			}
1023 		}
1024 
1025 		ci = &cs->sc_cinfo[ccdisk];
1026 
1027 		/*
1028 		 * Convert cbn from a superblock to a normal block so it
1029 		 * can be used to calculate (along with cboff) the normal
1030 		 * block index into this particular disk.
1031 		 */
1032 		cbn *= cs->sc_ileave;
1033 	}
1034 
1035 	/*
1036 	 * Fill in the component buf structure.
1037 	 */
1038 	cbp = getccdbuf(NULL);
1039 	cbp->cb_buf.b_flags = bio->bio_buf->b_flags;
1040 	/*cbp->cb_buf.b_dev = ci->ci_dev; */
1041 	cbp->cb_buf.b_data = addr;
1042 	cbp->cb_buf.b_vp = ci->ci_vp;
1043 	if (cs->sc_ileave == 0)
1044               cbc = dbtob((off_t)(ci->ci_size - cbn));
1045 	else
1046               cbc = dbtob((off_t)(cs->sc_ileave - cboff));
1047 	cbp->cb_buf.b_bcount = (cbc < bcount) ? cbc : bcount;
1048  	cbp->cb_buf.b_bufsize = cbp->cb_buf.b_bcount;
1049 
1050 	cbp->cb_buf.b_bio1.bio_done = ccdiodone;
1051 	cbp->cb_buf.b_bio1.bio_caller_info1.ptr = cbp;
1052 	cbp->cb_buf.b_bio1.bio_blkno = cbn + cboff + CCD_OFFSET;
1053 	cbp->cb_buf.b_bio1.bio_offset = dbtob(cbn + cboff + CCD_OFFSET);
1054 
1055 	/*
1056 	 * context for ccdiodone
1057 	 */
1058 	cbp->cb_obio = bio;
1059 	cbp->cb_unit = cs - ccd_softc;
1060 	cbp->cb_comp = ci - cs->sc_cinfo;
1061 
1062 #ifdef DEBUG
1063 	if (ccddebug & CCDB_IO)
1064 		printf(" dev %x(u%d): cbp %x bn %d addr %x bcnt %d\n",
1065 		       ci->ci_dev, ci-cs->sc_cinfo, cbp,
1066 		       cbp->cb_buf.b_bio1.bio_blkno,
1067 		       cbp->cb_buf.b_data, cbp->cb_buf.b_bcount);
1068 #endif
1069 	cb[0] = cbp;
1070 
1071 	/*
1072 	 * Note: both I/O's setup when reading from mirror, but only one
1073 	 * will be executed.
1074 	 */
1075 	if (cs->sc_cflags & CCDF_MIRROR) {
1076 		/* mirror, setup second I/O */
1077 		cbp = getccdbuf(cb[0]);
1078 		/* cbp->cb_buf.b_dev = ci2->ci_dev; */
1079 		cbp->cb_buf.b_vp = ci2->ci_vp;
1080 		cbp->cb_comp = ci2 - cs->sc_cinfo;
1081 		cb[1] = cbp;
1082 		/* link together the ccdbuf's and clear "mirror done" flag */
1083 		cb[0]->cb_mirror = cb[1];
1084 		cb[1]->cb_mirror = cb[0];
1085 		cb[0]->cb_pflags &= ~CCDPF_MIRROR_DONE;
1086 		cb[1]->cb_pflags &= ~CCDPF_MIRROR_DONE;
1087 	}
1088 }
1089 
1090 static void
1091 ccdintr(struct ccd_softc *cs, struct bio *bio)
1092 {
1093 	struct buf *bp = bio->bio_buf;
1094 
1095 #ifdef DEBUG
1096 	if (ccddebug & CCDB_FOLLOW)
1097 		printf("ccdintr(%x, %x)\n", cs, bp);
1098 #endif
1099 	/*
1100 	 * Request is done for better or worse, wakeup the top half.
1101 	 */
1102 	if (bp->b_flags & B_ERROR)
1103 		bp->b_resid = bp->b_bcount;
1104 	devstat_end_transaction_buf(&cs->device_stats, bp);
1105 	biodone(bio);
1106 }
1107 
1108 /*
1109  * Called at interrupt time.
1110  * Mark the component as done and if all components are done,
1111  * take a ccd interrupt.
1112  */
1113 static void
1114 ccdiodone(struct bio *bio)
1115 {
1116 	struct ccdbuf *cbp = bio->bio_caller_info1.ptr;
1117 	struct bio *obio = cbp->cb_obio;
1118 	struct buf *obp = obio->bio_buf;
1119 	int unit = cbp->cb_unit;
1120 	int count;
1121 
1122 	/*
1123 	 * Since we do not have exclusive access to underlying devices,
1124 	 * we can't keep cache translations around.
1125 	 */
1126 	clearbiocache(bio->bio_next);
1127 
1128 	crit_enter();
1129 #ifdef DEBUG
1130 	if (ccddebug & CCDB_FOLLOW)
1131 		printf("ccdiodone(%x)\n", cbp);
1132 	if (ccddebug & CCDB_IO) {
1133 		printf("ccdiodone: bp %x bcount %d resid %d\n",
1134 		       obp, obp->b_bcount, obp->b_resid);
1135 		printf(" dev %x(u%d), cbp %x bn %d addr %x bcnt %d\n",
1136 		       cbp->cb_buf.b_dev, cbp->cb_comp, cbp,
1137 		       cbp->cb_buf.b_lblkno, cbp->cb_buf.b_data,
1138 		       cbp->cb_buf.b_bcount);
1139 	}
1140 #endif
1141 	/*
1142 	 * If an error occured, report it.  If this is a mirrored
1143 	 * configuration and the first of two possible reads, do not
1144 	 * set the error in the bp yet because the second read may
1145 	 * succeed.
1146 	 */
1147 	if (cbp->cb_buf.b_flags & B_ERROR) {
1148 		const char *msg = "";
1149 
1150 		if ((ccd_softc[unit].sc_cflags & CCDF_MIRROR) &&
1151 		    (cbp->cb_buf.b_flags & B_READ) &&
1152 		    (cbp->cb_pflags & CCDPF_MIRROR_DONE) == 0) {
1153 			/*
1154 			 * We will try our read on the other disk down
1155 			 * below, also reverse the default pick so if we
1156 			 * are doing a scan we do not keep hitting the
1157 			 * bad disk first.
1158 			 */
1159 			struct ccd_softc *cs = &ccd_softc[unit];
1160 
1161 			msg = ", trying other disk";
1162 			cs->sc_pick = 1 - cs->sc_pick;
1163 			cs->sc_blk[cs->sc_pick] = obio->bio_blkno;
1164 		} else {
1165 			obp->b_flags |= B_ERROR;
1166 			obp->b_error = cbp->cb_buf.b_error ?
1167 			    cbp->cb_buf.b_error : EIO;
1168 		}
1169 		printf("ccd%d: error %d on component %d block %d (ccd block %d)%s\n",
1170 		       unit, obp->b_error, cbp->cb_comp,
1171 		       (int)cbp->cb_buf.b_bio2.bio_blkno,
1172 		       obio->bio_blkno, msg);
1173 	}
1174 
1175 	/*
1176 	 * Process mirror.  If we are writing, I/O has been initiated on both
1177 	 * buffers and we fall through only after both are finished.
1178 	 *
1179 	 * If we are reading only one I/O is initiated at a time.  If an
1180 	 * error occurs we initiate the second I/O and return, otherwise
1181 	 * we free the second I/O without initiating it.
1182 	 */
1183 
1184 	if (ccd_softc[unit].sc_cflags & CCDF_MIRROR) {
1185 		if ((cbp->cb_buf.b_flags & B_READ) == 0) {
1186 			/*
1187 			 * When writing, handshake with the second buffer
1188 			 * to determine when both are done.  If both are not
1189 			 * done, return here.
1190 			 */
1191 			if ((cbp->cb_pflags & CCDPF_MIRROR_DONE) == 0) {
1192 				cbp->cb_mirror->cb_pflags |= CCDPF_MIRROR_DONE;
1193 				putccdbuf(cbp);
1194 				crit_exit();
1195 				return;
1196 			}
1197 		} else {
1198 			/*
1199 			 * When reading, either dispose of the second buffer
1200 			 * or initiate I/O on the second buffer if an error
1201 			 * occured with this one.
1202 			 */
1203 			if ((cbp->cb_pflags & CCDPF_MIRROR_DONE) == 0) {
1204 				if (cbp->cb_buf.b_flags & B_ERROR) {
1205 					cbp->cb_mirror->cb_pflags |=
1206 					    CCDPF_MIRROR_DONE;
1207 					vn_strategy(
1208 					    cbp->cb_mirror->cb_buf.b_vp,
1209 					    &cbp->cb_mirror->cb_buf.b_bio1
1210 					);
1211 					putccdbuf(cbp);
1212 					crit_exit();
1213 					return;
1214 				} else {
1215 					putccdbuf(cbp->cb_mirror);
1216 					/* fall through */
1217 				}
1218 			}
1219 		}
1220 	}
1221 
1222 	/*
1223 	 * use b_bufsize to determine how big the original request was rather
1224 	 * then b_bcount, because b_bcount may have been truncated for EOF.
1225 	 *
1226 	 * XXX We check for an error, but we do not test the resid for an
1227 	 * aligned EOF condition.  This may result in character & block
1228 	 * device access not recognizing EOF properly when read or written
1229 	 * sequentially, but will not effect filesystems.
1230 	 */
1231 	count = cbp->cb_buf.b_bufsize;
1232 	putccdbuf(cbp);
1233 
1234 	/*
1235 	 * If all done, "interrupt".
1236 	 */
1237 	obp->b_resid -= count;
1238 	if (obp->b_resid < 0)
1239 		panic("ccdiodone: count");
1240 	if (obp->b_resid == 0)
1241 		ccdintr(&ccd_softc[unit], obio);
1242 	crit_exit();
1243 }
1244 
1245 static int
1246 ccdioctl(dev_t dev, u_long cmd, caddr_t data, int flag, d_thread_t *td)
1247 {
1248 	int unit = ccdunit(dev);
1249 	int i, j, lookedup = 0, error = 0;
1250 	int part, pmask;
1251 	struct ccd_softc *cs;
1252 	struct ccd_ioctl *ccio = (struct ccd_ioctl *)data;
1253 	struct ccddevice ccd;
1254 	char **cpp;
1255 	struct vnode **vpp;
1256 	struct ucred *cred;
1257 
1258 	KKASSERT(td->td_proc != NULL);
1259 	cred = td->td_proc->p_ucred;
1260 
1261 	if (unit >= numccd)
1262 		return (ENXIO);
1263 	cs = &ccd_softc[unit];
1264 
1265 	bzero(&ccd, sizeof(ccd));
1266 
1267 	switch (cmd) {
1268 	case CCDIOCSET:
1269 		if (cs->sc_flags & CCDF_INITED)
1270 			return (EBUSY);
1271 
1272 		if ((flag & FWRITE) == 0)
1273 			return (EBADF);
1274 
1275 		if ((error = ccdlock(cs)) != 0)
1276 			return (error);
1277 
1278 		if (ccio->ccio_ndisks > CCD_MAXNDISKS)
1279 			return (EINVAL);
1280 
1281 		/* Fill in some important bits. */
1282 		ccd.ccd_unit = unit;
1283 		ccd.ccd_interleave = ccio->ccio_ileave;
1284 		if (ccd.ccd_interleave == 0 &&
1285 		    ((ccio->ccio_flags & CCDF_MIRROR) ||
1286 		     (ccio->ccio_flags & CCDF_PARITY))) {
1287 			printf("ccd%d: disabling mirror/parity, interleave is 0\n", unit);
1288 			ccio->ccio_flags &= ~(CCDF_MIRROR | CCDF_PARITY);
1289 		}
1290 		if ((ccio->ccio_flags & CCDF_MIRROR) &&
1291 		    (ccio->ccio_flags & CCDF_PARITY)) {
1292 			printf("ccd%d: can't specify both mirror and parity, using mirror\n", unit);
1293 			ccio->ccio_flags &= ~CCDF_PARITY;
1294 		}
1295 		if ((ccio->ccio_flags & (CCDF_MIRROR | CCDF_PARITY)) &&
1296 		    !(ccio->ccio_flags & CCDF_UNIFORM)) {
1297 			printf("ccd%d: mirror/parity forces uniform flag\n",
1298 			       unit);
1299 			ccio->ccio_flags |= CCDF_UNIFORM;
1300 		}
1301 		ccd.ccd_flags = ccio->ccio_flags & CCDF_USERMASK;
1302 
1303 		/*
1304 		 * Allocate space for and copy in the array of
1305 		 * componet pathnames and device numbers.
1306 		 */
1307 		cpp = malloc(ccio->ccio_ndisks * sizeof(char *),
1308 		    M_DEVBUF, M_WAITOK);
1309 		vpp = malloc(ccio->ccio_ndisks * sizeof(struct vnode *),
1310 		    M_DEVBUF, M_WAITOK);
1311 
1312 		error = copyin((caddr_t)ccio->ccio_disks, (caddr_t)cpp,
1313 		    ccio->ccio_ndisks * sizeof(char **));
1314 		if (error) {
1315 			free(vpp, M_DEVBUF);
1316 			free(cpp, M_DEVBUF);
1317 			ccdunlock(cs);
1318 			return (error);
1319 		}
1320 
1321 #ifdef DEBUG
1322 		if (ccddebug & CCDB_INIT)
1323 			for (i = 0; i < ccio->ccio_ndisks; ++i)
1324 				printf("ccdioctl: component %d: 0x%x\n",
1325 				    i, cpp[i]);
1326 #endif
1327 
1328 		for (i = 0; i < ccio->ccio_ndisks; ++i) {
1329 #ifdef DEBUG
1330 			if (ccddebug & CCDB_INIT)
1331 				printf("ccdioctl: lookedup = %d\n", lookedup);
1332 #endif
1333 			if ((error = ccdlookup(cpp[i], td, &vpp[i])) != 0) {
1334 				for (j = 0; j < lookedup; ++j)
1335 					(void)vn_close(vpp[j], FREAD|FWRITE, td);
1336 				free(vpp, M_DEVBUF);
1337 				free(cpp, M_DEVBUF);
1338 				ccdunlock(cs);
1339 				return (error);
1340 			}
1341 			++lookedup;
1342 		}
1343 		ccd.ccd_cpp = cpp;
1344 		ccd.ccd_vpp = vpp;
1345 		ccd.ccd_ndev = ccio->ccio_ndisks;
1346 
1347 		/*
1348 		 * Initialize the ccd.  Fills in the softc for us.
1349 		 */
1350 		if ((error = ccdinit(&ccd, cpp, td)) != 0) {
1351 			for (j = 0; j < lookedup; ++j)
1352 				(void)vn_close(vpp[j], FREAD|FWRITE, td);
1353 			bzero(&ccd_softc[unit], sizeof(struct ccd_softc));
1354 			free(vpp, M_DEVBUF);
1355 			free(cpp, M_DEVBUF);
1356 			ccdunlock(cs);
1357 			return (error);
1358 		}
1359 
1360 		/*
1361 		 * The ccd has been successfully initialized, so
1362 		 * we can place it into the array and read the disklabel.
1363 		 */
1364 		bcopy(&ccd, &ccddevs[unit], sizeof(ccd));
1365 		ccio->ccio_unit = unit;
1366 		ccio->ccio_size = cs->sc_size;
1367 		ccdgetdisklabel(dev);
1368 
1369 		ccdunlock(cs);
1370 
1371 		break;
1372 
1373 	case CCDIOCCLR:
1374 		if ((cs->sc_flags & CCDF_INITED) == 0)
1375 			return (ENXIO);
1376 
1377 		if ((flag & FWRITE) == 0)
1378 			return (EBADF);
1379 
1380 		if ((error = ccdlock(cs)) != 0)
1381 			return (error);
1382 
1383 		/* Don't unconfigure if any other partitions are open */
1384 		part = ccdpart(dev);
1385 		pmask = (1 << part);
1386 		if ((cs->sc_openmask & ~pmask)) {
1387 			ccdunlock(cs);
1388 			return (EBUSY);
1389 		}
1390 
1391 		/*
1392 		 * Free ccd_softc information and clear entry.
1393 		 */
1394 
1395 		/* Close the components and free their pathnames. */
1396 		for (i = 0; i < cs->sc_nccdisks; ++i) {
1397 			/*
1398 			 * XXX: this close could potentially fail and
1399 			 * cause Bad Things.  Maybe we need to force
1400 			 * the close to happen?
1401 			 */
1402 #ifdef DEBUG
1403 			if (ccddebug & CCDB_VNODE)
1404 				vprint("CCDIOCCLR: vnode info",
1405 				    cs->sc_cinfo[i].ci_vp);
1406 #endif
1407 			(void)vn_close(cs->sc_cinfo[i].ci_vp, FREAD|FWRITE, td);
1408 			free(cs->sc_cinfo[i].ci_path, M_DEVBUF);
1409 		}
1410 
1411 		/* Free interleave index. */
1412 		for (i = 0; cs->sc_itable[i].ii_ndisk; ++i)
1413 			free(cs->sc_itable[i].ii_index, M_DEVBUF);
1414 
1415 		/* Free component info and interleave table. */
1416 		free(cs->sc_cinfo, M_DEVBUF);
1417 		free(cs->sc_itable, M_DEVBUF);
1418 		cs->sc_flags &= ~CCDF_INITED;
1419 
1420 		/*
1421 		 * Free ccddevice information and clear entry.
1422 		 */
1423 		free(ccddevs[unit].ccd_cpp, M_DEVBUF);
1424 		free(ccddevs[unit].ccd_vpp, M_DEVBUF);
1425 		ccd.ccd_dk = -1;
1426 		bcopy(&ccd, &ccddevs[unit], sizeof(ccd));
1427 
1428 		/*
1429 		 * And remove the devstat entry.
1430 		 */
1431 		devstat_remove_entry(&cs->device_stats);
1432 
1433 		/* This must be atomic. */
1434 		crit_enter();
1435 		ccdunlock(cs);
1436 		bzero(cs, sizeof(struct ccd_softc));
1437 		crit_exit();
1438 
1439 		break;
1440 
1441 	case DIOCGDINFO:
1442 		if ((cs->sc_flags & CCDF_INITED) == 0)
1443 			return (ENXIO);
1444 
1445 		*(struct disklabel *)data = cs->sc_label;
1446 		break;
1447 
1448 	case DIOCGPART:
1449 		if ((cs->sc_flags & CCDF_INITED) == 0)
1450 			return (ENXIO);
1451 
1452 		((struct partinfo *)data)->disklab = &cs->sc_label;
1453 		((struct partinfo *)data)->part =
1454 		    &cs->sc_label.d_partitions[ccdpart(dev)];
1455 		break;
1456 
1457 	case DIOCWDINFO:
1458 	case DIOCSDINFO:
1459 		if ((cs->sc_flags & CCDF_INITED) == 0)
1460 			return (ENXIO);
1461 
1462 		if ((flag & FWRITE) == 0)
1463 			return (EBADF);
1464 
1465 		if ((error = ccdlock(cs)) != 0)
1466 			return (error);
1467 
1468 		cs->sc_flags |= CCDF_LABELLING;
1469 
1470 		error = setdisklabel(&cs->sc_label,
1471 		    (struct disklabel *)data, 0);
1472 		if (error == 0) {
1473 			if (cmd == DIOCWDINFO) {
1474 				dev_t cdev = CCDLABELDEV(dev);
1475 				error = writedisklabel(cdev, &cs->sc_label);
1476 			}
1477 		}
1478 
1479 		cs->sc_flags &= ~CCDF_LABELLING;
1480 
1481 		ccdunlock(cs);
1482 
1483 		if (error)
1484 			return (error);
1485 		break;
1486 
1487 	case DIOCWLABEL:
1488 		if ((cs->sc_flags & CCDF_INITED) == 0)
1489 			return (ENXIO);
1490 
1491 		if ((flag & FWRITE) == 0)
1492 			return (EBADF);
1493 		if (*(int *)data != 0)
1494 			cs->sc_flags |= CCDF_WLABEL;
1495 		else
1496 			cs->sc_flags &= ~CCDF_WLABEL;
1497 		break;
1498 
1499 	default:
1500 		return (ENOTTY);
1501 	}
1502 
1503 	return (0);
1504 }
1505 
1506 static int
1507 ccdsize(dev_t dev)
1508 {
1509 	struct ccd_softc *cs;
1510 	int part, size;
1511 
1512 	if (ccdopen(dev, 0, S_IFCHR, curthread))
1513 		return (-1);
1514 
1515 	cs = &ccd_softc[ccdunit(dev)];
1516 	part = ccdpart(dev);
1517 
1518 	if ((cs->sc_flags & CCDF_INITED) == 0)
1519 		return (-1);
1520 
1521 	if (cs->sc_label.d_partitions[part].p_fstype != FS_SWAP)
1522 		size = -1;
1523 	else
1524 		size = cs->sc_label.d_partitions[part].p_size;
1525 
1526 	if (ccdclose(dev, 0, S_IFCHR, curthread))
1527 		return (-1);
1528 
1529 	return (size);
1530 }
1531 
1532 static int
1533 ccddump(dev_t dev, u_int count, u_int blkno, u_int secsize)
1534 {
1535 	/* Not implemented. */
1536 	return ENXIO;
1537 }
1538 
1539 /*
1540  * Lookup the provided name in the filesystem.  If the file exists,
1541  * is a valid block device, and isn't being used by anyone else,
1542  * set *vpp to the file's vnode.
1543  */
1544 static int
1545 ccdlookup(char *path, struct thread *td, struct vnode **vpp)
1546 {
1547 	struct nlookupdata nd;
1548 	struct ucred *cred;
1549 	struct vnode *vp;
1550 	int error;
1551 
1552 	KKASSERT(td->td_proc);
1553 	cred = td->td_proc->p_ucred;
1554 	*vpp = NULL;
1555 
1556 	error = nlookup_init(&nd, path, UIO_USERSPACE, NLC_FOLLOW|NLC_LOCKVP);
1557 	if (error)
1558 		return (error);
1559 	if ((error = vn_open(&nd, NULL, FREAD|FWRITE, 0)) != 0) {
1560 #ifdef DEBUG
1561 		if (ccddebug & CCDB_FOLLOW|CCDB_INIT)
1562 			printf("ccdlookup: vn_open error = %d\n", error);
1563 #endif
1564 		goto done;
1565 	}
1566 	vp = nd.nl_open_vp;
1567 
1568 	if (vp->v_usecount > 1) {
1569 		error = EBUSY;
1570 		goto done;
1571 	}
1572 
1573 	if (!vn_isdisk(vp, &error))
1574 		goto done;
1575 
1576 #ifdef DEBUG
1577 	if (ccddebug & CCDB_VNODE)
1578 		vprint("ccdlookup: vnode info", vp);
1579 #endif
1580 
1581 	VOP_UNLOCK(vp, 0, td);
1582 	nd.nl_open_vp = NULL;
1583 	nlookup_done(&nd);
1584 	*vpp = vp;				/* leave ref intact  */
1585 	return (0);
1586 done:
1587 	nlookup_done(&nd);
1588 	return (error);
1589 }
1590 
1591 /*
1592  * Read the disklabel from the ccd.  If one is not present, fake one
1593  * up.
1594  */
1595 static void
1596 ccdgetdisklabel(dev_t dev)
1597 {
1598 	int unit = ccdunit(dev);
1599 	struct ccd_softc *cs = &ccd_softc[unit];
1600 	char *errstring;
1601 	struct disklabel *lp = &cs->sc_label;
1602 	struct ccdgeom *ccg = &cs->sc_geom;
1603 	dev_t cdev;
1604 
1605 	bzero(lp, sizeof(*lp));
1606 
1607 	lp->d_secperunit = cs->sc_size;
1608 	lp->d_secsize = ccg->ccg_secsize;
1609 	lp->d_nsectors = ccg->ccg_nsectors;
1610 	lp->d_ntracks = ccg->ccg_ntracks;
1611 	lp->d_ncylinders = ccg->ccg_ncylinders;
1612 	lp->d_secpercyl = lp->d_ntracks * lp->d_nsectors;
1613 
1614 	strncpy(lp->d_typename, "ccd", sizeof(lp->d_typename));
1615 	lp->d_type = DTYPE_CCD;
1616 	strncpy(lp->d_packname, "fictitious", sizeof(lp->d_packname));
1617 	lp->d_rpm = 3600;
1618 	lp->d_interleave = 1;
1619 	lp->d_flags = 0;
1620 
1621 	lp->d_partitions[RAW_PART].p_offset = 0;
1622 	lp->d_partitions[RAW_PART].p_size = cs->sc_size;
1623 	lp->d_partitions[RAW_PART].p_fstype = FS_UNUSED;
1624 	lp->d_npartitions = RAW_PART + 1;
1625 
1626 	lp->d_bbsize = BBSIZE;				/* XXX */
1627 	lp->d_sbsize = SBSIZE;				/* XXX */
1628 
1629 	lp->d_magic = DISKMAGIC;
1630 	lp->d_magic2 = DISKMAGIC;
1631 	lp->d_checksum = dkcksum(&cs->sc_label);
1632 
1633 	/*
1634 	 * Call the generic disklabel extraction routine.
1635 	 */
1636 	cdev = CCDLABELDEV(dev);
1637 	errstring = readdisklabel(cdev, &cs->sc_label);
1638 	if (errstring != NULL)
1639 		ccdmakedisklabel(cs);
1640 
1641 #ifdef DEBUG
1642 	/* It's actually extremely common to have unlabeled ccds. */
1643 	if (ccddebug & CCDB_LABEL)
1644 		if (errstring != NULL)
1645 			printf("ccd%d: %s\n", unit, errstring);
1646 #endif
1647 }
1648 
1649 /*
1650  * Take care of things one might want to take care of in the event
1651  * that a disklabel isn't present.
1652  */
1653 static void
1654 ccdmakedisklabel(struct ccd_softc *cs)
1655 {
1656 	struct disklabel *lp = &cs->sc_label;
1657 
1658 	/*
1659 	 * For historical reasons, if there's no disklabel present
1660 	 * the raw partition must be marked FS_BSDFFS.
1661 	 */
1662 	lp->d_partitions[RAW_PART].p_fstype = FS_BSDFFS;
1663 
1664 	strncpy(lp->d_packname, "default label", sizeof(lp->d_packname));
1665 }
1666 
1667 /*
1668  * Wait interruptibly for an exclusive lock.
1669  *
1670  * XXX
1671  * Several drivers do this; it should be abstracted and made MP-safe.
1672  */
1673 static int
1674 ccdlock(struct ccd_softc *cs)
1675 {
1676 	int error;
1677 
1678 	while ((cs->sc_flags & CCDF_LOCKED) != 0) {
1679 		cs->sc_flags |= CCDF_WANTED;
1680 		if ((error = tsleep(cs, PCATCH, "ccdlck", 0)) != 0)
1681 			return (error);
1682 	}
1683 	cs->sc_flags |= CCDF_LOCKED;
1684 	return (0);
1685 }
1686 
1687 /*
1688  * Unlock and wake up any waiters.
1689  */
1690 static void
1691 ccdunlock(struct ccd_softc *cs)
1692 {
1693 
1694 	cs->sc_flags &= ~CCDF_LOCKED;
1695 	if ((cs->sc_flags & CCDF_WANTED) != 0) {
1696 		cs->sc_flags &= ~CCDF_WANTED;
1697 		wakeup(cs);
1698 	}
1699 }
1700 
1701 #ifdef DEBUG
1702 static void
1703 printiinfo(struct ccdiinfo *ii)
1704 {
1705 	int ix, i;
1706 
1707 	for (ix = 0; ii->ii_ndisk; ix++, ii++) {
1708 		printf(" itab[%d]: #dk %d sblk %d soff %d",
1709 		       ix, ii->ii_ndisk, ii->ii_startblk, ii->ii_startoff);
1710 		for (i = 0; i < ii->ii_ndisk; i++)
1711 			printf(" %d", ii->ii_index[i]);
1712 		printf("\n");
1713 	}
1714 }
1715 #endif
1716 
1717 
1718 /* Local Variables: */
1719 /* c-argdecl-indent: 8 */
1720 /* c-continued-statement-offset: 8 */
1721 /* c-indent-level: 8 */
1722 /* End: */
1723