xref: /dragonfly/sys/dev/disk/ccd/ccd.c (revision 6fb88001)
1 /* $FreeBSD: src/sys/dev/ccd/ccd.c,v 1.73.2.1 2001/09/11 09:49:52 kris Exp $ */
2 /* $DragonFly: src/sys/dev/disk/ccd/ccd.c,v 1.21 2005/12/11 01:54:07 swildner Exp $ */
3 
4 /*	$NetBSD: ccd.c,v 1.22 1995/12/08 19:13:26 thorpej Exp $	*/
5 
6 /*
7  * Copyright (c) 1995 Jason R. Thorpe.
8  * All rights reserved.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 3. All advertising materials mentioning features or use of this software
19  *    must display the following acknowledgement:
20  *	This product includes software developed for the NetBSD Project
21  *	by Jason R. Thorpe.
22  * 4. The name of the author may not be used to endorse or promote products
23  *    derived from this software without specific prior written permission.
24  *
25  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
26  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
27  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
28  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
29  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
30  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
31  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
32  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
33  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
34  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
35  * SUCH DAMAGE.
36  */
37 
38 /*
39  * Copyright (c) 1988 University of Utah.
40  * Copyright (c) 1990, 1993
41  *	The Regents of the University of California.  All rights reserved.
42  *
43  * This code is derived from software contributed to Berkeley by
44  * the Systems Programming Group of the University of Utah Computer
45  * Science Department.
46  *
47  * Redistribution and use in source and binary forms, with or without
48  * modification, are permitted provided that the following conditions
49  * are met:
50  * 1. Redistributions of source code must retain the above copyright
51  *    notice, this list of conditions and the following disclaimer.
52  * 2. Redistributions in binary form must reproduce the above copyright
53  *    notice, this list of conditions and the following disclaimer in the
54  *    documentation and/or other materials provided with the distribution.
55  * 3. All advertising materials mentioning features or use of this software
56  *    must display the following acknowledgement:
57  *	This product includes software developed by the University of
58  *	California, Berkeley and its contributors.
59  * 4. Neither the name of the University nor the names of its contributors
60  *    may be used to endorse or promote products derived from this software
61  *    without specific prior written permission.
62  *
63  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
64  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
65  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
66  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
67  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
68  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
69  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
70  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
71  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
72  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
73  * SUCH DAMAGE.
74  *
75  * from: Utah $Hdr: cd.c 1.6 90/11/28$
76  *
77  *	@(#)cd.c	8.2 (Berkeley) 11/16/93
78  */
79 
80 /*
81  * "Concatenated" disk driver.
82  *
83  * Dynamic configuration and disklabel support by:
84  *	Jason R. Thorpe <thorpej@nas.nasa.gov>
85  *	Numerical Aerodynamic Simulation Facility
86  *	Mail Stop 258-6
87  *	NASA Ames Research Center
88  *	Moffett Field, CA 94035
89  */
90 
91 #include "use_ccd.h"
92 
93 #include <sys/param.h>
94 #include <sys/systm.h>
95 #include <sys/kernel.h>
96 #include <sys/module.h>
97 #include <sys/proc.h>
98 #include <sys/buf.h>
99 #include <sys/malloc.h>
100 #include <sys/nlookup.h>
101 #include <sys/conf.h>
102 #include <sys/stat.h>
103 #include <sys/sysctl.h>
104 #include <sys/disklabel.h>
105 #include <vfs/ufs/fs.h>
106 #include <sys/devicestat.h>
107 #include <sys/fcntl.h>
108 #include <sys/vnode.h>
109 #include <sys/buf2.h>
110 
111 #include <sys/ccdvar.h>
112 
113 #include <sys/thread2.h>
114 
115 #include <vm/vm_zone.h>
116 
117 #if defined(CCDDEBUG) && !defined(DEBUG)
118 #define DEBUG
119 #endif
120 
121 #ifdef DEBUG
122 #define CCDB_FOLLOW	0x01
123 #define CCDB_INIT	0x02
124 #define CCDB_IO		0x04
125 #define CCDB_LABEL	0x08
126 #define CCDB_VNODE	0x10
127 static int ccddebug = CCDB_FOLLOW | CCDB_INIT | CCDB_IO | CCDB_LABEL |
128     CCDB_VNODE;
129 SYSCTL_INT(_debug, OID_AUTO, ccddebug, CTLFLAG_RW, &ccddebug, 0, "");
130 #undef DEBUG
131 #endif
132 
133 #define	ccdunit(x)	dkunit(x)
134 #define ccdpart(x)	dkpart(x)
135 
136 /*
137    This is how mirroring works (only writes are special):
138 
139    When initiating a write, ccdbuffer() returns two "struct ccdbuf *"s
140    linked together by the cb_mirror field.  "cb_pflags &
141    CCDPF_MIRROR_DONE" is set to 0 on both of them.
142 
143    When a component returns to ccdiodone(), it checks if "cb_pflags &
144    CCDPF_MIRROR_DONE" is set or not.  If not, it sets the partner's
145    flag and returns.  If it is, it means its partner has already
146    returned, so it will go to the regular cleanup.
147 
148  */
149 
150 struct ccdbuf {
151 	struct buf	cb_buf;		/* new I/O buf */
152 	struct buf	*cb_obp;	/* ptr. to original I/O buf */
153 	struct ccdbuf	*cb_freenext;	/* free list link */
154 	int		cb_unit;	/* target unit */
155 	int		cb_comp;	/* target component */
156 	int		cb_pflags;	/* mirror/parity status flag */
157 	struct ccdbuf	*cb_mirror;	/* mirror counterpart */
158 };
159 
160 /* bits in cb_pflags */
161 #define CCDPF_MIRROR_DONE 1	/* if set, mirror counterpart is done */
162 
163 #define CCDLABELDEV(dev)	\
164 	(make_sub_dev(dev, dkmakeminor(ccdunit((dev)), 0, RAW_PART)))
165 
166 static d_open_t ccdopen;
167 static d_close_t ccdclose;
168 static d_strategy_t ccdstrategy;
169 static d_ioctl_t ccdioctl;
170 static d_dump_t ccddump;
171 static d_psize_t ccdsize;
172 
173 #define NCCDFREEHIWAT	16
174 
175 #define CDEV_MAJOR 74
176 
177 static struct cdevsw ccd_cdevsw = {
178 	/* name */	"ccd",
179 	/* maj */	CDEV_MAJOR,
180 	/* flags */	D_DISK,
181 	/* port */      NULL,
182 	/* clone */	NULL,
183 
184 	/* open */	ccdopen,
185 	/* close */	ccdclose,
186 	/* read */	physread,
187 	/* write */	physwrite,
188 	/* ioctl */	ccdioctl,
189 	/* poll */	nopoll,
190 	/* mmap */	nommap,
191 	/* strategy */	ccdstrategy,
192 	/* dump */	ccddump,
193 	/* psize */	ccdsize
194 };
195 
196 /* called during module initialization */
197 static	void ccdattach (void);
198 static	int ccd_modevent (module_t, int, void *);
199 
200 /* called by biodone() at interrupt time */
201 static	void ccdiodone (struct ccdbuf *cbp);
202 
203 static	void ccdstart (struct ccd_softc *, struct buf *);
204 static	void ccdinterleave (struct ccd_softc *, int);
205 static	void ccdintr (struct ccd_softc *, struct buf *);
206 static	int ccdinit (struct ccddevice *, char **, struct thread *);
207 static	int ccdlookup (char *, struct thread *td, struct vnode **);
208 static	void ccdbuffer (struct ccdbuf **ret, struct ccd_softc *,
209 		struct buf *, daddr_t, caddr_t, long);
210 static	void ccdgetdisklabel (dev_t);
211 static	void ccdmakedisklabel (struct ccd_softc *);
212 static	int ccdlock (struct ccd_softc *);
213 static	void ccdunlock (struct ccd_softc *);
214 
215 #ifdef DEBUG
216 static	void printiinfo (struct ccdiinfo *);
217 #endif
218 
219 /* Non-private for the benefit of libkvm. */
220 struct	ccd_softc *ccd_softc;
221 struct	ccddevice *ccddevs;
222 struct	ccdbuf *ccdfreebufs;
223 static	int numccdfreebufs;
224 static	int numccd = 0;
225 
226 /*
227  * getccdbuf() -	Allocate and zero a ccd buffer.
228  *
229  *	This routine is called at splbio().
230  */
231 
232 static __inline
233 struct ccdbuf *
234 getccdbuf(struct ccdbuf *cpy)
235 {
236 	struct ccdbuf *cbp;
237 
238 	/*
239 	 * Allocate from freelist or malloc as necessary
240 	 */
241 	if ((cbp = ccdfreebufs) != NULL) {
242 		ccdfreebufs = cbp->cb_freenext;
243 		--numccdfreebufs;
244 	} else {
245 		cbp = malloc(sizeof(struct ccdbuf), M_DEVBUF, M_WAITOK);
246 	}
247 
248 	/*
249 	 * Used by mirroring code
250 	 */
251 	if (cpy)
252 		bcopy(cpy, cbp, sizeof(struct ccdbuf));
253 	else
254 		bzero(cbp, sizeof(struct ccdbuf));
255 
256 	/*
257 	 * independant struct buf initialization
258 	 */
259 	LIST_INIT(&cbp->cb_buf.b_dep);
260 	BUF_LOCKINIT(&cbp->cb_buf);
261 	BUF_LOCK(&cbp->cb_buf, LK_EXCLUSIVE);
262 	BUF_KERNPROC(&cbp->cb_buf);
263 
264 	return(cbp);
265 }
266 
267 /*
268  * putccdbuf() -	Free a ccd buffer.
269  *
270  *	This routine is called at splbio().
271  */
272 
273 static __inline
274 void
275 putccdbuf(struct ccdbuf *cbp)
276 {
277 	BUF_UNLOCK(&cbp->cb_buf);
278 	BUF_LOCKFREE(&cbp->cb_buf);
279 
280 	if (numccdfreebufs < NCCDFREEHIWAT) {
281 		cbp->cb_freenext = ccdfreebufs;
282 		ccdfreebufs = cbp;
283 		++numccdfreebufs;
284 	} else {
285 		free((caddr_t)cbp, M_DEVBUF);
286 	}
287 }
288 
289 
290 /*
291  * Number of blocks to untouched in front of a component partition.
292  * This is to avoid violating its disklabel area when it starts at the
293  * beginning of the slice.
294  */
295 #if !defined(CCD_OFFSET)
296 #define CCD_OFFSET 16
297 #endif
298 
299 /*
300  * Called by main() during pseudo-device attachment.  All we need
301  * to do is allocate enough space for devices to be configured later, and
302  * add devsw entries.
303  */
304 static void
305 ccdattach(void)
306 {
307 	int i;
308 	int num = NCCD;
309 
310 	if (num > 1)
311 		printf("ccd0-%d: Concatenated disk drivers\n", num-1);
312 	else
313 		printf("ccd0: Concatenated disk driver\n");
314 
315 	ccd_softc = malloc(num * sizeof(struct ccd_softc), M_DEVBUF,
316 			    M_WAITOK | M_ZERO);
317 	ccddevs = malloc(num * sizeof(struct ccddevice), M_DEVBUF,
318 			    M_WAITOK | M_ZERO);
319 	numccd = num;
320 
321 	cdevsw_add(&ccd_cdevsw, 0, 0);
322 	/* XXX: is this necessary? */
323 	for (i = 0; i < numccd; ++i)
324 		ccddevs[i].ccd_dk = -1;
325 }
326 
327 static int
328 ccd_modevent(module_t mod, int type, void *data)
329 {
330 	int error = 0;
331 
332 	switch (type) {
333 	case MOD_LOAD:
334 		ccdattach();
335 		break;
336 
337 	case MOD_UNLOAD:
338 		printf("ccd0: Unload not supported!\n");
339 		error = EOPNOTSUPP;
340 		break;
341 
342 	default:	/* MOD_SHUTDOWN etc */
343 		break;
344 	}
345 	return (error);
346 }
347 
348 DEV_MODULE(ccd, ccd_modevent, NULL);
349 
350 static int
351 ccdinit(struct ccddevice *ccd, char **cpaths, struct thread *td)
352 {
353 	struct ccd_softc *cs = &ccd_softc[ccd->ccd_unit];
354 	struct ccdcinfo *ci = NULL;	/* XXX */
355 	size_t size;
356 	int ix;
357 	struct vnode *vp;
358 	size_t minsize;
359 	int maxsecsize;
360 	struct partinfo dpart;
361 	struct ccdgeom *ccg = &cs->sc_geom;
362 	char tmppath[MAXPATHLEN];
363 	int error = 0;
364 	struct ucred *cred;
365 
366 	KKASSERT(td->td_proc);
367 	cred = td->td_proc->p_ucred;
368 
369 #ifdef DEBUG
370 	if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
371 		printf("ccdinit: unit %d\n", ccd->ccd_unit);
372 #endif
373 
374 	cs->sc_size = 0;
375 	cs->sc_ileave = ccd->ccd_interleave;
376 	cs->sc_nccdisks = ccd->ccd_ndev;
377 
378 	/* Allocate space for the component info. */
379 	cs->sc_cinfo = malloc(cs->sc_nccdisks * sizeof(struct ccdcinfo),
380 	    M_DEVBUF, M_WAITOK);
381 
382 	/*
383 	 * Verify that each component piece exists and record
384 	 * relevant information about it.
385 	 */
386 	maxsecsize = 0;
387 	minsize = 0;
388 	for (ix = 0; ix < cs->sc_nccdisks; ix++) {
389 		vp = ccd->ccd_vpp[ix];
390 		ci = &cs->sc_cinfo[ix];
391 		ci->ci_vp = vp;
392 
393 		/*
394 		 * Copy in the pathname of the component.
395 		 */
396 		bzero(tmppath, sizeof(tmppath));	/* sanity */
397 		if ((error = copyinstr(cpaths[ix], tmppath,
398 		    MAXPATHLEN, &ci->ci_pathlen)) != 0) {
399 #ifdef DEBUG
400 			if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
401 				printf("ccd%d: can't copy path, error = %d\n",
402 				    ccd->ccd_unit, error);
403 #endif
404 			goto fail;
405 		}
406 		ci->ci_path = malloc(ci->ci_pathlen, M_DEVBUF, M_WAITOK);
407 		bcopy(tmppath, ci->ci_path, ci->ci_pathlen);
408 
409 		ci->ci_dev = vn_todev(vp);
410 
411 		/*
412 		 * Get partition information for the component.
413 		 */
414 		if ((error = VOP_IOCTL(vp, DIOCGPART, (caddr_t)&dpart,
415 		    FREAD, cred, td)) != 0) {
416 #ifdef DEBUG
417 			if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
418 				 printf("ccd%d: %s: ioctl failed, error = %d\n",
419 				     ccd->ccd_unit, ci->ci_path, error);
420 #endif
421 			goto fail;
422 		}
423 		if (dpart.part->p_fstype == FS_BSDFFS) {
424 			maxsecsize =
425 			    ((dpart.disklab->d_secsize > maxsecsize) ?
426 			    dpart.disklab->d_secsize : maxsecsize);
427 			size = dpart.part->p_size - CCD_OFFSET;
428 		} else {
429 #ifdef DEBUG
430 			if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
431 				printf("ccd%d: %s: incorrect partition type\n",
432 				    ccd->ccd_unit, ci->ci_path);
433 #endif
434 			error = EFTYPE;
435 			goto fail;
436 		}
437 
438 		/*
439 		 * Calculate the size, truncating to an interleave
440 		 * boundary if necessary.
441 		 */
442 
443 		if (cs->sc_ileave > 1)
444 			size -= size % cs->sc_ileave;
445 
446 		if (size == 0) {
447 #ifdef DEBUG
448 			if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
449 				printf("ccd%d: %s: size == 0\n",
450 				    ccd->ccd_unit, ci->ci_path);
451 #endif
452 			error = ENODEV;
453 			goto fail;
454 		}
455 
456 		if (minsize == 0 || size < minsize)
457 			minsize = size;
458 		ci->ci_size = size;
459 		cs->sc_size += size;
460 	}
461 
462 	/*
463 	 * Don't allow the interleave to be smaller than
464 	 * the biggest component sector.
465 	 */
466 	if ((cs->sc_ileave > 0) &&
467 	    (cs->sc_ileave < (maxsecsize / DEV_BSIZE))) {
468 #ifdef DEBUG
469 		if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
470 			printf("ccd%d: interleave must be at least %d\n",
471 			    ccd->ccd_unit, (maxsecsize / DEV_BSIZE));
472 #endif
473 		error = EINVAL;
474 		goto fail;
475 	}
476 
477 	/*
478 	 * If uniform interleave is desired set all sizes to that of
479 	 * the smallest component.  This will guarentee that a single
480 	 * interleave table is generated.
481 	 *
482 	 * Lost space must be taken into account when calculating the
483 	 * overall size.  Half the space is lost when CCDF_MIRROR is
484 	 * specified.  One disk is lost when CCDF_PARITY is specified.
485 	 */
486 	if (ccd->ccd_flags & CCDF_UNIFORM) {
487 		for (ci = cs->sc_cinfo;
488 		     ci < &cs->sc_cinfo[cs->sc_nccdisks]; ci++) {
489 			ci->ci_size = minsize;
490 		}
491 		if (ccd->ccd_flags & CCDF_MIRROR) {
492 			/*
493 			 * Check to see if an even number of components
494 			 * have been specified.  The interleave must also
495 			 * be non-zero in order for us to be able to
496 			 * guarentee the topology.
497 			 */
498 			if (cs->sc_nccdisks % 2) {
499 				printf("ccd%d: mirroring requires an even number of disks\n", ccd->ccd_unit );
500 				error = EINVAL;
501 				goto fail;
502 			}
503 			if (cs->sc_ileave == 0) {
504 				printf("ccd%d: an interleave must be specified when mirroring\n", ccd->ccd_unit);
505 				error = EINVAL;
506 				goto fail;
507 			}
508 			cs->sc_size = (cs->sc_nccdisks/2) * minsize;
509 		} else if (ccd->ccd_flags & CCDF_PARITY) {
510 			cs->sc_size = (cs->sc_nccdisks-1) * minsize;
511 		} else {
512 			if (cs->sc_ileave == 0) {
513 				printf("ccd%d: an interleave must be specified when using parity\n", ccd->ccd_unit);
514 				error = EINVAL;
515 				goto fail;
516 			}
517 			cs->sc_size = cs->sc_nccdisks * minsize;
518 		}
519 	}
520 
521 	/*
522 	 * Construct the interleave table.
523 	 */
524 	ccdinterleave(cs, ccd->ccd_unit);
525 
526 	/*
527 	 * Create pseudo-geometry based on 1MB cylinders.  It's
528 	 * pretty close.
529 	 */
530 	ccg->ccg_secsize = maxsecsize;
531 	ccg->ccg_ntracks = 1;
532 	ccg->ccg_nsectors = 1024 * 1024 / ccg->ccg_secsize;
533 	ccg->ccg_ncylinders = cs->sc_size / ccg->ccg_nsectors;
534 
535 	/*
536 	 * Add an devstat entry for this device.
537 	 */
538 	devstat_add_entry(&cs->device_stats, "ccd", ccd->ccd_unit,
539 			  ccg->ccg_secsize, DEVSTAT_ALL_SUPPORTED,
540 			  DEVSTAT_TYPE_STORARRAY |DEVSTAT_TYPE_IF_OTHER,
541 			  DEVSTAT_PRIORITY_ARRAY);
542 
543 	cs->sc_flags |= CCDF_INITED;
544 	cs->sc_cflags = ccd->ccd_flags;	/* So we can find out later... */
545 	cs->sc_unit = ccd->ccd_unit;
546 	return (0);
547 fail:
548 	while (ci > cs->sc_cinfo) {
549 		ci--;
550 		free(ci->ci_path, M_DEVBUF);
551 	}
552 	free(cs->sc_cinfo, M_DEVBUF);
553 	return (error);
554 }
555 
556 static void
557 ccdinterleave(struct ccd_softc *cs, int unit)
558 {
559 	struct ccdcinfo *ci, *smallci;
560 	struct ccdiinfo *ii;
561 	daddr_t bn, lbn;
562 	int ix;
563 	u_long size;
564 
565 #ifdef DEBUG
566 	if (ccddebug & CCDB_INIT)
567 		printf("ccdinterleave(%x): ileave %d\n", cs, cs->sc_ileave);
568 #endif
569 
570 	/*
571 	 * Allocate an interleave table.  The worst case occurs when each
572 	 * of N disks is of a different size, resulting in N interleave
573 	 * tables.
574 	 *
575 	 * Chances are this is too big, but we don't care.
576 	 */
577 	size = (cs->sc_nccdisks + 1) * sizeof(struct ccdiinfo);
578 	cs->sc_itable = (struct ccdiinfo *)malloc(size, M_DEVBUF, M_WAITOK);
579 	bzero((caddr_t)cs->sc_itable, size);
580 
581 	/*
582 	 * Trivial case: no interleave (actually interleave of disk size).
583 	 * Each table entry represents a single component in its entirety.
584 	 *
585 	 * An interleave of 0 may not be used with a mirror or parity setup.
586 	 */
587 	if (cs->sc_ileave == 0) {
588 		bn = 0;
589 		ii = cs->sc_itable;
590 
591 		for (ix = 0; ix < cs->sc_nccdisks; ix++) {
592 			/* Allocate space for ii_index. */
593 			ii->ii_index = malloc(sizeof(int), M_DEVBUF, M_WAITOK);
594 			ii->ii_ndisk = 1;
595 			ii->ii_startblk = bn;
596 			ii->ii_startoff = 0;
597 			ii->ii_index[0] = ix;
598 			bn += cs->sc_cinfo[ix].ci_size;
599 			ii++;
600 		}
601 		ii->ii_ndisk = 0;
602 #ifdef DEBUG
603 		if (ccddebug & CCDB_INIT)
604 			printiinfo(cs->sc_itable);
605 #endif
606 		return;
607 	}
608 
609 	/*
610 	 * The following isn't fast or pretty; it doesn't have to be.
611 	 */
612 	size = 0;
613 	bn = lbn = 0;
614 	for (ii = cs->sc_itable; ; ii++) {
615 		/*
616 		 * Allocate space for ii_index.  We might allocate more then
617 		 * we use.
618 		 */
619 		ii->ii_index = malloc((sizeof(int) * cs->sc_nccdisks),
620 		    M_DEVBUF, M_WAITOK);
621 
622 		/*
623 		 * Locate the smallest of the remaining components
624 		 */
625 		smallci = NULL;
626 		for (ci = cs->sc_cinfo; ci < &cs->sc_cinfo[cs->sc_nccdisks];
627 		    ci++) {
628 			if (ci->ci_size > size &&
629 			    (smallci == NULL ||
630 			     ci->ci_size < smallci->ci_size)) {
631 				smallci = ci;
632 			}
633 		}
634 
635 		/*
636 		 * Nobody left, all done
637 		 */
638 		if (smallci == NULL) {
639 			ii->ii_ndisk = 0;
640 			break;
641 		}
642 
643 		/*
644 		 * Record starting logical block using an sc_ileave blocksize.
645 		 */
646 		ii->ii_startblk = bn / cs->sc_ileave;
647 
648 		/*
649 		 * Record starting comopnent block using an sc_ileave
650 		 * blocksize.  This value is relative to the beginning of
651 		 * a component disk.
652 		 */
653 		ii->ii_startoff = lbn;
654 
655 		/*
656 		 * Determine how many disks take part in this interleave
657 		 * and record their indices.
658 		 */
659 		ix = 0;
660 		for (ci = cs->sc_cinfo;
661 		    ci < &cs->sc_cinfo[cs->sc_nccdisks]; ci++) {
662 			if (ci->ci_size >= smallci->ci_size) {
663 				ii->ii_index[ix++] = ci - cs->sc_cinfo;
664 			}
665 		}
666 		ii->ii_ndisk = ix;
667 		bn += ix * (smallci->ci_size - size);
668 		lbn = smallci->ci_size / cs->sc_ileave;
669 		size = smallci->ci_size;
670 	}
671 #ifdef DEBUG
672 	if (ccddebug & CCDB_INIT)
673 		printiinfo(cs->sc_itable);
674 #endif
675 }
676 
677 /* ARGSUSED */
678 static int
679 ccdopen(dev_t dev, int flags, int fmt, d_thread_t *td)
680 {
681 	int unit = ccdunit(dev);
682 	struct ccd_softc *cs;
683 	struct disklabel *lp;
684 	int error = 0, part, pmask;
685 
686 #ifdef DEBUG
687 	if (ccddebug & CCDB_FOLLOW)
688 		printf("ccdopen(%x, %x)\n", dev, flags);
689 #endif
690 	if (unit >= numccd)
691 		return (ENXIO);
692 	cs = &ccd_softc[unit];
693 
694 	if ((error = ccdlock(cs)) != 0)
695 		return (error);
696 
697 	lp = &cs->sc_label;
698 
699 	part = ccdpart(dev);
700 	pmask = (1 << part);
701 
702 	/*
703 	 * If we're initialized, check to see if there are any other
704 	 * open partitions.  If not, then it's safe to update
705 	 * the in-core disklabel.
706 	 */
707 	if ((cs->sc_flags & CCDF_INITED) && (cs->sc_openmask == 0))
708 		ccdgetdisklabel(dev);
709 
710 	/* Check that the partition exists. */
711 	if (part != RAW_PART && ((part >= lp->d_npartitions) ||
712 	    (lp->d_partitions[part].p_fstype == FS_UNUSED))) {
713 		error = ENXIO;
714 		goto done;
715 	}
716 
717 	cs->sc_openmask |= pmask;
718  done:
719 	ccdunlock(cs);
720 	return (0);
721 }
722 
723 /* ARGSUSED */
724 static int
725 ccdclose(dev_t dev, int flags, int fmt, d_thread_t *td)
726 {
727 	int unit = ccdunit(dev);
728 	struct ccd_softc *cs;
729 	int error = 0, part;
730 
731 #ifdef DEBUG
732 	if (ccddebug & CCDB_FOLLOW)
733 		printf("ccdclose(%x, %x)\n", dev, flags);
734 #endif
735 
736 	if (unit >= numccd)
737 		return (ENXIO);
738 	cs = &ccd_softc[unit];
739 
740 	if ((error = ccdlock(cs)) != 0)
741 		return (error);
742 
743 	part = ccdpart(dev);
744 
745 	/* ...that much closer to allowing unconfiguration... */
746 	cs->sc_openmask &= ~(1 << part);
747 	ccdunlock(cs);
748 	return (0);
749 }
750 
751 static void
752 ccdstrategy(struct buf *bp)
753 {
754 	int unit = ccdunit(bp->b_dev);
755 	struct ccd_softc *cs = &ccd_softc[unit];
756 	int wlabel;
757 	struct disklabel *lp;
758 
759 #ifdef DEBUG
760 	if (ccddebug & CCDB_FOLLOW)
761 		printf("ccdstrategy(%x): unit %d\n", bp, unit);
762 #endif
763 	if ((cs->sc_flags & CCDF_INITED) == 0) {
764 		bp->b_error = ENXIO;
765 		bp->b_flags |= B_ERROR;
766 		goto done;
767 	}
768 
769 	/* If it's a nil transfer, wake up the top half now. */
770 	if (bp->b_bcount == 0)
771 		goto done;
772 
773 	lp = &cs->sc_label;
774 
775 	/*
776 	 * Do bounds checking and adjust transfer.  If there's an
777 	 * error, the bounds check will flag that for us.
778 	 */
779 	wlabel = cs->sc_flags & (CCDF_WLABEL|CCDF_LABELLING);
780 	if (ccdpart(bp->b_dev) != RAW_PART) {
781 		if (bounds_check_with_label(bp, lp, wlabel) <= 0)
782 			goto done;
783 	} else {
784 		int pbn;        /* in sc_secsize chunks */
785 		long sz;        /* in sc_secsize chunks */
786 
787 		pbn = bp->b_blkno / (cs->sc_geom.ccg_secsize / DEV_BSIZE);
788 		sz = howmany(bp->b_bcount, cs->sc_geom.ccg_secsize);
789 
790 		/*
791 		 * If out of bounds return an error. If at the EOF point,
792 		 * simply read or write less.
793 		 */
794 
795 		if (pbn < 0 || pbn >= cs->sc_size) {
796 			bp->b_resid = bp->b_bcount;
797 			if (pbn != cs->sc_size) {
798 				bp->b_error = EINVAL;
799 				bp->b_flags |= B_ERROR | B_INVAL;
800 			}
801 			goto done;
802 		}
803 
804 		/*
805 		 * If the request crosses EOF, truncate the request.
806 		 */
807 		if (pbn + sz > cs->sc_size) {
808 			bp->b_bcount = (cs->sc_size - pbn) *
809 			    cs->sc_geom.ccg_secsize;
810 		}
811 	}
812 
813 	bp->b_resid = bp->b_bcount;
814 
815 	/*
816 	 * "Start" the unit.
817 	 */
818 	crit_enter();
819 	ccdstart(cs, bp);
820 	crit_exit();
821 	return;
822 done:
823 	biodone(bp);
824 }
825 
826 static void
827 ccdstart(struct ccd_softc *cs, struct buf *bp)
828 {
829 	long bcount, rcount;
830 	struct ccdbuf *cbp[4];
831 	/* XXX! : 2 reads and 2 writes for RAID 4/5 */
832 	caddr_t addr;
833 	daddr_t bn;
834 	struct partition *pp;
835 
836 #ifdef DEBUG
837 	if (ccddebug & CCDB_FOLLOW)
838 		printf("ccdstart(%x, %x)\n", cs, bp);
839 #endif
840 
841 	/* Record the transaction start  */
842 	devstat_start_transaction(&cs->device_stats);
843 
844 	/*
845 	 * Translate the partition-relative block number to an absolute.
846 	 */
847 	bn = bp->b_blkno;
848 	if (ccdpart(bp->b_dev) != RAW_PART) {
849 		pp = &cs->sc_label.d_partitions[ccdpart(bp->b_dev)];
850 		bn += pp->p_offset;
851 	}
852 
853 	/*
854 	 * Allocate component buffers and fire off the requests
855 	 */
856 	addr = bp->b_data;
857 	for (bcount = bp->b_bcount; bcount > 0; bcount -= rcount) {
858 		ccdbuffer(cbp, cs, bp, bn, addr, bcount);
859 		rcount = cbp[0]->cb_buf.b_bcount;
860 
861 		if (cs->sc_cflags & CCDF_MIRROR) {
862 			/*
863 			 * Mirroring.  Writes go to both disks, reads are
864 			 * taken from whichever disk seems most appropriate.
865 			 *
866 			 * We attempt to localize reads to the disk whos arm
867 			 * is nearest the read request.  We ignore seeks due
868 			 * to writes when making this determination and we
869 			 * also try to avoid hogging.
870 			 */
871 			if ((cbp[0]->cb_buf.b_flags & B_READ) == 0) {
872 				cbp[0]->cb_buf.b_vp->v_numoutput++;
873 				cbp[1]->cb_buf.b_vp->v_numoutput++;
874 				VOP_STRATEGY(cbp[0]->cb_buf.b_vp,
875 				    &cbp[0]->cb_buf);
876 				VOP_STRATEGY(cbp[1]->cb_buf.b_vp,
877 				    &cbp[1]->cb_buf);
878 			} else {
879 				int pick = cs->sc_pick;
880 				daddr_t range = cs->sc_size / 16;
881 
882 				if (bn < cs->sc_blk[pick] - range ||
883 				    bn > cs->sc_blk[pick] + range
884 				) {
885 					cs->sc_pick = pick = 1 - pick;
886 				}
887 				cs->sc_blk[pick] = bn + btodb(rcount);
888 				VOP_STRATEGY(cbp[pick]->cb_buf.b_vp,
889 				    &cbp[pick]->cb_buf);
890 			}
891 		} else {
892 			/*
893 			 * Not mirroring
894 			 */
895 			if ((cbp[0]->cb_buf.b_flags & B_READ) == 0)
896 				cbp[0]->cb_buf.b_vp->v_numoutput++;
897 			VOP_STRATEGY(cbp[0]->cb_buf.b_vp, &cbp[0]->cb_buf);
898 		}
899 		bn += btodb(rcount);
900 		addr += rcount;
901 	}
902 }
903 
904 /*
905  * Build a component buffer header.
906  */
907 static void
908 ccdbuffer(struct ccdbuf **cb, struct ccd_softc *cs, struct buf *bp, daddr_t bn,
909 	  caddr_t addr, long bcount)
910 {
911 	struct ccdcinfo *ci, *ci2 = NULL;	/* XXX */
912 	struct ccdbuf *cbp;
913 	daddr_t cbn, cboff;
914 	off_t cbc;
915 
916 #ifdef DEBUG
917 	if (ccddebug & CCDB_IO)
918 		printf("ccdbuffer(%x, %x, %d, %x, %d)\n",
919 		       cs, bp, bn, addr, bcount);
920 #endif
921 	/*
922 	 * Determine which component bn falls in.
923 	 */
924 	cbn = bn;
925 	cboff = 0;
926 
927 	if (cs->sc_ileave == 0) {
928 		/*
929 		 * Serially concatenated and neither a mirror nor a parity
930 		 * config.  This is a special case.
931 		 */
932 		daddr_t sblk;
933 
934 		sblk = 0;
935 		for (ci = cs->sc_cinfo; cbn >= sblk + ci->ci_size; ci++)
936 			sblk += ci->ci_size;
937 		cbn -= sblk;
938 	} else {
939 		struct ccdiinfo *ii;
940 		int ccdisk, off;
941 
942 		/*
943 		 * Calculate cbn, the logical superblock (sc_ileave chunks),
944 		 * and cboff, a normal block offset (DEV_BSIZE chunks) relative
945 		 * to cbn.
946 		 */
947 		cboff = cbn % cs->sc_ileave;	/* DEV_BSIZE gran */
948 		cbn = cbn / cs->sc_ileave;	/* DEV_BSIZE * ileave gran */
949 
950 		/*
951 		 * Figure out which interleave table to use.
952 		 */
953 		for (ii = cs->sc_itable; ii->ii_ndisk; ii++) {
954 			if (ii->ii_startblk > cbn)
955 				break;
956 		}
957 		ii--;
958 
959 		/*
960 		 * off is the logical superblock relative to the beginning
961 		 * of this interleave block.
962 		 */
963 		off = cbn - ii->ii_startblk;
964 
965 		/*
966 		 * We must calculate which disk component to use (ccdisk),
967 		 * and recalculate cbn to be the superblock relative to
968 		 * the beginning of the component.  This is typically done by
969 		 * adding 'off' and ii->ii_startoff together.  However, 'off'
970 		 * must typically be divided by the number of components in
971 		 * this interleave array to be properly convert it from a
972 		 * CCD-relative logical superblock number to a
973 		 * component-relative superblock number.
974 		 */
975 		if (ii->ii_ndisk == 1) {
976 			/*
977 			 * When we have just one disk, it can't be a mirror
978 			 * or a parity config.
979 			 */
980 			ccdisk = ii->ii_index[0];
981 			cbn = ii->ii_startoff + off;
982 		} else {
983 			if (cs->sc_cflags & CCDF_MIRROR) {
984 				/*
985 				 * We have forced a uniform mapping, resulting
986 				 * in a single interleave array.  We double
987 				 * up on the first half of the available
988 				 * components and our mirror is in the second
989 				 * half.  This only works with a single
990 				 * interleave array because doubling up
991 				 * doubles the number of sectors, so there
992 				 * cannot be another interleave array because
993 				 * the next interleave array's calculations
994 				 * would be off.
995 				 */
996 				int ndisk2 = ii->ii_ndisk / 2;
997 				ccdisk = ii->ii_index[off % ndisk2];
998 				cbn = ii->ii_startoff + off / ndisk2;
999 				ci2 = &cs->sc_cinfo[ccdisk + ndisk2];
1000 			} else if (cs->sc_cflags & CCDF_PARITY) {
1001 				/*
1002 				 * XXX not implemented yet
1003 				 */
1004 				int ndisk2 = ii->ii_ndisk - 1;
1005 				ccdisk = ii->ii_index[off % ndisk2];
1006 				cbn = ii->ii_startoff + off / ndisk2;
1007 				if (cbn % ii->ii_ndisk <= ccdisk)
1008 					ccdisk++;
1009 			} else {
1010 				ccdisk = ii->ii_index[off % ii->ii_ndisk];
1011 				cbn = ii->ii_startoff + off / ii->ii_ndisk;
1012 			}
1013 		}
1014 
1015 		ci = &cs->sc_cinfo[ccdisk];
1016 
1017 		/*
1018 		 * Convert cbn from a superblock to a normal block so it
1019 		 * can be used to calculate (along with cboff) the normal
1020 		 * block index into this particular disk.
1021 		 */
1022 		cbn *= cs->sc_ileave;
1023 	}
1024 
1025 	/*
1026 	 * Fill in the component buf structure.
1027 	 */
1028 	cbp = getccdbuf(NULL);
1029 	cbp->cb_buf.b_flags = bp->b_flags;
1030 	cbp->cb_buf.b_iodone = (void (*)(struct buf *))ccdiodone;
1031 	cbp->cb_buf.b_dev = ci->ci_dev;		/* XXX */
1032 	cbp->cb_buf.b_blkno = cbn + cboff + CCD_OFFSET;
1033 	cbp->cb_buf.b_offset = dbtob(cbn + cboff + CCD_OFFSET);
1034 	cbp->cb_buf.b_data = addr;
1035 	cbp->cb_buf.b_vp = ci->ci_vp;
1036 	if (cs->sc_ileave == 0)
1037               cbc = dbtob((off_t)(ci->ci_size - cbn));
1038 	else
1039               cbc = dbtob((off_t)(cs->sc_ileave - cboff));
1040 	cbp->cb_buf.b_bcount = (cbc < bcount) ? cbc : bcount;
1041  	cbp->cb_buf.b_bufsize = cbp->cb_buf.b_bcount;
1042 
1043 	/*
1044 	 * context for ccdiodone
1045 	 */
1046 	cbp->cb_obp = bp;
1047 	cbp->cb_unit = cs - ccd_softc;
1048 	cbp->cb_comp = ci - cs->sc_cinfo;
1049 
1050 #ifdef DEBUG
1051 	if (ccddebug & CCDB_IO)
1052 		printf(" dev %x(u%d): cbp %x bn %d addr %x bcnt %d\n",
1053 		       ci->ci_dev, ci-cs->sc_cinfo, cbp, cbp->cb_buf.b_blkno,
1054 		       cbp->cb_buf.b_data, cbp->cb_buf.b_bcount);
1055 #endif
1056 	cb[0] = cbp;
1057 
1058 	/*
1059 	 * Note: both I/O's setup when reading from mirror, but only one
1060 	 * will be executed.
1061 	 */
1062 	if (cs->sc_cflags & CCDF_MIRROR) {
1063 		/* mirror, setup second I/O */
1064 		cbp = getccdbuf(cb[0]);
1065 		cbp->cb_buf.b_dev = ci2->ci_dev;
1066 		cbp->cb_buf.b_vp = ci2->ci_vp;
1067 		cbp->cb_comp = ci2 - cs->sc_cinfo;
1068 		cb[1] = cbp;
1069 		/* link together the ccdbuf's and clear "mirror done" flag */
1070 		cb[0]->cb_mirror = cb[1];
1071 		cb[1]->cb_mirror = cb[0];
1072 		cb[0]->cb_pflags &= ~CCDPF_MIRROR_DONE;
1073 		cb[1]->cb_pflags &= ~CCDPF_MIRROR_DONE;
1074 	}
1075 }
1076 
1077 static void
1078 ccdintr(struct ccd_softc *cs, struct buf *bp)
1079 {
1080 #ifdef DEBUG
1081 	if (ccddebug & CCDB_FOLLOW)
1082 		printf("ccdintr(%x, %x)\n", cs, bp);
1083 #endif
1084 	/*
1085 	 * Request is done for better or worse, wakeup the top half.
1086 	 */
1087 	if (bp->b_flags & B_ERROR)
1088 		bp->b_resid = bp->b_bcount;
1089 	devstat_end_transaction_buf(&cs->device_stats, bp);
1090 	biodone(bp);
1091 }
1092 
1093 /*
1094  * Called at interrupt time.
1095  * Mark the component as done and if all components are done,
1096  * take a ccd interrupt.
1097  */
1098 static void
1099 ccdiodone(struct ccdbuf *cbp)
1100 {
1101 	struct buf *bp = cbp->cb_obp;
1102 	int unit = cbp->cb_unit;
1103 	int count;
1104 
1105 	crit_enter();
1106 #ifdef DEBUG
1107 	if (ccddebug & CCDB_FOLLOW)
1108 		printf("ccdiodone(%x)\n", cbp);
1109 	if (ccddebug & CCDB_IO) {
1110 		printf("ccdiodone: bp %x bcount %d resid %d\n",
1111 		       bp, bp->b_bcount, bp->b_resid);
1112 		printf(" dev %x(u%d), cbp %x bn %d addr %x bcnt %d\n",
1113 		       cbp->cb_buf.b_dev, cbp->cb_comp, cbp,
1114 		       cbp->cb_buf.b_blkno, cbp->cb_buf.b_data,
1115 		       cbp->cb_buf.b_bcount);
1116 	}
1117 #endif
1118 	/*
1119 	 * If an error occured, report it.  If this is a mirrored
1120 	 * configuration and the first of two possible reads, do not
1121 	 * set the error in the bp yet because the second read may
1122 	 * succeed.
1123 	 */
1124 
1125 	if (cbp->cb_buf.b_flags & B_ERROR) {
1126 		const char *msg = "";
1127 
1128 		if ((ccd_softc[unit].sc_cflags & CCDF_MIRROR) &&
1129 		    (cbp->cb_buf.b_flags & B_READ) &&
1130 		    (cbp->cb_pflags & CCDPF_MIRROR_DONE) == 0) {
1131 			/*
1132 			 * We will try our read on the other disk down
1133 			 * below, also reverse the default pick so if we
1134 			 * are doing a scan we do not keep hitting the
1135 			 * bad disk first.
1136 			 */
1137 			struct ccd_softc *cs = &ccd_softc[unit];
1138 
1139 			msg = ", trying other disk";
1140 			cs->sc_pick = 1 - cs->sc_pick;
1141 			cs->sc_blk[cs->sc_pick] = bp->b_blkno;
1142 		} else {
1143 			bp->b_flags |= B_ERROR;
1144 			bp->b_error = cbp->cb_buf.b_error ?
1145 			    cbp->cb_buf.b_error : EIO;
1146 		}
1147 		printf("ccd%d: error %d on component %d block %d (ccd block %d)%s\n",
1148 		       unit, bp->b_error, cbp->cb_comp,
1149 		       (int)cbp->cb_buf.b_blkno, bp->b_blkno, msg);
1150 	}
1151 
1152 	/*
1153 	 * Process mirror.  If we are writing, I/O has been initiated on both
1154 	 * buffers and we fall through only after both are finished.
1155 	 *
1156 	 * If we are reading only one I/O is initiated at a time.  If an
1157 	 * error occurs we initiate the second I/O and return, otherwise
1158 	 * we free the second I/O without initiating it.
1159 	 */
1160 
1161 	if (ccd_softc[unit].sc_cflags & CCDF_MIRROR) {
1162 		if ((cbp->cb_buf.b_flags & B_READ) == 0) {
1163 			/*
1164 			 * When writing, handshake with the second buffer
1165 			 * to determine when both are done.  If both are not
1166 			 * done, return here.
1167 			 */
1168 			if ((cbp->cb_pflags & CCDPF_MIRROR_DONE) == 0) {
1169 				cbp->cb_mirror->cb_pflags |= CCDPF_MIRROR_DONE;
1170 				putccdbuf(cbp);
1171 				crit_exit();
1172 				return;
1173 			}
1174 		} else {
1175 			/*
1176 			 * When reading, either dispose of the second buffer
1177 			 * or initiate I/O on the second buffer if an error
1178 			 * occured with this one.
1179 			 */
1180 			if ((cbp->cb_pflags & CCDPF_MIRROR_DONE) == 0) {
1181 				if (cbp->cb_buf.b_flags & B_ERROR) {
1182 					cbp->cb_mirror->cb_pflags |=
1183 					    CCDPF_MIRROR_DONE;
1184 					VOP_STRATEGY(
1185 					    cbp->cb_mirror->cb_buf.b_vp,
1186 					    &cbp->cb_mirror->cb_buf
1187 					);
1188 					putccdbuf(cbp);
1189 					crit_exit();
1190 					return;
1191 				} else {
1192 					putccdbuf(cbp->cb_mirror);
1193 					/* fall through */
1194 				}
1195 			}
1196 		}
1197 	}
1198 
1199 	/*
1200 	 * use b_bufsize to determine how big the original request was rather
1201 	 * then b_bcount, because b_bcount may have been truncated for EOF.
1202 	 *
1203 	 * XXX We check for an error, but we do not test the resid for an
1204 	 * aligned EOF condition.  This may result in character & block
1205 	 * device access not recognizing EOF properly when read or written
1206 	 * sequentially, but will not effect filesystems.
1207 	 */
1208 	count = cbp->cb_buf.b_bufsize;
1209 	putccdbuf(cbp);
1210 
1211 	/*
1212 	 * If all done, "interrupt".
1213 	 */
1214 	bp->b_resid -= count;
1215 	if (bp->b_resid < 0)
1216 		panic("ccdiodone: count");
1217 	if (bp->b_resid == 0)
1218 		ccdintr(&ccd_softc[unit], bp);
1219 	crit_exit();
1220 }
1221 
1222 static int
1223 ccdioctl(dev_t dev, u_long cmd, caddr_t data, int flag, d_thread_t *td)
1224 {
1225 	int unit = ccdunit(dev);
1226 	int i, j, lookedup = 0, error = 0;
1227 	int part, pmask;
1228 	struct ccd_softc *cs;
1229 	struct ccd_ioctl *ccio = (struct ccd_ioctl *)data;
1230 	struct ccddevice ccd;
1231 	char **cpp;
1232 	struct vnode **vpp;
1233 	struct ucred *cred;
1234 
1235 	KKASSERT(td->td_proc != NULL);
1236 	cred = td->td_proc->p_ucred;
1237 
1238 	if (unit >= numccd)
1239 		return (ENXIO);
1240 	cs = &ccd_softc[unit];
1241 
1242 	bzero(&ccd, sizeof(ccd));
1243 
1244 	switch (cmd) {
1245 	case CCDIOCSET:
1246 		if (cs->sc_flags & CCDF_INITED)
1247 			return (EBUSY);
1248 
1249 		if ((flag & FWRITE) == 0)
1250 			return (EBADF);
1251 
1252 		if ((error = ccdlock(cs)) != 0)
1253 			return (error);
1254 
1255 		if (ccio->ccio_ndisks > CCD_MAXNDISKS)
1256 			return (EINVAL);
1257 
1258 		/* Fill in some important bits. */
1259 		ccd.ccd_unit = unit;
1260 		ccd.ccd_interleave = ccio->ccio_ileave;
1261 		if (ccd.ccd_interleave == 0 &&
1262 		    ((ccio->ccio_flags & CCDF_MIRROR) ||
1263 		     (ccio->ccio_flags & CCDF_PARITY))) {
1264 			printf("ccd%d: disabling mirror/parity, interleave is 0\n", unit);
1265 			ccio->ccio_flags &= ~(CCDF_MIRROR | CCDF_PARITY);
1266 		}
1267 		if ((ccio->ccio_flags & CCDF_MIRROR) &&
1268 		    (ccio->ccio_flags & CCDF_PARITY)) {
1269 			printf("ccd%d: can't specify both mirror and parity, using mirror\n", unit);
1270 			ccio->ccio_flags &= ~CCDF_PARITY;
1271 		}
1272 		if ((ccio->ccio_flags & (CCDF_MIRROR | CCDF_PARITY)) &&
1273 		    !(ccio->ccio_flags & CCDF_UNIFORM)) {
1274 			printf("ccd%d: mirror/parity forces uniform flag\n",
1275 			       unit);
1276 			ccio->ccio_flags |= CCDF_UNIFORM;
1277 		}
1278 		ccd.ccd_flags = ccio->ccio_flags & CCDF_USERMASK;
1279 
1280 		/*
1281 		 * Allocate space for and copy in the array of
1282 		 * componet pathnames and device numbers.
1283 		 */
1284 		cpp = malloc(ccio->ccio_ndisks * sizeof(char *),
1285 		    M_DEVBUF, M_WAITOK);
1286 		vpp = malloc(ccio->ccio_ndisks * sizeof(struct vnode *),
1287 		    M_DEVBUF, M_WAITOK);
1288 
1289 		error = copyin((caddr_t)ccio->ccio_disks, (caddr_t)cpp,
1290 		    ccio->ccio_ndisks * sizeof(char **));
1291 		if (error) {
1292 			free(vpp, M_DEVBUF);
1293 			free(cpp, M_DEVBUF);
1294 			ccdunlock(cs);
1295 			return (error);
1296 		}
1297 
1298 #ifdef DEBUG
1299 		if (ccddebug & CCDB_INIT)
1300 			for (i = 0; i < ccio->ccio_ndisks; ++i)
1301 				printf("ccdioctl: component %d: 0x%x\n",
1302 				    i, cpp[i]);
1303 #endif
1304 
1305 		for (i = 0; i < ccio->ccio_ndisks; ++i) {
1306 #ifdef DEBUG
1307 			if (ccddebug & CCDB_INIT)
1308 				printf("ccdioctl: lookedup = %d\n", lookedup);
1309 #endif
1310 			if ((error = ccdlookup(cpp[i], td, &vpp[i])) != 0) {
1311 				for (j = 0; j < lookedup; ++j)
1312 					(void)vn_close(vpp[j], FREAD|FWRITE, td);
1313 				free(vpp, M_DEVBUF);
1314 				free(cpp, M_DEVBUF);
1315 				ccdunlock(cs);
1316 				return (error);
1317 			}
1318 			++lookedup;
1319 		}
1320 		ccd.ccd_cpp = cpp;
1321 		ccd.ccd_vpp = vpp;
1322 		ccd.ccd_ndev = ccio->ccio_ndisks;
1323 
1324 		/*
1325 		 * Initialize the ccd.  Fills in the softc for us.
1326 		 */
1327 		if ((error = ccdinit(&ccd, cpp, td)) != 0) {
1328 			for (j = 0; j < lookedup; ++j)
1329 				(void)vn_close(vpp[j], FREAD|FWRITE, td);
1330 			bzero(&ccd_softc[unit], sizeof(struct ccd_softc));
1331 			free(vpp, M_DEVBUF);
1332 			free(cpp, M_DEVBUF);
1333 			ccdunlock(cs);
1334 			return (error);
1335 		}
1336 
1337 		/*
1338 		 * The ccd has been successfully initialized, so
1339 		 * we can place it into the array and read the disklabel.
1340 		 */
1341 		bcopy(&ccd, &ccddevs[unit], sizeof(ccd));
1342 		ccio->ccio_unit = unit;
1343 		ccio->ccio_size = cs->sc_size;
1344 		ccdgetdisklabel(dev);
1345 
1346 		ccdunlock(cs);
1347 
1348 		break;
1349 
1350 	case CCDIOCCLR:
1351 		if ((cs->sc_flags & CCDF_INITED) == 0)
1352 			return (ENXIO);
1353 
1354 		if ((flag & FWRITE) == 0)
1355 			return (EBADF);
1356 
1357 		if ((error = ccdlock(cs)) != 0)
1358 			return (error);
1359 
1360 		/* Don't unconfigure if any other partitions are open */
1361 		part = ccdpart(dev);
1362 		pmask = (1 << part);
1363 		if ((cs->sc_openmask & ~pmask)) {
1364 			ccdunlock(cs);
1365 			return (EBUSY);
1366 		}
1367 
1368 		/*
1369 		 * Free ccd_softc information and clear entry.
1370 		 */
1371 
1372 		/* Close the components and free their pathnames. */
1373 		for (i = 0; i < cs->sc_nccdisks; ++i) {
1374 			/*
1375 			 * XXX: this close could potentially fail and
1376 			 * cause Bad Things.  Maybe we need to force
1377 			 * the close to happen?
1378 			 */
1379 #ifdef DEBUG
1380 			if (ccddebug & CCDB_VNODE)
1381 				vprint("CCDIOCCLR: vnode info",
1382 				    cs->sc_cinfo[i].ci_vp);
1383 #endif
1384 			(void)vn_close(cs->sc_cinfo[i].ci_vp, FREAD|FWRITE, td);
1385 			free(cs->sc_cinfo[i].ci_path, M_DEVBUF);
1386 		}
1387 
1388 		/* Free interleave index. */
1389 		for (i = 0; cs->sc_itable[i].ii_ndisk; ++i)
1390 			free(cs->sc_itable[i].ii_index, M_DEVBUF);
1391 
1392 		/* Free component info and interleave table. */
1393 		free(cs->sc_cinfo, M_DEVBUF);
1394 		free(cs->sc_itable, M_DEVBUF);
1395 		cs->sc_flags &= ~CCDF_INITED;
1396 
1397 		/*
1398 		 * Free ccddevice information and clear entry.
1399 		 */
1400 		free(ccddevs[unit].ccd_cpp, M_DEVBUF);
1401 		free(ccddevs[unit].ccd_vpp, M_DEVBUF);
1402 		ccd.ccd_dk = -1;
1403 		bcopy(&ccd, &ccddevs[unit], sizeof(ccd));
1404 
1405 		/*
1406 		 * And remove the devstat entry.
1407 		 */
1408 		devstat_remove_entry(&cs->device_stats);
1409 
1410 		/* This must be atomic. */
1411 		crit_enter();
1412 		ccdunlock(cs);
1413 		bzero(cs, sizeof(struct ccd_softc));
1414 		crit_exit();
1415 
1416 		break;
1417 
1418 	case DIOCGDINFO:
1419 		if ((cs->sc_flags & CCDF_INITED) == 0)
1420 			return (ENXIO);
1421 
1422 		*(struct disklabel *)data = cs->sc_label;
1423 		break;
1424 
1425 	case DIOCGPART:
1426 		if ((cs->sc_flags & CCDF_INITED) == 0)
1427 			return (ENXIO);
1428 
1429 		((struct partinfo *)data)->disklab = &cs->sc_label;
1430 		((struct partinfo *)data)->part =
1431 		    &cs->sc_label.d_partitions[ccdpart(dev)];
1432 		break;
1433 
1434 	case DIOCWDINFO:
1435 	case DIOCSDINFO:
1436 		if ((cs->sc_flags & CCDF_INITED) == 0)
1437 			return (ENXIO);
1438 
1439 		if ((flag & FWRITE) == 0)
1440 			return (EBADF);
1441 
1442 		if ((error = ccdlock(cs)) != 0)
1443 			return (error);
1444 
1445 		cs->sc_flags |= CCDF_LABELLING;
1446 
1447 		error = setdisklabel(&cs->sc_label,
1448 		    (struct disklabel *)data, 0);
1449 		if (error == 0) {
1450 			if (cmd == DIOCWDINFO) {
1451 				dev_t cdev = CCDLABELDEV(dev);
1452 				error = writedisklabel(cdev, &cs->sc_label);
1453 			}
1454 		}
1455 
1456 		cs->sc_flags &= ~CCDF_LABELLING;
1457 
1458 		ccdunlock(cs);
1459 
1460 		if (error)
1461 			return (error);
1462 		break;
1463 
1464 	case DIOCWLABEL:
1465 		if ((cs->sc_flags & CCDF_INITED) == 0)
1466 			return (ENXIO);
1467 
1468 		if ((flag & FWRITE) == 0)
1469 			return (EBADF);
1470 		if (*(int *)data != 0)
1471 			cs->sc_flags |= CCDF_WLABEL;
1472 		else
1473 			cs->sc_flags &= ~CCDF_WLABEL;
1474 		break;
1475 
1476 	default:
1477 		return (ENOTTY);
1478 	}
1479 
1480 	return (0);
1481 }
1482 
1483 static int
1484 ccdsize(dev_t dev)
1485 {
1486 	struct ccd_softc *cs;
1487 	int part, size;
1488 
1489 	if (ccdopen(dev, 0, S_IFCHR, curthread))
1490 		return (-1);
1491 
1492 	cs = &ccd_softc[ccdunit(dev)];
1493 	part = ccdpart(dev);
1494 
1495 	if ((cs->sc_flags & CCDF_INITED) == 0)
1496 		return (-1);
1497 
1498 	if (cs->sc_label.d_partitions[part].p_fstype != FS_SWAP)
1499 		size = -1;
1500 	else
1501 		size = cs->sc_label.d_partitions[part].p_size;
1502 
1503 	if (ccdclose(dev, 0, S_IFCHR, curthread))
1504 		return (-1);
1505 
1506 	return (size);
1507 }
1508 
1509 static int
1510 ccddump(dev_t dev, u_int count, u_int blkno, u_int secsize)
1511 {
1512 	/* Not implemented. */
1513 	return ENXIO;
1514 }
1515 
1516 /*
1517  * Lookup the provided name in the filesystem.  If the file exists,
1518  * is a valid block device, and isn't being used by anyone else,
1519  * set *vpp to the file's vnode.
1520  */
1521 static int
1522 ccdlookup(char *path, struct thread *td, struct vnode **vpp)
1523 {
1524 	struct nlookupdata nd;
1525 	struct ucred *cred;
1526 	struct vnode *vp;
1527 	int error;
1528 
1529 	KKASSERT(td->td_proc);
1530 	cred = td->td_proc->p_ucred;
1531 	*vpp = NULL;
1532 
1533 	error = nlookup_init(&nd, path, UIO_USERSPACE, NLC_FOLLOW|NLC_LOCKVP);
1534 	if (error)
1535 		return (error);
1536 	if ((error = vn_open(&nd, NULL, FREAD|FWRITE, 0)) != 0) {
1537 #ifdef DEBUG
1538 		if (ccddebug & CCDB_FOLLOW|CCDB_INIT)
1539 			printf("ccdlookup: vn_open error = %d\n", error);
1540 #endif
1541 		goto done;
1542 	}
1543 	vp = nd.nl_open_vp;
1544 
1545 	if (vp->v_usecount > 1) {
1546 		error = EBUSY;
1547 		goto done;
1548 	}
1549 
1550 	if (!vn_isdisk(vp, &error))
1551 		goto done;
1552 
1553 #ifdef DEBUG
1554 	if (ccddebug & CCDB_VNODE)
1555 		vprint("ccdlookup: vnode info", vp);
1556 #endif
1557 
1558 	VOP_UNLOCK(vp, 0, td);
1559 	nd.nl_open_vp = NULL;
1560 	nlookup_done(&nd);
1561 	*vpp = vp;				/* leave ref intact  */
1562 	return (0);
1563 done:
1564 	nlookup_done(&nd);
1565 	return (error);
1566 }
1567 
1568 /*
1569  * Read the disklabel from the ccd.  If one is not present, fake one
1570  * up.
1571  */
1572 static void
1573 ccdgetdisklabel(dev_t dev)
1574 {
1575 	int unit = ccdunit(dev);
1576 	struct ccd_softc *cs = &ccd_softc[unit];
1577 	char *errstring;
1578 	struct disklabel *lp = &cs->sc_label;
1579 	struct ccdgeom *ccg = &cs->sc_geom;
1580 	dev_t cdev;
1581 
1582 	bzero(lp, sizeof(*lp));
1583 
1584 	lp->d_secperunit = cs->sc_size;
1585 	lp->d_secsize = ccg->ccg_secsize;
1586 	lp->d_nsectors = ccg->ccg_nsectors;
1587 	lp->d_ntracks = ccg->ccg_ntracks;
1588 	lp->d_ncylinders = ccg->ccg_ncylinders;
1589 	lp->d_secpercyl = lp->d_ntracks * lp->d_nsectors;
1590 
1591 	strncpy(lp->d_typename, "ccd", sizeof(lp->d_typename));
1592 	lp->d_type = DTYPE_CCD;
1593 	strncpy(lp->d_packname, "fictitious", sizeof(lp->d_packname));
1594 	lp->d_rpm = 3600;
1595 	lp->d_interleave = 1;
1596 	lp->d_flags = 0;
1597 
1598 	lp->d_partitions[RAW_PART].p_offset = 0;
1599 	lp->d_partitions[RAW_PART].p_size = cs->sc_size;
1600 	lp->d_partitions[RAW_PART].p_fstype = FS_UNUSED;
1601 	lp->d_npartitions = RAW_PART + 1;
1602 
1603 	lp->d_bbsize = BBSIZE;				/* XXX */
1604 	lp->d_sbsize = SBSIZE;				/* XXX */
1605 
1606 	lp->d_magic = DISKMAGIC;
1607 	lp->d_magic2 = DISKMAGIC;
1608 	lp->d_checksum = dkcksum(&cs->sc_label);
1609 
1610 	/*
1611 	 * Call the generic disklabel extraction routine.
1612 	 */
1613 	cdev = CCDLABELDEV(dev);
1614 	errstring = readdisklabel(cdev, &cs->sc_label);
1615 	if (errstring != NULL)
1616 		ccdmakedisklabel(cs);
1617 
1618 #ifdef DEBUG
1619 	/* It's actually extremely common to have unlabeled ccds. */
1620 	if (ccddebug & CCDB_LABEL)
1621 		if (errstring != NULL)
1622 			printf("ccd%d: %s\n", unit, errstring);
1623 #endif
1624 }
1625 
1626 /*
1627  * Take care of things one might want to take care of in the event
1628  * that a disklabel isn't present.
1629  */
1630 static void
1631 ccdmakedisklabel(struct ccd_softc *cs)
1632 {
1633 	struct disklabel *lp = &cs->sc_label;
1634 
1635 	/*
1636 	 * For historical reasons, if there's no disklabel present
1637 	 * the raw partition must be marked FS_BSDFFS.
1638 	 */
1639 	lp->d_partitions[RAW_PART].p_fstype = FS_BSDFFS;
1640 
1641 	strncpy(lp->d_packname, "default label", sizeof(lp->d_packname));
1642 }
1643 
1644 /*
1645  * Wait interruptibly for an exclusive lock.
1646  *
1647  * XXX
1648  * Several drivers do this; it should be abstracted and made MP-safe.
1649  */
1650 static int
1651 ccdlock(struct ccd_softc *cs)
1652 {
1653 	int error;
1654 
1655 	while ((cs->sc_flags & CCDF_LOCKED) != 0) {
1656 		cs->sc_flags |= CCDF_WANTED;
1657 		if ((error = tsleep(cs, PCATCH, "ccdlck", 0)) != 0)
1658 			return (error);
1659 	}
1660 	cs->sc_flags |= CCDF_LOCKED;
1661 	return (0);
1662 }
1663 
1664 /*
1665  * Unlock and wake up any waiters.
1666  */
1667 static void
1668 ccdunlock(struct ccd_softc *cs)
1669 {
1670 
1671 	cs->sc_flags &= ~CCDF_LOCKED;
1672 	if ((cs->sc_flags & CCDF_WANTED) != 0) {
1673 		cs->sc_flags &= ~CCDF_WANTED;
1674 		wakeup(cs);
1675 	}
1676 }
1677 
1678 #ifdef DEBUG
1679 static void
1680 printiinfo(struct ccdiinfo *ii)
1681 {
1682 	int ix, i;
1683 
1684 	for (ix = 0; ii->ii_ndisk; ix++, ii++) {
1685 		printf(" itab[%d]: #dk %d sblk %d soff %d",
1686 		       ix, ii->ii_ndisk, ii->ii_startblk, ii->ii_startoff);
1687 		for (i = 0; i < ii->ii_ndisk; i++)
1688 			printf(" %d", ii->ii_index[i]);
1689 		printf("\n");
1690 	}
1691 }
1692 #endif
1693 
1694 
1695 /* Local Variables: */
1696 /* c-argdecl-indent: 8 */
1697 /* c-continued-statement-offset: 8 */
1698 /* c-indent-level: 8 */
1699 /* End: */
1700