xref: /dragonfly/sys/dev/disk/ccd/ccd.c (revision 375d1659)
1 /* $FreeBSD: src/sys/dev/ccd/ccd.c,v 1.73.2.1 2001/09/11 09:49:52 kris Exp $ */
2 /* $DragonFly: src/sys/dev/disk/ccd/ccd.c,v 1.23 2006/03/08 17:14:11 dillon Exp $ */
3 
4 /*	$NetBSD: ccd.c,v 1.22 1995/12/08 19:13:26 thorpej Exp $	*/
5 
6 /*
7  * Copyright (c) 1995 Jason R. Thorpe.
8  * All rights reserved.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 3. All advertising materials mentioning features or use of this software
19  *    must display the following acknowledgement:
20  *	This product includes software developed for the NetBSD Project
21  *	by Jason R. Thorpe.
22  * 4. The name of the author may not be used to endorse or promote products
23  *    derived from this software without specific prior written permission.
24  *
25  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
26  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
27  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
28  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
29  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
30  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
31  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
32  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
33  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
34  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
35  * SUCH DAMAGE.
36  */
37 
38 /*
39  * Copyright (c) 1988 University of Utah.
40  * Copyright (c) 1990, 1993
41  *	The Regents of the University of California.  All rights reserved.
42  *
43  * This code is derived from software contributed to Berkeley by
44  * the Systems Programming Group of the University of Utah Computer
45  * Science Department.
46  *
47  * Redistribution and use in source and binary forms, with or without
48  * modification, are permitted provided that the following conditions
49  * are met:
50  * 1. Redistributions of source code must retain the above copyright
51  *    notice, this list of conditions and the following disclaimer.
52  * 2. Redistributions in binary form must reproduce the above copyright
53  *    notice, this list of conditions and the following disclaimer in the
54  *    documentation and/or other materials provided with the distribution.
55  * 3. All advertising materials mentioning features or use of this software
56  *    must display the following acknowledgement:
57  *	This product includes software developed by the University of
58  *	California, Berkeley and its contributors.
59  * 4. Neither the name of the University nor the names of its contributors
60  *    may be used to endorse or promote products derived from this software
61  *    without specific prior written permission.
62  *
63  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
64  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
65  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
66  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
67  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
68  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
69  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
70  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
71  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
72  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
73  * SUCH DAMAGE.
74  *
75  * from: Utah $Hdr: cd.c 1.6 90/11/28$
76  *
77  *	@(#)cd.c	8.2 (Berkeley) 11/16/93
78  */
79 
80 /*
81  * "Concatenated" disk driver.
82  *
83  * Dynamic configuration and disklabel support by:
84  *	Jason R. Thorpe <thorpej@nas.nasa.gov>
85  *	Numerical Aerodynamic Simulation Facility
86  *	Mail Stop 258-6
87  *	NASA Ames Research Center
88  *	Moffett Field, CA 94035
89  */
90 
91 #include "use_ccd.h"
92 
93 #include <sys/param.h>
94 #include <sys/systm.h>
95 #include <sys/kernel.h>
96 #include <sys/module.h>
97 #include <sys/proc.h>
98 #include <sys/buf.h>
99 #include <sys/malloc.h>
100 #include <sys/nlookup.h>
101 #include <sys/conf.h>
102 #include <sys/stat.h>
103 #include <sys/sysctl.h>
104 #include <sys/disklabel.h>
105 #include <vfs/ufs/fs.h>
106 #include <sys/devicestat.h>
107 #include <sys/fcntl.h>
108 #include <sys/vnode.h>
109 #include <sys/buf2.h>
110 
111 #include <sys/ccdvar.h>
112 
113 #include <sys/thread2.h>
114 
115 #include <vm/vm_zone.h>
116 
117 #if defined(CCDDEBUG) && !defined(DEBUG)
118 #define DEBUG
119 #endif
120 
121 #ifdef DEBUG
122 #define CCDB_FOLLOW	0x01
123 #define CCDB_INIT	0x02
124 #define CCDB_IO		0x04
125 #define CCDB_LABEL	0x08
126 #define CCDB_VNODE	0x10
127 static int ccddebug = CCDB_FOLLOW | CCDB_INIT | CCDB_IO | CCDB_LABEL |
128     CCDB_VNODE;
129 SYSCTL_INT(_debug, OID_AUTO, ccddebug, CTLFLAG_RW, &ccddebug, 0, "");
130 #undef DEBUG
131 #endif
132 
133 #define	ccdunit(x)	dkunit(x)
134 #define ccdpart(x)	dkpart(x)
135 
136 /*
137    This is how mirroring works (only writes are special):
138 
139    When initiating a write, ccdbuffer() returns two "struct ccdbuf *"s
140    linked together by the cb_mirror field.  "cb_pflags &
141    CCDPF_MIRROR_DONE" is set to 0 on both of them.
142 
143    When a component returns to ccdiodone(), it checks if "cb_pflags &
144    CCDPF_MIRROR_DONE" is set or not.  If not, it sets the partner's
145    flag and returns.  If it is, it means its partner has already
146    returned, so it will go to the regular cleanup.
147 
148  */
149 
150 struct ccdbuf {
151 	struct buf	cb_buf;		/* new I/O buf */
152 	struct bio	*cb_obio;	/* ptr. to original I/O buf */
153 	struct ccdbuf	*cb_freenext;	/* free list link */
154 	int		cb_unit;	/* target unit */
155 	int		cb_comp;	/* target component */
156 	int		cb_pflags;	/* mirror/parity status flag */
157 	struct ccdbuf	*cb_mirror;	/* mirror counterpart */
158 };
159 
160 /* bits in cb_pflags */
161 #define CCDPF_MIRROR_DONE 1	/* if set, mirror counterpart is done */
162 
163 #define CCDLABELDEV(dev)	\
164 	(make_sub_dev(dev, dkmakeminor(ccdunit((dev)), 0, RAW_PART)))
165 
166 static d_open_t ccdopen;
167 static d_close_t ccdclose;
168 static d_strategy_t ccdstrategy;
169 static d_ioctl_t ccdioctl;
170 static d_dump_t ccddump;
171 static d_psize_t ccdsize;
172 
173 #define NCCDFREEHIWAT	16
174 
175 #define CDEV_MAJOR 74
176 
177 static struct cdevsw ccd_cdevsw = {
178 	/* name */	"ccd",
179 	/* maj */	CDEV_MAJOR,
180 	/* flags */	D_DISK,
181 	/* port */      NULL,
182 	/* clone */	NULL,
183 
184 	/* open */	ccdopen,
185 	/* close */	ccdclose,
186 	/* read */	physread,
187 	/* write */	physwrite,
188 	/* ioctl */	ccdioctl,
189 	/* poll */	nopoll,
190 	/* mmap */	nommap,
191 	/* strategy */	ccdstrategy,
192 	/* dump */	ccddump,
193 	/* psize */	ccdsize
194 };
195 
196 /* called during module initialization */
197 static	void ccdattach (void);
198 static	int ccd_modevent (module_t, int, void *);
199 
200 /* called by biodone() at interrupt time */
201 static	void ccdiodone (struct bio *bio);
202 
203 static	void ccdstart (struct ccd_softc *, struct bio *);
204 static	void ccdinterleave (struct ccd_softc *, int);
205 static	void ccdintr (struct ccd_softc *, struct bio *);
206 static	int ccdinit (struct ccddevice *, char **, struct thread *);
207 static	int ccdlookup (char *, struct thread *td, struct vnode **);
208 static	void ccdbuffer (struct ccdbuf **ret, struct ccd_softc *,
209 		struct bio *, daddr_t, caddr_t, long);
210 static	void ccdgetdisklabel (dev_t);
211 static	void ccdmakedisklabel (struct ccd_softc *);
212 static	int ccdlock (struct ccd_softc *);
213 static	void ccdunlock (struct ccd_softc *);
214 
215 #ifdef DEBUG
216 static	void printiinfo (struct ccdiinfo *);
217 #endif
218 
219 /* Non-private for the benefit of libkvm. */
220 struct	ccd_softc *ccd_softc;
221 struct	ccddevice *ccddevs;
222 struct	ccdbuf *ccdfreebufs;
223 static	int numccdfreebufs;
224 static	int numccd = 0;
225 
226 /*
227  * getccdbuf() -	Allocate and zero a ccd buffer.
228  *
229  *	This routine is called at splbio().
230  */
231 
232 static __inline
233 struct ccdbuf *
234 getccdbuf(void)
235 {
236 	struct ccdbuf *cbp;
237 
238 	/*
239 	 * Allocate from freelist or malloc as necessary
240 	 */
241 	if ((cbp = ccdfreebufs) != NULL) {
242 		ccdfreebufs = cbp->cb_freenext;
243 		--numccdfreebufs;
244 		reinitbufbio(&cbp->cb_buf);
245 	} else {
246 		cbp = malloc(sizeof(struct ccdbuf), M_DEVBUF, M_WAITOK|M_ZERO);
247 		initbufbio(&cbp->cb_buf);
248 	}
249 
250 	/*
251 	 * independant struct buf initialization
252 	 */
253 	LIST_INIT(&cbp->cb_buf.b_dep);
254 	BUF_LOCKINIT(&cbp->cb_buf);
255 	BUF_LOCK(&cbp->cb_buf, LK_EXCLUSIVE);
256 	BUF_KERNPROC(&cbp->cb_buf);
257 
258 	return(cbp);
259 }
260 
261 /*
262  * putccdbuf() -	Free a ccd buffer.
263  *
264  *	This routine is called at splbio().
265  */
266 
267 static __inline
268 void
269 putccdbuf(struct ccdbuf *cbp)
270 {
271 	BUF_UNLOCK(&cbp->cb_buf);
272 	BUF_LOCKFREE(&cbp->cb_buf);
273 
274 	if (numccdfreebufs < NCCDFREEHIWAT) {
275 		cbp->cb_freenext = ccdfreebufs;
276 		ccdfreebufs = cbp;
277 		++numccdfreebufs;
278 	} else {
279 		free((caddr_t)cbp, M_DEVBUF);
280 	}
281 }
282 
283 
284 /*
285  * Number of blocks to untouched in front of a component partition.
286  * This is to avoid violating its disklabel area when it starts at the
287  * beginning of the slice.
288  */
289 #if !defined(CCD_OFFSET)
290 #define CCD_OFFSET 16
291 #endif
292 
293 /*
294  * Called by main() during pseudo-device attachment.  All we need
295  * to do is allocate enough space for devices to be configured later, and
296  * add devsw entries.
297  */
298 static void
299 ccdattach(void)
300 {
301 	int i;
302 	int num = NCCD;
303 
304 	if (num > 1)
305 		printf("ccd0-%d: Concatenated disk drivers\n", num-1);
306 	else
307 		printf("ccd0: Concatenated disk driver\n");
308 
309 	ccd_softc = malloc(num * sizeof(struct ccd_softc), M_DEVBUF,
310 			    M_WAITOK | M_ZERO);
311 	ccddevs = malloc(num * sizeof(struct ccddevice), M_DEVBUF,
312 			    M_WAITOK | M_ZERO);
313 	numccd = num;
314 
315 	cdevsw_add(&ccd_cdevsw, 0, 0);
316 	/* XXX: is this necessary? */
317 	for (i = 0; i < numccd; ++i)
318 		ccddevs[i].ccd_dk = -1;
319 }
320 
321 static int
322 ccd_modevent(module_t mod, int type, void *data)
323 {
324 	int error = 0;
325 
326 	switch (type) {
327 	case MOD_LOAD:
328 		ccdattach();
329 		break;
330 
331 	case MOD_UNLOAD:
332 		printf("ccd0: Unload not supported!\n");
333 		error = EOPNOTSUPP;
334 		break;
335 
336 	default:	/* MOD_SHUTDOWN etc */
337 		break;
338 	}
339 	return (error);
340 }
341 
342 DEV_MODULE(ccd, ccd_modevent, NULL);
343 
344 static int
345 ccdinit(struct ccddevice *ccd, char **cpaths, struct thread *td)
346 {
347 	struct ccd_softc *cs = &ccd_softc[ccd->ccd_unit];
348 	struct ccdcinfo *ci = NULL;	/* XXX */
349 	size_t size;
350 	int ix;
351 	struct vnode *vp;
352 	size_t minsize;
353 	int maxsecsize;
354 	struct partinfo dpart;
355 	struct ccdgeom *ccg = &cs->sc_geom;
356 	char tmppath[MAXPATHLEN];
357 	int error = 0;
358 	struct ucred *cred;
359 
360 	KKASSERT(td->td_proc);
361 	cred = td->td_proc->p_ucred;
362 
363 #ifdef DEBUG
364 	if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
365 		printf("ccdinit: unit %d\n", ccd->ccd_unit);
366 #endif
367 
368 	cs->sc_size = 0;
369 	cs->sc_ileave = ccd->ccd_interleave;
370 	cs->sc_nccdisks = ccd->ccd_ndev;
371 
372 	/* Allocate space for the component info. */
373 	cs->sc_cinfo = malloc(cs->sc_nccdisks * sizeof(struct ccdcinfo),
374 	    M_DEVBUF, M_WAITOK);
375 
376 	/*
377 	 * Verify that each component piece exists and record
378 	 * relevant information about it.
379 	 */
380 	maxsecsize = 0;
381 	minsize = 0;
382 	for (ix = 0; ix < cs->sc_nccdisks; ix++) {
383 		vp = ccd->ccd_vpp[ix];
384 		ci = &cs->sc_cinfo[ix];
385 		ci->ci_vp = vp;
386 
387 		/*
388 		 * Copy in the pathname of the component.
389 		 */
390 		bzero(tmppath, sizeof(tmppath));	/* sanity */
391 		if ((error = copyinstr(cpaths[ix], tmppath,
392 		    MAXPATHLEN, &ci->ci_pathlen)) != 0) {
393 #ifdef DEBUG
394 			if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
395 				printf("ccd%d: can't copy path, error = %d\n",
396 				    ccd->ccd_unit, error);
397 #endif
398 			goto fail;
399 		}
400 		ci->ci_path = malloc(ci->ci_pathlen, M_DEVBUF, M_WAITOK);
401 		bcopy(tmppath, ci->ci_path, ci->ci_pathlen);
402 
403 		ci->ci_dev = vn_todev(vp);
404 
405 		/*
406 		 * Get partition information for the component.
407 		 */
408 		if ((error = VOP_IOCTL(vp, DIOCGPART, (caddr_t)&dpart,
409 		    FREAD, cred, td)) != 0) {
410 #ifdef DEBUG
411 			if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
412 				 printf("ccd%d: %s: ioctl failed, error = %d\n",
413 				     ccd->ccd_unit, ci->ci_path, error);
414 #endif
415 			goto fail;
416 		}
417 		if (dpart.part->p_fstype == FS_BSDFFS) {
418 			maxsecsize =
419 			    ((dpart.disklab->d_secsize > maxsecsize) ?
420 			    dpart.disklab->d_secsize : maxsecsize);
421 			size = dpart.part->p_size - CCD_OFFSET;
422 		} else {
423 #ifdef DEBUG
424 			if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
425 				printf("ccd%d: %s: incorrect partition type\n",
426 				    ccd->ccd_unit, ci->ci_path);
427 #endif
428 			error = EFTYPE;
429 			goto fail;
430 		}
431 
432 		/*
433 		 * Calculate the size, truncating to an interleave
434 		 * boundary if necessary.
435 		 */
436 
437 		if (cs->sc_ileave > 1)
438 			size -= size % cs->sc_ileave;
439 
440 		if (size == 0) {
441 #ifdef DEBUG
442 			if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
443 				printf("ccd%d: %s: size == 0\n",
444 				    ccd->ccd_unit, ci->ci_path);
445 #endif
446 			error = ENODEV;
447 			goto fail;
448 		}
449 
450 		if (minsize == 0 || size < minsize)
451 			minsize = size;
452 		ci->ci_size = size;
453 		cs->sc_size += size;
454 	}
455 
456 	/*
457 	 * Don't allow the interleave to be smaller than
458 	 * the biggest component sector.
459 	 */
460 	if ((cs->sc_ileave > 0) &&
461 	    (cs->sc_ileave < (maxsecsize / DEV_BSIZE))) {
462 #ifdef DEBUG
463 		if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
464 			printf("ccd%d: interleave must be at least %d\n",
465 			    ccd->ccd_unit, (maxsecsize / DEV_BSIZE));
466 #endif
467 		error = EINVAL;
468 		goto fail;
469 	}
470 
471 	/*
472 	 * If uniform interleave is desired set all sizes to that of
473 	 * the smallest component.  This will guarentee that a single
474 	 * interleave table is generated.
475 	 *
476 	 * Lost space must be taken into account when calculating the
477 	 * overall size.  Half the space is lost when CCDF_MIRROR is
478 	 * specified.  One disk is lost when CCDF_PARITY is specified.
479 	 */
480 	if (ccd->ccd_flags & CCDF_UNIFORM) {
481 		for (ci = cs->sc_cinfo;
482 		     ci < &cs->sc_cinfo[cs->sc_nccdisks]; ci++) {
483 			ci->ci_size = minsize;
484 		}
485 		if (ccd->ccd_flags & CCDF_MIRROR) {
486 			/*
487 			 * Check to see if an even number of components
488 			 * have been specified.  The interleave must also
489 			 * be non-zero in order for us to be able to
490 			 * guarentee the topology.
491 			 */
492 			if (cs->sc_nccdisks % 2) {
493 				printf("ccd%d: mirroring requires an even number of disks\n", ccd->ccd_unit );
494 				error = EINVAL;
495 				goto fail;
496 			}
497 			if (cs->sc_ileave == 0) {
498 				printf("ccd%d: an interleave must be specified when mirroring\n", ccd->ccd_unit);
499 				error = EINVAL;
500 				goto fail;
501 			}
502 			cs->sc_size = (cs->sc_nccdisks/2) * minsize;
503 		} else if (ccd->ccd_flags & CCDF_PARITY) {
504 			cs->sc_size = (cs->sc_nccdisks-1) * minsize;
505 		} else {
506 			if (cs->sc_ileave == 0) {
507 				printf("ccd%d: an interleave must be specified when using parity\n", ccd->ccd_unit);
508 				error = EINVAL;
509 				goto fail;
510 			}
511 			cs->sc_size = cs->sc_nccdisks * minsize;
512 		}
513 	}
514 
515 	/*
516 	 * Construct the interleave table.
517 	 */
518 	ccdinterleave(cs, ccd->ccd_unit);
519 
520 	/*
521 	 * Create pseudo-geometry based on 1MB cylinders.  It's
522 	 * pretty close.
523 	 */
524 	ccg->ccg_secsize = maxsecsize;
525 	ccg->ccg_ntracks = 1;
526 	ccg->ccg_nsectors = 1024 * 1024 / ccg->ccg_secsize;
527 	ccg->ccg_ncylinders = cs->sc_size / ccg->ccg_nsectors;
528 
529 	/*
530 	 * Add an devstat entry for this device.
531 	 */
532 	devstat_add_entry(&cs->device_stats, "ccd", ccd->ccd_unit,
533 			  ccg->ccg_secsize, DEVSTAT_ALL_SUPPORTED,
534 			  DEVSTAT_TYPE_STORARRAY |DEVSTAT_TYPE_IF_OTHER,
535 			  DEVSTAT_PRIORITY_ARRAY);
536 
537 	cs->sc_flags |= CCDF_INITED;
538 	cs->sc_cflags = ccd->ccd_flags;	/* So we can find out later... */
539 	cs->sc_unit = ccd->ccd_unit;
540 	return (0);
541 fail:
542 	while (ci > cs->sc_cinfo) {
543 		ci--;
544 		free(ci->ci_path, M_DEVBUF);
545 	}
546 	free(cs->sc_cinfo, M_DEVBUF);
547 	return (error);
548 }
549 
550 static void
551 ccdinterleave(struct ccd_softc *cs, int unit)
552 {
553 	struct ccdcinfo *ci, *smallci;
554 	struct ccdiinfo *ii;
555 	daddr_t bn, lbn;
556 	int ix;
557 	u_long size;
558 
559 #ifdef DEBUG
560 	if (ccddebug & CCDB_INIT)
561 		printf("ccdinterleave(%x): ileave %d\n", cs, cs->sc_ileave);
562 #endif
563 
564 	/*
565 	 * Allocate an interleave table.  The worst case occurs when each
566 	 * of N disks is of a different size, resulting in N interleave
567 	 * tables.
568 	 *
569 	 * Chances are this is too big, but we don't care.
570 	 */
571 	size = (cs->sc_nccdisks + 1) * sizeof(struct ccdiinfo);
572 	cs->sc_itable = (struct ccdiinfo *)malloc(size, M_DEVBUF, M_WAITOK);
573 	bzero((caddr_t)cs->sc_itable, size);
574 
575 	/*
576 	 * Trivial case: no interleave (actually interleave of disk size).
577 	 * Each table entry represents a single component in its entirety.
578 	 *
579 	 * An interleave of 0 may not be used with a mirror or parity setup.
580 	 */
581 	if (cs->sc_ileave == 0) {
582 		bn = 0;
583 		ii = cs->sc_itable;
584 
585 		for (ix = 0; ix < cs->sc_nccdisks; ix++) {
586 			/* Allocate space for ii_index. */
587 			ii->ii_index = malloc(sizeof(int), M_DEVBUF, M_WAITOK);
588 			ii->ii_ndisk = 1;
589 			ii->ii_startblk = bn;
590 			ii->ii_startoff = 0;
591 			ii->ii_index[0] = ix;
592 			bn += cs->sc_cinfo[ix].ci_size;
593 			ii++;
594 		}
595 		ii->ii_ndisk = 0;
596 #ifdef DEBUG
597 		if (ccddebug & CCDB_INIT)
598 			printiinfo(cs->sc_itable);
599 #endif
600 		return;
601 	}
602 
603 	/*
604 	 * The following isn't fast or pretty; it doesn't have to be.
605 	 */
606 	size = 0;
607 	bn = lbn = 0;
608 	for (ii = cs->sc_itable; ; ii++) {
609 		/*
610 		 * Allocate space for ii_index.  We might allocate more then
611 		 * we use.
612 		 */
613 		ii->ii_index = malloc((sizeof(int) * cs->sc_nccdisks),
614 		    M_DEVBUF, M_WAITOK);
615 
616 		/*
617 		 * Locate the smallest of the remaining components
618 		 */
619 		smallci = NULL;
620 		for (ci = cs->sc_cinfo; ci < &cs->sc_cinfo[cs->sc_nccdisks];
621 		    ci++) {
622 			if (ci->ci_size > size &&
623 			    (smallci == NULL ||
624 			     ci->ci_size < smallci->ci_size)) {
625 				smallci = ci;
626 			}
627 		}
628 
629 		/*
630 		 * Nobody left, all done
631 		 */
632 		if (smallci == NULL) {
633 			ii->ii_ndisk = 0;
634 			break;
635 		}
636 
637 		/*
638 		 * Record starting logical block using an sc_ileave blocksize.
639 		 */
640 		ii->ii_startblk = bn / cs->sc_ileave;
641 
642 		/*
643 		 * Record starting comopnent block using an sc_ileave
644 		 * blocksize.  This value is relative to the beginning of
645 		 * a component disk.
646 		 */
647 		ii->ii_startoff = lbn;
648 
649 		/*
650 		 * Determine how many disks take part in this interleave
651 		 * and record their indices.
652 		 */
653 		ix = 0;
654 		for (ci = cs->sc_cinfo;
655 		    ci < &cs->sc_cinfo[cs->sc_nccdisks]; ci++) {
656 			if (ci->ci_size >= smallci->ci_size) {
657 				ii->ii_index[ix++] = ci - cs->sc_cinfo;
658 			}
659 		}
660 		ii->ii_ndisk = ix;
661 		bn += ix * (smallci->ci_size - size);
662 		lbn = smallci->ci_size / cs->sc_ileave;
663 		size = smallci->ci_size;
664 	}
665 #ifdef DEBUG
666 	if (ccddebug & CCDB_INIT)
667 		printiinfo(cs->sc_itable);
668 #endif
669 }
670 
671 /* ARGSUSED */
672 static int
673 ccdopen(dev_t dev, int flags, int fmt, d_thread_t *td)
674 {
675 	int unit = ccdunit(dev);
676 	struct ccd_softc *cs;
677 	struct disklabel *lp;
678 	int error = 0, part, pmask;
679 
680 #ifdef DEBUG
681 	if (ccddebug & CCDB_FOLLOW)
682 		printf("ccdopen(%x, %x)\n", dev, flags);
683 #endif
684 	if (unit >= numccd)
685 		return (ENXIO);
686 	cs = &ccd_softc[unit];
687 
688 	if ((error = ccdlock(cs)) != 0)
689 		return (error);
690 
691 	lp = &cs->sc_label;
692 
693 	part = ccdpart(dev);
694 	pmask = (1 << part);
695 
696 	/*
697 	 * If we're initialized, check to see if there are any other
698 	 * open partitions.  If not, then it's safe to update
699 	 * the in-core disklabel.
700 	 */
701 	if ((cs->sc_flags & CCDF_INITED) && (cs->sc_openmask == 0))
702 		ccdgetdisklabel(dev);
703 
704 	/* Check that the partition exists. */
705 	if (part != RAW_PART && ((part >= lp->d_npartitions) ||
706 	    (lp->d_partitions[part].p_fstype == FS_UNUSED))) {
707 		error = ENXIO;
708 		goto done;
709 	}
710 
711 	cs->sc_openmask |= pmask;
712  done:
713 	ccdunlock(cs);
714 	return (0);
715 }
716 
717 /* ARGSUSED */
718 static int
719 ccdclose(dev_t dev, int flags, int fmt, d_thread_t *td)
720 {
721 	int unit = ccdunit(dev);
722 	struct ccd_softc *cs;
723 	int error = 0, part;
724 
725 #ifdef DEBUG
726 	if (ccddebug & CCDB_FOLLOW)
727 		printf("ccdclose(%x, %x)\n", dev, flags);
728 #endif
729 
730 	if (unit >= numccd)
731 		return (ENXIO);
732 	cs = &ccd_softc[unit];
733 
734 	if ((error = ccdlock(cs)) != 0)
735 		return (error);
736 
737 	part = ccdpart(dev);
738 
739 	/* ...that much closer to allowing unconfiguration... */
740 	cs->sc_openmask &= ~(1 << part);
741 	ccdunlock(cs);
742 	return (0);
743 }
744 
745 static void
746 ccdstrategy(dev_t dev, struct bio *bio)
747 {
748 	int unit = ccdunit(dev);
749 	struct bio *nbio;
750 	struct buf *bp = bio->bio_buf;
751 	struct ccd_softc *cs = &ccd_softc[unit];
752 	int wlabel;
753 	struct disklabel *lp;
754 
755 #ifdef DEBUG
756 	if (ccddebug & CCDB_FOLLOW)
757 		printf("ccdstrategy(%x): unit %d\n", bp, unit);
758 #endif
759 	if ((cs->sc_flags & CCDF_INITED) == 0) {
760 		bp->b_error = ENXIO;
761 		bp->b_flags |= B_ERROR;
762 		goto done;
763 	}
764 
765 	/* If it's a nil transfer, wake up the top half now. */
766 	if (bp->b_bcount == 0)
767 		goto done;
768 
769 	lp = &cs->sc_label;
770 
771 	/*
772 	 * Do bounds checking and adjust transfer.  If there's an
773 	 * error, the bounds check will flag that for us.
774 	 */
775 	wlabel = cs->sc_flags & (CCDF_WLABEL|CCDF_LABELLING);
776 	if (ccdpart(dev) != RAW_PART) {
777 		nbio = bounds_check_with_label(dev, bio, lp, wlabel);
778 		if (nbio == NULL)
779 			goto done;
780 	} else {
781 		int pbn;        /* in sc_secsize chunks */
782 		long sz;        /* in sc_secsize chunks */
783 
784 		pbn = bio->bio_blkno / (cs->sc_geom.ccg_secsize / DEV_BSIZE);
785 		sz = howmany(bp->b_bcount, cs->sc_geom.ccg_secsize);
786 
787 		/*
788 		 * If out of bounds return an error. If at the EOF point,
789 		 * simply read or write less.
790 		 */
791 
792 		if (pbn < 0 || pbn >= cs->sc_size) {
793 			bp->b_resid = bp->b_bcount;
794 			if (pbn != cs->sc_size) {
795 				bp->b_error = EINVAL;
796 				bp->b_flags |= B_ERROR | B_INVAL;
797 			}
798 			goto done;
799 		}
800 
801 		/*
802 		 * If the request crosses EOF, truncate the request.
803 		 */
804 		if (pbn + sz > cs->sc_size) {
805 			bp->b_bcount = (cs->sc_size - pbn) *
806 			    cs->sc_geom.ccg_secsize;
807 		}
808 		nbio = bio;
809 	}
810 
811 	bp->b_resid = bp->b_bcount;
812 	nbio->bio_driver_info = dev;
813 
814 	/*
815 	 * "Start" the unit.
816 	 */
817 	crit_enter();
818 	ccdstart(cs, nbio);
819 	crit_exit();
820 	return;
821 
822 	/*
823 	 * note: bio, not nbio, is valid at the done label.
824 	 */
825 done:
826 	biodone(bio);
827 }
828 
829 static void
830 ccdstart(struct ccd_softc *cs, struct bio *bio)
831 {
832 	long bcount, rcount;
833 	struct ccdbuf *cbp[4];
834 	struct buf *bp = bio->bio_buf;
835 	dev_t dev = bio->bio_driver_info;
836 	/* XXX! : 2 reads and 2 writes for RAID 4/5 */
837 	caddr_t addr;
838 	daddr_t bn;
839 	struct partition *pp;
840 
841 #ifdef DEBUG
842 	if (ccddebug & CCDB_FOLLOW)
843 		printf("ccdstart(%x, %x)\n", cs, bp);
844 #endif
845 
846 	/* Record the transaction start  */
847 	devstat_start_transaction(&cs->device_stats);
848 
849 	/*
850 	 * Translate the partition-relative block number to an absolute.
851 	 */
852 	bn = bio->bio_blkno;
853 	if (ccdpart(dev) != RAW_PART) {
854 		pp = &cs->sc_label.d_partitions[ccdpart(dev)];
855 		bn += pp->p_offset;
856 	}
857 
858 	/*
859 	 * Allocate component buffers and fire off the requests
860 	 */
861 	addr = bp->b_data;
862 	for (bcount = bp->b_bcount; bcount > 0; bcount -= rcount) {
863 		ccdbuffer(cbp, cs, bio, bn, addr, bcount);
864 		rcount = cbp[0]->cb_buf.b_bcount;
865 
866 		if (cs->sc_cflags & CCDF_MIRROR) {
867 			/*
868 			 * Mirroring.  Writes go to both disks, reads are
869 			 * taken from whichever disk seems most appropriate.
870 			 *
871 			 * We attempt to localize reads to the disk whos arm
872 			 * is nearest the read request.  We ignore seeks due
873 			 * to writes when making this determination and we
874 			 * also try to avoid hogging.
875 			 */
876 			if ((cbp[0]->cb_buf.b_flags & B_READ) == 0) {
877 				vn_strategy(cbp[0]->cb_buf.b_vp,
878 				    &cbp[0]->cb_buf.b_bio1);
879 				vn_strategy(cbp[1]->cb_buf.b_vp,
880 				    &cbp[1]->cb_buf.b_bio1);
881 			} else {
882 				int pick = cs->sc_pick;
883 				daddr_t range = cs->sc_size / 16;
884 
885 				if (bn < cs->sc_blk[pick] - range ||
886 				    bn > cs->sc_blk[pick] + range
887 				) {
888 					cs->sc_pick = pick = 1 - pick;
889 				}
890 				cs->sc_blk[pick] = bn + btodb(rcount);
891 				vn_strategy(cbp[pick]->cb_buf.b_vp,
892 				    &cbp[pick]->cb_buf.b_bio1);
893 			}
894 		} else {
895 			/*
896 			 * Not mirroring
897 			 */
898 			vn_strategy(cbp[0]->cb_buf.b_vp,
899 				     &cbp[0]->cb_buf.b_bio1);
900 		}
901 		bn += btodb(rcount);
902 		addr += rcount;
903 	}
904 }
905 
906 /*
907  * Build a component buffer header.
908  */
909 static void
910 ccdbuffer(struct ccdbuf **cb, struct ccd_softc *cs, struct bio *bio, daddr_t bn,
911 	  caddr_t addr, long bcount)
912 {
913 	struct ccdcinfo *ci, *ci2 = NULL;	/* XXX */
914 	struct ccdbuf *cbp;
915 	daddr_t cbn, cboff;
916 	off_t cbc;
917 
918 #ifdef DEBUG
919 	if (ccddebug & CCDB_IO)
920 		printf("ccdbuffer(%x, %x, %d, %x, %d)\n",
921 		       cs, bp, bn, addr, bcount);
922 #endif
923 	/*
924 	 * Determine which component bn falls in.
925 	 */
926 	cbn = bn;
927 	cboff = 0;
928 
929 	if (cs->sc_ileave == 0) {
930 		/*
931 		 * Serially concatenated and neither a mirror nor a parity
932 		 * config.  This is a special case.
933 		 */
934 		daddr_t sblk;
935 
936 		sblk = 0;
937 		for (ci = cs->sc_cinfo; cbn >= sblk + ci->ci_size; ci++)
938 			sblk += ci->ci_size;
939 		cbn -= sblk;
940 	} else {
941 		struct ccdiinfo *ii;
942 		int ccdisk, off;
943 
944 		/*
945 		 * Calculate cbn, the logical superblock (sc_ileave chunks),
946 		 * and cboff, a normal block offset (DEV_BSIZE chunks) relative
947 		 * to cbn.
948 		 */
949 		cboff = cbn % cs->sc_ileave;	/* DEV_BSIZE gran */
950 		cbn = cbn / cs->sc_ileave;	/* DEV_BSIZE * ileave gran */
951 
952 		/*
953 		 * Figure out which interleave table to use.
954 		 */
955 		for (ii = cs->sc_itable; ii->ii_ndisk; ii++) {
956 			if (ii->ii_startblk > cbn)
957 				break;
958 		}
959 		ii--;
960 
961 		/*
962 		 * off is the logical superblock relative to the beginning
963 		 * of this interleave block.
964 		 */
965 		off = cbn - ii->ii_startblk;
966 
967 		/*
968 		 * We must calculate which disk component to use (ccdisk),
969 		 * and recalculate cbn to be the superblock relative to
970 		 * the beginning of the component.  This is typically done by
971 		 * adding 'off' and ii->ii_startoff together.  However, 'off'
972 		 * must typically be divided by the number of components in
973 		 * this interleave array to be properly convert it from a
974 		 * CCD-relative logical superblock number to a
975 		 * component-relative superblock number.
976 		 */
977 		if (ii->ii_ndisk == 1) {
978 			/*
979 			 * When we have just one disk, it can't be a mirror
980 			 * or a parity config.
981 			 */
982 			ccdisk = ii->ii_index[0];
983 			cbn = ii->ii_startoff + off;
984 		} else {
985 			if (cs->sc_cflags & CCDF_MIRROR) {
986 				/*
987 				 * We have forced a uniform mapping, resulting
988 				 * in a single interleave array.  We double
989 				 * up on the first half of the available
990 				 * components and our mirror is in the second
991 				 * half.  This only works with a single
992 				 * interleave array because doubling up
993 				 * doubles the number of sectors, so there
994 				 * cannot be another interleave array because
995 				 * the next interleave array's calculations
996 				 * would be off.
997 				 */
998 				int ndisk2 = ii->ii_ndisk / 2;
999 				ccdisk = ii->ii_index[off % ndisk2];
1000 				cbn = ii->ii_startoff + off / ndisk2;
1001 				ci2 = &cs->sc_cinfo[ccdisk + ndisk2];
1002 			} else if (cs->sc_cflags & CCDF_PARITY) {
1003 				/*
1004 				 * XXX not implemented yet
1005 				 */
1006 				int ndisk2 = ii->ii_ndisk - 1;
1007 				ccdisk = ii->ii_index[off % ndisk2];
1008 				cbn = ii->ii_startoff + off / ndisk2;
1009 				if (cbn % ii->ii_ndisk <= ccdisk)
1010 					ccdisk++;
1011 			} else {
1012 				ccdisk = ii->ii_index[off % ii->ii_ndisk];
1013 				cbn = ii->ii_startoff + off / ii->ii_ndisk;
1014 			}
1015 		}
1016 
1017 		ci = &cs->sc_cinfo[ccdisk];
1018 
1019 		/*
1020 		 * Convert cbn from a superblock to a normal block so it
1021 		 * can be used to calculate (along with cboff) the normal
1022 		 * block index into this particular disk.
1023 		 */
1024 		cbn *= cs->sc_ileave;
1025 	}
1026 
1027 	/*
1028 	 * Fill in the component buf structure.
1029 	 */
1030 	cbp = getccdbuf();
1031 	cbp->cb_buf.b_flags = bio->bio_buf->b_flags;
1032 	cbp->cb_buf.b_data = addr;
1033 	cbp->cb_buf.b_vp = ci->ci_vp;
1034 	if (cs->sc_ileave == 0)
1035               cbc = dbtob((off_t)(ci->ci_size - cbn));
1036 	else
1037               cbc = dbtob((off_t)(cs->sc_ileave - cboff));
1038 	cbp->cb_buf.b_bcount = (cbc < bcount) ? cbc : bcount;
1039  	cbp->cb_buf.b_bufsize = cbp->cb_buf.b_bcount;
1040 
1041 	cbp->cb_buf.b_bio1.bio_done = ccdiodone;
1042 	cbp->cb_buf.b_bio1.bio_caller_info1.ptr = cbp;
1043 	cbp->cb_buf.b_bio1.bio_blkno = cbn + cboff + CCD_OFFSET;
1044 	cbp->cb_buf.b_bio1.bio_offset = dbtob(cbn + cboff + CCD_OFFSET);
1045 
1046 	/*
1047 	 * context for ccdiodone
1048 	 */
1049 	cbp->cb_obio = bio;
1050 	cbp->cb_unit = cs - ccd_softc;
1051 	cbp->cb_comp = ci - cs->sc_cinfo;
1052 
1053 #ifdef DEBUG
1054 	if (ccddebug & CCDB_IO)
1055 		printf(" dev %x(u%d): cbp %x bn %d addr %x bcnt %d\n",
1056 		       ci->ci_dev, ci-cs->sc_cinfo, cbp,
1057 		       cbp->cb_buf.b_bio1.bio_blkno,
1058 		       cbp->cb_buf.b_data, cbp->cb_buf.b_bcount);
1059 #endif
1060 	cb[0] = cbp;
1061 
1062 	/*
1063 	 * Note: both I/O's setup when reading from mirror, but only one
1064 	 * will be executed.
1065 	 */
1066 	if (cs->sc_cflags & CCDF_MIRROR) {
1067 		/* mirror, setup second I/O */
1068 		cbp = getccdbuf();
1069 
1070 		cbp->cb_buf.b_flags = bio->bio_buf->b_flags;
1071 		cbp->cb_buf.b_data = addr;
1072 		cbp->cb_buf.b_vp = ci2->ci_vp;
1073 		if (cs->sc_ileave == 0)
1074 		      cbc = dbtob((off_t)(ci->ci_size - cbn));
1075 		else
1076 		      cbc = dbtob((off_t)(cs->sc_ileave - cboff));
1077 		cbp->cb_buf.b_bcount = (cbc < bcount) ? cbc : bcount;
1078 		cbp->cb_buf.b_bufsize = cbp->cb_buf.b_bcount;
1079 
1080 		cbp->cb_buf.b_bio1.bio_done = ccdiodone;
1081 		cbp->cb_buf.b_bio1.bio_caller_info1.ptr = cbp;
1082 		cbp->cb_buf.b_bio1.bio_blkno = cbn + cboff + CCD_OFFSET;
1083 		cbp->cb_buf.b_bio1.bio_offset = dbtob(cbn + cboff + CCD_OFFSET);
1084 
1085 		/*
1086 		 * context for ccdiodone
1087 		 */
1088 		cbp->cb_obio = bio;
1089 		cbp->cb_unit = cs - ccd_softc;
1090 		cbp->cb_comp = ci2 - cs->sc_cinfo;
1091 		cb[1] = cbp;
1092 		/* link together the ccdbuf's and clear "mirror done" flag */
1093 		cb[0]->cb_mirror = cb[1];
1094 		cb[1]->cb_mirror = cb[0];
1095 		cb[0]->cb_pflags &= ~CCDPF_MIRROR_DONE;
1096 		cb[1]->cb_pflags &= ~CCDPF_MIRROR_DONE;
1097 	}
1098 }
1099 
1100 static void
1101 ccdintr(struct ccd_softc *cs, struct bio *bio)
1102 {
1103 	struct buf *bp = bio->bio_buf;
1104 
1105 #ifdef DEBUG
1106 	if (ccddebug & CCDB_FOLLOW)
1107 		printf("ccdintr(%x, %x)\n", cs, bp);
1108 #endif
1109 	/*
1110 	 * Request is done for better or worse, wakeup the top half.
1111 	 */
1112 	if (bp->b_flags & B_ERROR)
1113 		bp->b_resid = bp->b_bcount;
1114 	devstat_end_transaction_buf(&cs->device_stats, bp);
1115 	biodone(bio);
1116 }
1117 
1118 /*
1119  * Called at interrupt time.
1120  * Mark the component as done and if all components are done,
1121  * take a ccd interrupt.
1122  */
1123 static void
1124 ccdiodone(struct bio *bio)
1125 {
1126 	struct ccdbuf *cbp = bio->bio_caller_info1.ptr;
1127 	struct bio *obio = cbp->cb_obio;
1128 	struct buf *obp = obio->bio_buf;
1129 	int unit = cbp->cb_unit;
1130 	int count;
1131 
1132 	/*
1133 	 * Since we do not have exclusive access to underlying devices,
1134 	 * we can't keep cache translations around.
1135 	 */
1136 	clearbiocache(bio->bio_next);
1137 
1138 	crit_enter();
1139 #ifdef DEBUG
1140 	if (ccddebug & CCDB_FOLLOW)
1141 		printf("ccdiodone(%x)\n", cbp);
1142 	if (ccddebug & CCDB_IO) {
1143 		printf("ccdiodone: bp %x bcount %d resid %d\n",
1144 		       obp, obp->b_bcount, obp->b_resid);
1145 		printf(" dev %x(u%d), cbp %x bn %d addr %x bcnt %d\n",
1146 		       cbp->cb_buf.b_dev, cbp->cb_comp, cbp,
1147 		       cbp->cb_buf.b_lblkno, cbp->cb_buf.b_data,
1148 		       cbp->cb_buf.b_bcount);
1149 	}
1150 #endif
1151 	/*
1152 	 * If an error occured, report it.  If this is a mirrored
1153 	 * configuration and the first of two possible reads, do not
1154 	 * set the error in the bp yet because the second read may
1155 	 * succeed.
1156 	 */
1157 	if (cbp->cb_buf.b_flags & B_ERROR) {
1158 		const char *msg = "";
1159 
1160 		if ((ccd_softc[unit].sc_cflags & CCDF_MIRROR) &&
1161 		    (cbp->cb_buf.b_flags & B_READ) &&
1162 		    (cbp->cb_pflags & CCDPF_MIRROR_DONE) == 0) {
1163 			/*
1164 			 * We will try our read on the other disk down
1165 			 * below, also reverse the default pick so if we
1166 			 * are doing a scan we do not keep hitting the
1167 			 * bad disk first.
1168 			 */
1169 			struct ccd_softc *cs = &ccd_softc[unit];
1170 
1171 			msg = ", trying other disk";
1172 			cs->sc_pick = 1 - cs->sc_pick;
1173 			cs->sc_blk[cs->sc_pick] = obio->bio_blkno;
1174 		} else {
1175 			obp->b_flags |= B_ERROR;
1176 			obp->b_error = cbp->cb_buf.b_error ?
1177 			    cbp->cb_buf.b_error : EIO;
1178 		}
1179 		printf("ccd%d: error %d on component %d block %d (ccd block %d)%s\n",
1180 		       unit, obp->b_error, cbp->cb_comp,
1181 		       (int)cbp->cb_buf.b_bio2.bio_blkno,
1182 		       obio->bio_blkno, msg);
1183 	}
1184 
1185 	/*
1186 	 * Process mirror.  If we are writing, I/O has been initiated on both
1187 	 * buffers and we fall through only after both are finished.
1188 	 *
1189 	 * If we are reading only one I/O is initiated at a time.  If an
1190 	 * error occurs we initiate the second I/O and return, otherwise
1191 	 * we free the second I/O without initiating it.
1192 	 */
1193 
1194 	if (ccd_softc[unit].sc_cflags & CCDF_MIRROR) {
1195 		if ((cbp->cb_buf.b_flags & B_READ) == 0) {
1196 			/*
1197 			 * When writing, handshake with the second buffer
1198 			 * to determine when both are done.  If both are not
1199 			 * done, return here.
1200 			 */
1201 			if ((cbp->cb_pflags & CCDPF_MIRROR_DONE) == 0) {
1202 				cbp->cb_mirror->cb_pflags |= CCDPF_MIRROR_DONE;
1203 				putccdbuf(cbp);
1204 				crit_exit();
1205 				return;
1206 			}
1207 		} else {
1208 			/*
1209 			 * When reading, either dispose of the second buffer
1210 			 * or initiate I/O on the second buffer if an error
1211 			 * occured with this one.
1212 			 */
1213 			if ((cbp->cb_pflags & CCDPF_MIRROR_DONE) == 0) {
1214 				if (cbp->cb_buf.b_flags & B_ERROR) {
1215 					cbp->cb_mirror->cb_pflags |=
1216 					    CCDPF_MIRROR_DONE;
1217 					vn_strategy(
1218 					    cbp->cb_mirror->cb_buf.b_vp,
1219 					    &cbp->cb_mirror->cb_buf.b_bio1
1220 					);
1221 					putccdbuf(cbp);
1222 					crit_exit();
1223 					return;
1224 				} else {
1225 					putccdbuf(cbp->cb_mirror);
1226 					/* fall through */
1227 				}
1228 			}
1229 		}
1230 	}
1231 
1232 	/*
1233 	 * use b_bufsize to determine how big the original request was rather
1234 	 * then b_bcount, because b_bcount may have been truncated for EOF.
1235 	 *
1236 	 * XXX We check for an error, but we do not test the resid for an
1237 	 * aligned EOF condition.  This may result in character & block
1238 	 * device access not recognizing EOF properly when read or written
1239 	 * sequentially, but will not effect filesystems.
1240 	 */
1241 	count = cbp->cb_buf.b_bufsize;
1242 	putccdbuf(cbp);
1243 
1244 	/*
1245 	 * If all done, "interrupt".
1246 	 */
1247 	obp->b_resid -= count;
1248 	if (obp->b_resid < 0)
1249 		panic("ccdiodone: count");
1250 	if (obp->b_resid == 0)
1251 		ccdintr(&ccd_softc[unit], obio);
1252 	crit_exit();
1253 }
1254 
1255 static int
1256 ccdioctl(dev_t dev, u_long cmd, caddr_t data, int flag, d_thread_t *td)
1257 {
1258 	int unit = ccdunit(dev);
1259 	int i, j, lookedup = 0, error = 0;
1260 	int part, pmask;
1261 	struct ccd_softc *cs;
1262 	struct ccd_ioctl *ccio = (struct ccd_ioctl *)data;
1263 	struct ccddevice ccd;
1264 	char **cpp;
1265 	struct vnode **vpp;
1266 	struct ucred *cred;
1267 
1268 	KKASSERT(td->td_proc != NULL);
1269 	cred = td->td_proc->p_ucred;
1270 
1271 	if (unit >= numccd)
1272 		return (ENXIO);
1273 	cs = &ccd_softc[unit];
1274 
1275 	bzero(&ccd, sizeof(ccd));
1276 
1277 	switch (cmd) {
1278 	case CCDIOCSET:
1279 		if (cs->sc_flags & CCDF_INITED)
1280 			return (EBUSY);
1281 
1282 		if ((flag & FWRITE) == 0)
1283 			return (EBADF);
1284 
1285 		if ((error = ccdlock(cs)) != 0)
1286 			return (error);
1287 
1288 		if (ccio->ccio_ndisks > CCD_MAXNDISKS)
1289 			return (EINVAL);
1290 
1291 		/* Fill in some important bits. */
1292 		ccd.ccd_unit = unit;
1293 		ccd.ccd_interleave = ccio->ccio_ileave;
1294 		if (ccd.ccd_interleave == 0 &&
1295 		    ((ccio->ccio_flags & CCDF_MIRROR) ||
1296 		     (ccio->ccio_flags & CCDF_PARITY))) {
1297 			printf("ccd%d: disabling mirror/parity, interleave is 0\n", unit);
1298 			ccio->ccio_flags &= ~(CCDF_MIRROR | CCDF_PARITY);
1299 		}
1300 		if ((ccio->ccio_flags & CCDF_MIRROR) &&
1301 		    (ccio->ccio_flags & CCDF_PARITY)) {
1302 			printf("ccd%d: can't specify both mirror and parity, using mirror\n", unit);
1303 			ccio->ccio_flags &= ~CCDF_PARITY;
1304 		}
1305 		if ((ccio->ccio_flags & (CCDF_MIRROR | CCDF_PARITY)) &&
1306 		    !(ccio->ccio_flags & CCDF_UNIFORM)) {
1307 			printf("ccd%d: mirror/parity forces uniform flag\n",
1308 			       unit);
1309 			ccio->ccio_flags |= CCDF_UNIFORM;
1310 		}
1311 		ccd.ccd_flags = ccio->ccio_flags & CCDF_USERMASK;
1312 
1313 		/*
1314 		 * Allocate space for and copy in the array of
1315 		 * componet pathnames and device numbers.
1316 		 */
1317 		cpp = malloc(ccio->ccio_ndisks * sizeof(char *),
1318 		    M_DEVBUF, M_WAITOK);
1319 		vpp = malloc(ccio->ccio_ndisks * sizeof(struct vnode *),
1320 		    M_DEVBUF, M_WAITOK);
1321 
1322 		error = copyin((caddr_t)ccio->ccio_disks, (caddr_t)cpp,
1323 		    ccio->ccio_ndisks * sizeof(char **));
1324 		if (error) {
1325 			free(vpp, M_DEVBUF);
1326 			free(cpp, M_DEVBUF);
1327 			ccdunlock(cs);
1328 			return (error);
1329 		}
1330 
1331 #ifdef DEBUG
1332 		if (ccddebug & CCDB_INIT)
1333 			for (i = 0; i < ccio->ccio_ndisks; ++i)
1334 				printf("ccdioctl: component %d: 0x%x\n",
1335 				    i, cpp[i]);
1336 #endif
1337 
1338 		for (i = 0; i < ccio->ccio_ndisks; ++i) {
1339 #ifdef DEBUG
1340 			if (ccddebug & CCDB_INIT)
1341 				printf("ccdioctl: lookedup = %d\n", lookedup);
1342 #endif
1343 			if ((error = ccdlookup(cpp[i], td, &vpp[i])) != 0) {
1344 				for (j = 0; j < lookedup; ++j)
1345 					(void)vn_close(vpp[j], FREAD|FWRITE, td);
1346 				free(vpp, M_DEVBUF);
1347 				free(cpp, M_DEVBUF);
1348 				ccdunlock(cs);
1349 				return (error);
1350 			}
1351 			++lookedup;
1352 		}
1353 		ccd.ccd_cpp = cpp;
1354 		ccd.ccd_vpp = vpp;
1355 		ccd.ccd_ndev = ccio->ccio_ndisks;
1356 
1357 		/*
1358 		 * Initialize the ccd.  Fills in the softc for us.
1359 		 */
1360 		if ((error = ccdinit(&ccd, cpp, td)) != 0) {
1361 			for (j = 0; j < lookedup; ++j)
1362 				(void)vn_close(vpp[j], FREAD|FWRITE, td);
1363 			bzero(&ccd_softc[unit], sizeof(struct ccd_softc));
1364 			free(vpp, M_DEVBUF);
1365 			free(cpp, M_DEVBUF);
1366 			ccdunlock(cs);
1367 			return (error);
1368 		}
1369 
1370 		/*
1371 		 * The ccd has been successfully initialized, so
1372 		 * we can place it into the array and read the disklabel.
1373 		 */
1374 		bcopy(&ccd, &ccddevs[unit], sizeof(ccd));
1375 		ccio->ccio_unit = unit;
1376 		ccio->ccio_size = cs->sc_size;
1377 		ccdgetdisklabel(dev);
1378 
1379 		ccdunlock(cs);
1380 
1381 		break;
1382 
1383 	case CCDIOCCLR:
1384 		if ((cs->sc_flags & CCDF_INITED) == 0)
1385 			return (ENXIO);
1386 
1387 		if ((flag & FWRITE) == 0)
1388 			return (EBADF);
1389 
1390 		if ((error = ccdlock(cs)) != 0)
1391 			return (error);
1392 
1393 		/* Don't unconfigure if any other partitions are open */
1394 		part = ccdpart(dev);
1395 		pmask = (1 << part);
1396 		if ((cs->sc_openmask & ~pmask)) {
1397 			ccdunlock(cs);
1398 			return (EBUSY);
1399 		}
1400 
1401 		/*
1402 		 * Free ccd_softc information and clear entry.
1403 		 */
1404 
1405 		/* Close the components and free their pathnames. */
1406 		for (i = 0; i < cs->sc_nccdisks; ++i) {
1407 			/*
1408 			 * XXX: this close could potentially fail and
1409 			 * cause Bad Things.  Maybe we need to force
1410 			 * the close to happen?
1411 			 */
1412 #ifdef DEBUG
1413 			if (ccddebug & CCDB_VNODE)
1414 				vprint("CCDIOCCLR: vnode info",
1415 				    cs->sc_cinfo[i].ci_vp);
1416 #endif
1417 			(void)vn_close(cs->sc_cinfo[i].ci_vp, FREAD|FWRITE, td);
1418 			free(cs->sc_cinfo[i].ci_path, M_DEVBUF);
1419 		}
1420 
1421 		/* Free interleave index. */
1422 		for (i = 0; cs->sc_itable[i].ii_ndisk; ++i)
1423 			free(cs->sc_itable[i].ii_index, M_DEVBUF);
1424 
1425 		/* Free component info and interleave table. */
1426 		free(cs->sc_cinfo, M_DEVBUF);
1427 		free(cs->sc_itable, M_DEVBUF);
1428 		cs->sc_flags &= ~CCDF_INITED;
1429 
1430 		/*
1431 		 * Free ccddevice information and clear entry.
1432 		 */
1433 		free(ccddevs[unit].ccd_cpp, M_DEVBUF);
1434 		free(ccddevs[unit].ccd_vpp, M_DEVBUF);
1435 		ccd.ccd_dk = -1;
1436 		bcopy(&ccd, &ccddevs[unit], sizeof(ccd));
1437 
1438 		/*
1439 		 * And remove the devstat entry.
1440 		 */
1441 		devstat_remove_entry(&cs->device_stats);
1442 
1443 		/* This must be atomic. */
1444 		crit_enter();
1445 		ccdunlock(cs);
1446 		bzero(cs, sizeof(struct ccd_softc));
1447 		crit_exit();
1448 
1449 		break;
1450 
1451 	case DIOCGDINFO:
1452 		if ((cs->sc_flags & CCDF_INITED) == 0)
1453 			return (ENXIO);
1454 
1455 		*(struct disklabel *)data = cs->sc_label;
1456 		break;
1457 
1458 	case DIOCGPART:
1459 		if ((cs->sc_flags & CCDF_INITED) == 0)
1460 			return (ENXIO);
1461 
1462 		((struct partinfo *)data)->disklab = &cs->sc_label;
1463 		((struct partinfo *)data)->part =
1464 		    &cs->sc_label.d_partitions[ccdpart(dev)];
1465 		break;
1466 
1467 	case DIOCWDINFO:
1468 	case DIOCSDINFO:
1469 		if ((cs->sc_flags & CCDF_INITED) == 0)
1470 			return (ENXIO);
1471 
1472 		if ((flag & FWRITE) == 0)
1473 			return (EBADF);
1474 
1475 		if ((error = ccdlock(cs)) != 0)
1476 			return (error);
1477 
1478 		cs->sc_flags |= CCDF_LABELLING;
1479 
1480 		error = setdisklabel(&cs->sc_label,
1481 		    (struct disklabel *)data, 0);
1482 		if (error == 0) {
1483 			if (cmd == DIOCWDINFO) {
1484 				dev_t cdev = CCDLABELDEV(dev);
1485 				error = writedisklabel(cdev, &cs->sc_label);
1486 			}
1487 		}
1488 
1489 		cs->sc_flags &= ~CCDF_LABELLING;
1490 
1491 		ccdunlock(cs);
1492 
1493 		if (error)
1494 			return (error);
1495 		break;
1496 
1497 	case DIOCWLABEL:
1498 		if ((cs->sc_flags & CCDF_INITED) == 0)
1499 			return (ENXIO);
1500 
1501 		if ((flag & FWRITE) == 0)
1502 			return (EBADF);
1503 		if (*(int *)data != 0)
1504 			cs->sc_flags |= CCDF_WLABEL;
1505 		else
1506 			cs->sc_flags &= ~CCDF_WLABEL;
1507 		break;
1508 
1509 	default:
1510 		return (ENOTTY);
1511 	}
1512 
1513 	return (0);
1514 }
1515 
1516 static int
1517 ccdsize(dev_t dev)
1518 {
1519 	struct ccd_softc *cs;
1520 	int part, size;
1521 
1522 	if (ccdopen(dev, 0, S_IFCHR, curthread))
1523 		return (-1);
1524 
1525 	cs = &ccd_softc[ccdunit(dev)];
1526 	part = ccdpart(dev);
1527 
1528 	if ((cs->sc_flags & CCDF_INITED) == 0)
1529 		return (-1);
1530 
1531 	if (cs->sc_label.d_partitions[part].p_fstype != FS_SWAP)
1532 		size = -1;
1533 	else
1534 		size = cs->sc_label.d_partitions[part].p_size;
1535 
1536 	if (ccdclose(dev, 0, S_IFCHR, curthread))
1537 		return (-1);
1538 
1539 	return (size);
1540 }
1541 
1542 static int
1543 ccddump(dev_t dev, u_int count, u_int blkno, u_int secsize)
1544 {
1545 	/* Not implemented. */
1546 	return ENXIO;
1547 }
1548 
1549 /*
1550  * Lookup the provided name in the filesystem.  If the file exists,
1551  * is a valid block device, and isn't being used by anyone else,
1552  * set *vpp to the file's vnode.
1553  */
1554 static int
1555 ccdlookup(char *path, struct thread *td, struct vnode **vpp)
1556 {
1557 	struct nlookupdata nd;
1558 	struct ucred *cred;
1559 	struct vnode *vp;
1560 	int error;
1561 
1562 	KKASSERT(td->td_proc);
1563 	cred = td->td_proc->p_ucred;
1564 	*vpp = NULL;
1565 
1566 	error = nlookup_init(&nd, path, UIO_USERSPACE, NLC_FOLLOW|NLC_LOCKVP);
1567 	if (error)
1568 		return (error);
1569 	if ((error = vn_open(&nd, NULL, FREAD|FWRITE, 0)) != 0) {
1570 #ifdef DEBUG
1571 		if (ccddebug & CCDB_FOLLOW|CCDB_INIT)
1572 			printf("ccdlookup: vn_open error = %d\n", error);
1573 #endif
1574 		goto done;
1575 	}
1576 	vp = nd.nl_open_vp;
1577 
1578 	if (vp->v_usecount > 1) {
1579 		error = EBUSY;
1580 		goto done;
1581 	}
1582 
1583 	if (!vn_isdisk(vp, &error))
1584 		goto done;
1585 
1586 #ifdef DEBUG
1587 	if (ccddebug & CCDB_VNODE)
1588 		vprint("ccdlookup: vnode info", vp);
1589 #endif
1590 
1591 	VOP_UNLOCK(vp, 0, td);
1592 	nd.nl_open_vp = NULL;
1593 	nlookup_done(&nd);
1594 	*vpp = vp;				/* leave ref intact  */
1595 	return (0);
1596 done:
1597 	nlookup_done(&nd);
1598 	return (error);
1599 }
1600 
1601 /*
1602  * Read the disklabel from the ccd.  If one is not present, fake one
1603  * up.
1604  */
1605 static void
1606 ccdgetdisklabel(dev_t dev)
1607 {
1608 	int unit = ccdunit(dev);
1609 	struct ccd_softc *cs = &ccd_softc[unit];
1610 	char *errstring;
1611 	struct disklabel *lp = &cs->sc_label;
1612 	struct ccdgeom *ccg = &cs->sc_geom;
1613 	dev_t cdev;
1614 
1615 	bzero(lp, sizeof(*lp));
1616 
1617 	lp->d_secperunit = cs->sc_size;
1618 	lp->d_secsize = ccg->ccg_secsize;
1619 	lp->d_nsectors = ccg->ccg_nsectors;
1620 	lp->d_ntracks = ccg->ccg_ntracks;
1621 	lp->d_ncylinders = ccg->ccg_ncylinders;
1622 	lp->d_secpercyl = lp->d_ntracks * lp->d_nsectors;
1623 
1624 	strncpy(lp->d_typename, "ccd", sizeof(lp->d_typename));
1625 	lp->d_type = DTYPE_CCD;
1626 	strncpy(lp->d_packname, "fictitious", sizeof(lp->d_packname));
1627 	lp->d_rpm = 3600;
1628 	lp->d_interleave = 1;
1629 	lp->d_flags = 0;
1630 
1631 	lp->d_partitions[RAW_PART].p_offset = 0;
1632 	lp->d_partitions[RAW_PART].p_size = cs->sc_size;
1633 	lp->d_partitions[RAW_PART].p_fstype = FS_UNUSED;
1634 	lp->d_npartitions = RAW_PART + 1;
1635 
1636 	lp->d_bbsize = BBSIZE;				/* XXX */
1637 	lp->d_sbsize = SBSIZE;				/* XXX */
1638 
1639 	lp->d_magic = DISKMAGIC;
1640 	lp->d_magic2 = DISKMAGIC;
1641 	lp->d_checksum = dkcksum(&cs->sc_label);
1642 
1643 	/*
1644 	 * Call the generic disklabel extraction routine.
1645 	 */
1646 	cdev = CCDLABELDEV(dev);
1647 	errstring = readdisklabel(cdev, &cs->sc_label);
1648 	if (errstring != NULL)
1649 		ccdmakedisklabel(cs);
1650 
1651 #ifdef DEBUG
1652 	/* It's actually extremely common to have unlabeled ccds. */
1653 	if (ccddebug & CCDB_LABEL)
1654 		if (errstring != NULL)
1655 			printf("ccd%d: %s\n", unit, errstring);
1656 #endif
1657 }
1658 
1659 /*
1660  * Take care of things one might want to take care of in the event
1661  * that a disklabel isn't present.
1662  */
1663 static void
1664 ccdmakedisklabel(struct ccd_softc *cs)
1665 {
1666 	struct disklabel *lp = &cs->sc_label;
1667 
1668 	/*
1669 	 * For historical reasons, if there's no disklabel present
1670 	 * the raw partition must be marked FS_BSDFFS.
1671 	 */
1672 	lp->d_partitions[RAW_PART].p_fstype = FS_BSDFFS;
1673 
1674 	strncpy(lp->d_packname, "default label", sizeof(lp->d_packname));
1675 }
1676 
1677 /*
1678  * Wait interruptibly for an exclusive lock.
1679  *
1680  * XXX
1681  * Several drivers do this; it should be abstracted and made MP-safe.
1682  */
1683 static int
1684 ccdlock(struct ccd_softc *cs)
1685 {
1686 	int error;
1687 
1688 	while ((cs->sc_flags & CCDF_LOCKED) != 0) {
1689 		cs->sc_flags |= CCDF_WANTED;
1690 		if ((error = tsleep(cs, PCATCH, "ccdlck", 0)) != 0)
1691 			return (error);
1692 	}
1693 	cs->sc_flags |= CCDF_LOCKED;
1694 	return (0);
1695 }
1696 
1697 /*
1698  * Unlock and wake up any waiters.
1699  */
1700 static void
1701 ccdunlock(struct ccd_softc *cs)
1702 {
1703 
1704 	cs->sc_flags &= ~CCDF_LOCKED;
1705 	if ((cs->sc_flags & CCDF_WANTED) != 0) {
1706 		cs->sc_flags &= ~CCDF_WANTED;
1707 		wakeup(cs);
1708 	}
1709 }
1710 
1711 #ifdef DEBUG
1712 static void
1713 printiinfo(struct ccdiinfo *ii)
1714 {
1715 	int ix, i;
1716 
1717 	for (ix = 0; ii->ii_ndisk; ix++, ii++) {
1718 		printf(" itab[%d]: #dk %d sblk %d soff %d",
1719 		       ix, ii->ii_ndisk, ii->ii_startblk, ii->ii_startoff);
1720 		for (i = 0; i < ii->ii_ndisk; i++)
1721 			printf(" %d", ii->ii_index[i]);
1722 		printf("\n");
1723 	}
1724 }
1725 #endif
1726 
1727 
1728 /* Local Variables: */
1729 /* c-argdecl-indent: 8 */
1730 /* c-continued-statement-offset: 8 */
1731 /* c-indent-level: 8 */
1732 /* End: */
1733