xref: /dragonfly/sys/dev/disk/ccd/ccd.c (revision fcce2b94)
1 /* $FreeBSD: src/sys/dev/ccd/ccd.c,v 1.73.2.1 2001/09/11 09:49:52 kris Exp $ */
2 /* $DragonFly: src/sys/dev/disk/ccd/ccd.c,v 1.32 2006/05/06 02:43:02 dillon Exp $ */
3 
4 /*	$NetBSD: ccd.c,v 1.22 1995/12/08 19:13:26 thorpej Exp $	*/
5 
6 /*
7  * Copyright (c) 1995 Jason R. Thorpe.
8  * All rights reserved.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 3. All advertising materials mentioning features or use of this software
19  *    must display the following acknowledgement:
20  *	This product includes software developed for the NetBSD Project
21  *	by Jason R. Thorpe.
22  * 4. The name of the author may not be used to endorse or promote products
23  *    derived from this software without specific prior written permission.
24  *
25  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
26  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
27  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
28  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
29  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
30  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
31  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
32  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
33  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
34  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
35  * SUCH DAMAGE.
36  */
37 
38 /*
39  * Copyright (c) 1988 University of Utah.
40  * Copyright (c) 1990, 1993
41  *	The Regents of the University of California.  All rights reserved.
42  *
43  * This code is derived from software contributed to Berkeley by
44  * the Systems Programming Group of the University of Utah Computer
45  * Science Department.
46  *
47  * Redistribution and use in source and binary forms, with or without
48  * modification, are permitted provided that the following conditions
49  * are met:
50  * 1. Redistributions of source code must retain the above copyright
51  *    notice, this list of conditions and the following disclaimer.
52  * 2. Redistributions in binary form must reproduce the above copyright
53  *    notice, this list of conditions and the following disclaimer in the
54  *    documentation and/or other materials provided with the distribution.
55  * 3. All advertising materials mentioning features or use of this software
56  *    must display the following acknowledgement:
57  *	This product includes software developed by the University of
58  *	California, Berkeley and its contributors.
59  * 4. Neither the name of the University nor the names of its contributors
60  *    may be used to endorse or promote products derived from this software
61  *    without specific prior written permission.
62  *
63  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
64  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
65  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
66  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
67  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
68  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
69  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
70  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
71  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
72  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
73  * SUCH DAMAGE.
74  *
75  * from: Utah $Hdr: cd.c 1.6 90/11/28$
76  *
77  *	@(#)cd.c	8.2 (Berkeley) 11/16/93
78  */
79 
80 /*
81  * "Concatenated" disk driver.
82  *
83  * Dynamic configuration and disklabel support by:
84  *	Jason R. Thorpe <thorpej@nas.nasa.gov>
85  *	Numerical Aerodynamic Simulation Facility
86  *	Mail Stop 258-6
87  *	NASA Ames Research Center
88  *	Moffett Field, CA 94035
89  */
90 
91 #include "use_ccd.h"
92 
93 #include <sys/param.h>
94 #include <sys/systm.h>
95 #include <sys/kernel.h>
96 #include <sys/module.h>
97 #include <sys/proc.h>
98 #include <sys/buf.h>
99 #include <sys/malloc.h>
100 #include <sys/nlookup.h>
101 #include <sys/conf.h>
102 #include <sys/stat.h>
103 #include <sys/sysctl.h>
104 #include <sys/disklabel.h>
105 #include <sys/devicestat.h>
106 #include <sys/fcntl.h>
107 #include <sys/vnode.h>
108 #include <sys/buf2.h>
109 #include <sys/ccdvar.h>
110 
111 #include <vm/vm_zone.h>
112 
113 #include <vfs/ufs/dinode.h> 	/* XXX Used only for fs.h */
114 #include <vfs/ufs/fs.h> 	/* XXX used only to get BBSIZE and SBSIZE */
115 
116 #include <sys/thread2.h>
117 
118 #if defined(CCDDEBUG) && !defined(DEBUG)
119 #define DEBUG
120 #endif
121 
122 #ifdef DEBUG
123 #define CCDB_FOLLOW	0x01
124 #define CCDB_INIT	0x02
125 #define CCDB_IO		0x04
126 #define CCDB_LABEL	0x08
127 #define CCDB_VNODE	0x10
128 static int ccddebug = CCDB_FOLLOW | CCDB_INIT | CCDB_IO | CCDB_LABEL |
129     CCDB_VNODE;
130 SYSCTL_INT(_debug, OID_AUTO, ccddebug, CTLFLAG_RW, &ccddebug, 0, "");
131 #undef DEBUG
132 #endif
133 
134 #define	ccdunit(x)	dkunit(x)
135 #define ccdpart(x)	dkpart(x)
136 
137 /*
138    This is how mirroring works (only writes are special):
139 
140    When initiating a write, ccdbuffer() returns two "struct ccdbuf *"s
141    linked together by the cb_mirror field.  "cb_pflags &
142    CCDPF_MIRROR_DONE" is set to 0 on both of them.
143 
144    When a component returns to ccdiodone(), it checks if "cb_pflags &
145    CCDPF_MIRROR_DONE" is set or not.  If not, it sets the partner's
146    flag and returns.  If it is, it means its partner has already
147    returned, so it will go to the regular cleanup.
148 
149  */
150 
151 struct ccdbuf {
152 	struct buf	cb_buf;		/* new I/O buf */
153 	struct vnode	*cb_vp;		/* related vnode */
154 	struct bio	*cb_obio;	/* ptr. to original I/O buf */
155 	struct ccdbuf	*cb_freenext;	/* free list link */
156 	int		cb_unit;	/* target unit */
157 	int		cb_comp;	/* target component */
158 	int		cb_pflags;	/* mirror/parity status flag */
159 	struct ccdbuf	*cb_mirror;	/* mirror counterpart */
160 };
161 
162 /* bits in cb_pflags */
163 #define CCDPF_MIRROR_DONE 1	/* if set, mirror counterpart is done */
164 
165 #define CCDLABELDEV(dev)	\
166 	(make_sub_dev(dev, dkmakeminor(ccdunit((dev)), 0, RAW_PART)))
167 
168 static d_open_t ccdopen;
169 static d_close_t ccdclose;
170 static d_strategy_t ccdstrategy;
171 static d_ioctl_t ccdioctl;
172 static d_dump_t ccddump;
173 static d_psize_t ccdsize;
174 
175 #define NCCDFREEHIWAT	16
176 
177 #define CDEV_MAJOR 74
178 
179 static struct cdevsw ccd_cdevsw = {
180 	/* name */	"ccd",
181 	/* maj */	CDEV_MAJOR,
182 	/* flags */	D_DISK,
183 	/* port */      NULL,
184 	/* clone */	NULL,
185 
186 	/* open */	ccdopen,
187 	/* close */	ccdclose,
188 	/* read */	physread,
189 	/* write */	physwrite,
190 	/* ioctl */	ccdioctl,
191 	/* poll */	nopoll,
192 	/* mmap */	nommap,
193 	/* strategy */	ccdstrategy,
194 	/* dump */	ccddump,
195 	/* psize */	ccdsize
196 };
197 
198 /* called during module initialization */
199 static	void ccdattach (void);
200 static	int ccd_modevent (module_t, int, void *);
201 
202 /* called by biodone() at interrupt time */
203 static	void ccdiodone (struct bio *bio);
204 
205 static	void ccdstart (struct ccd_softc *, struct bio *);
206 static	void ccdinterleave (struct ccd_softc *, int);
207 static	void ccdintr (struct ccd_softc *, struct bio *);
208 static	int ccdinit (struct ccddevice *, char **, struct thread *);
209 static	int ccdlookup (char *, struct thread *td, struct vnode **);
210 static	void ccdbuffer (struct ccdbuf **ret, struct ccd_softc *,
211 		struct bio *, off_t, caddr_t, long);
212 static	void ccdgetdisklabel (dev_t);
213 static	void ccdmakedisklabel (struct ccd_softc *);
214 static	int ccdlock (struct ccd_softc *);
215 static	void ccdunlock (struct ccd_softc *);
216 
217 #ifdef DEBUG
218 static	void printiinfo (struct ccdiinfo *);
219 #endif
220 
221 /* Non-private for the benefit of libkvm. */
222 struct	ccd_softc *ccd_softc;
223 struct	ccddevice *ccddevs;
224 struct	ccdbuf *ccdfreebufs;
225 static	int numccdfreebufs;
226 static	int numccd = 0;
227 
228 /*
229  * getccdbuf() -	Allocate and zero a ccd buffer.
230  *
231  *	This routine is called at splbio().
232  */
233 
234 static __inline
235 struct ccdbuf *
236 getccdbuf(void)
237 {
238 	struct ccdbuf *cbp;
239 
240 	/*
241 	 * Allocate from freelist or malloc as necessary
242 	 */
243 	if ((cbp = ccdfreebufs) != NULL) {
244 		ccdfreebufs = cbp->cb_freenext;
245 		--numccdfreebufs;
246 		reinitbufbio(&cbp->cb_buf);
247 	} else {
248 		cbp = malloc(sizeof(struct ccdbuf), M_DEVBUF, M_WAITOK|M_ZERO);
249 		initbufbio(&cbp->cb_buf);
250 	}
251 
252 	/*
253 	 * independant struct buf initialization
254 	 */
255 	LIST_INIT(&cbp->cb_buf.b_dep);
256 	BUF_LOCKINIT(&cbp->cb_buf);
257 	BUF_LOCK(&cbp->cb_buf, LK_EXCLUSIVE);
258 	BUF_KERNPROC(&cbp->cb_buf);
259 	cbp->cb_buf.b_flags = B_PAGING | B_BNOCLIP;
260 
261 	return(cbp);
262 }
263 
264 /*
265  * putccdbuf() -	Free a ccd buffer.
266  *
267  *	This routine is called at splbio().
268  */
269 
270 static __inline
271 void
272 putccdbuf(struct ccdbuf *cbp)
273 {
274 	BUF_UNLOCK(&cbp->cb_buf);
275 	BUF_LOCKFREE(&cbp->cb_buf);
276 
277 	if (numccdfreebufs < NCCDFREEHIWAT) {
278 		cbp->cb_freenext = ccdfreebufs;
279 		ccdfreebufs = cbp;
280 		++numccdfreebufs;
281 	} else {
282 		free((caddr_t)cbp, M_DEVBUF);
283 	}
284 }
285 
286 
287 /*
288  * Number of blocks to untouched in front of a component partition.
289  * This is to avoid violating its disklabel area when it starts at the
290  * beginning of the slice.
291  */
292 #if !defined(CCD_OFFSET)
293 #define CCD_OFFSET 16
294 #endif
295 
296 /*
297  * Called by main() during pseudo-device attachment.  All we need
298  * to do is allocate enough space for devices to be configured later, and
299  * add devsw entries.
300  */
301 static void
302 ccdattach(void)
303 {
304 	int i;
305 	int num = NCCD;
306 
307 	if (num > 1)
308 		printf("ccd0-%d: Concatenated disk drivers\n", num-1);
309 	else
310 		printf("ccd0: Concatenated disk driver\n");
311 
312 	ccd_softc = malloc(num * sizeof(struct ccd_softc), M_DEVBUF,
313 			    M_WAITOK | M_ZERO);
314 	ccddevs = malloc(num * sizeof(struct ccddevice), M_DEVBUF,
315 			    M_WAITOK | M_ZERO);
316 	numccd = num;
317 
318 	cdevsw_add(&ccd_cdevsw, 0, 0);
319 	/* XXX: is this necessary? */
320 	for (i = 0; i < numccd; ++i)
321 		ccddevs[i].ccd_dk = -1;
322 }
323 
324 static int
325 ccd_modevent(module_t mod, int type, void *data)
326 {
327 	int error = 0;
328 
329 	switch (type) {
330 	case MOD_LOAD:
331 		ccdattach();
332 		break;
333 
334 	case MOD_UNLOAD:
335 		printf("ccd0: Unload not supported!\n");
336 		error = EOPNOTSUPP;
337 		break;
338 
339 	default:	/* MOD_SHUTDOWN etc */
340 		break;
341 	}
342 	return (error);
343 }
344 
345 DEV_MODULE(ccd, ccd_modevent, NULL);
346 
347 static int
348 ccdinit(struct ccddevice *ccd, char **cpaths, struct thread *td)
349 {
350 	struct ccd_softc *cs = &ccd_softc[ccd->ccd_unit];
351 	struct ccdcinfo *ci = NULL;	/* XXX */
352 	size_t size;
353 	int ix;
354 	struct vnode *vp;
355 	size_t minsize;
356 	int maxsecsize;
357 	struct partinfo dpart;
358 	struct ccdgeom *ccg = &cs->sc_geom;
359 	char tmppath[MAXPATHLEN];
360 	int error = 0;
361 	struct ucred *cred;
362 
363 	KKASSERT(td->td_proc);
364 	cred = td->td_proc->p_ucred;
365 
366 #ifdef DEBUG
367 	if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
368 		printf("ccdinit: unit %d\n", ccd->ccd_unit);
369 #endif
370 
371 	cs->sc_size = 0;
372 	cs->sc_ileave = ccd->ccd_interleave;
373 	cs->sc_nccdisks = ccd->ccd_ndev;
374 
375 	/* Allocate space for the component info. */
376 	cs->sc_cinfo = malloc(cs->sc_nccdisks * sizeof(struct ccdcinfo),
377 	    M_DEVBUF, M_WAITOK);
378 
379 	/*
380 	 * Verify that each component piece exists and record
381 	 * relevant information about it.
382 	 */
383 	maxsecsize = 0;
384 	minsize = 0;
385 	for (ix = 0; ix < cs->sc_nccdisks; ix++) {
386 		vp = ccd->ccd_vpp[ix];
387 		ci = &cs->sc_cinfo[ix];
388 		ci->ci_vp = vp;
389 
390 		/*
391 		 * Copy in the pathname of the component.
392 		 */
393 		bzero(tmppath, sizeof(tmppath));	/* sanity */
394 		if ((error = copyinstr(cpaths[ix], tmppath,
395 		    MAXPATHLEN, &ci->ci_pathlen)) != 0) {
396 #ifdef DEBUG
397 			if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
398 				printf("ccd%d: can't copy path, error = %d\n",
399 				    ccd->ccd_unit, error);
400 #endif
401 			goto fail;
402 		}
403 		ci->ci_path = malloc(ci->ci_pathlen, M_DEVBUF, M_WAITOK);
404 		bcopy(tmppath, ci->ci_path, ci->ci_pathlen);
405 
406 		ci->ci_dev = vn_todev(vp);
407 
408 		/*
409 		 * Get partition information for the component.
410 		 */
411 		if ((error = VOP_IOCTL(vp, DIOCGPART, (caddr_t)&dpart,
412 				       FREAD, cred)) != 0) {
413 #ifdef DEBUG
414 			if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
415 				 printf("ccd%d: %s: ioctl failed, error = %d\n",
416 				     ccd->ccd_unit, ci->ci_path, error);
417 #endif
418 			goto fail;
419 		}
420 		if (dpart.part->p_fstype == FS_BSDFFS) {
421 			maxsecsize =
422 			    ((dpart.disklab->d_secsize > maxsecsize) ?
423 			    dpart.disklab->d_secsize : maxsecsize);
424 			size = dpart.part->p_size - CCD_OFFSET;
425 		} else {
426 #ifdef DEBUG
427 			if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
428 				printf("ccd%d: %s: incorrect partition type\n",
429 				    ccd->ccd_unit, ci->ci_path);
430 #endif
431 			error = EFTYPE;
432 			goto fail;
433 		}
434 
435 		/*
436 		 * Calculate the size, truncating to an interleave
437 		 * boundary if necessary.
438 		 */
439 
440 		if (cs->sc_ileave > 1)
441 			size -= size % cs->sc_ileave;
442 
443 		if (size == 0) {
444 #ifdef DEBUG
445 			if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
446 				printf("ccd%d: %s: size == 0\n",
447 				    ccd->ccd_unit, ci->ci_path);
448 #endif
449 			error = ENODEV;
450 			goto fail;
451 		}
452 
453 		if (minsize == 0 || size < minsize)
454 			minsize = size;
455 		ci->ci_size = size;
456 		cs->sc_size += size;
457 	}
458 
459 	/*
460 	 * Don't allow the interleave to be smaller than
461 	 * the biggest component sector.
462 	 */
463 	if ((cs->sc_ileave > 0) &&
464 	    (cs->sc_ileave < (maxsecsize / DEV_BSIZE))) {
465 #ifdef DEBUG
466 		if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
467 			printf("ccd%d: interleave must be at least %d\n",
468 			    ccd->ccd_unit, (maxsecsize / DEV_BSIZE));
469 #endif
470 		error = EINVAL;
471 		goto fail;
472 	}
473 
474 	/*
475 	 * If uniform interleave is desired set all sizes to that of
476 	 * the smallest component.  This will guarentee that a single
477 	 * interleave table is generated.
478 	 *
479 	 * Lost space must be taken into account when calculating the
480 	 * overall size.  Half the space is lost when CCDF_MIRROR is
481 	 * specified.  One disk is lost when CCDF_PARITY is specified.
482 	 */
483 	if (ccd->ccd_flags & CCDF_UNIFORM) {
484 		for (ci = cs->sc_cinfo;
485 		     ci < &cs->sc_cinfo[cs->sc_nccdisks]; ci++) {
486 			ci->ci_size = minsize;
487 		}
488 		if (ccd->ccd_flags & CCDF_MIRROR) {
489 			/*
490 			 * Check to see if an even number of components
491 			 * have been specified.  The interleave must also
492 			 * be non-zero in order for us to be able to
493 			 * guarentee the topology.
494 			 */
495 			if (cs->sc_nccdisks % 2) {
496 				printf("ccd%d: mirroring requires an even number of disks\n", ccd->ccd_unit );
497 				error = EINVAL;
498 				goto fail;
499 			}
500 			if (cs->sc_ileave == 0) {
501 				printf("ccd%d: an interleave must be specified when mirroring\n", ccd->ccd_unit);
502 				error = EINVAL;
503 				goto fail;
504 			}
505 			cs->sc_size = (cs->sc_nccdisks/2) * minsize;
506 		} else if (ccd->ccd_flags & CCDF_PARITY) {
507 			cs->sc_size = (cs->sc_nccdisks-1) * minsize;
508 		} else {
509 			if (cs->sc_ileave == 0) {
510 				printf("ccd%d: an interleave must be specified when using parity\n", ccd->ccd_unit);
511 				error = EINVAL;
512 				goto fail;
513 			}
514 			cs->sc_size = cs->sc_nccdisks * minsize;
515 		}
516 	}
517 
518 	/*
519 	 * Construct the interleave table.
520 	 */
521 	ccdinterleave(cs, ccd->ccd_unit);
522 
523 	/*
524 	 * Create pseudo-geometry based on 1MB cylinders.  It's
525 	 * pretty close.
526 	 */
527 	ccg->ccg_secsize = maxsecsize;
528 	ccg->ccg_ntracks = 1;
529 	ccg->ccg_nsectors = 1024 * 1024 / ccg->ccg_secsize;
530 	ccg->ccg_ncylinders = cs->sc_size / ccg->ccg_nsectors;
531 
532 	/*
533 	 * Add an devstat entry for this device.
534 	 */
535 	devstat_add_entry(&cs->device_stats, "ccd", ccd->ccd_unit,
536 			  ccg->ccg_secsize, DEVSTAT_ALL_SUPPORTED,
537 			  DEVSTAT_TYPE_STORARRAY |DEVSTAT_TYPE_IF_OTHER,
538 			  DEVSTAT_PRIORITY_ARRAY);
539 
540 	cs->sc_flags |= CCDF_INITED;
541 	cs->sc_cflags = ccd->ccd_flags;	/* So we can find out later... */
542 	cs->sc_unit = ccd->ccd_unit;
543 	return (0);
544 fail:
545 	while (ci > cs->sc_cinfo) {
546 		ci--;
547 		free(ci->ci_path, M_DEVBUF);
548 	}
549 	free(cs->sc_cinfo, M_DEVBUF);
550 	return (error);
551 }
552 
553 static void
554 ccdinterleave(struct ccd_softc *cs, int unit)
555 {
556 	struct ccdcinfo *ci, *smallci;
557 	struct ccdiinfo *ii;
558 	daddr_t bn, lbn;
559 	int ix;
560 	u_long size;
561 
562 #ifdef DEBUG
563 	if (ccddebug & CCDB_INIT)
564 		printf("ccdinterleave(%x): ileave %d\n", cs, cs->sc_ileave);
565 #endif
566 
567 	/*
568 	 * Allocate an interleave table.  The worst case occurs when each
569 	 * of N disks is of a different size, resulting in N interleave
570 	 * tables.
571 	 *
572 	 * Chances are this is too big, but we don't care.
573 	 */
574 	size = (cs->sc_nccdisks + 1) * sizeof(struct ccdiinfo);
575 	cs->sc_itable = (struct ccdiinfo *)malloc(size, M_DEVBUF, M_WAITOK);
576 	bzero((caddr_t)cs->sc_itable, size);
577 
578 	/*
579 	 * Trivial case: no interleave (actually interleave of disk size).
580 	 * Each table entry represents a single component in its entirety.
581 	 *
582 	 * An interleave of 0 may not be used with a mirror or parity setup.
583 	 */
584 	if (cs->sc_ileave == 0) {
585 		bn = 0;
586 		ii = cs->sc_itable;
587 
588 		for (ix = 0; ix < cs->sc_nccdisks; ix++) {
589 			/* Allocate space for ii_index. */
590 			ii->ii_index = malloc(sizeof(int), M_DEVBUF, M_WAITOK);
591 			ii->ii_ndisk = 1;
592 			ii->ii_startblk = bn;
593 			ii->ii_startoff = 0;
594 			ii->ii_index[0] = ix;
595 			bn += cs->sc_cinfo[ix].ci_size;
596 			ii++;
597 		}
598 		ii->ii_ndisk = 0;
599 #ifdef DEBUG
600 		if (ccddebug & CCDB_INIT)
601 			printiinfo(cs->sc_itable);
602 #endif
603 		return;
604 	}
605 
606 	/*
607 	 * The following isn't fast or pretty; it doesn't have to be.
608 	 */
609 	size = 0;
610 	bn = lbn = 0;
611 	for (ii = cs->sc_itable; ; ii++) {
612 		/*
613 		 * Allocate space for ii_index.  We might allocate more then
614 		 * we use.
615 		 */
616 		ii->ii_index = malloc((sizeof(int) * cs->sc_nccdisks),
617 		    M_DEVBUF, M_WAITOK);
618 
619 		/*
620 		 * Locate the smallest of the remaining components
621 		 */
622 		smallci = NULL;
623 		for (ci = cs->sc_cinfo; ci < &cs->sc_cinfo[cs->sc_nccdisks];
624 		    ci++) {
625 			if (ci->ci_size > size &&
626 			    (smallci == NULL ||
627 			     ci->ci_size < smallci->ci_size)) {
628 				smallci = ci;
629 			}
630 		}
631 
632 		/*
633 		 * Nobody left, all done
634 		 */
635 		if (smallci == NULL) {
636 			ii->ii_ndisk = 0;
637 			break;
638 		}
639 
640 		/*
641 		 * Record starting logical block using an sc_ileave blocksize.
642 		 */
643 		ii->ii_startblk = bn / cs->sc_ileave;
644 
645 		/*
646 		 * Record starting comopnent block using an sc_ileave
647 		 * blocksize.  This value is relative to the beginning of
648 		 * a component disk.
649 		 */
650 		ii->ii_startoff = lbn;
651 
652 		/*
653 		 * Determine how many disks take part in this interleave
654 		 * and record their indices.
655 		 */
656 		ix = 0;
657 		for (ci = cs->sc_cinfo;
658 		    ci < &cs->sc_cinfo[cs->sc_nccdisks]; ci++) {
659 			if (ci->ci_size >= smallci->ci_size) {
660 				ii->ii_index[ix++] = ci - cs->sc_cinfo;
661 			}
662 		}
663 		ii->ii_ndisk = ix;
664 		bn += ix * (smallci->ci_size - size);
665 		lbn = smallci->ci_size / cs->sc_ileave;
666 		size = smallci->ci_size;
667 	}
668 #ifdef DEBUG
669 	if (ccddebug & CCDB_INIT)
670 		printiinfo(cs->sc_itable);
671 #endif
672 }
673 
674 /* ARGSUSED */
675 static int
676 ccdopen(dev_t dev, int flags, int fmt, d_thread_t *td)
677 {
678 	int unit = ccdunit(dev);
679 	struct ccd_softc *cs;
680 	struct disklabel *lp;
681 	int error = 0, part, pmask;
682 
683 #ifdef DEBUG
684 	if (ccddebug & CCDB_FOLLOW)
685 		printf("ccdopen(%x, %x)\n", dev, flags);
686 #endif
687 	if (unit >= numccd)
688 		return (ENXIO);
689 	cs = &ccd_softc[unit];
690 
691 	if ((error = ccdlock(cs)) != 0)
692 		return (error);
693 
694 	lp = &cs->sc_label;
695 
696 	part = ccdpart(dev);
697 	pmask = (1 << part);
698 
699 	/*
700 	 * If we're initialized, check to see if there are any other
701 	 * open partitions.  If not, then it's safe to update
702 	 * the in-core disklabel.
703 	 */
704 	if ((cs->sc_flags & CCDF_INITED) && (cs->sc_openmask == 0))
705 		ccdgetdisklabel(dev);
706 
707 	/* Check that the partition exists. */
708 	if (part != RAW_PART && ((part >= lp->d_npartitions) ||
709 	    (lp->d_partitions[part].p_fstype == FS_UNUSED))) {
710 		error = ENXIO;
711 		goto done;
712 	}
713 
714 	cs->sc_openmask |= pmask;
715  done:
716 	ccdunlock(cs);
717 	return (0);
718 }
719 
720 /* ARGSUSED */
721 static int
722 ccdclose(dev_t dev, int flags, int fmt, d_thread_t *td)
723 {
724 	int unit = ccdunit(dev);
725 	struct ccd_softc *cs;
726 	int error = 0, part;
727 
728 #ifdef DEBUG
729 	if (ccddebug & CCDB_FOLLOW)
730 		printf("ccdclose(%x, %x)\n", dev, flags);
731 #endif
732 
733 	if (unit >= numccd)
734 		return (ENXIO);
735 	cs = &ccd_softc[unit];
736 
737 	if ((error = ccdlock(cs)) != 0)
738 		return (error);
739 
740 	part = ccdpart(dev);
741 
742 	/* ...that much closer to allowing unconfiguration... */
743 	cs->sc_openmask &= ~(1 << part);
744 	ccdunlock(cs);
745 	return (0);
746 }
747 
748 static void
749 ccdstrategy(dev_t dev, struct bio *bio)
750 {
751 	int unit = ccdunit(dev);
752 	struct bio *nbio;
753 	struct buf *bp = bio->bio_buf;
754 	struct ccd_softc *cs = &ccd_softc[unit];
755 	int wlabel;
756 	struct disklabel *lp;
757 
758 #ifdef DEBUG
759 	if (ccddebug & CCDB_FOLLOW)
760 		printf("ccdstrategy(%x): unit %d\n", bp, unit);
761 #endif
762 	if ((cs->sc_flags & CCDF_INITED) == 0) {
763 		bp->b_error = ENXIO;
764 		goto error;
765 	}
766 
767 	/* If it's a nil transfer, wake up the top half now. */
768 	if (bp->b_bcount == 0) {
769 		bp->b_resid = 0;
770 		goto done;
771 	}
772 
773 	lp = &cs->sc_label;
774 
775 	/*
776 	 * Do bounds checking and adjust transfer.  If there's an
777 	 * error, the bounds check will flag that for us.
778 	 */
779 	wlabel = cs->sc_flags & (CCDF_WLABEL|CCDF_LABELLING);
780 	if (ccdpart(dev) != RAW_PART) {
781 		nbio = bounds_check_with_label(dev, bio, lp, wlabel);
782 		if (nbio == NULL)
783 			goto done;
784 	} else {
785 		int pbn;        /* in sc_secsize chunks */
786 		long sz;        /* in sc_secsize chunks */
787 
788 		pbn = (int)(bio->bio_offset / cs->sc_geom.ccg_secsize);
789 		sz = howmany(bp->b_bcount, cs->sc_geom.ccg_secsize);
790 
791 		/*
792 		 * If out of bounds return an error.  If the request goes
793 		 * past EOF, clip the request as appropriate.  If exactly
794 		 * at EOF, return success (don't clip), but with 0 bytes
795 		 * of I/O.
796 		 *
797 		 * Mark EOF B_INVAL (just like bad), indicating that the
798 		 * contents of the buffer, if any, is invalid.
799 		 */
800 		if (pbn < 0)
801 			goto bad;
802 		if (pbn + sz > cs->sc_size) {
803 			if (pbn > cs->sc_size || (bp->b_flags & B_BNOCLIP))
804 				goto bad;
805 			if (pbn == cs->sc_size) {
806 				bp->b_resid = bp->b_bcount;
807 				bp->b_flags |= B_INVAL;
808 				goto done;
809 			}
810 			sz = cs->sc_size - pbn;
811 			bp->b_bcount = sz * cs->sc_geom.ccg_secsize;
812 		}
813 		nbio = bio;
814 	}
815 
816 	bp->b_resid = bp->b_bcount;
817 	nbio->bio_driver_info = dev;
818 
819 	/*
820 	 * "Start" the unit.
821 	 */
822 	crit_enter();
823 	ccdstart(cs, nbio);
824 	crit_exit();
825 	return;
826 
827 	/*
828 	 * note: bio, not nbio, is valid at the done label.
829 	 */
830 bad:
831 	bp->b_error = EINVAL;
832 error:
833 	bp->b_resid = bp->b_bcount;
834 	bp->b_flags |= B_ERROR | B_INVAL;
835 done:
836 	biodone(bio);
837 }
838 
839 static void
840 ccdstart(struct ccd_softc *cs, struct bio *bio)
841 {
842 	long bcount, rcount;
843 	struct ccdbuf *cbp[4];
844 	struct buf *bp = bio->bio_buf;
845 	dev_t dev = bio->bio_driver_info;
846 	/* XXX! : 2 reads and 2 writes for RAID 4/5 */
847 	caddr_t addr;
848 	off_t doffset;
849 	struct partition *pp;
850 
851 #ifdef DEBUG
852 	if (ccddebug & CCDB_FOLLOW)
853 		printf("ccdstart(%x, %x)\n", cs, bp);
854 #endif
855 
856 	/* Record the transaction start  */
857 	devstat_start_transaction(&cs->device_stats);
858 
859 	/*
860 	 * Translate the partition-relative block number to an absolute.
861 	 */
862 	doffset = bio->bio_offset;
863 	if (ccdpart(dev) != RAW_PART) {
864 		pp = &cs->sc_label.d_partitions[ccdpart(dev)];
865 		doffset += pp->p_offset * cs->sc_label.d_secsize;
866 	}
867 
868 	/*
869 	 * Allocate component buffers and fire off the requests
870 	 */
871 	addr = bp->b_data;
872 	for (bcount = bp->b_bcount; bcount > 0; bcount -= rcount) {
873 		ccdbuffer(cbp, cs, bio, doffset, addr, bcount);
874 		rcount = cbp[0]->cb_buf.b_bcount;
875 
876 		if (cs->sc_cflags & CCDF_MIRROR) {
877 			/*
878 			 * Mirroring.  Writes go to both disks, reads are
879 			 * taken from whichever disk seems most appropriate.
880 			 *
881 			 * We attempt to localize reads to the disk whos arm
882 			 * is nearest the read request.  We ignore seeks due
883 			 * to writes when making this determination and we
884 			 * also try to avoid hogging.
885 			 */
886 			if (cbp[0]->cb_buf.b_cmd != BUF_CMD_READ) {
887 				vn_strategy(cbp[0]->cb_vp,
888 					    &cbp[0]->cb_buf.b_bio1);
889 				vn_strategy(cbp[1]->cb_vp,
890 					    &cbp[1]->cb_buf.b_bio1);
891 			} else {
892 				int pick = cs->sc_pick;
893 				daddr_t range = cs->sc_size / 16 * cs->sc_label.d_secsize;
894 
895 				if (doffset < cs->sc_blk[pick] - range ||
896 				    doffset > cs->sc_blk[pick] + range
897 				) {
898 					cs->sc_pick = pick = 1 - pick;
899 				}
900 				cs->sc_blk[pick] = doffset + rcount;
901 				vn_strategy(cbp[pick]->cb_vp,
902 					    &cbp[pick]->cb_buf.b_bio1);
903 			}
904 		} else {
905 			/*
906 			 * Not mirroring
907 			 */
908 			vn_strategy(cbp[0]->cb_vp,
909 				     &cbp[0]->cb_buf.b_bio1);
910 		}
911 		doffset += rcount;
912 		addr += rcount;
913 	}
914 }
915 
916 /*
917  * Build a component buffer header.
918  */
919 static void
920 ccdbuffer(struct ccdbuf **cb, struct ccd_softc *cs, struct bio *bio,
921 	  off_t doffset, caddr_t addr, long bcount)
922 {
923 	struct ccdcinfo *ci, *ci2 = NULL;	/* XXX */
924 	struct ccdbuf *cbp;
925 	daddr_t bn, cbn, cboff;
926 	off_t cbc;
927 
928 #ifdef DEBUG
929 	if (ccddebug & CCDB_IO)
930 		printf("ccdbuffer(%x, %x, %d, %x, %d)\n",
931 		       cs, bp, bn, addr, bcount);
932 #endif
933 	/*
934 	 * Determine which component bn falls in.
935 	 */
936 	bn = (daddr_t)(doffset / cs->sc_geom.ccg_secsize);
937 	cbn = bn;
938 	cboff = 0;
939 
940 	if (cs->sc_ileave == 0) {
941 		/*
942 		 * Serially concatenated and neither a mirror nor a parity
943 		 * config.  This is a special case.
944 		 */
945 		daddr_t sblk;
946 
947 		sblk = 0;
948 		for (ci = cs->sc_cinfo; cbn >= sblk + ci->ci_size; ci++)
949 			sblk += ci->ci_size;
950 		cbn -= sblk;
951 	} else {
952 		struct ccdiinfo *ii;
953 		int ccdisk, off;
954 
955 		/*
956 		 * Calculate cbn, the logical superblock (sc_ileave chunks),
957 		 * and cboff, a normal block offset (DEV_BSIZE chunks) relative
958 		 * to cbn.
959 		 */
960 		cboff = cbn % cs->sc_ileave;	/* DEV_BSIZE gran */
961 		cbn = cbn / cs->sc_ileave;	/* DEV_BSIZE * ileave gran */
962 
963 		/*
964 		 * Figure out which interleave table to use.
965 		 */
966 		for (ii = cs->sc_itable; ii->ii_ndisk; ii++) {
967 			if (ii->ii_startblk > cbn)
968 				break;
969 		}
970 		ii--;
971 
972 		/*
973 		 * off is the logical superblock relative to the beginning
974 		 * of this interleave block.
975 		 */
976 		off = cbn - ii->ii_startblk;
977 
978 		/*
979 		 * We must calculate which disk component to use (ccdisk),
980 		 * and recalculate cbn to be the superblock relative to
981 		 * the beginning of the component.  This is typically done by
982 		 * adding 'off' and ii->ii_startoff together.  However, 'off'
983 		 * must typically be divided by the number of components in
984 		 * this interleave array to be properly convert it from a
985 		 * CCD-relative logical superblock number to a
986 		 * component-relative superblock number.
987 		 */
988 		if (ii->ii_ndisk == 1) {
989 			/*
990 			 * When we have just one disk, it can't be a mirror
991 			 * or a parity config.
992 			 */
993 			ccdisk = ii->ii_index[0];
994 			cbn = ii->ii_startoff + off;
995 		} else {
996 			if (cs->sc_cflags & CCDF_MIRROR) {
997 				/*
998 				 * We have forced a uniform mapping, resulting
999 				 * in a single interleave array.  We double
1000 				 * up on the first half of the available
1001 				 * components and our mirror is in the second
1002 				 * half.  This only works with a single
1003 				 * interleave array because doubling up
1004 				 * doubles the number of sectors, so there
1005 				 * cannot be another interleave array because
1006 				 * the next interleave array's calculations
1007 				 * would be off.
1008 				 */
1009 				int ndisk2 = ii->ii_ndisk / 2;
1010 				ccdisk = ii->ii_index[off % ndisk2];
1011 				cbn = ii->ii_startoff + off / ndisk2;
1012 				ci2 = &cs->sc_cinfo[ccdisk + ndisk2];
1013 			} else if (cs->sc_cflags & CCDF_PARITY) {
1014 				/*
1015 				 * XXX not implemented yet
1016 				 */
1017 				int ndisk2 = ii->ii_ndisk - 1;
1018 				ccdisk = ii->ii_index[off % ndisk2];
1019 				cbn = ii->ii_startoff + off / ndisk2;
1020 				if (cbn % ii->ii_ndisk <= ccdisk)
1021 					ccdisk++;
1022 			} else {
1023 				ccdisk = ii->ii_index[off % ii->ii_ndisk];
1024 				cbn = ii->ii_startoff + off / ii->ii_ndisk;
1025 			}
1026 		}
1027 
1028 		ci = &cs->sc_cinfo[ccdisk];
1029 
1030 		/*
1031 		 * Convert cbn from a superblock to a normal block so it
1032 		 * can be used to calculate (along with cboff) the normal
1033 		 * block index into this particular disk.
1034 		 */
1035 		cbn *= cs->sc_ileave;
1036 	}
1037 
1038 	/*
1039 	 * Fill in the component buf structure.
1040 	 *
1041 	 * NOTE: devices do not use b_bufsize, only b_bcount, but b_bcount
1042 	 * will be truncated on device EOF so we use b_bufsize to detect
1043 	 * the case.
1044 	 */
1045 	cbp = getccdbuf();
1046 	cbp->cb_buf.b_cmd = bio->bio_buf->b_cmd;
1047 	cbp->cb_buf.b_flags |= bio->bio_buf->b_flags;
1048 	cbp->cb_buf.b_data = addr;
1049 	cbp->cb_vp = ci->ci_vp;
1050 	if (cs->sc_ileave == 0)
1051               cbc = dbtob((off_t)(ci->ci_size - cbn));
1052 	else
1053               cbc = dbtob((off_t)(cs->sc_ileave - cboff));
1054 	cbp->cb_buf.b_bcount = (cbc < bcount) ? cbc : bcount;
1055  	cbp->cb_buf.b_bufsize = cbp->cb_buf.b_bcount;
1056 
1057 	cbp->cb_buf.b_bio1.bio_done = ccdiodone;
1058 	cbp->cb_buf.b_bio1.bio_caller_info1.ptr = cbp;
1059 	cbp->cb_buf.b_bio1.bio_offset = dbtob(cbn + cboff + CCD_OFFSET);
1060 
1061 	/*
1062 	 * context for ccdiodone
1063 	 */
1064 	cbp->cb_obio = bio;
1065 	cbp->cb_unit = cs - ccd_softc;
1066 	cbp->cb_comp = ci - cs->sc_cinfo;
1067 
1068 #ifdef DEBUG
1069 	if (ccddebug & CCDB_IO)
1070 		printf(" dev %x(u%d): cbp %x off %lld addr %x bcnt %d\n",
1071 		       ci->ci_dev, ci-cs->sc_cinfo, cbp,
1072 		       cbp->cb_buf.b_bio1.bio_offset,
1073 		       cbp->cb_buf.b_data, cbp->cb_buf.b_bcount);
1074 #endif
1075 	cb[0] = cbp;
1076 
1077 	/*
1078 	 * Note: both I/O's setup when reading from mirror, but only one
1079 	 * will be executed.
1080 	 */
1081 	if (cs->sc_cflags & CCDF_MIRROR) {
1082 		/* mirror, setup second I/O */
1083 		cbp = getccdbuf();
1084 
1085 		cbp->cb_buf.b_cmd = bio->bio_buf->b_cmd;
1086 		cbp->cb_buf.b_flags |= bio->bio_buf->b_flags;
1087 		cbp->cb_buf.b_data = addr;
1088 		cbp->cb_vp = ci2->ci_vp;
1089 		if (cs->sc_ileave == 0)
1090 		      cbc = dbtob((off_t)(ci->ci_size - cbn));
1091 		else
1092 		      cbc = dbtob((off_t)(cs->sc_ileave - cboff));
1093 		cbp->cb_buf.b_bcount = (cbc < bcount) ? cbc : bcount;
1094 		cbp->cb_buf.b_bufsize = cbp->cb_buf.b_bcount;
1095 
1096 		cbp->cb_buf.b_bio1.bio_done = ccdiodone;
1097 		cbp->cb_buf.b_bio1.bio_caller_info1.ptr = cbp;
1098 		cbp->cb_buf.b_bio1.bio_offset = dbtob(cbn + cboff + CCD_OFFSET);
1099 
1100 		/*
1101 		 * context for ccdiodone
1102 		 */
1103 		cbp->cb_obio = bio;
1104 		cbp->cb_unit = cs - ccd_softc;
1105 		cbp->cb_comp = ci2 - cs->sc_cinfo;
1106 		cb[1] = cbp;
1107 		/* link together the ccdbuf's and clear "mirror done" flag */
1108 		cb[0]->cb_mirror = cb[1];
1109 		cb[1]->cb_mirror = cb[0];
1110 		cb[0]->cb_pflags &= ~CCDPF_MIRROR_DONE;
1111 		cb[1]->cb_pflags &= ~CCDPF_MIRROR_DONE;
1112 	}
1113 }
1114 
1115 static void
1116 ccdintr(struct ccd_softc *cs, struct bio *bio)
1117 {
1118 	struct buf *bp = bio->bio_buf;
1119 
1120 #ifdef DEBUG
1121 	if (ccddebug & CCDB_FOLLOW)
1122 		printf("ccdintr(%x, %x)\n", cs, bp);
1123 #endif
1124 	/*
1125 	 * Request is done for better or worse, wakeup the top half.
1126 	 */
1127 	if (bp->b_flags & B_ERROR)
1128 		bp->b_resid = bp->b_bcount;
1129 	devstat_end_transaction_buf(&cs->device_stats, bp);
1130 	biodone(bio);
1131 }
1132 
1133 /*
1134  * Called at interrupt time.
1135  * Mark the component as done and if all components are done,
1136  * take a ccd interrupt.
1137  */
1138 static void
1139 ccdiodone(struct bio *bio)
1140 {
1141 	struct ccdbuf *cbp = bio->bio_caller_info1.ptr;
1142 	struct bio *obio = cbp->cb_obio;
1143 	struct buf *obp = obio->bio_buf;
1144 	int unit = cbp->cb_unit;
1145 	int count;
1146 
1147 	/*
1148 	 * Since we do not have exclusive access to underlying devices,
1149 	 * we can't keep cache translations around.
1150 	 */
1151 	clearbiocache(bio->bio_next);
1152 
1153 	crit_enter();
1154 #ifdef DEBUG
1155 	if (ccddebug & CCDB_FOLLOW)
1156 		printf("ccdiodone(%x)\n", cbp);
1157 	if (ccddebug & CCDB_IO) {
1158 		printf("ccdiodone: bp %x bcount %d resid %d\n",
1159 		       obp, obp->b_bcount, obp->b_resid);
1160 		printf(" dev %x(u%d), cbp %x off %lld addr %x bcnt %d\n",
1161 		       cbp->cb_buf.b_dev, cbp->cb_comp, cbp,
1162 		       cbp->cb_buf.b_loffset, cbp->cb_buf.b_data,
1163 		       cbp->cb_buf.b_bcount);
1164 	}
1165 #endif
1166 
1167 	/*
1168 	 * If an error occured, report it.  If this is a mirrored
1169 	 * configuration and the first of two possible reads, do not
1170 	 * set the error in the bp yet because the second read may
1171 	 * succeed.
1172 	 */
1173 	if (cbp->cb_buf.b_flags & B_ERROR) {
1174 		const char *msg = "";
1175 
1176 		if ((ccd_softc[unit].sc_cflags & CCDF_MIRROR) &&
1177 		    (cbp->cb_buf.b_cmd == BUF_CMD_READ) &&
1178 		    (cbp->cb_pflags & CCDPF_MIRROR_DONE) == 0) {
1179 			/*
1180 			 * We will try our read on the other disk down
1181 			 * below, also reverse the default pick so if we
1182 			 * are doing a scan we do not keep hitting the
1183 			 * bad disk first.
1184 			 */
1185 			struct ccd_softc *cs = &ccd_softc[unit];
1186 
1187 			msg = ", trying other disk";
1188 			cs->sc_pick = 1 - cs->sc_pick;
1189 			cs->sc_blk[cs->sc_pick] = obio->bio_offset;
1190 		} else {
1191 			obp->b_flags |= B_ERROR;
1192 			obp->b_error = cbp->cb_buf.b_error ?
1193 			    cbp->cb_buf.b_error : EIO;
1194 		}
1195 		printf("ccd%d: error %d on component %d offset %lld (ccd offset %lld)%s\n",
1196 		       unit, obp->b_error, cbp->cb_comp,
1197 		       cbp->cb_buf.b_bio2.bio_offset,
1198 		       obio->bio_offset, msg);
1199 	}
1200 
1201 	/*
1202 	 * Process mirror.  If we are writing, I/O has been initiated on both
1203 	 * buffers and we fall through only after both are finished.
1204 	 *
1205 	 * If we are reading only one I/O is initiated at a time.  If an
1206 	 * error occurs we initiate the second I/O and return, otherwise
1207 	 * we free the second I/O without initiating it.
1208 	 */
1209 
1210 	if (ccd_softc[unit].sc_cflags & CCDF_MIRROR) {
1211 		if (cbp->cb_buf.b_cmd != BUF_CMD_READ) {
1212 			/*
1213 			 * When writing, handshake with the second buffer
1214 			 * to determine when both are done.  If both are not
1215 			 * done, return here.
1216 			 */
1217 			if ((cbp->cb_pflags & CCDPF_MIRROR_DONE) == 0) {
1218 				cbp->cb_mirror->cb_pflags |= CCDPF_MIRROR_DONE;
1219 				putccdbuf(cbp);
1220 				crit_exit();
1221 				return;
1222 			}
1223 		} else {
1224 			/*
1225 			 * When reading, either dispose of the second buffer
1226 			 * or initiate I/O on the second buffer if an error
1227 			 * occured with this one.
1228 			 */
1229 			if ((cbp->cb_pflags & CCDPF_MIRROR_DONE) == 0) {
1230 				if (cbp->cb_buf.b_flags & B_ERROR) {
1231 					cbp->cb_mirror->cb_pflags |=
1232 					    CCDPF_MIRROR_DONE;
1233 					vn_strategy(
1234 					    cbp->cb_mirror->cb_vp,
1235 					    &cbp->cb_mirror->cb_buf.b_bio1
1236 					);
1237 					putccdbuf(cbp);
1238 					crit_exit();
1239 					return;
1240 				} else {
1241 					putccdbuf(cbp->cb_mirror);
1242 					/* fall through */
1243 				}
1244 			}
1245 		}
1246 	}
1247 
1248 	/*
1249 	 * Use our saved b_bufsize to determine if an unexpected EOF occured.
1250 	 */
1251 	count = cbp->cb_buf.b_bufsize;
1252 	putccdbuf(cbp);
1253 
1254 	/*
1255 	 * If all done, "interrupt".
1256 	 */
1257 	obp->b_resid -= count;
1258 	if (obp->b_resid < 0)
1259 		panic("ccdiodone: count");
1260 	if (obp->b_resid == 0)
1261 		ccdintr(&ccd_softc[unit], obio);
1262 	crit_exit();
1263 }
1264 
1265 static int
1266 ccdioctl(dev_t dev, u_long cmd, caddr_t data, int flag, d_thread_t *td)
1267 {
1268 	int unit = ccdunit(dev);
1269 	int i, j, lookedup = 0, error = 0;
1270 	int part, pmask;
1271 	struct ccd_softc *cs;
1272 	struct ccd_ioctl *ccio = (struct ccd_ioctl *)data;
1273 	struct ccddevice ccd;
1274 	char **cpp;
1275 	struct vnode **vpp;
1276 	struct ucred *cred;
1277 
1278 	KKASSERT(td->td_proc != NULL);
1279 	cred = td->td_proc->p_ucred;
1280 
1281 	if (unit >= numccd)
1282 		return (ENXIO);
1283 	cs = &ccd_softc[unit];
1284 
1285 	bzero(&ccd, sizeof(ccd));
1286 
1287 	switch (cmd) {
1288 	case CCDIOCSET:
1289 		if (cs->sc_flags & CCDF_INITED)
1290 			return (EBUSY);
1291 
1292 		if ((flag & FWRITE) == 0)
1293 			return (EBADF);
1294 
1295 		if ((error = ccdlock(cs)) != 0)
1296 			return (error);
1297 
1298 		if (ccio->ccio_ndisks > CCD_MAXNDISKS)
1299 			return (EINVAL);
1300 
1301 		/* Fill in some important bits. */
1302 		ccd.ccd_unit = unit;
1303 		ccd.ccd_interleave = ccio->ccio_ileave;
1304 		if (ccd.ccd_interleave == 0 &&
1305 		    ((ccio->ccio_flags & CCDF_MIRROR) ||
1306 		     (ccio->ccio_flags & CCDF_PARITY))) {
1307 			printf("ccd%d: disabling mirror/parity, interleave is 0\n", unit);
1308 			ccio->ccio_flags &= ~(CCDF_MIRROR | CCDF_PARITY);
1309 		}
1310 		if ((ccio->ccio_flags & CCDF_MIRROR) &&
1311 		    (ccio->ccio_flags & CCDF_PARITY)) {
1312 			printf("ccd%d: can't specify both mirror and parity, using mirror\n", unit);
1313 			ccio->ccio_flags &= ~CCDF_PARITY;
1314 		}
1315 		if ((ccio->ccio_flags & (CCDF_MIRROR | CCDF_PARITY)) &&
1316 		    !(ccio->ccio_flags & CCDF_UNIFORM)) {
1317 			printf("ccd%d: mirror/parity forces uniform flag\n",
1318 			       unit);
1319 			ccio->ccio_flags |= CCDF_UNIFORM;
1320 		}
1321 		ccd.ccd_flags = ccio->ccio_flags & CCDF_USERMASK;
1322 
1323 		/*
1324 		 * Allocate space for and copy in the array of
1325 		 * componet pathnames and device numbers.
1326 		 */
1327 		cpp = malloc(ccio->ccio_ndisks * sizeof(char *),
1328 		    M_DEVBUF, M_WAITOK);
1329 		vpp = malloc(ccio->ccio_ndisks * sizeof(struct vnode *),
1330 		    M_DEVBUF, M_WAITOK);
1331 
1332 		error = copyin((caddr_t)ccio->ccio_disks, (caddr_t)cpp,
1333 		    ccio->ccio_ndisks * sizeof(char **));
1334 		if (error) {
1335 			free(vpp, M_DEVBUF);
1336 			free(cpp, M_DEVBUF);
1337 			ccdunlock(cs);
1338 			return (error);
1339 		}
1340 
1341 #ifdef DEBUG
1342 		if (ccddebug & CCDB_INIT)
1343 			for (i = 0; i < ccio->ccio_ndisks; ++i)
1344 				printf("ccdioctl: component %d: 0x%x\n",
1345 				    i, cpp[i]);
1346 #endif
1347 
1348 		for (i = 0; i < ccio->ccio_ndisks; ++i) {
1349 #ifdef DEBUG
1350 			if (ccddebug & CCDB_INIT)
1351 				printf("ccdioctl: lookedup = %d\n", lookedup);
1352 #endif
1353 			if ((error = ccdlookup(cpp[i], td, &vpp[i])) != 0) {
1354 				for (j = 0; j < lookedup; ++j)
1355 					(void)vn_close(vpp[j], FREAD|FWRITE);
1356 				free(vpp, M_DEVBUF);
1357 				free(cpp, M_DEVBUF);
1358 				ccdunlock(cs);
1359 				return (error);
1360 			}
1361 			++lookedup;
1362 		}
1363 		ccd.ccd_cpp = cpp;
1364 		ccd.ccd_vpp = vpp;
1365 		ccd.ccd_ndev = ccio->ccio_ndisks;
1366 
1367 		/*
1368 		 * Initialize the ccd.  Fills in the softc for us.
1369 		 */
1370 		if ((error = ccdinit(&ccd, cpp, td)) != 0) {
1371 			for (j = 0; j < lookedup; ++j)
1372 				(void)vn_close(vpp[j], FREAD|FWRITE);
1373 			bzero(&ccd_softc[unit], sizeof(struct ccd_softc));
1374 			free(vpp, M_DEVBUF);
1375 			free(cpp, M_DEVBUF);
1376 			ccdunlock(cs);
1377 			return (error);
1378 		}
1379 
1380 		/*
1381 		 * The ccd has been successfully initialized, so
1382 		 * we can place it into the array and read the disklabel.
1383 		 */
1384 		bcopy(&ccd, &ccddevs[unit], sizeof(ccd));
1385 		ccio->ccio_unit = unit;
1386 		ccio->ccio_size = cs->sc_size;
1387 		ccdgetdisklabel(dev);
1388 
1389 		ccdunlock(cs);
1390 
1391 		break;
1392 
1393 	case CCDIOCCLR:
1394 		if ((cs->sc_flags & CCDF_INITED) == 0)
1395 			return (ENXIO);
1396 
1397 		if ((flag & FWRITE) == 0)
1398 			return (EBADF);
1399 
1400 		if ((error = ccdlock(cs)) != 0)
1401 			return (error);
1402 
1403 		/* Don't unconfigure if any other partitions are open */
1404 		part = ccdpart(dev);
1405 		pmask = (1 << part);
1406 		if ((cs->sc_openmask & ~pmask)) {
1407 			ccdunlock(cs);
1408 			return (EBUSY);
1409 		}
1410 
1411 		/*
1412 		 * Free ccd_softc information and clear entry.
1413 		 */
1414 
1415 		/* Close the components and free their pathnames. */
1416 		for (i = 0; i < cs->sc_nccdisks; ++i) {
1417 			/*
1418 			 * XXX: this close could potentially fail and
1419 			 * cause Bad Things.  Maybe we need to force
1420 			 * the close to happen?
1421 			 */
1422 #ifdef DEBUG
1423 			if (ccddebug & CCDB_VNODE)
1424 				vprint("CCDIOCCLR: vnode info",
1425 				    cs->sc_cinfo[i].ci_vp);
1426 #endif
1427 			(void)vn_close(cs->sc_cinfo[i].ci_vp, FREAD|FWRITE);
1428 			free(cs->sc_cinfo[i].ci_path, M_DEVBUF);
1429 		}
1430 
1431 		/* Free interleave index. */
1432 		for (i = 0; cs->sc_itable[i].ii_ndisk; ++i)
1433 			free(cs->sc_itable[i].ii_index, M_DEVBUF);
1434 
1435 		/* Free component info and interleave table. */
1436 		free(cs->sc_cinfo, M_DEVBUF);
1437 		free(cs->sc_itable, M_DEVBUF);
1438 		cs->sc_flags &= ~CCDF_INITED;
1439 
1440 		/*
1441 		 * Free ccddevice information and clear entry.
1442 		 */
1443 		free(ccddevs[unit].ccd_cpp, M_DEVBUF);
1444 		free(ccddevs[unit].ccd_vpp, M_DEVBUF);
1445 		ccd.ccd_dk = -1;
1446 		bcopy(&ccd, &ccddevs[unit], sizeof(ccd));
1447 
1448 		/*
1449 		 * And remove the devstat entry.
1450 		 */
1451 		devstat_remove_entry(&cs->device_stats);
1452 
1453 		/* This must be atomic. */
1454 		crit_enter();
1455 		ccdunlock(cs);
1456 		bzero(cs, sizeof(struct ccd_softc));
1457 		crit_exit();
1458 
1459 		break;
1460 
1461 	case DIOCGDINFO:
1462 		if ((cs->sc_flags & CCDF_INITED) == 0)
1463 			return (ENXIO);
1464 
1465 		*(struct disklabel *)data = cs->sc_label;
1466 		break;
1467 
1468 	case DIOCGPART:
1469 		if ((cs->sc_flags & CCDF_INITED) == 0)
1470 			return (ENXIO);
1471 
1472 		((struct partinfo *)data)->disklab = &cs->sc_label;
1473 		((struct partinfo *)data)->part =
1474 		    &cs->sc_label.d_partitions[ccdpart(dev)];
1475 		break;
1476 
1477 	case DIOCWDINFO:
1478 	case DIOCSDINFO:
1479 		if ((cs->sc_flags & CCDF_INITED) == 0)
1480 			return (ENXIO);
1481 
1482 		if ((flag & FWRITE) == 0)
1483 			return (EBADF);
1484 
1485 		if ((error = ccdlock(cs)) != 0)
1486 			return (error);
1487 
1488 		cs->sc_flags |= CCDF_LABELLING;
1489 
1490 		error = setdisklabel(&cs->sc_label,
1491 		    (struct disklabel *)data, 0);
1492 		if (error == 0) {
1493 			if (cmd == DIOCWDINFO) {
1494 				dev_t cdev = CCDLABELDEV(dev);
1495 				error = writedisklabel(cdev, &cs->sc_label);
1496 			}
1497 		}
1498 
1499 		cs->sc_flags &= ~CCDF_LABELLING;
1500 
1501 		ccdunlock(cs);
1502 
1503 		if (error)
1504 			return (error);
1505 		break;
1506 
1507 	case DIOCWLABEL:
1508 		if ((cs->sc_flags & CCDF_INITED) == 0)
1509 			return (ENXIO);
1510 
1511 		if ((flag & FWRITE) == 0)
1512 			return (EBADF);
1513 		if (*(int *)data != 0)
1514 			cs->sc_flags |= CCDF_WLABEL;
1515 		else
1516 			cs->sc_flags &= ~CCDF_WLABEL;
1517 		break;
1518 
1519 	default:
1520 		return (ENOTTY);
1521 	}
1522 
1523 	return (0);
1524 }
1525 
1526 static int
1527 ccdsize(dev_t dev)
1528 {
1529 	struct ccd_softc *cs;
1530 	int part, size;
1531 
1532 	if (ccdopen(dev, 0, S_IFCHR, curthread))
1533 		return (-1);
1534 
1535 	cs = &ccd_softc[ccdunit(dev)];
1536 	part = ccdpart(dev);
1537 
1538 	if ((cs->sc_flags & CCDF_INITED) == 0)
1539 		return (-1);
1540 
1541 	if (cs->sc_label.d_partitions[part].p_fstype != FS_SWAP)
1542 		size = -1;
1543 	else
1544 		size = cs->sc_label.d_partitions[part].p_size;
1545 
1546 	if (ccdclose(dev, 0, S_IFCHR, curthread))
1547 		return (-1);
1548 
1549 	return (size);
1550 }
1551 
1552 static int
1553 ccddump(dev_t dev, u_int count, u_int blkno, u_int secsize)
1554 {
1555 	/* Not implemented. */
1556 	return ENXIO;
1557 }
1558 
1559 /*
1560  * Lookup the provided name in the filesystem.  If the file exists,
1561  * is a valid block device, and isn't being used by anyone else,
1562  * set *vpp to the file's vnode.
1563  */
1564 static int
1565 ccdlookup(char *path, struct thread *td, struct vnode **vpp)
1566 {
1567 	struct nlookupdata nd;
1568 	struct ucred *cred;
1569 	struct vnode *vp;
1570 	int error;
1571 
1572 	KKASSERT(td->td_proc);
1573 	cred = td->td_proc->p_ucred;
1574 	*vpp = NULL;
1575 
1576 	error = nlookup_init(&nd, path, UIO_USERSPACE, NLC_FOLLOW|NLC_LOCKVP);
1577 	if (error)
1578 		return (error);
1579 	if ((error = vn_open(&nd, NULL, FREAD|FWRITE, 0)) != 0) {
1580 #ifdef DEBUG
1581 		if (ccddebug & CCDB_FOLLOW|CCDB_INIT)
1582 			printf("ccdlookup: vn_open error = %d\n", error);
1583 #endif
1584 		goto done;
1585 	}
1586 	vp = nd.nl_open_vp;
1587 
1588 	if (vp->v_usecount > 1) {
1589 		error = EBUSY;
1590 		goto done;
1591 	}
1592 
1593 	if (!vn_isdisk(vp, &error))
1594 		goto done;
1595 
1596 #ifdef DEBUG
1597 	if (ccddebug & CCDB_VNODE)
1598 		vprint("ccdlookup: vnode info", vp);
1599 #endif
1600 
1601 	VOP_UNLOCK(vp, 0);
1602 	nd.nl_open_vp = NULL;
1603 	nlookup_done(&nd);
1604 	*vpp = vp;				/* leave ref intact  */
1605 	return (0);
1606 done:
1607 	nlookup_done(&nd);
1608 	return (error);
1609 }
1610 
1611 /*
1612  * Read the disklabel from the ccd.  If one is not present, fake one
1613  * up.
1614  */
1615 static void
1616 ccdgetdisklabel(dev_t dev)
1617 {
1618 	int unit = ccdunit(dev);
1619 	struct ccd_softc *cs = &ccd_softc[unit];
1620 	char *errstring;
1621 	struct disklabel *lp = &cs->sc_label;
1622 	struct ccdgeom *ccg = &cs->sc_geom;
1623 	dev_t cdev;
1624 
1625 	bzero(lp, sizeof(*lp));
1626 
1627 	lp->d_secperunit = cs->sc_size;
1628 	lp->d_secsize = ccg->ccg_secsize;
1629 	lp->d_nsectors = ccg->ccg_nsectors;
1630 	lp->d_ntracks = ccg->ccg_ntracks;
1631 	lp->d_ncylinders = ccg->ccg_ncylinders;
1632 	lp->d_secpercyl = lp->d_ntracks * lp->d_nsectors;
1633 
1634 	strncpy(lp->d_typename, "ccd", sizeof(lp->d_typename));
1635 	lp->d_type = DTYPE_CCD;
1636 	strncpy(lp->d_packname, "fictitious", sizeof(lp->d_packname));
1637 	lp->d_rpm = 3600;
1638 	lp->d_interleave = 1;
1639 	lp->d_flags = 0;
1640 
1641 	lp->d_partitions[RAW_PART].p_offset = 0;
1642 	lp->d_partitions[RAW_PART].p_size = cs->sc_size;
1643 	lp->d_partitions[RAW_PART].p_fstype = FS_UNUSED;
1644 	lp->d_npartitions = RAW_PART + 1;
1645 
1646 	lp->d_bbsize = BBSIZE;				/* XXX */
1647 	lp->d_sbsize = SBSIZE;				/* XXX */
1648 
1649 	lp->d_magic = DISKMAGIC;
1650 	lp->d_magic2 = DISKMAGIC;
1651 	lp->d_checksum = dkcksum(&cs->sc_label);
1652 
1653 	/*
1654 	 * Call the generic disklabel extraction routine.
1655 	 */
1656 	cdev = CCDLABELDEV(dev);
1657 	errstring = readdisklabel(cdev, &cs->sc_label);
1658 	if (errstring != NULL)
1659 		ccdmakedisklabel(cs);
1660 
1661 #ifdef DEBUG
1662 	/* It's actually extremely common to have unlabeled ccds. */
1663 	if (ccddebug & CCDB_LABEL)
1664 		if (errstring != NULL)
1665 			printf("ccd%d: %s\n", unit, errstring);
1666 #endif
1667 }
1668 
1669 /*
1670  * Take care of things one might want to take care of in the event
1671  * that a disklabel isn't present.
1672  */
1673 static void
1674 ccdmakedisklabel(struct ccd_softc *cs)
1675 {
1676 	struct disklabel *lp = &cs->sc_label;
1677 
1678 	/*
1679 	 * For historical reasons, if there's no disklabel present
1680 	 * the raw partition must be marked FS_BSDFFS.
1681 	 */
1682 	lp->d_partitions[RAW_PART].p_fstype = FS_BSDFFS;
1683 
1684 	strncpy(lp->d_packname, "default label", sizeof(lp->d_packname));
1685 }
1686 
1687 /*
1688  * Wait interruptibly for an exclusive lock.
1689  *
1690  * XXX
1691  * Several drivers do this; it should be abstracted and made MP-safe.
1692  */
1693 static int
1694 ccdlock(struct ccd_softc *cs)
1695 {
1696 	int error;
1697 
1698 	while ((cs->sc_flags & CCDF_LOCKED) != 0) {
1699 		cs->sc_flags |= CCDF_WANTED;
1700 		if ((error = tsleep(cs, PCATCH, "ccdlck", 0)) != 0)
1701 			return (error);
1702 	}
1703 	cs->sc_flags |= CCDF_LOCKED;
1704 	return (0);
1705 }
1706 
1707 /*
1708  * Unlock and wake up any waiters.
1709  */
1710 static void
1711 ccdunlock(struct ccd_softc *cs)
1712 {
1713 
1714 	cs->sc_flags &= ~CCDF_LOCKED;
1715 	if ((cs->sc_flags & CCDF_WANTED) != 0) {
1716 		cs->sc_flags &= ~CCDF_WANTED;
1717 		wakeup(cs);
1718 	}
1719 }
1720 
1721 #ifdef DEBUG
1722 static void
1723 printiinfo(struct ccdiinfo *ii)
1724 {
1725 	int ix, i;
1726 
1727 	for (ix = 0; ii->ii_ndisk; ix++, ii++) {
1728 		printf(" itab[%d]: #dk %d sblk %d soff %d",
1729 		       ix, ii->ii_ndisk, ii->ii_startblk, ii->ii_startoff);
1730 		for (i = 0; i < ii->ii_ndisk; i++)
1731 			printf(" %d", ii->ii_index[i]);
1732 		printf("\n");
1733 	}
1734 }
1735 #endif
1736 
1737 
1738 /* Local Variables: */
1739 /* c-argdecl-indent: 8 */
1740 /* c-continued-statement-offset: 8 */
1741 /* c-indent-level: 8 */
1742 /* End: */
1743