xref: /dragonfly/sys/dev/disk/ccd/ccd.c (revision f02303f9)
1 /* $FreeBSD: src/sys/dev/ccd/ccd.c,v 1.73.2.1 2001/09/11 09:49:52 kris Exp $ */
2 /* $DragonFly: src/sys/dev/disk/ccd/ccd.c,v 1.38 2006/12/22 23:26:16 swildner Exp $ */
3 
4 /*	$NetBSD: ccd.c,v 1.22 1995/12/08 19:13:26 thorpej Exp $	*/
5 
6 /*
7  * Copyright (c) 1995 Jason R. Thorpe.
8  * All rights reserved.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 3. All advertising materials mentioning features or use of this software
19  *    must display the following acknowledgement:
20  *	This product includes software developed for the NetBSD Project
21  *	by Jason R. Thorpe.
22  * 4. The name of the author may not be used to endorse or promote products
23  *    derived from this software without specific prior written permission.
24  *
25  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
26  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
27  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
28  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
29  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
30  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
31  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
32  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
33  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
34  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
35  * SUCH DAMAGE.
36  */
37 
38 /*
39  * Copyright (c) 1988 University of Utah.
40  * Copyright (c) 1990, 1993
41  *	The Regents of the University of California.  All rights reserved.
42  *
43  * This code is derived from software contributed to Berkeley by
44  * the Systems Programming Group of the University of Utah Computer
45  * Science Department.
46  *
47  * Redistribution and use in source and binary forms, with or without
48  * modification, are permitted provided that the following conditions
49  * are met:
50  * 1. Redistributions of source code must retain the above copyright
51  *    notice, this list of conditions and the following disclaimer.
52  * 2. Redistributions in binary form must reproduce the above copyright
53  *    notice, this list of conditions and the following disclaimer in the
54  *    documentation and/or other materials provided with the distribution.
55  * 3. All advertising materials mentioning features or use of this software
56  *    must display the following acknowledgement:
57  *	This product includes software developed by the University of
58  *	California, Berkeley and its contributors.
59  * 4. Neither the name of the University nor the names of its contributors
60  *    may be used to endorse or promote products derived from this software
61  *    without specific prior written permission.
62  *
63  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
64  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
65  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
66  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
67  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
68  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
69  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
70  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
71  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
72  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
73  * SUCH DAMAGE.
74  *
75  * from: Utah $Hdr: cd.c 1.6 90/11/28$
76  *
77  *	@(#)cd.c	8.2 (Berkeley) 11/16/93
78  */
79 
80 /*
81  * "Concatenated" disk driver.
82  *
83  * Dynamic configuration and disklabel support by:
84  *	Jason R. Thorpe <thorpej@nas.nasa.gov>
85  *	Numerical Aerodynamic Simulation Facility
86  *	Mail Stop 258-6
87  *	NASA Ames Research Center
88  *	Moffett Field, CA 94035
89  */
90 
91 #include "use_ccd.h"
92 
93 #include <sys/param.h>
94 #include <sys/systm.h>
95 #include <sys/kernel.h>
96 #include <sys/module.h>
97 #include <sys/proc.h>
98 #include <sys/buf.h>
99 #include <sys/malloc.h>
100 #include <sys/nlookup.h>
101 #include <sys/conf.h>
102 #include <sys/stat.h>
103 #include <sys/sysctl.h>
104 #include <sys/disklabel.h>
105 #include <sys/devicestat.h>
106 #include <sys/fcntl.h>
107 #include <sys/vnode.h>
108 #include <sys/buf2.h>
109 #include <sys/ccdvar.h>
110 
111 #include <vm/vm_zone.h>
112 
113 #include <vfs/ufs/dinode.h> 	/* XXX Used only for fs.h */
114 #include <vfs/ufs/fs.h> 	/* XXX used only to get BBSIZE and SBSIZE */
115 
116 #include <sys/thread2.h>
117 
118 #if defined(CCDDEBUG) && !defined(DEBUG)
119 #define DEBUG
120 #endif
121 
122 #ifdef DEBUG
123 #define CCDB_FOLLOW	0x01
124 #define CCDB_INIT	0x02
125 #define CCDB_IO		0x04
126 #define CCDB_LABEL	0x08
127 #define CCDB_VNODE	0x10
128 static int ccddebug = CCDB_FOLLOW | CCDB_INIT | CCDB_IO | CCDB_LABEL |
129     CCDB_VNODE;
130 SYSCTL_INT(_debug, OID_AUTO, ccddebug, CTLFLAG_RW, &ccddebug, 0, "");
131 #undef DEBUG
132 #endif
133 
134 #define	ccdunit(x)	dkunit(x)
135 #define ccdpart(x)	dkpart(x)
136 
137 /*
138    This is how mirroring works (only writes are special):
139 
140    When initiating a write, ccdbuffer() returns two "struct ccdbuf *"s
141    linked together by the cb_mirror field.  "cb_pflags &
142    CCDPF_MIRROR_DONE" is set to 0 on both of them.
143 
144    When a component returns to ccdiodone(), it checks if "cb_pflags &
145    CCDPF_MIRROR_DONE" is set or not.  If not, it sets the partner's
146    flag and returns.  If it is, it means its partner has already
147    returned, so it will go to the regular cleanup.
148 
149  */
150 
151 struct ccdbuf {
152 	struct buf	cb_buf;		/* new I/O buf */
153 	struct vnode	*cb_vp;		/* related vnode */
154 	struct bio	*cb_obio;	/* ptr. to original I/O buf */
155 	struct ccdbuf	*cb_freenext;	/* free list link */
156 	int		cb_unit;	/* target unit */
157 	int		cb_comp;	/* target component */
158 	int		cb_pflags;	/* mirror/parity status flag */
159 	struct ccdbuf	*cb_mirror;	/* mirror counterpart */
160 };
161 
162 /* bits in cb_pflags */
163 #define CCDPF_MIRROR_DONE 1	/* if set, mirror counterpart is done */
164 
165 #define CCDLABELDEV(dev)	\
166 	(make_sub_dev(dev, dkmakeminor(ccdunit((dev)), 0, RAW_PART)))
167 
168 static d_open_t ccdopen;
169 static d_close_t ccdclose;
170 static d_strategy_t ccdstrategy;
171 static d_ioctl_t ccdioctl;
172 static d_dump_t ccddump;
173 static d_psize_t ccdsize;
174 
175 #define NCCDFREEHIWAT	16
176 
177 #define CDEV_MAJOR 74
178 
179 static struct dev_ops ccd_ops = {
180 	{ "ccd", CDEV_MAJOR, D_DISK },
181 	.d_open =	ccdopen,
182 	.d_close =	ccdclose,
183 	.d_read =	physread,
184 	.d_write =	physwrite,
185 	.d_ioctl =	ccdioctl,
186 	.d_strategy =	ccdstrategy,
187 	.d_dump =	ccddump,
188 	.d_psize =	ccdsize
189 };
190 
191 /* called during module initialization */
192 static	void ccdattach (void);
193 static	int ccd_modevent (module_t, int, void *);
194 
195 /* called by biodone() at interrupt time */
196 static	void ccdiodone (struct bio *bio);
197 
198 static	void ccdstart (struct ccd_softc *, struct bio *);
199 static	void ccdinterleave (struct ccd_softc *, int);
200 static	void ccdintr (struct ccd_softc *, struct bio *);
201 static	int ccdinit (struct ccddevice *, char **, struct ucred *);
202 static	int ccdlookup (char *, struct vnode **);
203 static	void ccdbuffer (struct ccdbuf **ret, struct ccd_softc *,
204 		struct bio *, off_t, caddr_t, long);
205 static	void ccdgetdisklabel (cdev_t);
206 static	void ccdmakedisklabel (struct ccd_softc *);
207 static	int ccdlock (struct ccd_softc *);
208 static	void ccdunlock (struct ccd_softc *);
209 
210 #ifdef DEBUG
211 static	void printiinfo (struct ccdiinfo *);
212 #endif
213 
214 /* Non-private for the benefit of libkvm. */
215 struct	ccd_softc *ccd_softc;
216 struct	ccddevice *ccddevs;
217 struct	ccdbuf *ccdfreebufs;
218 static	int numccdfreebufs;
219 static	int numccd = 0;
220 
221 /*
222  * getccdbuf() -	Allocate and zero a ccd buffer.
223  *
224  *	This routine is called at splbio().
225  */
226 
227 static __inline
228 struct ccdbuf *
229 getccdbuf(void)
230 {
231 	struct ccdbuf *cbp;
232 
233 	/*
234 	 * Allocate from freelist or malloc as necessary
235 	 */
236 	if ((cbp = ccdfreebufs) != NULL) {
237 		ccdfreebufs = cbp->cb_freenext;
238 		--numccdfreebufs;
239 		reinitbufbio(&cbp->cb_buf);
240 	} else {
241 		cbp = kmalloc(sizeof(struct ccdbuf), M_DEVBUF, M_WAITOK|M_ZERO);
242 		initbufbio(&cbp->cb_buf);
243 	}
244 
245 	/*
246 	 * independant struct buf initialization
247 	 */
248 	LIST_INIT(&cbp->cb_buf.b_dep);
249 	BUF_LOCKINIT(&cbp->cb_buf);
250 	BUF_LOCK(&cbp->cb_buf, LK_EXCLUSIVE);
251 	BUF_KERNPROC(&cbp->cb_buf);
252 	cbp->cb_buf.b_flags = B_PAGING | B_BNOCLIP;
253 
254 	return(cbp);
255 }
256 
257 /*
258  * putccdbuf() -	Free a ccd buffer.
259  *
260  *	This routine is called at splbio().
261  */
262 
263 static __inline
264 void
265 putccdbuf(struct ccdbuf *cbp)
266 {
267 	BUF_UNLOCK(&cbp->cb_buf);
268 	BUF_LOCKFREE(&cbp->cb_buf);
269 
270 	if (numccdfreebufs < NCCDFREEHIWAT) {
271 		cbp->cb_freenext = ccdfreebufs;
272 		ccdfreebufs = cbp;
273 		++numccdfreebufs;
274 	} else {
275 		kfree((caddr_t)cbp, M_DEVBUF);
276 	}
277 }
278 
279 
280 /*
281  * Number of blocks to untouched in front of a component partition.
282  * This is to avoid violating its disklabel area when it starts at the
283  * beginning of the slice.
284  */
285 #if !defined(CCD_OFFSET)
286 #define CCD_OFFSET 16
287 #endif
288 
289 /*
290  * Called by main() during pseudo-device attachment.  All we need
291  * to do is allocate enough space for devices to be configured later, and
292  * add devsw entries.
293  */
294 static void
295 ccdattach(void)
296 {
297 	int i;
298 	int num = NCCD;
299 
300 	if (num > 1)
301 		kprintf("ccd0-%d: Concatenated disk drivers\n", num-1);
302 	else
303 		kprintf("ccd0: Concatenated disk driver\n");
304 
305 	ccd_softc = kmalloc(num * sizeof(struct ccd_softc), M_DEVBUF,
306 			    M_WAITOK | M_ZERO);
307 	ccddevs = kmalloc(num * sizeof(struct ccddevice), M_DEVBUF,
308 			    M_WAITOK | M_ZERO);
309 	numccd = num;
310 
311 	dev_ops_add(&ccd_ops, 0, 0);
312 	/* XXX: is this necessary? */
313 	for (i = 0; i < numccd; ++i)
314 		ccddevs[i].ccd_dk = -1;
315 }
316 
317 static int
318 ccd_modevent(module_t mod, int type, void *data)
319 {
320 	int error = 0;
321 
322 	switch (type) {
323 	case MOD_LOAD:
324 		ccdattach();
325 		break;
326 
327 	case MOD_UNLOAD:
328 		kprintf("ccd0: Unload not supported!\n");
329 		error = EOPNOTSUPP;
330 		break;
331 
332 	default:	/* MOD_SHUTDOWN etc */
333 		break;
334 	}
335 	return (error);
336 }
337 
338 DEV_MODULE(ccd, ccd_modevent, NULL);
339 
340 static int
341 ccdinit(struct ccddevice *ccd, char **cpaths, struct ucred *cred)
342 {
343 	struct ccd_softc *cs = &ccd_softc[ccd->ccd_unit];
344 	struct ccdcinfo *ci = NULL;	/* XXX */
345 	size_t size;
346 	int ix;
347 	struct vnode *vp;
348 	size_t minsize;
349 	int maxsecsize;
350 	struct partinfo dpart;
351 	struct ccdgeom *ccg = &cs->sc_geom;
352 	char tmppath[MAXPATHLEN];
353 	int error = 0;
354 
355 #ifdef DEBUG
356 	if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
357 		kprintf("ccdinit: unit %d\n", ccd->ccd_unit);
358 #endif
359 
360 	cs->sc_size = 0;
361 	cs->sc_ileave = ccd->ccd_interleave;
362 	cs->sc_nccdisks = ccd->ccd_ndev;
363 
364 	/* Allocate space for the component info. */
365 	cs->sc_cinfo = kmalloc(cs->sc_nccdisks * sizeof(struct ccdcinfo),
366 	    M_DEVBUF, M_WAITOK);
367 
368 	/*
369 	 * Verify that each component piece exists and record
370 	 * relevant information about it.
371 	 */
372 	maxsecsize = 0;
373 	minsize = 0;
374 	for (ix = 0; ix < cs->sc_nccdisks; ix++) {
375 		vp = ccd->ccd_vpp[ix];
376 		ci = &cs->sc_cinfo[ix];
377 		ci->ci_vp = vp;
378 
379 		/*
380 		 * Copy in the pathname of the component.
381 		 */
382 		bzero(tmppath, sizeof(tmppath));	/* sanity */
383 		if ((error = copyinstr(cpaths[ix], tmppath,
384 		    MAXPATHLEN, &ci->ci_pathlen)) != 0) {
385 #ifdef DEBUG
386 			if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
387 				kprintf("ccd%d: can't copy path, error = %d\n",
388 				    ccd->ccd_unit, error);
389 #endif
390 			goto fail;
391 		}
392 		ci->ci_path = kmalloc(ci->ci_pathlen, M_DEVBUF, M_WAITOK);
393 		bcopy(tmppath, ci->ci_path, ci->ci_pathlen);
394 
395 		ci->ci_dev = vn_todev(vp);
396 
397 		/*
398 		 * Get partition information for the component.
399 		 */
400 		if ((error = VOP_IOCTL(vp, DIOCGPART, (caddr_t)&dpart,
401 				       FREAD, cred)) != 0) {
402 #ifdef DEBUG
403 			if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
404 				 kprintf("ccd%d: %s: ioctl failed, error = %d\n",
405 				     ccd->ccd_unit, ci->ci_path, error);
406 #endif
407 			goto fail;
408 		}
409 		if (dpart.part->p_fstype == FS_BSDFFS) {
410 			maxsecsize =
411 			    ((dpart.disklab->d_secsize > maxsecsize) ?
412 			    dpart.disklab->d_secsize : maxsecsize);
413 			size = dpart.part->p_size - CCD_OFFSET;
414 		} else {
415 #ifdef DEBUG
416 			if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
417 				kprintf("ccd%d: %s: incorrect partition type\n",
418 				    ccd->ccd_unit, ci->ci_path);
419 #endif
420 			error = EFTYPE;
421 			goto fail;
422 		}
423 
424 		/*
425 		 * Calculate the size, truncating to an interleave
426 		 * boundary if necessary.
427 		 */
428 
429 		if (cs->sc_ileave > 1)
430 			size -= size % cs->sc_ileave;
431 
432 		if (size == 0) {
433 #ifdef DEBUG
434 			if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
435 				kprintf("ccd%d: %s: size == 0\n",
436 				    ccd->ccd_unit, ci->ci_path);
437 #endif
438 			error = ENODEV;
439 			goto fail;
440 		}
441 
442 		if (minsize == 0 || size < minsize)
443 			minsize = size;
444 		ci->ci_size = size;
445 		cs->sc_size += size;
446 	}
447 
448 	/*
449 	 * Don't allow the interleave to be smaller than
450 	 * the biggest component sector.
451 	 */
452 	if ((cs->sc_ileave > 0) &&
453 	    (cs->sc_ileave < (maxsecsize / DEV_BSIZE))) {
454 #ifdef DEBUG
455 		if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
456 			kprintf("ccd%d: interleave must be at least %d\n",
457 			    ccd->ccd_unit, (maxsecsize / DEV_BSIZE));
458 #endif
459 		error = EINVAL;
460 		goto fail;
461 	}
462 
463 	/*
464 	 * If uniform interleave is desired set all sizes to that of
465 	 * the smallest component.  This will guarentee that a single
466 	 * interleave table is generated.
467 	 *
468 	 * Lost space must be taken into account when calculating the
469 	 * overall size.  Half the space is lost when CCDF_MIRROR is
470 	 * specified.  One disk is lost when CCDF_PARITY is specified.
471 	 */
472 	if (ccd->ccd_flags & CCDF_UNIFORM) {
473 		for (ci = cs->sc_cinfo;
474 		     ci < &cs->sc_cinfo[cs->sc_nccdisks]; ci++) {
475 			ci->ci_size = minsize;
476 		}
477 		if (ccd->ccd_flags & CCDF_MIRROR) {
478 			/*
479 			 * Check to see if an even number of components
480 			 * have been specified.  The interleave must also
481 			 * be non-zero in order for us to be able to
482 			 * guarentee the topology.
483 			 */
484 			if (cs->sc_nccdisks % 2) {
485 				kprintf("ccd%d: mirroring requires an even number of disks\n", ccd->ccd_unit );
486 				error = EINVAL;
487 				goto fail;
488 			}
489 			if (cs->sc_ileave == 0) {
490 				kprintf("ccd%d: an interleave must be specified when mirroring\n", ccd->ccd_unit);
491 				error = EINVAL;
492 				goto fail;
493 			}
494 			cs->sc_size = (cs->sc_nccdisks/2) * minsize;
495 		} else if (ccd->ccd_flags & CCDF_PARITY) {
496 			cs->sc_size = (cs->sc_nccdisks-1) * minsize;
497 		} else {
498 			if (cs->sc_ileave == 0) {
499 				kprintf("ccd%d: an interleave must be specified when using parity\n", ccd->ccd_unit);
500 				error = EINVAL;
501 				goto fail;
502 			}
503 			cs->sc_size = cs->sc_nccdisks * minsize;
504 		}
505 	}
506 
507 	/*
508 	 * Construct the interleave table.
509 	 */
510 	ccdinterleave(cs, ccd->ccd_unit);
511 
512 	/*
513 	 * Create pseudo-geometry based on 1MB cylinders.  It's
514 	 * pretty close.
515 	 */
516 	ccg->ccg_secsize = maxsecsize;
517 	ccg->ccg_ntracks = 1;
518 	ccg->ccg_nsectors = 1024 * 1024 / ccg->ccg_secsize;
519 	ccg->ccg_ncylinders = cs->sc_size / ccg->ccg_nsectors;
520 
521 	/*
522 	 * Add an devstat entry for this device.
523 	 */
524 	devstat_add_entry(&cs->device_stats, "ccd", ccd->ccd_unit,
525 			  ccg->ccg_secsize, DEVSTAT_ALL_SUPPORTED,
526 			  DEVSTAT_TYPE_STORARRAY |DEVSTAT_TYPE_IF_OTHER,
527 			  DEVSTAT_PRIORITY_ARRAY);
528 
529 	cs->sc_flags |= CCDF_INITED;
530 	cs->sc_cflags = ccd->ccd_flags;	/* So we can find out later... */
531 	cs->sc_unit = ccd->ccd_unit;
532 	return (0);
533 fail:
534 	while (ci > cs->sc_cinfo) {
535 		ci--;
536 		kfree(ci->ci_path, M_DEVBUF);
537 	}
538 	kfree(cs->sc_cinfo, M_DEVBUF);
539 	return (error);
540 }
541 
542 static void
543 ccdinterleave(struct ccd_softc *cs, int unit)
544 {
545 	struct ccdcinfo *ci, *smallci;
546 	struct ccdiinfo *ii;
547 	daddr_t bn, lbn;
548 	int ix;
549 	u_long size;
550 
551 #ifdef DEBUG
552 	if (ccddebug & CCDB_INIT)
553 		kprintf("ccdinterleave(%x): ileave %d\n", cs, cs->sc_ileave);
554 #endif
555 
556 	/*
557 	 * Allocate an interleave table.  The worst case occurs when each
558 	 * of N disks is of a different size, resulting in N interleave
559 	 * tables.
560 	 *
561 	 * Chances are this is too big, but we don't care.
562 	 */
563 	size = (cs->sc_nccdisks + 1) * sizeof(struct ccdiinfo);
564 	cs->sc_itable = (struct ccdiinfo *)kmalloc(size, M_DEVBUF, M_WAITOK);
565 	bzero((caddr_t)cs->sc_itable, size);
566 
567 	/*
568 	 * Trivial case: no interleave (actually interleave of disk size).
569 	 * Each table entry represents a single component in its entirety.
570 	 *
571 	 * An interleave of 0 may not be used with a mirror or parity setup.
572 	 */
573 	if (cs->sc_ileave == 0) {
574 		bn = 0;
575 		ii = cs->sc_itable;
576 
577 		for (ix = 0; ix < cs->sc_nccdisks; ix++) {
578 			/* Allocate space for ii_index. */
579 			ii->ii_index = kmalloc(sizeof(int), M_DEVBUF, M_WAITOK);
580 			ii->ii_ndisk = 1;
581 			ii->ii_startblk = bn;
582 			ii->ii_startoff = 0;
583 			ii->ii_index[0] = ix;
584 			bn += cs->sc_cinfo[ix].ci_size;
585 			ii++;
586 		}
587 		ii->ii_ndisk = 0;
588 #ifdef DEBUG
589 		if (ccddebug & CCDB_INIT)
590 			printiinfo(cs->sc_itable);
591 #endif
592 		return;
593 	}
594 
595 	/*
596 	 * The following isn't fast or pretty; it doesn't have to be.
597 	 */
598 	size = 0;
599 	bn = lbn = 0;
600 	for (ii = cs->sc_itable; ; ii++) {
601 		/*
602 		 * Allocate space for ii_index.  We might allocate more then
603 		 * we use.
604 		 */
605 		ii->ii_index = kmalloc((sizeof(int) * cs->sc_nccdisks),
606 		    M_DEVBUF, M_WAITOK);
607 
608 		/*
609 		 * Locate the smallest of the remaining components
610 		 */
611 		smallci = NULL;
612 		for (ci = cs->sc_cinfo; ci < &cs->sc_cinfo[cs->sc_nccdisks];
613 		    ci++) {
614 			if (ci->ci_size > size &&
615 			    (smallci == NULL ||
616 			     ci->ci_size < smallci->ci_size)) {
617 				smallci = ci;
618 			}
619 		}
620 
621 		/*
622 		 * Nobody left, all done
623 		 */
624 		if (smallci == NULL) {
625 			ii->ii_ndisk = 0;
626 			break;
627 		}
628 
629 		/*
630 		 * Record starting logical block using an sc_ileave blocksize.
631 		 */
632 		ii->ii_startblk = bn / cs->sc_ileave;
633 
634 		/*
635 		 * Record starting comopnent block using an sc_ileave
636 		 * blocksize.  This value is relative to the beginning of
637 		 * a component disk.
638 		 */
639 		ii->ii_startoff = lbn;
640 
641 		/*
642 		 * Determine how many disks take part in this interleave
643 		 * and record their indices.
644 		 */
645 		ix = 0;
646 		for (ci = cs->sc_cinfo;
647 		    ci < &cs->sc_cinfo[cs->sc_nccdisks]; ci++) {
648 			if (ci->ci_size >= smallci->ci_size) {
649 				ii->ii_index[ix++] = ci - cs->sc_cinfo;
650 			}
651 		}
652 		ii->ii_ndisk = ix;
653 		bn += ix * (smallci->ci_size - size);
654 		lbn = smallci->ci_size / cs->sc_ileave;
655 		size = smallci->ci_size;
656 	}
657 #ifdef DEBUG
658 	if (ccddebug & CCDB_INIT)
659 		printiinfo(cs->sc_itable);
660 #endif
661 }
662 
663 /* ARGSUSED */
664 static int
665 ccdopen(struct dev_open_args *ap)
666 {
667 	cdev_t dev = ap->a_head.a_dev;
668 	int unit = ccdunit(dev);
669 	struct ccd_softc *cs;
670 	struct disklabel *lp;
671 	int error = 0, part, pmask;
672 
673 #ifdef DEBUG
674 	if (ccddebug & CCDB_FOLLOW)
675 		kprintf("ccdopen(%x, %x)\n", dev, flags);
676 #endif
677 	if (unit >= numccd)
678 		return (ENXIO);
679 	cs = &ccd_softc[unit];
680 
681 	if ((error = ccdlock(cs)) != 0)
682 		return (error);
683 
684 	lp = &cs->sc_label;
685 
686 	part = ccdpart(dev);
687 	pmask = (1 << part);
688 
689 	/*
690 	 * If we're initialized, check to see if there are any other
691 	 * open partitions.  If not, then it's safe to update
692 	 * the in-core disklabel.
693 	 */
694 	if ((cs->sc_flags & CCDF_INITED) && (cs->sc_openmask == 0))
695 		ccdgetdisklabel(dev);
696 
697 	/* Check that the partition exists. */
698 	if (part != RAW_PART && ((part >= lp->d_npartitions) ||
699 	    (lp->d_partitions[part].p_fstype == FS_UNUSED))) {
700 		error = ENXIO;
701 		goto done;
702 	}
703 
704 	cs->sc_openmask |= pmask;
705  done:
706 	ccdunlock(cs);
707 	return (0);
708 }
709 
710 /* ARGSUSED */
711 static int
712 ccdclose(struct dev_close_args *ap)
713 {
714 	cdev_t dev = ap->a_head.a_dev;
715 	int unit = ccdunit(dev);
716 	struct ccd_softc *cs;
717 	int error = 0, part;
718 
719 #ifdef DEBUG
720 	if (ccddebug & CCDB_FOLLOW)
721 		kprintf("ccdclose(%x, %x)\n", dev, flags);
722 #endif
723 
724 	if (unit >= numccd)
725 		return (ENXIO);
726 	cs = &ccd_softc[unit];
727 
728 	if ((error = ccdlock(cs)) != 0)
729 		return (error);
730 
731 	part = ccdpart(dev);
732 
733 	/* ...that much closer to allowing unconfiguration... */
734 	cs->sc_openmask &= ~(1 << part);
735 	ccdunlock(cs);
736 	return (0);
737 }
738 
739 static int
740 ccdstrategy(struct dev_strategy_args *ap)
741 {
742 	cdev_t dev = ap->a_head.a_dev;
743 	struct bio *bio = ap->a_bio;
744 	int unit = ccdunit(dev);
745 	struct bio *nbio;
746 	struct buf *bp = bio->bio_buf;
747 	struct ccd_softc *cs = &ccd_softc[unit];
748 	int wlabel;
749 	struct disklabel *lp;
750 
751 #ifdef DEBUG
752 	if (ccddebug & CCDB_FOLLOW)
753 		kprintf("ccdstrategy(%x): unit %d\n", bp, unit);
754 #endif
755 	if ((cs->sc_flags & CCDF_INITED) == 0) {
756 		bp->b_error = ENXIO;
757 		goto error;
758 	}
759 
760 	/* If it's a nil transfer, wake up the top half now. */
761 	if (bp->b_bcount == 0) {
762 		bp->b_resid = 0;
763 		goto done;
764 	}
765 
766 	lp = &cs->sc_label;
767 
768 	/*
769 	 * Do bounds checking and adjust transfer.  If there's an
770 	 * error, the bounds check will flag that for us.
771 	 */
772 	wlabel = cs->sc_flags & (CCDF_WLABEL|CCDF_LABELLING);
773 	if (ccdpart(dev) != RAW_PART) {
774 		nbio = bounds_check_with_label(dev, bio, lp, wlabel);
775 		if (nbio == NULL)
776 			goto done;
777 	} else {
778 		int pbn;        /* in sc_secsize chunks */
779 		long sz;        /* in sc_secsize chunks */
780 
781 		pbn = (int)(bio->bio_offset / cs->sc_geom.ccg_secsize);
782 		sz = howmany(bp->b_bcount, cs->sc_geom.ccg_secsize);
783 
784 		/*
785 		 * If out of bounds return an error.  If the request goes
786 		 * past EOF, clip the request as appropriate.  If exactly
787 		 * at EOF, return success (don't clip), but with 0 bytes
788 		 * of I/O.
789 		 *
790 		 * Mark EOF B_INVAL (just like bad), indicating that the
791 		 * contents of the buffer, if any, is invalid.
792 		 */
793 		if (pbn < 0)
794 			goto bad;
795 		if (pbn + sz > cs->sc_size) {
796 			if (pbn > cs->sc_size || (bp->b_flags & B_BNOCLIP))
797 				goto bad;
798 			if (pbn == cs->sc_size) {
799 				bp->b_resid = bp->b_bcount;
800 				bp->b_flags |= B_INVAL;
801 				goto done;
802 			}
803 			sz = cs->sc_size - pbn;
804 			bp->b_bcount = sz * cs->sc_geom.ccg_secsize;
805 		}
806 		nbio = bio;
807 	}
808 
809 	bp->b_resid = bp->b_bcount;
810 	nbio->bio_driver_info = dev;
811 
812 	/*
813 	 * "Start" the unit.
814 	 */
815 	crit_enter();
816 	ccdstart(cs, nbio);
817 	crit_exit();
818 	return(0);
819 
820 	/*
821 	 * note: bio, not nbio, is valid at the done label.
822 	 */
823 bad:
824 	bp->b_error = EINVAL;
825 error:
826 	bp->b_resid = bp->b_bcount;
827 	bp->b_flags |= B_ERROR | B_INVAL;
828 done:
829 	biodone(bio);
830 	return(0);
831 }
832 
833 static void
834 ccdstart(struct ccd_softc *cs, struct bio *bio)
835 {
836 	long bcount, rcount;
837 	struct ccdbuf *cbp[4];
838 	struct buf *bp = bio->bio_buf;
839 	cdev_t dev = bio->bio_driver_info;
840 	/* XXX! : 2 reads and 2 writes for RAID 4/5 */
841 	caddr_t addr;
842 	off_t doffset;
843 	struct partition *pp;
844 
845 #ifdef DEBUG
846 	if (ccddebug & CCDB_FOLLOW)
847 		kprintf("ccdstart(%x, %x)\n", cs, bp);
848 #endif
849 
850 	/* Record the transaction start  */
851 	devstat_start_transaction(&cs->device_stats);
852 
853 	/*
854 	 * Translate the partition-relative block number to an absolute.
855 	 */
856 	doffset = bio->bio_offset;
857 	if (ccdpart(dev) != RAW_PART) {
858 		pp = &cs->sc_label.d_partitions[ccdpart(dev)];
859 		doffset += pp->p_offset * cs->sc_label.d_secsize;
860 	}
861 
862 	/*
863 	 * Allocate component buffers and fire off the requests
864 	 */
865 	addr = bp->b_data;
866 	for (bcount = bp->b_bcount; bcount > 0; bcount -= rcount) {
867 		ccdbuffer(cbp, cs, bio, doffset, addr, bcount);
868 		rcount = cbp[0]->cb_buf.b_bcount;
869 
870 		if (cs->sc_cflags & CCDF_MIRROR) {
871 			/*
872 			 * Mirroring.  Writes go to both disks, reads are
873 			 * taken from whichever disk seems most appropriate.
874 			 *
875 			 * We attempt to localize reads to the disk whos arm
876 			 * is nearest the read request.  We ignore seeks due
877 			 * to writes when making this determination and we
878 			 * also try to avoid hogging.
879 			 */
880 			if (cbp[0]->cb_buf.b_cmd != BUF_CMD_READ) {
881 				vn_strategy(cbp[0]->cb_vp,
882 					    &cbp[0]->cb_buf.b_bio1);
883 				vn_strategy(cbp[1]->cb_vp,
884 					    &cbp[1]->cb_buf.b_bio1);
885 			} else {
886 				int pick = cs->sc_pick;
887 				daddr_t range = cs->sc_size / 16 * cs->sc_label.d_secsize;
888 
889 				if (doffset < cs->sc_blk[pick] - range ||
890 				    doffset > cs->sc_blk[pick] + range
891 				) {
892 					cs->sc_pick = pick = 1 - pick;
893 				}
894 				cs->sc_blk[pick] = doffset + rcount;
895 				vn_strategy(cbp[pick]->cb_vp,
896 					    &cbp[pick]->cb_buf.b_bio1);
897 			}
898 		} else {
899 			/*
900 			 * Not mirroring
901 			 */
902 			vn_strategy(cbp[0]->cb_vp,
903 				     &cbp[0]->cb_buf.b_bio1);
904 		}
905 		doffset += rcount;
906 		addr += rcount;
907 	}
908 }
909 
910 /*
911  * Build a component buffer header.
912  */
913 static void
914 ccdbuffer(struct ccdbuf **cb, struct ccd_softc *cs, struct bio *bio,
915 	  off_t doffset, caddr_t addr, long bcount)
916 {
917 	struct ccdcinfo *ci, *ci2 = NULL;	/* XXX */
918 	struct ccdbuf *cbp;
919 	daddr_t bn, cbn, cboff;
920 	off_t cbc;
921 
922 #ifdef DEBUG
923 	if (ccddebug & CCDB_IO)
924 		kprintf("ccdbuffer(%x, %x, %d, %x, %d)\n",
925 		       cs, bp, bn, addr, bcount);
926 #endif
927 	/*
928 	 * Determine which component bn falls in.
929 	 */
930 	bn = (daddr_t)(doffset / cs->sc_geom.ccg_secsize);
931 	cbn = bn;
932 	cboff = 0;
933 
934 	if (cs->sc_ileave == 0) {
935 		/*
936 		 * Serially concatenated and neither a mirror nor a parity
937 		 * config.  This is a special case.
938 		 */
939 		daddr_t sblk;
940 
941 		sblk = 0;
942 		for (ci = cs->sc_cinfo; cbn >= sblk + ci->ci_size; ci++)
943 			sblk += ci->ci_size;
944 		cbn -= sblk;
945 	} else {
946 		struct ccdiinfo *ii;
947 		int ccdisk, off;
948 
949 		/*
950 		 * Calculate cbn, the logical superblock (sc_ileave chunks),
951 		 * and cboff, a normal block offset (DEV_BSIZE chunks) relative
952 		 * to cbn.
953 		 */
954 		cboff = cbn % cs->sc_ileave;	/* DEV_BSIZE gran */
955 		cbn = cbn / cs->sc_ileave;	/* DEV_BSIZE * ileave gran */
956 
957 		/*
958 		 * Figure out which interleave table to use.
959 		 */
960 		for (ii = cs->sc_itable; ii->ii_ndisk; ii++) {
961 			if (ii->ii_startblk > cbn)
962 				break;
963 		}
964 		ii--;
965 
966 		/*
967 		 * off is the logical superblock relative to the beginning
968 		 * of this interleave block.
969 		 */
970 		off = cbn - ii->ii_startblk;
971 
972 		/*
973 		 * We must calculate which disk component to use (ccdisk),
974 		 * and recalculate cbn to be the superblock relative to
975 		 * the beginning of the component.  This is typically done by
976 		 * adding 'off' and ii->ii_startoff together.  However, 'off'
977 		 * must typically be divided by the number of components in
978 		 * this interleave array to be properly convert it from a
979 		 * CCD-relative logical superblock number to a
980 		 * component-relative superblock number.
981 		 */
982 		if (ii->ii_ndisk == 1) {
983 			/*
984 			 * When we have just one disk, it can't be a mirror
985 			 * or a parity config.
986 			 */
987 			ccdisk = ii->ii_index[0];
988 			cbn = ii->ii_startoff + off;
989 		} else {
990 			if (cs->sc_cflags & CCDF_MIRROR) {
991 				/*
992 				 * We have forced a uniform mapping, resulting
993 				 * in a single interleave array.  We double
994 				 * up on the first half of the available
995 				 * components and our mirror is in the second
996 				 * half.  This only works with a single
997 				 * interleave array because doubling up
998 				 * doubles the number of sectors, so there
999 				 * cannot be another interleave array because
1000 				 * the next interleave array's calculations
1001 				 * would be off.
1002 				 */
1003 				int ndisk2 = ii->ii_ndisk / 2;
1004 				ccdisk = ii->ii_index[off % ndisk2];
1005 				cbn = ii->ii_startoff + off / ndisk2;
1006 				ci2 = &cs->sc_cinfo[ccdisk + ndisk2];
1007 			} else if (cs->sc_cflags & CCDF_PARITY) {
1008 				/*
1009 				 * XXX not implemented yet
1010 				 */
1011 				int ndisk2 = ii->ii_ndisk - 1;
1012 				ccdisk = ii->ii_index[off % ndisk2];
1013 				cbn = ii->ii_startoff + off / ndisk2;
1014 				if (cbn % ii->ii_ndisk <= ccdisk)
1015 					ccdisk++;
1016 			} else {
1017 				ccdisk = ii->ii_index[off % ii->ii_ndisk];
1018 				cbn = ii->ii_startoff + off / ii->ii_ndisk;
1019 			}
1020 		}
1021 
1022 		ci = &cs->sc_cinfo[ccdisk];
1023 
1024 		/*
1025 		 * Convert cbn from a superblock to a normal block so it
1026 		 * can be used to calculate (along with cboff) the normal
1027 		 * block index into this particular disk.
1028 		 */
1029 		cbn *= cs->sc_ileave;
1030 	}
1031 
1032 	/*
1033 	 * Fill in the component buf structure.
1034 	 *
1035 	 * NOTE: devices do not use b_bufsize, only b_bcount, but b_bcount
1036 	 * will be truncated on device EOF so we use b_bufsize to detect
1037 	 * the case.
1038 	 */
1039 	cbp = getccdbuf();
1040 	cbp->cb_buf.b_cmd = bio->bio_buf->b_cmd;
1041 	cbp->cb_buf.b_flags |= bio->bio_buf->b_flags;
1042 	cbp->cb_buf.b_data = addr;
1043 	cbp->cb_vp = ci->ci_vp;
1044 	if (cs->sc_ileave == 0)
1045               cbc = dbtob((off_t)(ci->ci_size - cbn));
1046 	else
1047               cbc = dbtob((off_t)(cs->sc_ileave - cboff));
1048 	cbp->cb_buf.b_bcount = (cbc < bcount) ? cbc : bcount;
1049  	cbp->cb_buf.b_bufsize = cbp->cb_buf.b_bcount;
1050 
1051 	cbp->cb_buf.b_bio1.bio_done = ccdiodone;
1052 	cbp->cb_buf.b_bio1.bio_caller_info1.ptr = cbp;
1053 	cbp->cb_buf.b_bio1.bio_offset = dbtob(cbn + cboff + CCD_OFFSET);
1054 
1055 	/*
1056 	 * context for ccdiodone
1057 	 */
1058 	cbp->cb_obio = bio;
1059 	cbp->cb_unit = cs - ccd_softc;
1060 	cbp->cb_comp = ci - cs->sc_cinfo;
1061 
1062 #ifdef DEBUG
1063 	if (ccddebug & CCDB_IO)
1064 		kprintf(" dev %x(u%d): cbp %x off %lld addr %x bcnt %d\n",
1065 		       ci->ci_dev, ci-cs->sc_cinfo, cbp,
1066 		       cbp->cb_buf.b_bio1.bio_offset,
1067 		       cbp->cb_buf.b_data, cbp->cb_buf.b_bcount);
1068 #endif
1069 	cb[0] = cbp;
1070 
1071 	/*
1072 	 * Note: both I/O's setup when reading from mirror, but only one
1073 	 * will be executed.
1074 	 */
1075 	if (cs->sc_cflags & CCDF_MIRROR) {
1076 		/* mirror, setup second I/O */
1077 		cbp = getccdbuf();
1078 
1079 		cbp->cb_buf.b_cmd = bio->bio_buf->b_cmd;
1080 		cbp->cb_buf.b_flags |= bio->bio_buf->b_flags;
1081 		cbp->cb_buf.b_data = addr;
1082 		cbp->cb_vp = ci2->ci_vp;
1083 		if (cs->sc_ileave == 0)
1084 		      cbc = dbtob((off_t)(ci->ci_size - cbn));
1085 		else
1086 		      cbc = dbtob((off_t)(cs->sc_ileave - cboff));
1087 		cbp->cb_buf.b_bcount = (cbc < bcount) ? cbc : bcount;
1088 		cbp->cb_buf.b_bufsize = cbp->cb_buf.b_bcount;
1089 
1090 		cbp->cb_buf.b_bio1.bio_done = ccdiodone;
1091 		cbp->cb_buf.b_bio1.bio_caller_info1.ptr = cbp;
1092 		cbp->cb_buf.b_bio1.bio_offset = dbtob(cbn + cboff + CCD_OFFSET);
1093 
1094 		/*
1095 		 * context for ccdiodone
1096 		 */
1097 		cbp->cb_obio = bio;
1098 		cbp->cb_unit = cs - ccd_softc;
1099 		cbp->cb_comp = ci2 - cs->sc_cinfo;
1100 		cb[1] = cbp;
1101 		/* link together the ccdbuf's and clear "mirror done" flag */
1102 		cb[0]->cb_mirror = cb[1];
1103 		cb[1]->cb_mirror = cb[0];
1104 		cb[0]->cb_pflags &= ~CCDPF_MIRROR_DONE;
1105 		cb[1]->cb_pflags &= ~CCDPF_MIRROR_DONE;
1106 	}
1107 }
1108 
1109 static void
1110 ccdintr(struct ccd_softc *cs, struct bio *bio)
1111 {
1112 	struct buf *bp = bio->bio_buf;
1113 
1114 #ifdef DEBUG
1115 	if (ccddebug & CCDB_FOLLOW)
1116 		kprintf("ccdintr(%x, %x)\n", cs, bp);
1117 #endif
1118 	/*
1119 	 * Request is done for better or worse, wakeup the top half.
1120 	 */
1121 	if (bp->b_flags & B_ERROR)
1122 		bp->b_resid = bp->b_bcount;
1123 	devstat_end_transaction_buf(&cs->device_stats, bp);
1124 	biodone(bio);
1125 }
1126 
1127 /*
1128  * Called at interrupt time.
1129  * Mark the component as done and if all components are done,
1130  * take a ccd interrupt.
1131  */
1132 static void
1133 ccdiodone(struct bio *bio)
1134 {
1135 	struct ccdbuf *cbp = bio->bio_caller_info1.ptr;
1136 	struct bio *obio = cbp->cb_obio;
1137 	struct buf *obp = obio->bio_buf;
1138 	int unit = cbp->cb_unit;
1139 	int count;
1140 
1141 	/*
1142 	 * Since we do not have exclusive access to underlying devices,
1143 	 * we can't keep cache translations around.
1144 	 */
1145 	clearbiocache(bio->bio_next);
1146 
1147 	crit_enter();
1148 #ifdef DEBUG
1149 	if (ccddebug & CCDB_FOLLOW)
1150 		kprintf("ccdiodone(%x)\n", cbp);
1151 	if (ccddebug & CCDB_IO) {
1152 		kprintf("ccdiodone: bp %x bcount %d resid %d\n",
1153 		       obp, obp->b_bcount, obp->b_resid);
1154 		kprintf(" dev %x(u%d), cbp %x off %lld addr %x bcnt %d\n",
1155 		       cbp->cb_buf.b_dev, cbp->cb_comp, cbp,
1156 		       cbp->cb_buf.b_loffset, cbp->cb_buf.b_data,
1157 		       cbp->cb_buf.b_bcount);
1158 	}
1159 #endif
1160 
1161 	/*
1162 	 * If an error occured, report it.  If this is a mirrored
1163 	 * configuration and the first of two possible reads, do not
1164 	 * set the error in the bp yet because the second read may
1165 	 * succeed.
1166 	 */
1167 	if (cbp->cb_buf.b_flags & B_ERROR) {
1168 		const char *msg = "";
1169 
1170 		if ((ccd_softc[unit].sc_cflags & CCDF_MIRROR) &&
1171 		    (cbp->cb_buf.b_cmd == BUF_CMD_READ) &&
1172 		    (cbp->cb_pflags & CCDPF_MIRROR_DONE) == 0) {
1173 			/*
1174 			 * We will try our read on the other disk down
1175 			 * below, also reverse the default pick so if we
1176 			 * are doing a scan we do not keep hitting the
1177 			 * bad disk first.
1178 			 */
1179 			struct ccd_softc *cs = &ccd_softc[unit];
1180 
1181 			msg = ", trying other disk";
1182 			cs->sc_pick = 1 - cs->sc_pick;
1183 			cs->sc_blk[cs->sc_pick] = obio->bio_offset;
1184 		} else {
1185 			obp->b_flags |= B_ERROR;
1186 			obp->b_error = cbp->cb_buf.b_error ?
1187 			    cbp->cb_buf.b_error : EIO;
1188 		}
1189 		kprintf("ccd%d: error %d on component %d offset %lld (ccd offset %lld)%s\n",
1190 		       unit, obp->b_error, cbp->cb_comp,
1191 		       cbp->cb_buf.b_bio2.bio_offset,
1192 		       obio->bio_offset, msg);
1193 	}
1194 
1195 	/*
1196 	 * Process mirror.  If we are writing, I/O has been initiated on both
1197 	 * buffers and we fall through only after both are finished.
1198 	 *
1199 	 * If we are reading only one I/O is initiated at a time.  If an
1200 	 * error occurs we initiate the second I/O and return, otherwise
1201 	 * we free the second I/O without initiating it.
1202 	 */
1203 
1204 	if (ccd_softc[unit].sc_cflags & CCDF_MIRROR) {
1205 		if (cbp->cb_buf.b_cmd != BUF_CMD_READ) {
1206 			/*
1207 			 * When writing, handshake with the second buffer
1208 			 * to determine when both are done.  If both are not
1209 			 * done, return here.
1210 			 */
1211 			if ((cbp->cb_pflags & CCDPF_MIRROR_DONE) == 0) {
1212 				cbp->cb_mirror->cb_pflags |= CCDPF_MIRROR_DONE;
1213 				putccdbuf(cbp);
1214 				crit_exit();
1215 				return;
1216 			}
1217 		} else {
1218 			/*
1219 			 * When reading, either dispose of the second buffer
1220 			 * or initiate I/O on the second buffer if an error
1221 			 * occured with this one.
1222 			 */
1223 			if ((cbp->cb_pflags & CCDPF_MIRROR_DONE) == 0) {
1224 				if (cbp->cb_buf.b_flags & B_ERROR) {
1225 					cbp->cb_mirror->cb_pflags |=
1226 					    CCDPF_MIRROR_DONE;
1227 					vn_strategy(
1228 					    cbp->cb_mirror->cb_vp,
1229 					    &cbp->cb_mirror->cb_buf.b_bio1
1230 					);
1231 					putccdbuf(cbp);
1232 					crit_exit();
1233 					return;
1234 				} else {
1235 					putccdbuf(cbp->cb_mirror);
1236 					/* fall through */
1237 				}
1238 			}
1239 		}
1240 	}
1241 
1242 	/*
1243 	 * Use our saved b_bufsize to determine if an unexpected EOF occured.
1244 	 */
1245 	count = cbp->cb_buf.b_bufsize;
1246 	putccdbuf(cbp);
1247 
1248 	/*
1249 	 * If all done, "interrupt".
1250 	 */
1251 	obp->b_resid -= count;
1252 	if (obp->b_resid < 0)
1253 		panic("ccdiodone: count");
1254 	if (obp->b_resid == 0)
1255 		ccdintr(&ccd_softc[unit], obio);
1256 	crit_exit();
1257 }
1258 
1259 static int
1260 ccdioctl(struct dev_ioctl_args *ap)
1261 {
1262 	cdev_t dev = ap->a_head.a_dev;
1263 	int unit = ccdunit(dev);
1264 	int i, j, lookedup = 0, error = 0;
1265 	int part, pmask;
1266 	struct ccd_softc *cs;
1267 	struct ccd_ioctl *ccio = (struct ccd_ioctl *)ap->a_data;
1268 	struct ccddevice ccd;
1269 	char **cpp;
1270 	struct vnode **vpp;
1271 
1272 	if (unit >= numccd)
1273 		return (ENXIO);
1274 	cs = &ccd_softc[unit];
1275 
1276 	bzero(&ccd, sizeof(ccd));
1277 
1278 	switch (ap->a_cmd) {
1279 	case CCDIOCSET:
1280 		if (cs->sc_flags & CCDF_INITED)
1281 			return (EBUSY);
1282 
1283 		if ((ap->a_fflag & FWRITE) == 0)
1284 			return (EBADF);
1285 
1286 		if ((error = ccdlock(cs)) != 0)
1287 			return (error);
1288 
1289 		if (ccio->ccio_ndisks > CCD_MAXNDISKS)
1290 			return (EINVAL);
1291 
1292 		/* Fill in some important bits. */
1293 		ccd.ccd_unit = unit;
1294 		ccd.ccd_interleave = ccio->ccio_ileave;
1295 		if (ccd.ccd_interleave == 0 &&
1296 		    ((ccio->ccio_flags & CCDF_MIRROR) ||
1297 		     (ccio->ccio_flags & CCDF_PARITY))) {
1298 			kprintf("ccd%d: disabling mirror/parity, interleave is 0\n", unit);
1299 			ccio->ccio_flags &= ~(CCDF_MIRROR | CCDF_PARITY);
1300 		}
1301 		if ((ccio->ccio_flags & CCDF_MIRROR) &&
1302 		    (ccio->ccio_flags & CCDF_PARITY)) {
1303 			kprintf("ccd%d: can't specify both mirror and parity, using mirror\n", unit);
1304 			ccio->ccio_flags &= ~CCDF_PARITY;
1305 		}
1306 		if ((ccio->ccio_flags & (CCDF_MIRROR | CCDF_PARITY)) &&
1307 		    !(ccio->ccio_flags & CCDF_UNIFORM)) {
1308 			kprintf("ccd%d: mirror/parity forces uniform flag\n",
1309 			       unit);
1310 			ccio->ccio_flags |= CCDF_UNIFORM;
1311 		}
1312 		ccd.ccd_flags = ccio->ccio_flags & CCDF_USERMASK;
1313 
1314 		/*
1315 		 * Allocate space for and copy in the array of
1316 		 * componet pathnames and device numbers.
1317 		 */
1318 		cpp = kmalloc(ccio->ccio_ndisks * sizeof(char *),
1319 		    M_DEVBUF, M_WAITOK);
1320 		vpp = kmalloc(ccio->ccio_ndisks * sizeof(struct vnode *),
1321 		    M_DEVBUF, M_WAITOK);
1322 
1323 		error = copyin((caddr_t)ccio->ccio_disks, (caddr_t)cpp,
1324 		    ccio->ccio_ndisks * sizeof(char **));
1325 		if (error) {
1326 			kfree(vpp, M_DEVBUF);
1327 			kfree(cpp, M_DEVBUF);
1328 			ccdunlock(cs);
1329 			return (error);
1330 		}
1331 
1332 #ifdef DEBUG
1333 		if (ccddebug & CCDB_INIT)
1334 			for (i = 0; i < ccio->ccio_ndisks; ++i)
1335 				kprintf("ccdioctl: component %d: 0x%x\n",
1336 				    i, cpp[i]);
1337 #endif
1338 
1339 		for (i = 0; i < ccio->ccio_ndisks; ++i) {
1340 #ifdef DEBUG
1341 			if (ccddebug & CCDB_INIT)
1342 				kprintf("ccdioctl: lookedup = %d\n", lookedup);
1343 #endif
1344 			if ((error = ccdlookup(cpp[i], &vpp[i])) != 0) {
1345 				for (j = 0; j < lookedup; ++j)
1346 					(void)vn_close(vpp[j], FREAD|FWRITE);
1347 				kfree(vpp, M_DEVBUF);
1348 				kfree(cpp, M_DEVBUF);
1349 				ccdunlock(cs);
1350 				return (error);
1351 			}
1352 			++lookedup;
1353 		}
1354 		ccd.ccd_cpp = cpp;
1355 		ccd.ccd_vpp = vpp;
1356 		ccd.ccd_ndev = ccio->ccio_ndisks;
1357 
1358 		/*
1359 		 * Initialize the ccd.  Fills in the softc for us.
1360 		 */
1361 		if ((error = ccdinit(&ccd, cpp, ap->a_cred)) != 0) {
1362 			for (j = 0; j < lookedup; ++j)
1363 				(void)vn_close(vpp[j], FREAD|FWRITE);
1364 			bzero(&ccd_softc[unit], sizeof(struct ccd_softc));
1365 			kfree(vpp, M_DEVBUF);
1366 			kfree(cpp, M_DEVBUF);
1367 			ccdunlock(cs);
1368 			return (error);
1369 		}
1370 
1371 		/*
1372 		 * The ccd has been successfully initialized, so
1373 		 * we can place it into the array and read the disklabel.
1374 		 */
1375 		bcopy(&ccd, &ccddevs[unit], sizeof(ccd));
1376 		ccio->ccio_unit = unit;
1377 		ccio->ccio_size = cs->sc_size;
1378 		ccdgetdisklabel(dev);
1379 
1380 		ccdunlock(cs);
1381 
1382 		break;
1383 
1384 	case CCDIOCCLR:
1385 		if ((cs->sc_flags & CCDF_INITED) == 0)
1386 			return (ENXIO);
1387 
1388 		if ((ap->a_fflag & FWRITE) == 0)
1389 			return (EBADF);
1390 
1391 		if ((error = ccdlock(cs)) != 0)
1392 			return (error);
1393 
1394 		/* Don't unconfigure if any other partitions are open */
1395 		part = ccdpart(dev);
1396 		pmask = (1 << part);
1397 		if ((cs->sc_openmask & ~pmask)) {
1398 			ccdunlock(cs);
1399 			return (EBUSY);
1400 		}
1401 
1402 		/*
1403 		 * Free ccd_softc information and clear entry.
1404 		 */
1405 
1406 		/* Close the components and free their pathnames. */
1407 		for (i = 0; i < cs->sc_nccdisks; ++i) {
1408 			/*
1409 			 * XXX: this close could potentially fail and
1410 			 * cause Bad Things.  Maybe we need to force
1411 			 * the close to happen?
1412 			 */
1413 #ifdef DEBUG
1414 			if (ccddebug & CCDB_VNODE)
1415 				vprint("CCDIOCCLR: vnode info",
1416 				    cs->sc_cinfo[i].ci_vp);
1417 #endif
1418 			(void)vn_close(cs->sc_cinfo[i].ci_vp, FREAD|FWRITE);
1419 			kfree(cs->sc_cinfo[i].ci_path, M_DEVBUF);
1420 		}
1421 
1422 		/* Free interleave index. */
1423 		for (i = 0; cs->sc_itable[i].ii_ndisk; ++i)
1424 			kfree(cs->sc_itable[i].ii_index, M_DEVBUF);
1425 
1426 		/* Free component info and interleave table. */
1427 		kfree(cs->sc_cinfo, M_DEVBUF);
1428 		kfree(cs->sc_itable, M_DEVBUF);
1429 		cs->sc_flags &= ~CCDF_INITED;
1430 
1431 		/*
1432 		 * Free ccddevice information and clear entry.
1433 		 */
1434 		kfree(ccddevs[unit].ccd_cpp, M_DEVBUF);
1435 		kfree(ccddevs[unit].ccd_vpp, M_DEVBUF);
1436 		ccd.ccd_dk = -1;
1437 		bcopy(&ccd, &ccddevs[unit], sizeof(ccd));
1438 
1439 		/*
1440 		 * And remove the devstat entry.
1441 		 */
1442 		devstat_remove_entry(&cs->device_stats);
1443 
1444 		/* This must be atomic. */
1445 		crit_enter();
1446 		ccdunlock(cs);
1447 		bzero(cs, sizeof(struct ccd_softc));
1448 		crit_exit();
1449 
1450 		break;
1451 
1452 	case DIOCGDINFO:
1453 		if ((cs->sc_flags & CCDF_INITED) == 0)
1454 			return (ENXIO);
1455 
1456 		*(struct disklabel *)ap->a_data = cs->sc_label;
1457 		break;
1458 
1459 	case DIOCGPART:
1460 		if ((cs->sc_flags & CCDF_INITED) == 0)
1461 			return (ENXIO);
1462 
1463 		((struct partinfo *)ap->a_data)->disklab = &cs->sc_label;
1464 		((struct partinfo *)ap->a_data)->part =
1465 		    &cs->sc_label.d_partitions[ccdpart(dev)];
1466 		break;
1467 
1468 	case DIOCWDINFO:
1469 	case DIOCSDINFO:
1470 		if ((cs->sc_flags & CCDF_INITED) == 0)
1471 			return (ENXIO);
1472 
1473 		if ((ap->a_fflag & FWRITE) == 0)
1474 			return (EBADF);
1475 
1476 		if ((error = ccdlock(cs)) != 0)
1477 			return (error);
1478 
1479 		cs->sc_flags |= CCDF_LABELLING;
1480 
1481 		error = setdisklabel(&cs->sc_label,
1482 		    (struct disklabel *)ap->a_data, 0);
1483 		if (error == 0) {
1484 			if (ap->a_cmd == DIOCWDINFO) {
1485 				cdev_t cdev = CCDLABELDEV(dev);
1486 				error = writedisklabel(cdev, &cs->sc_label);
1487 			}
1488 		}
1489 
1490 		cs->sc_flags &= ~CCDF_LABELLING;
1491 
1492 		ccdunlock(cs);
1493 
1494 		if (error)
1495 			return (error);
1496 		break;
1497 
1498 	case DIOCWLABEL:
1499 		if ((cs->sc_flags & CCDF_INITED) == 0)
1500 			return (ENXIO);
1501 
1502 		if ((ap->a_fflag & FWRITE) == 0)
1503 			return (EBADF);
1504 		if (*(int *)ap->a_data != 0)
1505 			cs->sc_flags |= CCDF_WLABEL;
1506 		else
1507 			cs->sc_flags &= ~CCDF_WLABEL;
1508 		break;
1509 
1510 	default:
1511 		return (ENOTTY);
1512 	}
1513 
1514 	return (0);
1515 }
1516 
1517 static int
1518 ccdsize(struct dev_psize_args *ap)
1519 {
1520 	cdev_t dev = ap->a_head.a_dev;
1521 	struct ccd_softc *cs;
1522 	int part, size;
1523 
1524 	if (dev_dopen(dev, 0, S_IFCHR, proc0.p_ucred))
1525 		return (-1);
1526 
1527 	cs = &ccd_softc[ccdunit(dev)];
1528 	part = ccdpart(dev);
1529 
1530 	if ((cs->sc_flags & CCDF_INITED) == 0)
1531 		return (-1);
1532 
1533 	if (cs->sc_label.d_partitions[part].p_fstype != FS_SWAP)
1534 		size = -1;
1535 	else
1536 		size = cs->sc_label.d_partitions[part].p_size;
1537 
1538 	if (dev_dclose(dev, 0, S_IFCHR))
1539 		return (-1);
1540 
1541 	ap->a_result = size;
1542 	return(0);
1543 }
1544 
1545 static int
1546 ccddump(struct dev_dump_args *ap)
1547 {
1548 	/* Not implemented. */
1549 	return ENXIO;
1550 }
1551 
1552 /*
1553  * Lookup the provided name in the filesystem.  If the file exists,
1554  * is a valid block device, and isn't being used by anyone else,
1555  * set *vpp to the file's vnode.
1556  */
1557 static int
1558 ccdlookup(char *path, struct vnode **vpp)
1559 {
1560 	struct nlookupdata nd;
1561 	struct vnode *vp;
1562 	int error;
1563 
1564 	*vpp = NULL;
1565 
1566 	error = nlookup_init(&nd, path, UIO_USERSPACE, NLC_FOLLOW|NLC_LOCKVP);
1567 	if (error)
1568 		return (error);
1569 	if ((error = vn_open(&nd, NULL, FREAD|FWRITE, 0)) != 0) {
1570 #ifdef DEBUG
1571 		if (ccddebug & CCDB_FOLLOW|CCDB_INIT)
1572 			kprintf("ccdlookup: vn_open error = %d\n", error);
1573 #endif
1574 		goto done;
1575 	}
1576 	vp = nd.nl_open_vp;
1577 
1578 	if (vp->v_usecount > 1) {
1579 		error = EBUSY;
1580 		goto done;
1581 	}
1582 
1583 	if (!vn_isdisk(vp, &error))
1584 		goto done;
1585 
1586 #ifdef DEBUG
1587 	if (ccddebug & CCDB_VNODE)
1588 		vprint("ccdlookup: vnode info", vp);
1589 #endif
1590 
1591 	vn_unlock(vp);
1592 	nd.nl_open_vp = NULL;
1593 	nlookup_done(&nd);
1594 	*vpp = vp;				/* leave ref intact  */
1595 	return (0);
1596 done:
1597 	nlookup_done(&nd);
1598 	return (error);
1599 }
1600 
1601 /*
1602  * Read the disklabel from the ccd.  If one is not present, fake one
1603  * up.
1604  */
1605 static void
1606 ccdgetdisklabel(cdev_t dev)
1607 {
1608 	int unit = ccdunit(dev);
1609 	struct ccd_softc *cs = &ccd_softc[unit];
1610 	char *errstring;
1611 	struct disklabel *lp = &cs->sc_label;
1612 	struct ccdgeom *ccg = &cs->sc_geom;
1613 	cdev_t cdev;
1614 
1615 	bzero(lp, sizeof(*lp));
1616 
1617 	lp->d_secperunit = cs->sc_size;
1618 	lp->d_secsize = ccg->ccg_secsize;
1619 	lp->d_nsectors = ccg->ccg_nsectors;
1620 	lp->d_ntracks = ccg->ccg_ntracks;
1621 	lp->d_ncylinders = ccg->ccg_ncylinders;
1622 	lp->d_secpercyl = lp->d_ntracks * lp->d_nsectors;
1623 
1624 	strncpy(lp->d_typename, "ccd", sizeof(lp->d_typename));
1625 	lp->d_type = DTYPE_CCD;
1626 	strncpy(lp->d_packname, "fictitious", sizeof(lp->d_packname));
1627 	lp->d_rpm = 3600;
1628 	lp->d_interleave = 1;
1629 	lp->d_flags = 0;
1630 
1631 	lp->d_partitions[RAW_PART].p_offset = 0;
1632 	lp->d_partitions[RAW_PART].p_size = cs->sc_size;
1633 	lp->d_partitions[RAW_PART].p_fstype = FS_UNUSED;
1634 	lp->d_npartitions = RAW_PART + 1;
1635 
1636 	lp->d_bbsize = BBSIZE;				/* XXX */
1637 	lp->d_sbsize = SBSIZE;				/* XXX */
1638 
1639 	lp->d_magic = DISKMAGIC;
1640 	lp->d_magic2 = DISKMAGIC;
1641 	lp->d_checksum = dkcksum(&cs->sc_label);
1642 
1643 	/*
1644 	 * Call the generic disklabel extraction routine.
1645 	 */
1646 	cdev = CCDLABELDEV(dev);
1647 	errstring = readdisklabel(cdev, &cs->sc_label);
1648 	if (errstring != NULL)
1649 		ccdmakedisklabel(cs);
1650 
1651 #ifdef DEBUG
1652 	/* It's actually extremely common to have unlabeled ccds. */
1653 	if (ccddebug & CCDB_LABEL)
1654 		if (errstring != NULL)
1655 			kprintf("ccd%d: %s\n", unit, errstring);
1656 #endif
1657 }
1658 
1659 /*
1660  * Take care of things one might want to take care of in the event
1661  * that a disklabel isn't present.
1662  */
1663 static void
1664 ccdmakedisklabel(struct ccd_softc *cs)
1665 {
1666 	struct disklabel *lp = &cs->sc_label;
1667 
1668 	/*
1669 	 * For historical reasons, if there's no disklabel present
1670 	 * the raw partition must be marked FS_BSDFFS.
1671 	 */
1672 	lp->d_partitions[RAW_PART].p_fstype = FS_BSDFFS;
1673 
1674 	strncpy(lp->d_packname, "default label", sizeof(lp->d_packname));
1675 }
1676 
1677 /*
1678  * Wait interruptibly for an exclusive lock.
1679  *
1680  * XXX
1681  * Several drivers do this; it should be abstracted and made MP-safe.
1682  */
1683 static int
1684 ccdlock(struct ccd_softc *cs)
1685 {
1686 	int error;
1687 
1688 	while ((cs->sc_flags & CCDF_LOCKED) != 0) {
1689 		cs->sc_flags |= CCDF_WANTED;
1690 		if ((error = tsleep(cs, PCATCH, "ccdlck", 0)) != 0)
1691 			return (error);
1692 	}
1693 	cs->sc_flags |= CCDF_LOCKED;
1694 	return (0);
1695 }
1696 
1697 /*
1698  * Unlock and wake up any waiters.
1699  */
1700 static void
1701 ccdunlock(struct ccd_softc *cs)
1702 {
1703 
1704 	cs->sc_flags &= ~CCDF_LOCKED;
1705 	if ((cs->sc_flags & CCDF_WANTED) != 0) {
1706 		cs->sc_flags &= ~CCDF_WANTED;
1707 		wakeup(cs);
1708 	}
1709 }
1710 
1711 #ifdef DEBUG
1712 static void
1713 printiinfo(struct ccdiinfo *ii)
1714 {
1715 	int ix, i;
1716 
1717 	for (ix = 0; ii->ii_ndisk; ix++, ii++) {
1718 		kprintf(" itab[%d]: #dk %d sblk %d soff %d",
1719 		       ix, ii->ii_ndisk, ii->ii_startblk, ii->ii_startoff);
1720 		for (i = 0; i < ii->ii_ndisk; i++)
1721 			kprintf(" %d", ii->ii_index[i]);
1722 		kprintf("\n");
1723 	}
1724 }
1725 #endif
1726 
1727 
1728 /* Local Variables: */
1729 /* c-argdecl-indent: 8 */
1730 /* c-continued-statement-offset: 8 */
1731 /* c-indent-level: 8 */
1732 /* End: */
1733