xref: /dragonfly/sys/kern/kern_device.c (revision fcf53d9b)
1 /*
2  * Copyright (c) 2003 Matthew Dillon <dillon@backplane.com> All rights reserved.
3  * cdevsw from kern/kern_conf.c Copyright (c) 1995 Terrence R. Lambert
4  * cdevsw from kern/kern_conf.c Copyright (c) 1995 Julian R. Elishcer,
5  *							All rights reserved.
6  * Copyright (c) 1982, 1986, 1991, 1993
7  *	The Regents of the University of California.  All rights reserved.
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in the
16  *    documentation and/or other materials provided with the distribution.
17  *
18  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
19  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
22  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28  * SUCH DAMAGE.
29  *
30  * $DragonFly: src/sys/kern/kern_device.c,v 1.27 2007/07/23 18:59:50 dillon Exp $
31  */
32 #include <sys/param.h>
33 #include <sys/systm.h>
34 #include <sys/kernel.h>
35 #include <sys/sysctl.h>
36 #include <sys/systm.h>
37 #include <sys/module.h>
38 #include <sys/malloc.h>
39 #include <sys/conf.h>
40 #include <sys/bio.h>
41 #include <sys/buf.h>
42 #include <sys/vnode.h>
43 #include <sys/queue.h>
44 #include <sys/device.h>
45 #include <sys/tree.h>
46 #include <sys/syslink_rpc.h>
47 #include <sys/proc.h>
48 #include <sys/sysctl.h>
49 #include <machine/stdarg.h>
50 #include <sys/devfs.h>
51 #include <sys/dsched.h>
52 
53 #include <sys/thread2.h>
54 #include <sys/mplock2.h>
55 
56 static int mpsafe_writes;
57 static int mplock_writes;
58 static int mpsafe_reads;
59 static int mplock_reads;
60 static int mpsafe_strategies;
61 static int mplock_strategies;
62 
63 SYSCTL_INT(_kern, OID_AUTO, mpsafe_writes, CTLFLAG_RD, &mpsafe_writes,
64 	   0, "mpsafe writes");
65 SYSCTL_INT(_kern, OID_AUTO, mplock_writes, CTLFLAG_RD, &mplock_writes,
66 	   0, "non-mpsafe writes");
67 SYSCTL_INT(_kern, OID_AUTO, mpsafe_reads, CTLFLAG_RD, &mpsafe_reads,
68 	   0, "mpsafe reads");
69 SYSCTL_INT(_kern, OID_AUTO, mplock_reads, CTLFLAG_RD, &mplock_reads,
70 	   0, "non-mpsafe reads");
71 SYSCTL_INT(_kern, OID_AUTO, mpsafe_strategies, CTLFLAG_RD, &mpsafe_strategies,
72 	   0, "mpsafe strategies");
73 SYSCTL_INT(_kern, OID_AUTO, mplock_strategies, CTLFLAG_RD, &mplock_strategies,
74 	   0, "non-mpsafe strategies");
75 
76 /*
77  * system link descriptors identify the command in the
78  * arguments structure.
79  */
80 #define DDESCNAME(name) __CONCAT(__CONCAT(dev_,name),_desc)
81 
82 #define DEVOP_DESC_INIT(name)						\
83 	    struct syslink_desc DDESCNAME(name) = {			\
84 		__offsetof(struct dev_ops, __CONCAT(d_, name)),	\
85 	    #name }
86 
87 DEVOP_DESC_INIT(default);
88 DEVOP_DESC_INIT(open);
89 DEVOP_DESC_INIT(close);
90 DEVOP_DESC_INIT(read);
91 DEVOP_DESC_INIT(write);
92 DEVOP_DESC_INIT(ioctl);
93 DEVOP_DESC_INIT(dump);
94 DEVOP_DESC_INIT(psize);
95 DEVOP_DESC_INIT(mmap);
96 DEVOP_DESC_INIT(strategy);
97 DEVOP_DESC_INIT(kqfilter);
98 DEVOP_DESC_INIT(revoke);
99 DEVOP_DESC_INIT(clone);
100 
101 /*
102  * Misc default ops
103  */
104 struct dev_ops dead_dev_ops;
105 
106 struct dev_ops default_dev_ops = {
107 	{ "null" },
108 	.d_default = NULL,	/* must be NULL */
109 	.d_open = noopen,
110 	.d_close = noclose,
111 	.d_read = noread,
112 	.d_write = nowrite,
113 	.d_ioctl = noioctl,
114 	.d_mmap = nommap,
115 	.d_strategy = nostrategy,
116 	.d_dump = nodump,
117 	.d_psize = nopsize,
118 	.d_kqfilter = nokqfilter,
119 	.d_revoke = norevoke,
120 	.d_clone = noclone
121 };
122 
123 static __inline
124 int
125 dev_needmplock(cdev_t dev)
126 {
127     return((dev->si_ops->head.flags & D_MPSAFE) == 0);
128 }
129 
130 /************************************************************************
131  *			GENERAL DEVICE API FUNCTIONS			*
132  ************************************************************************
133  *
134  * The MPSAFEness of these depends on dev->si_ops->head.flags
135  */
136 int
137 dev_dopen(cdev_t dev, int oflags, int devtype, struct ucred *cred)
138 {
139 	struct dev_open_args ap;
140 	int needmplock = dev_needmplock(dev);
141 	int error;
142 
143 	ap.a_head.a_desc = &dev_open_desc;
144 	ap.a_head.a_dev = dev;
145 	ap.a_oflags = oflags;
146 	ap.a_devtype = devtype;
147 	ap.a_cred = cred;
148 
149 	if (needmplock)
150 		get_mplock();
151 	error = dev->si_ops->d_open(&ap);
152 	if (needmplock)
153 		rel_mplock();
154 	return (error);
155 }
156 
157 int
158 dev_dclose(cdev_t dev, int fflag, int devtype)
159 {
160 	struct dev_close_args ap;
161 	int needmplock = dev_needmplock(dev);
162 	int error;
163 
164 	ap.a_head.a_desc = &dev_close_desc;
165 	ap.a_head.a_dev = dev;
166 	ap.a_fflag = fflag;
167 	ap.a_devtype = devtype;
168 
169 	if (needmplock)
170 		get_mplock();
171 	error = dev->si_ops->d_close(&ap);
172 	if (needmplock)
173 		rel_mplock();
174 	return (error);
175 }
176 
177 int
178 dev_dread(cdev_t dev, struct uio *uio, int ioflag)
179 {
180 	struct dev_read_args ap;
181 	int needmplock = dev_needmplock(dev);
182 	int error;
183 
184 	ap.a_head.a_desc = &dev_read_desc;
185 	ap.a_head.a_dev = dev;
186 	ap.a_uio = uio;
187 	ap.a_ioflag = ioflag;
188 
189 	if (needmplock) {
190 		get_mplock();
191 		++mplock_reads;
192 	} else {
193 		++mpsafe_reads;
194 	}
195 	error = dev->si_ops->d_read(&ap);
196 	if (needmplock)
197 		rel_mplock();
198 	if (error == 0)
199 		dev->si_lastread = time_second;
200 	return (error);
201 }
202 
203 int
204 dev_dwrite(cdev_t dev, struct uio *uio, int ioflag)
205 {
206 	struct dev_write_args ap;
207 	int needmplock = dev_needmplock(dev);
208 	int error;
209 
210 	dev->si_lastwrite = time_second;
211 	ap.a_head.a_desc = &dev_write_desc;
212 	ap.a_head.a_dev = dev;
213 	ap.a_uio = uio;
214 	ap.a_ioflag = ioflag;
215 
216 	if (needmplock) {
217 		get_mplock();
218 		++mplock_writes;
219 	} else {
220 		++mpsafe_writes;
221 	}
222 	error = dev->si_ops->d_write(&ap);
223 	if (needmplock)
224 		rel_mplock();
225 	return (error);
226 }
227 
228 int
229 dev_dioctl(cdev_t dev, u_long cmd, caddr_t data, int fflag, struct ucred *cred,
230 	   struct sysmsg *msg)
231 {
232 	struct dev_ioctl_args ap;
233 	int needmplock = dev_needmplock(dev);
234 	int error;
235 
236 	ap.a_head.a_desc = &dev_ioctl_desc;
237 	ap.a_head.a_dev = dev;
238 	ap.a_cmd = cmd;
239 	ap.a_data = data;
240 	ap.a_fflag = fflag;
241 	ap.a_cred = cred;
242 	ap.a_sysmsg = msg;
243 
244 	if (needmplock)
245 		get_mplock();
246 	error = dev->si_ops->d_ioctl(&ap);
247 	if (needmplock)
248 		rel_mplock();
249 	return (error);
250 }
251 
252 int
253 dev_dmmap(cdev_t dev, vm_offset_t offset, int nprot)
254 {
255 	struct dev_mmap_args ap;
256 	int needmplock = dev_needmplock(dev);
257 	int error;
258 
259 	ap.a_head.a_desc = &dev_mmap_desc;
260 	ap.a_head.a_dev = dev;
261 	ap.a_offset = offset;
262 	ap.a_nprot = nprot;
263 
264 	if (needmplock)
265 		get_mplock();
266 	error = dev->si_ops->d_mmap(&ap);
267 	if (needmplock)
268 		rel_mplock();
269 
270 	if (error == 0)
271 		return(ap.a_result);
272 	return(-1);
273 }
274 
275 int
276 dev_dclone(cdev_t dev)
277 {
278 	struct dev_clone_args ap;
279 	int needmplock = dev_needmplock(dev);
280 	int error;
281 
282 	ap.a_head.a_desc = &dev_clone_desc;
283 	ap.a_head.a_dev = dev;
284 
285 	if (needmplock)
286 		get_mplock();
287 	error = dev->si_ops->d_clone(&ap);
288 	if (needmplock)
289 		rel_mplock();
290 	return (error);
291 }
292 
293 int
294 dev_drevoke(cdev_t dev)
295 {
296 	struct dev_revoke_args ap;
297 	int needmplock = dev_needmplock(dev);
298 	int error;
299 
300 	ap.a_head.a_desc = &dev_revoke_desc;
301 	ap.a_head.a_dev = dev;
302 
303 	if (needmplock)
304 		get_mplock();
305 	error = dev->si_ops->d_revoke(&ap);
306 	if (needmplock)
307 		rel_mplock();
308 
309 	return (error);
310 }
311 
312 /*
313  * Core device strategy call, used to issue I/O on a device.  There are
314  * two versions, a non-chained version and a chained version.  The chained
315  * version reuses a BIO set up by vn_strategy().  The only difference is
316  * that, for now, we do not push a new tracking structure when chaining
317  * from vn_strategy.  XXX this will ultimately have to change.
318  */
319 void
320 dev_dstrategy(cdev_t dev, struct bio *bio)
321 {
322 	struct dev_strategy_args ap;
323 	struct bio_track *track;
324 	int needmplock = dev_needmplock(dev);
325 
326 	ap.a_head.a_desc = &dev_strategy_desc;
327 	ap.a_head.a_dev = dev;
328 	ap.a_bio = bio;
329 
330 	KKASSERT(bio->bio_track == NULL);
331 	KKASSERT(bio->bio_buf->b_cmd != BUF_CMD_DONE);
332 	if (bio->bio_buf->b_cmd == BUF_CMD_READ)
333 	    track = &dev->si_track_read;
334 	else
335 	    track = &dev->si_track_write;
336 	bio_track_ref(track);
337 	bio->bio_track = track;
338 
339 	if (dsched_is_clear_buf_priv(bio->bio_buf))
340 		dsched_new_buf(bio->bio_buf);
341 
342 	KKASSERT((bio->bio_flags & BIO_DONE) == 0);
343 	if (needmplock) {
344 		get_mplock();
345 		++mplock_strategies;
346 	} else {
347 		++mpsafe_strategies;
348 	}
349 	(void)dev->si_ops->d_strategy(&ap);
350 	if (needmplock)
351 		rel_mplock();
352 }
353 
354 void
355 dev_dstrategy_chain(cdev_t dev, struct bio *bio)
356 {
357 	struct dev_strategy_args ap;
358 	int needmplock = dev_needmplock(dev);
359 
360 	ap.a_head.a_desc = &dev_strategy_desc;
361 	ap.a_head.a_dev = dev;
362 	ap.a_bio = bio;
363 
364 	KKASSERT(bio->bio_track != NULL);
365 	KKASSERT((bio->bio_flags & BIO_DONE) == 0);
366 	if (needmplock)
367 		get_mplock();
368 	(void)dev->si_ops->d_strategy(&ap);
369 	if (needmplock)
370 		rel_mplock();
371 }
372 
373 /*
374  * note: the disk layer is expected to set count, blkno, and secsize before
375  * forwarding the message.
376  */
377 int
378 dev_ddump(cdev_t dev, void *virtual, vm_offset_t physical, off_t offset,
379     size_t length)
380 {
381 	struct dev_dump_args ap;
382 	int needmplock = dev_needmplock(dev);
383 	int error;
384 
385 	ap.a_head.a_desc = &dev_dump_desc;
386 	ap.a_head.a_dev = dev;
387 	ap.a_count = 0;
388 	ap.a_blkno = 0;
389 	ap.a_secsize = 0;
390 	ap.a_virtual = virtual;
391 	ap.a_physical = physical;
392 	ap.a_offset = offset;
393 	ap.a_length = length;
394 
395 	if (needmplock)
396 		get_mplock();
397 	error = dev->si_ops->d_dump(&ap);
398 	if (needmplock)
399 		rel_mplock();
400 	return (error);
401 }
402 
403 int64_t
404 dev_dpsize(cdev_t dev)
405 {
406 	struct dev_psize_args ap;
407 	int needmplock = dev_needmplock(dev);
408 	int error;
409 
410 	ap.a_head.a_desc = &dev_psize_desc;
411 	ap.a_head.a_dev = dev;
412 
413 	if (needmplock)
414 		get_mplock();
415 	error = dev->si_ops->d_psize(&ap);
416 	if (needmplock)
417 		rel_mplock();
418 
419 	if (error == 0)
420 		return (ap.a_result);
421 	return(-1);
422 }
423 
424 /*
425  * Pass-thru to the device kqfilter.
426  *
427  * NOTE: We explicitly preset a_result to 0 so d_kqfilter() functions
428  *	 which return 0 do not have to bother setting a_result.
429  */
430 int
431 dev_dkqfilter(cdev_t dev, struct knote *kn)
432 {
433 	struct dev_kqfilter_args ap;
434 	int needmplock = dev_needmplock(dev);
435 	int error;
436 
437 	ap.a_head.a_desc = &dev_kqfilter_desc;
438 	ap.a_head.a_dev = dev;
439 	ap.a_kn = kn;
440 	ap.a_result = 0;
441 
442 	if (needmplock)
443 		get_mplock();
444 	error = dev->si_ops->d_kqfilter(&ap);
445 	if (needmplock)
446 		rel_mplock();
447 
448 	if (error == 0)
449 		return(ap.a_result);
450 	return(ENODEV);
451 }
452 
453 /************************************************************************
454  *			DEVICE HELPER FUNCTIONS				*
455  ************************************************************************/
456 
457 /*
458  * MPSAFE
459  */
460 int
461 dev_drefs(cdev_t dev)
462 {
463     return(dev->si_sysref.refcnt);
464 }
465 
466 /*
467  * MPSAFE
468  */
469 const char *
470 dev_dname(cdev_t dev)
471 {
472     return(dev->si_ops->head.name);
473 }
474 
475 /*
476  * MPSAFE
477  */
478 int
479 dev_dflags(cdev_t dev)
480 {
481     return(dev->si_ops->head.flags);
482 }
483 
484 /*
485  * MPSAFE
486  */
487 int
488 dev_dmaj(cdev_t dev)
489 {
490     return(dev->si_ops->head.maj);
491 }
492 
493 /*
494  * Used when forwarding a request through layers.  The caller adjusts
495  * ap->a_head.a_dev and then calls this function.
496  */
497 int
498 dev_doperate(struct dev_generic_args *ap)
499 {
500     int (*func)(struct dev_generic_args *);
501     int needmplock = dev_needmplock(ap->a_dev);
502     int error;
503 
504     func = *(void **)((char *)ap->a_dev->si_ops + ap->a_desc->sd_offset);
505 
506     if (needmplock)
507 	    get_mplock();
508     error = func(ap);
509     if (needmplock)
510 	    rel_mplock();
511 
512     return (error);
513 }
514 
515 /*
516  * Used by the console intercept code only.  Issue an operation through
517  * a foreign ops structure allowing the ops structure associated
518  * with the device to remain intact.
519  */
520 int
521 dev_doperate_ops(struct dev_ops *ops, struct dev_generic_args *ap)
522 {
523     int (*func)(struct dev_generic_args *);
524     int needmplock = ((ops->head.flags & D_MPSAFE) == 0);
525     int error;
526 
527     func = *(void **)((char *)ops + ap->a_desc->sd_offset);
528 
529     if (needmplock)
530 	    get_mplock();
531     error = func(ap);
532     if (needmplock)
533 	    rel_mplock();
534 
535     return (error);
536 }
537 
538 /*
539  * Convert a template dev_ops into the real thing by filling in
540  * uninitialized fields.
541  */
542 void
543 compile_dev_ops(struct dev_ops *ops)
544 {
545 	int offset;
546 
547 	for (offset = offsetof(struct dev_ops, dev_ops_first_field);
548 	     offset <= offsetof(struct dev_ops, dev_ops_last_field);
549 	     offset += sizeof(void *)
550 	) {
551 		void **func_p = (void **)((char *)ops + offset);
552 		void **def_p = (void **)((char *)&default_dev_ops + offset);
553 		if (*func_p == NULL) {
554 			if (ops->d_default)
555 				*func_p = ops->d_default;
556 			else
557 				*func_p = *def_p;
558 		}
559 	}
560 }
561 
562 /************************************************************************
563  *			MAJOR/MINOR SPACE FUNCTION 			*
564  ************************************************************************/
565 
566 /*
567  * This makes a dev_ops entry visible to userland (e.g /dev/<blah>).
568  *
569  * Disk devices typically register their major, e.g. 'ad0', and then call
570  * into the disk label management code which overloads its own onto e.g. 'ad0'
571  * to support all the various slice and partition combinations.
572  *
573  * The mask/match supplied in this call are a full 32 bits and the same
574  * mask and match must be specified in a later dev_ops_remove() call to
575  * match this add.  However, the match value for the minor number should never
576  * have any bits set in the major number's bit range (8-15).  The mask value
577  * may be conveniently specified as -1 without creating any major number
578  * interference.
579  */
580 
581 static
582 int
583 rb_dev_ops_compare(struct dev_ops_maj *a, struct dev_ops_maj *b)
584 {
585     if (a->maj < b->maj)
586 	return(-1);
587     else if (a->maj > b->maj)
588 	return(1);
589     return(0);
590 }
591 
592 RB_GENERATE2(dev_ops_rb_tree, dev_ops_maj, rbnode, rb_dev_ops_compare, int, maj);
593 
594 struct dev_ops_rb_tree dev_ops_rbhead = RB_INITIALIZER(dev_ops_rbhead);
595 
596 int
597 dev_ops_remove_all(struct dev_ops *ops)
598 {
599 	return devfs_destroy_dev_by_ops(ops, -1);
600 }
601 
602 int
603 dev_ops_remove_minor(struct dev_ops *ops, int minor)
604 {
605 	return devfs_destroy_dev_by_ops(ops, minor);
606 }
607 
608 struct dev_ops *
609 dev_ops_intercept(cdev_t dev, struct dev_ops *iops)
610 {
611 	struct dev_ops *oops = dev->si_ops;
612 
613 	compile_dev_ops(iops);
614 	iops->head.maj = oops->head.maj;
615 	iops->head.data = oops->head.data;
616 	iops->head.flags = oops->head.flags;
617 	dev->si_ops = iops;
618 	dev->si_flags |= SI_INTERCEPTED;
619 
620 	return (oops);
621 }
622 
623 void
624 dev_ops_restore(cdev_t dev, struct dev_ops *oops)
625 {
626 	struct dev_ops *iops = dev->si_ops;
627 
628 	dev->si_ops = oops;
629 	dev->si_flags &= ~SI_INTERCEPTED;
630 	iops->head.maj = 0;
631 	iops->head.data = NULL;
632 	iops->head.flags = 0;
633 }
634 
635 /************************************************************************
636  *			DEFAULT DEV OPS FUNCTIONS			*
637  ************************************************************************/
638 
639 
640 /*
641  * Unsupported devswitch functions (e.g. for writing to read-only device).
642  * XXX may belong elsewhere.
643  */
644 int
645 norevoke(struct dev_revoke_args *ap)
646 {
647 	/* take no action */
648 	return(0);
649 }
650 
651 int
652 noclone(struct dev_clone_args *ap)
653 {
654 	/* take no action */
655 	return (0);	/* allow the clone */
656 }
657 
658 int
659 noopen(struct dev_open_args *ap)
660 {
661 	return (ENODEV);
662 }
663 
664 int
665 noclose(struct dev_close_args *ap)
666 {
667 	return (ENODEV);
668 }
669 
670 int
671 noread(struct dev_read_args *ap)
672 {
673 	return (ENODEV);
674 }
675 
676 int
677 nowrite(struct dev_write_args *ap)
678 {
679 	return (ENODEV);
680 }
681 
682 int
683 noioctl(struct dev_ioctl_args *ap)
684 {
685 	return (ENODEV);
686 }
687 
688 int
689 nokqfilter(struct dev_kqfilter_args *ap)
690 {
691 	return (ENODEV);
692 }
693 
694 int
695 nommap(struct dev_mmap_args *ap)
696 {
697 	return (ENODEV);
698 }
699 
700 int
701 nostrategy(struct dev_strategy_args *ap)
702 {
703 	struct bio *bio = ap->a_bio;
704 
705 	bio->bio_buf->b_flags |= B_ERROR;
706 	bio->bio_buf->b_error = EOPNOTSUPP;
707 	biodone(bio);
708 	return(0);
709 }
710 
711 int
712 nopsize(struct dev_psize_args *ap)
713 {
714 	ap->a_result = 0;
715 	return(0);
716 }
717 
718 int
719 nodump(struct dev_dump_args *ap)
720 {
721 	return (ENODEV);
722 }
723 
724 /*
725  * XXX this is probably bogus.  Any device that uses it isn't checking the
726  * minor number.
727  */
728 int
729 nullopen(struct dev_open_args *ap)
730 {
731 	return (0);
732 }
733 
734 int
735 nullclose(struct dev_close_args *ap)
736 {
737 	return (0);
738 }
739 
740