xref: /dragonfly/sys/kern/kern_device.c (revision d22a69a4)
1 /*
2  * Copyright (c) 2003 Matthew Dillon <dillon@backplane.com> All rights reserved.
3  * cdevsw from kern/kern_conf.c Copyright (c) 1995 Terrence R. Lambert
4  * cdevsw from kern/kern_conf.c Copyright (c) 1995 Julian R. Elishcer,
5  *							All rights reserved.
6  * Copyright (c) 1982, 1986, 1991, 1993
7  *	The Regents of the University of California.  All rights reserved.
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in the
16  *    documentation and/or other materials provided with the distribution.
17  *
18  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
19  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
22  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28  * SUCH DAMAGE.
29  */
30 
31 #include <sys/param.h>
32 #include <sys/systm.h>
33 #include <sys/kernel.h>
34 #include <sys/sysctl.h>
35 #include <sys/module.h>
36 #include <sys/malloc.h>
37 #include <sys/conf.h>
38 #include <sys/bio.h>
39 #include <sys/buf.h>
40 #include <sys/vnode.h>
41 #include <sys/queue.h>
42 #include <sys/device.h>
43 #include <sys/tree.h>
44 #include <sys/syslink_rpc.h>
45 #include <sys/proc.h>
46 #include <machine/stdarg.h>
47 #include <sys/devfs.h>
48 #include <sys/dsched.h>
49 
50 #include <sys/thread2.h>
51 #include <sys/mplock2.h>
52 
53 static int mpsafe_writes;
54 static int mplock_writes;
55 static int mpsafe_reads;
56 static int mplock_reads;
57 static int mpsafe_strategies;
58 static int mplock_strategies;
59 
60 SYSCTL_INT(_kern, OID_AUTO, mpsafe_writes, CTLFLAG_RD, &mpsafe_writes,
61 	   0, "mpsafe writes");
62 SYSCTL_INT(_kern, OID_AUTO, mplock_writes, CTLFLAG_RD, &mplock_writes,
63 	   0, "non-mpsafe writes");
64 SYSCTL_INT(_kern, OID_AUTO, mpsafe_reads, CTLFLAG_RD, &mpsafe_reads,
65 	   0, "mpsafe reads");
66 SYSCTL_INT(_kern, OID_AUTO, mplock_reads, CTLFLAG_RD, &mplock_reads,
67 	   0, "non-mpsafe reads");
68 SYSCTL_INT(_kern, OID_AUTO, mpsafe_strategies, CTLFLAG_RD, &mpsafe_strategies,
69 	   0, "mpsafe strategies");
70 SYSCTL_INT(_kern, OID_AUTO, mplock_strategies, CTLFLAG_RD, &mplock_strategies,
71 	   0, "non-mpsafe strategies");
72 
73 /*
74  * system link descriptors identify the command in the
75  * arguments structure.
76  */
77 #define DDESCNAME(name) __CONCAT(__CONCAT(dev_,name),_desc)
78 
79 #define DEVOP_DESC_INIT(name)						\
80 	    struct syslink_desc DDESCNAME(name) = {			\
81 		__offsetof(struct dev_ops, __CONCAT(d_, name)),	\
82 	    #name }
83 
84 DEVOP_DESC_INIT(default);
85 DEVOP_DESC_INIT(open);
86 DEVOP_DESC_INIT(close);
87 DEVOP_DESC_INIT(read);
88 DEVOP_DESC_INIT(write);
89 DEVOP_DESC_INIT(ioctl);
90 DEVOP_DESC_INIT(dump);
91 DEVOP_DESC_INIT(psize);
92 DEVOP_DESC_INIT(mmap);
93 DEVOP_DESC_INIT(strategy);
94 DEVOP_DESC_INIT(kqfilter);
95 DEVOP_DESC_INIT(revoke);
96 DEVOP_DESC_INIT(clone);
97 
98 /*
99  * Misc default ops
100  */
101 struct dev_ops dead_dev_ops;
102 
103 struct dev_ops default_dev_ops = {
104 	{ "null" },
105 	.d_default = NULL,	/* must be NULL */
106 	.d_open = noopen,
107 	.d_close = noclose,
108 	.d_read = noread,
109 	.d_write = nowrite,
110 	.d_ioctl = noioctl,
111 	.d_mmap = nommap,
112 	.d_strategy = nostrategy,
113 	.d_dump = nodump,
114 	.d_psize = nopsize,
115 	.d_kqfilter = nokqfilter,
116 	.d_revoke = norevoke,
117 	.d_clone = noclone
118 };
119 
120 static __inline
121 int
122 dev_needmplock(cdev_t dev)
123 {
124     return((dev->si_ops->head.flags & D_MPSAFE) == 0);
125 }
126 
127 /************************************************************************
128  *			GENERAL DEVICE API FUNCTIONS			*
129  ************************************************************************
130  *
131  * The MPSAFEness of these depends on dev->si_ops->head.flags
132  */
133 int
134 dev_dopen(cdev_t dev, int oflags, int devtype, struct ucred *cred)
135 {
136 	struct dev_open_args ap;
137 	int needmplock = dev_needmplock(dev);
138 	int error;
139 
140 	ap.a_head.a_desc = &dev_open_desc;
141 	ap.a_head.a_dev = dev;
142 	ap.a_oflags = oflags;
143 	ap.a_devtype = devtype;
144 	ap.a_cred = cred;
145 
146 	if (needmplock)
147 		get_mplock();
148 	error = dev->si_ops->d_open(&ap);
149 	if (needmplock)
150 		rel_mplock();
151 	return (error);
152 }
153 
154 int
155 dev_dclose(cdev_t dev, int fflag, int devtype)
156 {
157 	struct dev_close_args ap;
158 	int needmplock = dev_needmplock(dev);
159 	int error;
160 
161 	ap.a_head.a_desc = &dev_close_desc;
162 	ap.a_head.a_dev = dev;
163 	ap.a_fflag = fflag;
164 	ap.a_devtype = devtype;
165 
166 	if (needmplock)
167 		get_mplock();
168 	error = dev->si_ops->d_close(&ap);
169 	if (needmplock)
170 		rel_mplock();
171 	return (error);
172 }
173 
174 int
175 dev_dread(cdev_t dev, struct uio *uio, int ioflag)
176 {
177 	struct dev_read_args ap;
178 	int needmplock = dev_needmplock(dev);
179 	int error;
180 
181 	ap.a_head.a_desc = &dev_read_desc;
182 	ap.a_head.a_dev = dev;
183 	ap.a_uio = uio;
184 	ap.a_ioflag = ioflag;
185 
186 	if (needmplock) {
187 		get_mplock();
188 		++mplock_reads;
189 	} else {
190 		++mpsafe_reads;
191 	}
192 	error = dev->si_ops->d_read(&ap);
193 	if (needmplock)
194 		rel_mplock();
195 	if (error == 0)
196 		dev->si_lastread = time_second;
197 	return (error);
198 }
199 
200 int
201 dev_dwrite(cdev_t dev, struct uio *uio, int ioflag)
202 {
203 	struct dev_write_args ap;
204 	int needmplock = dev_needmplock(dev);
205 	int error;
206 
207 	dev->si_lastwrite = time_second;
208 	ap.a_head.a_desc = &dev_write_desc;
209 	ap.a_head.a_dev = dev;
210 	ap.a_uio = uio;
211 	ap.a_ioflag = ioflag;
212 
213 	if (needmplock) {
214 		get_mplock();
215 		++mplock_writes;
216 	} else {
217 		++mpsafe_writes;
218 	}
219 	error = dev->si_ops->d_write(&ap);
220 	if (needmplock)
221 		rel_mplock();
222 	return (error);
223 }
224 
225 int
226 dev_dioctl(cdev_t dev, u_long cmd, caddr_t data, int fflag, struct ucred *cred,
227 	   struct sysmsg *msg)
228 {
229 	struct dev_ioctl_args ap;
230 	int needmplock = dev_needmplock(dev);
231 	int error;
232 
233 	ap.a_head.a_desc = &dev_ioctl_desc;
234 	ap.a_head.a_dev = dev;
235 	ap.a_cmd = cmd;
236 	ap.a_data = data;
237 	ap.a_fflag = fflag;
238 	ap.a_cred = cred;
239 	ap.a_sysmsg = msg;
240 
241 	if (needmplock)
242 		get_mplock();
243 	error = dev->si_ops->d_ioctl(&ap);
244 	if (needmplock)
245 		rel_mplock();
246 	return (error);
247 }
248 
249 int
250 dev_dmmap(cdev_t dev, vm_offset_t offset, int nprot)
251 {
252 	struct dev_mmap_args ap;
253 	int needmplock = dev_needmplock(dev);
254 	int error;
255 
256 	ap.a_head.a_desc = &dev_mmap_desc;
257 	ap.a_head.a_dev = dev;
258 	ap.a_offset = offset;
259 	ap.a_nprot = nprot;
260 
261 	if (needmplock)
262 		get_mplock();
263 	error = dev->si_ops->d_mmap(&ap);
264 	if (needmplock)
265 		rel_mplock();
266 
267 	if (error == 0)
268 		return(ap.a_result);
269 	return(-1);
270 }
271 
272 int
273 dev_dclone(cdev_t dev)
274 {
275 	struct dev_clone_args ap;
276 	int needmplock = dev_needmplock(dev);
277 	int error;
278 
279 	ap.a_head.a_desc = &dev_clone_desc;
280 	ap.a_head.a_dev = dev;
281 
282 	if (needmplock)
283 		get_mplock();
284 	error = dev->si_ops->d_clone(&ap);
285 	if (needmplock)
286 		rel_mplock();
287 	return (error);
288 }
289 
290 int
291 dev_drevoke(cdev_t dev)
292 {
293 	struct dev_revoke_args ap;
294 	int needmplock = dev_needmplock(dev);
295 	int error;
296 
297 	ap.a_head.a_desc = &dev_revoke_desc;
298 	ap.a_head.a_dev = dev;
299 
300 	if (needmplock)
301 		get_mplock();
302 	error = dev->si_ops->d_revoke(&ap);
303 	if (needmplock)
304 		rel_mplock();
305 
306 	return (error);
307 }
308 
309 /*
310  * Core device strategy call, used to issue I/O on a device.  There are
311  * two versions, a non-chained version and a chained version.  The chained
312  * version reuses a BIO set up by vn_strategy().  The only difference is
313  * that, for now, we do not push a new tracking structure when chaining
314  * from vn_strategy.  XXX this will ultimately have to change.
315  */
316 void
317 dev_dstrategy(cdev_t dev, struct bio *bio)
318 {
319 	struct dev_strategy_args ap;
320 	struct bio_track *track;
321 	int needmplock = dev_needmplock(dev);
322 
323 	ap.a_head.a_desc = &dev_strategy_desc;
324 	ap.a_head.a_dev = dev;
325 	ap.a_bio = bio;
326 
327 	KKASSERT(bio->bio_track == NULL);
328 	KKASSERT(bio->bio_buf->b_cmd != BUF_CMD_DONE);
329 	if (bio->bio_buf->b_cmd == BUF_CMD_READ)
330 	    track = &dev->si_track_read;
331 	else
332 	    track = &dev->si_track_write;
333 	bio_track_ref(track);
334 	bio->bio_track = track;
335 
336 	if (dsched_is_clear_buf_priv(bio->bio_buf))
337 		dsched_new_buf(bio->bio_buf);
338 
339 	KKASSERT((bio->bio_flags & BIO_DONE) == 0);
340 	if (needmplock) {
341 		get_mplock();
342 		++mplock_strategies;
343 	} else {
344 		++mpsafe_strategies;
345 	}
346 	(void)dev->si_ops->d_strategy(&ap);
347 	if (needmplock)
348 		rel_mplock();
349 }
350 
351 void
352 dev_dstrategy_chain(cdev_t dev, struct bio *bio)
353 {
354 	struct dev_strategy_args ap;
355 	int needmplock = dev_needmplock(dev);
356 
357 	ap.a_head.a_desc = &dev_strategy_desc;
358 	ap.a_head.a_dev = dev;
359 	ap.a_bio = bio;
360 
361 	KKASSERT(bio->bio_track != NULL);
362 	KKASSERT((bio->bio_flags & BIO_DONE) == 0);
363 	if (needmplock)
364 		get_mplock();
365 	(void)dev->si_ops->d_strategy(&ap);
366 	if (needmplock)
367 		rel_mplock();
368 }
369 
370 /*
371  * note: the disk layer is expected to set count, blkno, and secsize before
372  * forwarding the message.
373  */
374 int
375 dev_ddump(cdev_t dev, void *virtual, vm_offset_t physical, off_t offset,
376     size_t length)
377 {
378 	struct dev_dump_args ap;
379 	int needmplock = dev_needmplock(dev);
380 	int error;
381 
382 	ap.a_head.a_desc = &dev_dump_desc;
383 	ap.a_head.a_dev = dev;
384 	ap.a_count = 0;
385 	ap.a_blkno = 0;
386 	ap.a_secsize = 0;
387 	ap.a_virtual = virtual;
388 	ap.a_physical = physical;
389 	ap.a_offset = offset;
390 	ap.a_length = length;
391 
392 	if (needmplock)
393 		get_mplock();
394 	error = dev->si_ops->d_dump(&ap);
395 	if (needmplock)
396 		rel_mplock();
397 	return (error);
398 }
399 
400 int64_t
401 dev_dpsize(cdev_t dev)
402 {
403 	struct dev_psize_args ap;
404 	int needmplock = dev_needmplock(dev);
405 	int error;
406 
407 	ap.a_head.a_desc = &dev_psize_desc;
408 	ap.a_head.a_dev = dev;
409 
410 	if (needmplock)
411 		get_mplock();
412 	error = dev->si_ops->d_psize(&ap);
413 	if (needmplock)
414 		rel_mplock();
415 
416 	if (error == 0)
417 		return (ap.a_result);
418 	return(-1);
419 }
420 
421 /*
422  * Pass-thru to the device kqfilter.
423  *
424  * NOTE: We explicitly preset a_result to 0 so d_kqfilter() functions
425  *	 which return 0 do not have to bother setting a_result.
426  */
427 int
428 dev_dkqfilter(cdev_t dev, struct knote *kn)
429 {
430 	struct dev_kqfilter_args ap;
431 	int needmplock = dev_needmplock(dev);
432 	int error;
433 
434 	ap.a_head.a_desc = &dev_kqfilter_desc;
435 	ap.a_head.a_dev = dev;
436 	ap.a_kn = kn;
437 	ap.a_result = 0;
438 
439 	if (needmplock)
440 		get_mplock();
441 	error = dev->si_ops->d_kqfilter(&ap);
442 	if (needmplock)
443 		rel_mplock();
444 
445 	if (error == 0)
446 		return(ap.a_result);
447 	return(ENODEV);
448 }
449 
450 /************************************************************************
451  *			DEVICE HELPER FUNCTIONS				*
452  ************************************************************************/
453 
454 /*
455  * MPSAFE
456  */
457 int
458 dev_drefs(cdev_t dev)
459 {
460     return(dev->si_sysref.refcnt);
461 }
462 
463 /*
464  * MPSAFE
465  */
466 const char *
467 dev_dname(cdev_t dev)
468 {
469     return(dev->si_ops->head.name);
470 }
471 
472 /*
473  * MPSAFE
474  */
475 int
476 dev_dflags(cdev_t dev)
477 {
478     return(dev->si_ops->head.flags);
479 }
480 
481 /*
482  * MPSAFE
483  */
484 int
485 dev_dmaj(cdev_t dev)
486 {
487     return(dev->si_ops->head.maj);
488 }
489 
490 /*
491  * Used when forwarding a request through layers.  The caller adjusts
492  * ap->a_head.a_dev and then calls this function.
493  */
494 int
495 dev_doperate(struct dev_generic_args *ap)
496 {
497     int (*func)(struct dev_generic_args *);
498     int needmplock = dev_needmplock(ap->a_dev);
499     int error;
500 
501     func = *(void **)((char *)ap->a_dev->si_ops + ap->a_desc->sd_offset);
502 
503     if (needmplock)
504 	    get_mplock();
505     error = func(ap);
506     if (needmplock)
507 	    rel_mplock();
508 
509     return (error);
510 }
511 
512 /*
513  * Used by the console intercept code only.  Issue an operation through
514  * a foreign ops structure allowing the ops structure associated
515  * with the device to remain intact.
516  */
517 int
518 dev_doperate_ops(struct dev_ops *ops, struct dev_generic_args *ap)
519 {
520     int (*func)(struct dev_generic_args *);
521     int needmplock = ((ops->head.flags & D_MPSAFE) == 0);
522     int error;
523 
524     func = *(void **)((char *)ops + ap->a_desc->sd_offset);
525 
526     if (needmplock)
527 	    get_mplock();
528     error = func(ap);
529     if (needmplock)
530 	    rel_mplock();
531 
532     return (error);
533 }
534 
535 /*
536  * Convert a template dev_ops into the real thing by filling in
537  * uninitialized fields.
538  */
539 void
540 compile_dev_ops(struct dev_ops *ops)
541 {
542 	int offset;
543 
544 	for (offset = offsetof(struct dev_ops, dev_ops_first_field);
545 	     offset <= offsetof(struct dev_ops, dev_ops_last_field);
546 	     offset += sizeof(void *)
547 	) {
548 		void **func_p = (void **)((char *)ops + offset);
549 		void **def_p = (void **)((char *)&default_dev_ops + offset);
550 		if (*func_p == NULL) {
551 			if (ops->d_default)
552 				*func_p = ops->d_default;
553 			else
554 				*func_p = *def_p;
555 		}
556 	}
557 }
558 
559 /************************************************************************
560  *			MAJOR/MINOR SPACE FUNCTION 			*
561  ************************************************************************/
562 
563 /*
564  * This makes a dev_ops entry visible to userland (e.g /dev/<blah>).
565  *
566  * Disk devices typically register their major, e.g. 'ad0', and then call
567  * into the disk label management code which overloads its own onto e.g. 'ad0'
568  * to support all the various slice and partition combinations.
569  *
570  * The mask/match supplied in this call are a full 32 bits and the same
571  * mask and match must be specified in a later dev_ops_remove() call to
572  * match this add.  However, the match value for the minor number should never
573  * have any bits set in the major number's bit range (8-15).  The mask value
574  * may be conveniently specified as -1 without creating any major number
575  * interference.
576  */
577 
578 static
579 int
580 rb_dev_ops_compare(struct dev_ops_maj *a, struct dev_ops_maj *b)
581 {
582     if (a->maj < b->maj)
583 	return(-1);
584     else if (a->maj > b->maj)
585 	return(1);
586     return(0);
587 }
588 
589 RB_GENERATE2(dev_ops_rb_tree, dev_ops_maj, rbnode, rb_dev_ops_compare, int, maj);
590 
591 struct dev_ops_rb_tree dev_ops_rbhead = RB_INITIALIZER(dev_ops_rbhead);
592 
593 int
594 dev_ops_remove_all(struct dev_ops *ops)
595 {
596 	return devfs_destroy_dev_by_ops(ops, -1);
597 }
598 
599 int
600 dev_ops_remove_minor(struct dev_ops *ops, int minor)
601 {
602 	return devfs_destroy_dev_by_ops(ops, minor);
603 }
604 
605 struct dev_ops *
606 dev_ops_intercept(cdev_t dev, struct dev_ops *iops)
607 {
608 	struct dev_ops *oops = dev->si_ops;
609 
610 	compile_dev_ops(iops);
611 	iops->head.maj = oops->head.maj;
612 	iops->head.data = oops->head.data;
613 	iops->head.flags = oops->head.flags;
614 	dev->si_ops = iops;
615 	dev->si_flags |= SI_INTERCEPTED;
616 
617 	return (oops);
618 }
619 
620 void
621 dev_ops_restore(cdev_t dev, struct dev_ops *oops)
622 {
623 	struct dev_ops *iops = dev->si_ops;
624 
625 	dev->si_ops = oops;
626 	dev->si_flags &= ~SI_INTERCEPTED;
627 	iops->head.maj = 0;
628 	iops->head.data = NULL;
629 	iops->head.flags = 0;
630 }
631 
632 /************************************************************************
633  *			DEFAULT DEV OPS FUNCTIONS			*
634  ************************************************************************/
635 
636 
637 /*
638  * Unsupported devswitch functions (e.g. for writing to read-only device).
639  * XXX may belong elsewhere.
640  */
641 int
642 norevoke(struct dev_revoke_args *ap)
643 {
644 	/* take no action */
645 	return(0);
646 }
647 
648 int
649 noclone(struct dev_clone_args *ap)
650 {
651 	/* take no action */
652 	return (0);	/* allow the clone */
653 }
654 
655 int
656 noopen(struct dev_open_args *ap)
657 {
658 	return (ENODEV);
659 }
660 
661 int
662 noclose(struct dev_close_args *ap)
663 {
664 	return (ENODEV);
665 }
666 
667 int
668 noread(struct dev_read_args *ap)
669 {
670 	return (ENODEV);
671 }
672 
673 int
674 nowrite(struct dev_write_args *ap)
675 {
676 	return (ENODEV);
677 }
678 
679 int
680 noioctl(struct dev_ioctl_args *ap)
681 {
682 	return (ENODEV);
683 }
684 
685 int
686 nokqfilter(struct dev_kqfilter_args *ap)
687 {
688 	return (ENODEV);
689 }
690 
691 int
692 nommap(struct dev_mmap_args *ap)
693 {
694 	return (ENODEV);
695 }
696 
697 int
698 nostrategy(struct dev_strategy_args *ap)
699 {
700 	struct bio *bio = ap->a_bio;
701 
702 	bio->bio_buf->b_flags |= B_ERROR;
703 	bio->bio_buf->b_error = EOPNOTSUPP;
704 	biodone(bio);
705 	return(0);
706 }
707 
708 int
709 nopsize(struct dev_psize_args *ap)
710 {
711 	ap->a_result = 0;
712 	return(0);
713 }
714 
715 int
716 nodump(struct dev_dump_args *ap)
717 {
718 	return (ENODEV);
719 }
720 
721 /*
722  * XXX this is probably bogus.  Any device that uses it isn't checking the
723  * minor number.
724  */
725 int
726 nullopen(struct dev_open_args *ap)
727 {
728 	return (0);
729 }
730 
731 int
732 nullclose(struct dev_close_args *ap)
733 {
734 	return (0);
735 }
736 
737