1 /*-
2 * Copyright (c) 2010 Isilon Systems, Inc.
3 * Copyright (c) 2010 iX Systems, Inc.
4 * Copyright (c) 2010 Panasas, Inc.
5 * Copyright (c) 2013-2021 Mellanox Technologies, Ltd.
6 * All rights reserved.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 * notice unmodified, this list of conditions, and the following
13 * disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 * notice, this list of conditions and the following disclaimer in the
16 * documentation and/or other materials provided with the distribution.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
19 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
20 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
21 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
22 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
23 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
27 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 */
29
30 #include <sys/cdefs.h>
31 #include "opt_global.h"
32 #include "opt_stack.h"
33
34 #include <sys/param.h>
35 #include <sys/systm.h>
36 #include <sys/malloc.h>
37 #include <sys/kernel.h>
38 #include <sys/sysctl.h>
39 #include <sys/proc.h>
40 #include <sys/sglist.h>
41 #include <sys/sleepqueue.h>
42 #include <sys/refcount.h>
43 #include <sys/lock.h>
44 #include <sys/mutex.h>
45 #include <sys/bus.h>
46 #include <sys/eventhandler.h>
47 #include <sys/fcntl.h>
48 #include <sys/file.h>
49 #include <sys/filio.h>
50 #include <sys/rwlock.h>
51 #include <sys/mman.h>
52 #include <sys/stack.h>
53 #include <sys/sysent.h>
54 #include <sys/time.h>
55 #include <sys/user.h>
56
57 #include <vm/vm.h>
58 #include <vm/pmap.h>
59 #include <vm/vm_object.h>
60 #include <vm/vm_page.h>
61 #include <vm/vm_pager.h>
62
63 #include <machine/stdarg.h>
64
65 #if defined(__i386__) || defined(__amd64__)
66 #include <machine/cputypes.h>
67 #include <machine/md_var.h>
68 #endif
69
70 #include <linux/kobject.h>
71 #include <linux/cpu.h>
72 #include <linux/device.h>
73 #include <linux/slab.h>
74 #include <linux/module.h>
75 #include <linux/moduleparam.h>
76 #include <linux/cdev.h>
77 #include <linux/file.h>
78 #include <linux/sysfs.h>
79 #include <linux/mm.h>
80 #include <linux/io.h>
81 #include <linux/vmalloc.h>
82 #include <linux/netdevice.h>
83 #include <linux/timer.h>
84 #include <linux/interrupt.h>
85 #include <linux/uaccess.h>
86 #include <linux/utsname.h>
87 #include <linux/list.h>
88 #include <linux/kthread.h>
89 #include <linux/kernel.h>
90 #include <linux/compat.h>
91 #include <linux/io-mapping.h>
92 #include <linux/poll.h>
93 #include <linux/smp.h>
94 #include <linux/wait_bit.h>
95 #include <linux/rcupdate.h>
96 #include <linux/interval_tree.h>
97 #include <linux/interval_tree_generic.h>
98
99 #if defined(__i386__) || defined(__amd64__)
100 #include <asm/smp.h>
101 #include <asm/processor.h>
102 #endif
103
104 #include <xen/xen.h>
105 #ifdef XENHVM
106 #undef xen_pv_domain
107 #undef xen_initial_domain
108 /* xen/xen-os.h redefines __must_check */
109 #undef __must_check
110 #include <xen/xen-os.h>
111 #endif
112
113 SYSCTL_NODE(_compat, OID_AUTO, linuxkpi, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
114 "LinuxKPI parameters");
115
116 int linuxkpi_debug;
117 SYSCTL_INT(_compat_linuxkpi, OID_AUTO, debug, CTLFLAG_RWTUN,
118 &linuxkpi_debug, 0, "Set to enable pr_debug() prints. Clear to disable.");
119
120 int linuxkpi_warn_dump_stack = 0;
121 SYSCTL_INT(_compat_linuxkpi, OID_AUTO, warn_dump_stack, CTLFLAG_RWTUN,
122 &linuxkpi_warn_dump_stack, 0,
123 "Set to enable stack traces from WARN_ON(). Clear to disable.");
124
125 static struct timeval lkpi_net_lastlog;
126 static int lkpi_net_curpps;
127 static int lkpi_net_maxpps = 99;
128 SYSCTL_INT(_compat_linuxkpi, OID_AUTO, net_ratelimit, CTLFLAG_RWTUN,
129 &lkpi_net_maxpps, 0, "Limit number of LinuxKPI net messages per second.");
130
131 MALLOC_DEFINE(M_KMALLOC, "lkpikmalloc", "Linux kmalloc compat");
132
133 #include <linux/rbtree.h>
134 /* Undo Linux compat changes. */
135 #undef RB_ROOT
136 #undef file
137 #undef cdev
138 #define RB_ROOT(head) (head)->rbh_root
139
140 static void linux_destroy_dev(struct linux_cdev *);
141 static void linux_cdev_deref(struct linux_cdev *ldev);
142 static struct vm_area_struct *linux_cdev_handle_find(void *handle);
143
144 cpumask_t cpu_online_mask;
145 static cpumask_t **static_single_cpu_mask;
146 static cpumask_t *static_single_cpu_mask_lcs;
147 struct kobject linux_class_root;
148 struct device linux_root_device;
149 struct class linux_class_misc;
150 struct list_head pci_drivers;
151 struct list_head pci_devices;
152 spinlock_t pci_lock;
153 struct uts_namespace init_uts_ns;
154
155 unsigned long linux_timer_hz_mask;
156
157 wait_queue_head_t linux_bit_waitq;
158 wait_queue_head_t linux_var_waitq;
159
160 int
panic_cmp(struct rb_node * one,struct rb_node * two)161 panic_cmp(struct rb_node *one, struct rb_node *two)
162 {
163 panic("no cmp");
164 }
165
166 RB_GENERATE(linux_root, rb_node, __entry, panic_cmp);
167
168 #define START(node) ((node)->start)
169 #define LAST(node) ((node)->last)
170
171 INTERVAL_TREE_DEFINE(struct interval_tree_node, rb, unsigned long,, START,
172 LAST,, lkpi_interval_tree)
173
174 static void
linux_device_release(struct device * dev)175 linux_device_release(struct device *dev)
176 {
177 pr_debug("linux_device_release: %s\n", dev_name(dev));
178 kfree(dev);
179 }
180
181 static ssize_t
linux_class_show(struct kobject * kobj,struct attribute * attr,char * buf)182 linux_class_show(struct kobject *kobj, struct attribute *attr, char *buf)
183 {
184 struct class_attribute *dattr;
185 ssize_t error;
186
187 dattr = container_of(attr, struct class_attribute, attr);
188 error = -EIO;
189 if (dattr->show)
190 error = dattr->show(container_of(kobj, struct class, kobj),
191 dattr, buf);
192 return (error);
193 }
194
195 static ssize_t
linux_class_store(struct kobject * kobj,struct attribute * attr,const char * buf,size_t count)196 linux_class_store(struct kobject *kobj, struct attribute *attr, const char *buf,
197 size_t count)
198 {
199 struct class_attribute *dattr;
200 ssize_t error;
201
202 dattr = container_of(attr, struct class_attribute, attr);
203 error = -EIO;
204 if (dattr->store)
205 error = dattr->store(container_of(kobj, struct class, kobj),
206 dattr, buf, count);
207 return (error);
208 }
209
210 static void
linux_class_release(struct kobject * kobj)211 linux_class_release(struct kobject *kobj)
212 {
213 struct class *class;
214
215 class = container_of(kobj, struct class, kobj);
216 if (class->class_release)
217 class->class_release(class);
218 }
219
220 static const struct sysfs_ops linux_class_sysfs = {
221 .show = linux_class_show,
222 .store = linux_class_store,
223 };
224
225 const struct kobj_type linux_class_ktype = {
226 .release = linux_class_release,
227 .sysfs_ops = &linux_class_sysfs
228 };
229
230 static void
linux_dev_release(struct kobject * kobj)231 linux_dev_release(struct kobject *kobj)
232 {
233 struct device *dev;
234
235 dev = container_of(kobj, struct device, kobj);
236 /* This is the precedence defined by linux. */
237 if (dev->release)
238 dev->release(dev);
239 else if (dev->class && dev->class->dev_release)
240 dev->class->dev_release(dev);
241 }
242
243 static ssize_t
linux_dev_show(struct kobject * kobj,struct attribute * attr,char * buf)244 linux_dev_show(struct kobject *kobj, struct attribute *attr, char *buf)
245 {
246 struct device_attribute *dattr;
247 ssize_t error;
248
249 dattr = container_of(attr, struct device_attribute, attr);
250 error = -EIO;
251 if (dattr->show)
252 error = dattr->show(container_of(kobj, struct device, kobj),
253 dattr, buf);
254 return (error);
255 }
256
257 static ssize_t
linux_dev_store(struct kobject * kobj,struct attribute * attr,const char * buf,size_t count)258 linux_dev_store(struct kobject *kobj, struct attribute *attr, const char *buf,
259 size_t count)
260 {
261 struct device_attribute *dattr;
262 ssize_t error;
263
264 dattr = container_of(attr, struct device_attribute, attr);
265 error = -EIO;
266 if (dattr->store)
267 error = dattr->store(container_of(kobj, struct device, kobj),
268 dattr, buf, count);
269 return (error);
270 }
271
272 static const struct sysfs_ops linux_dev_sysfs = {
273 .show = linux_dev_show,
274 .store = linux_dev_store,
275 };
276
277 const struct kobj_type linux_dev_ktype = {
278 .release = linux_dev_release,
279 .sysfs_ops = &linux_dev_sysfs
280 };
281
282 struct device *
device_create(struct class * class,struct device * parent,dev_t devt,void * drvdata,const char * fmt,...)283 device_create(struct class *class, struct device *parent, dev_t devt,
284 void *drvdata, const char *fmt, ...)
285 {
286 struct device *dev;
287 va_list args;
288
289 dev = kzalloc(sizeof(*dev), M_WAITOK);
290 dev->parent = parent;
291 dev->class = class;
292 dev->devt = devt;
293 dev->driver_data = drvdata;
294 dev->release = linux_device_release;
295 va_start(args, fmt);
296 kobject_set_name_vargs(&dev->kobj, fmt, args);
297 va_end(args);
298 device_register(dev);
299
300 return (dev);
301 }
302
303 struct device *
device_create_groups_vargs(struct class * class,struct device * parent,dev_t devt,void * drvdata,const struct attribute_group ** groups,const char * fmt,va_list args)304 device_create_groups_vargs(struct class *class, struct device *parent,
305 dev_t devt, void *drvdata, const struct attribute_group **groups,
306 const char *fmt, va_list args)
307 {
308 struct device *dev = NULL;
309 int retval = -ENODEV;
310
311 if (class == NULL || IS_ERR(class))
312 goto error;
313
314 dev = kzalloc(sizeof(*dev), GFP_KERNEL);
315 if (!dev) {
316 retval = -ENOMEM;
317 goto error;
318 }
319
320 dev->devt = devt;
321 dev->class = class;
322 dev->parent = parent;
323 dev->groups = groups;
324 dev->release = device_create_release;
325 /* device_initialize() needs the class and parent to be set */
326 device_initialize(dev);
327 dev_set_drvdata(dev, drvdata);
328
329 retval = kobject_set_name_vargs(&dev->kobj, fmt, args);
330 if (retval)
331 goto error;
332
333 retval = device_add(dev);
334 if (retval)
335 goto error;
336
337 return dev;
338
339 error:
340 put_device(dev);
341 return ERR_PTR(retval);
342 }
343
344 struct class *
class_create(struct module * owner,const char * name)345 class_create(struct module *owner, const char *name)
346 {
347 struct class *class;
348 int error;
349
350 class = kzalloc(sizeof(*class), M_WAITOK);
351 class->owner = owner;
352 class->name = name;
353 class->class_release = linux_class_kfree;
354 error = class_register(class);
355 if (error) {
356 kfree(class);
357 return (NULL);
358 }
359
360 return (class);
361 }
362
363 static void
linux_kq_lock(void * arg)364 linux_kq_lock(void *arg)
365 {
366 spinlock_t *s = arg;
367
368 spin_lock(s);
369 }
370 static void
linux_kq_unlock(void * arg)371 linux_kq_unlock(void *arg)
372 {
373 spinlock_t *s = arg;
374
375 spin_unlock(s);
376 }
377
378 static void
linux_kq_assert_lock(void * arg,int what)379 linux_kq_assert_lock(void *arg, int what)
380 {
381 #ifdef INVARIANTS
382 spinlock_t *s = arg;
383
384 if (what == LA_LOCKED)
385 mtx_assert(s, MA_OWNED);
386 else
387 mtx_assert(s, MA_NOTOWNED);
388 #endif
389 }
390
391 static void
392 linux_file_kqfilter_poll(struct linux_file *, int);
393
394 struct linux_file *
linux_file_alloc(void)395 linux_file_alloc(void)
396 {
397 struct linux_file *filp;
398
399 filp = kzalloc(sizeof(*filp), GFP_KERNEL);
400
401 /* set initial refcount */
402 filp->f_count = 1;
403
404 /* setup fields needed by kqueue support */
405 spin_lock_init(&filp->f_kqlock);
406 knlist_init(&filp->f_selinfo.si_note, &filp->f_kqlock,
407 linux_kq_lock, linux_kq_unlock, linux_kq_assert_lock);
408
409 return (filp);
410 }
411
412 void
linux_file_free(struct linux_file * filp)413 linux_file_free(struct linux_file *filp)
414 {
415 if (filp->_file == NULL) {
416 if (filp->f_op != NULL && filp->f_op->release != NULL)
417 filp->f_op->release(filp->f_vnode, filp);
418 if (filp->f_shmem != NULL)
419 vm_object_deallocate(filp->f_shmem);
420 kfree_rcu(filp, rcu);
421 } else {
422 /*
423 * The close method of the character device or file
424 * will free the linux_file structure:
425 */
426 _fdrop(filp->_file, curthread);
427 }
428 }
429
430 struct linux_cdev *
cdev_alloc(void)431 cdev_alloc(void)
432 {
433 struct linux_cdev *cdev;
434
435 cdev = kzalloc(sizeof(struct linux_cdev), M_WAITOK);
436 kobject_init(&cdev->kobj, &linux_cdev_ktype);
437 cdev->refs = 1;
438 return (cdev);
439 }
440
441 static int
linux_cdev_pager_fault(vm_object_t vm_obj,vm_ooffset_t offset,int prot,vm_page_t * mres)442 linux_cdev_pager_fault(vm_object_t vm_obj, vm_ooffset_t offset, int prot,
443 vm_page_t *mres)
444 {
445 struct vm_area_struct *vmap;
446
447 vmap = linux_cdev_handle_find(vm_obj->handle);
448
449 MPASS(vmap != NULL);
450 MPASS(vmap->vm_private_data == vm_obj->handle);
451
452 if (likely(vmap->vm_ops != NULL && offset < vmap->vm_len)) {
453 vm_paddr_t paddr = IDX_TO_OFF(vmap->vm_pfn) + offset;
454 vm_page_t page;
455
456 if (((*mres)->flags & PG_FICTITIOUS) != 0) {
457 /*
458 * If the passed in result page is a fake
459 * page, update it with the new physical
460 * address.
461 */
462 page = *mres;
463 vm_page_updatefake(page, paddr, vm_obj->memattr);
464 } else {
465 /*
466 * Replace the passed in "mres" page with our
467 * own fake page and free up the all of the
468 * original pages.
469 */
470 VM_OBJECT_WUNLOCK(vm_obj);
471 page = vm_page_getfake(paddr, vm_obj->memattr);
472 VM_OBJECT_WLOCK(vm_obj);
473
474 vm_page_replace(page, vm_obj, (*mres)->pindex, *mres);
475 *mres = page;
476 }
477 vm_page_valid(page);
478 return (VM_PAGER_OK);
479 }
480 return (VM_PAGER_FAIL);
481 }
482
483 static int
linux_cdev_pager_populate(vm_object_t vm_obj,vm_pindex_t pidx,int fault_type,vm_prot_t max_prot,vm_pindex_t * first,vm_pindex_t * last)484 linux_cdev_pager_populate(vm_object_t vm_obj, vm_pindex_t pidx, int fault_type,
485 vm_prot_t max_prot, vm_pindex_t *first, vm_pindex_t *last)
486 {
487 struct vm_area_struct *vmap;
488 int err;
489
490 /* get VM area structure */
491 vmap = linux_cdev_handle_find(vm_obj->handle);
492 MPASS(vmap != NULL);
493 MPASS(vmap->vm_private_data == vm_obj->handle);
494
495 VM_OBJECT_WUNLOCK(vm_obj);
496
497 linux_set_current(curthread);
498
499 down_write(&vmap->vm_mm->mmap_sem);
500 if (unlikely(vmap->vm_ops == NULL)) {
501 err = VM_FAULT_SIGBUS;
502 } else {
503 struct vm_fault vmf;
504
505 /* fill out VM fault structure */
506 vmf.virtual_address = (void *)(uintptr_t)IDX_TO_OFF(pidx);
507 vmf.flags = (fault_type & VM_PROT_WRITE) ? FAULT_FLAG_WRITE : 0;
508 vmf.pgoff = 0;
509 vmf.page = NULL;
510 vmf.vma = vmap;
511
512 vmap->vm_pfn_count = 0;
513 vmap->vm_pfn_pcount = &vmap->vm_pfn_count;
514 vmap->vm_obj = vm_obj;
515
516 err = vmap->vm_ops->fault(&vmf);
517
518 while (vmap->vm_pfn_count == 0 && err == VM_FAULT_NOPAGE) {
519 kern_yield(PRI_USER);
520 err = vmap->vm_ops->fault(&vmf);
521 }
522 }
523
524 /* translate return code */
525 switch (err) {
526 case VM_FAULT_OOM:
527 err = VM_PAGER_AGAIN;
528 break;
529 case VM_FAULT_SIGBUS:
530 err = VM_PAGER_BAD;
531 break;
532 case VM_FAULT_NOPAGE:
533 /*
534 * By contract the fault handler will return having
535 * busied all the pages itself. If pidx is already
536 * found in the object, it will simply xbusy the first
537 * page and return with vm_pfn_count set to 1.
538 */
539 *first = vmap->vm_pfn_first;
540 *last = *first + vmap->vm_pfn_count - 1;
541 err = VM_PAGER_OK;
542 break;
543 default:
544 err = VM_PAGER_ERROR;
545 break;
546 }
547 up_write(&vmap->vm_mm->mmap_sem);
548 VM_OBJECT_WLOCK(vm_obj);
549 return (err);
550 }
551
552 static struct rwlock linux_vma_lock;
553 static TAILQ_HEAD(, vm_area_struct) linux_vma_head =
554 TAILQ_HEAD_INITIALIZER(linux_vma_head);
555
556 static void
linux_cdev_handle_free(struct vm_area_struct * vmap)557 linux_cdev_handle_free(struct vm_area_struct *vmap)
558 {
559 /* Drop reference on vm_file */
560 if (vmap->vm_file != NULL)
561 fput(vmap->vm_file);
562
563 /* Drop reference on mm_struct */
564 mmput(vmap->vm_mm);
565
566 kfree(vmap);
567 }
568
569 static void
linux_cdev_handle_remove(struct vm_area_struct * vmap)570 linux_cdev_handle_remove(struct vm_area_struct *vmap)
571 {
572 rw_wlock(&linux_vma_lock);
573 TAILQ_REMOVE(&linux_vma_head, vmap, vm_entry);
574 rw_wunlock(&linux_vma_lock);
575 }
576
577 static struct vm_area_struct *
linux_cdev_handle_find(void * handle)578 linux_cdev_handle_find(void *handle)
579 {
580 struct vm_area_struct *vmap;
581
582 rw_rlock(&linux_vma_lock);
583 TAILQ_FOREACH(vmap, &linux_vma_head, vm_entry) {
584 if (vmap->vm_private_data == handle)
585 break;
586 }
587 rw_runlock(&linux_vma_lock);
588 return (vmap);
589 }
590
591 static int
linux_cdev_pager_ctor(void * handle,vm_ooffset_t size,vm_prot_t prot,vm_ooffset_t foff,struct ucred * cred,u_short * color)592 linux_cdev_pager_ctor(void *handle, vm_ooffset_t size, vm_prot_t prot,
593 vm_ooffset_t foff, struct ucred *cred, u_short *color)
594 {
595
596 MPASS(linux_cdev_handle_find(handle) != NULL);
597 *color = 0;
598 return (0);
599 }
600
601 static void
linux_cdev_pager_dtor(void * handle)602 linux_cdev_pager_dtor(void *handle)
603 {
604 const struct vm_operations_struct *vm_ops;
605 struct vm_area_struct *vmap;
606
607 vmap = linux_cdev_handle_find(handle);
608 MPASS(vmap != NULL);
609
610 /*
611 * Remove handle before calling close operation to prevent
612 * other threads from reusing the handle pointer.
613 */
614 linux_cdev_handle_remove(vmap);
615
616 down_write(&vmap->vm_mm->mmap_sem);
617 vm_ops = vmap->vm_ops;
618 if (likely(vm_ops != NULL))
619 vm_ops->close(vmap);
620 up_write(&vmap->vm_mm->mmap_sem);
621
622 linux_cdev_handle_free(vmap);
623 }
624
625 static struct cdev_pager_ops linux_cdev_pager_ops[2] = {
626 {
627 /* OBJT_MGTDEVICE */
628 .cdev_pg_populate = linux_cdev_pager_populate,
629 .cdev_pg_ctor = linux_cdev_pager_ctor,
630 .cdev_pg_dtor = linux_cdev_pager_dtor
631 },
632 {
633 /* OBJT_DEVICE */
634 .cdev_pg_fault = linux_cdev_pager_fault,
635 .cdev_pg_ctor = linux_cdev_pager_ctor,
636 .cdev_pg_dtor = linux_cdev_pager_dtor
637 },
638 };
639
640 int
zap_vma_ptes(struct vm_area_struct * vma,unsigned long address,unsigned long size)641 zap_vma_ptes(struct vm_area_struct *vma, unsigned long address,
642 unsigned long size)
643 {
644 vm_object_t obj;
645 vm_page_t m;
646
647 obj = vma->vm_obj;
648 if (obj == NULL || (obj->flags & OBJ_UNMANAGED) != 0)
649 return (-ENOTSUP);
650 VM_OBJECT_RLOCK(obj);
651 for (m = vm_page_find_least(obj, OFF_TO_IDX(address));
652 m != NULL && m->pindex < OFF_TO_IDX(address + size);
653 m = TAILQ_NEXT(m, listq))
654 pmap_remove_all(m);
655 VM_OBJECT_RUNLOCK(obj);
656 return (0);
657 }
658
659 void
vma_set_file(struct vm_area_struct * vma,struct linux_file * file)660 vma_set_file(struct vm_area_struct *vma, struct linux_file *file)
661 {
662 struct linux_file *tmp;
663
664 /* Changing an anonymous vma with this is illegal */
665 get_file(file);
666 tmp = vma->vm_file;
667 vma->vm_file = file;
668 fput(tmp);
669 }
670
671 static struct file_operations dummy_ldev_ops = {
672 /* XXXKIB */
673 };
674
675 static struct linux_cdev dummy_ldev = {
676 .ops = &dummy_ldev_ops,
677 };
678
679 #define LDEV_SI_DTR 0x0001
680 #define LDEV_SI_REF 0x0002
681
682 static void
linux_get_fop(struct linux_file * filp,const struct file_operations ** fop,struct linux_cdev ** dev)683 linux_get_fop(struct linux_file *filp, const struct file_operations **fop,
684 struct linux_cdev **dev)
685 {
686 struct linux_cdev *ldev;
687 u_int siref;
688
689 ldev = filp->f_cdev;
690 *fop = filp->f_op;
691 if (ldev != NULL) {
692 if (ldev->kobj.ktype == &linux_cdev_static_ktype) {
693 refcount_acquire(&ldev->refs);
694 } else {
695 for (siref = ldev->siref;;) {
696 if ((siref & LDEV_SI_DTR) != 0) {
697 ldev = &dummy_ldev;
698 *fop = ldev->ops;
699 siref = ldev->siref;
700 MPASS((ldev->siref & LDEV_SI_DTR) == 0);
701 } else if (atomic_fcmpset_int(&ldev->siref,
702 &siref, siref + LDEV_SI_REF)) {
703 break;
704 }
705 }
706 }
707 }
708 *dev = ldev;
709 }
710
711 static void
linux_drop_fop(struct linux_cdev * ldev)712 linux_drop_fop(struct linux_cdev *ldev)
713 {
714
715 if (ldev == NULL)
716 return;
717 if (ldev->kobj.ktype == &linux_cdev_static_ktype) {
718 linux_cdev_deref(ldev);
719 } else {
720 MPASS(ldev->kobj.ktype == &linux_cdev_ktype);
721 MPASS((ldev->siref & ~LDEV_SI_DTR) != 0);
722 atomic_subtract_int(&ldev->siref, LDEV_SI_REF);
723 }
724 }
725
726 #define OPW(fp,td,code) ({ \
727 struct file *__fpop; \
728 __typeof(code) __retval; \
729 \
730 __fpop = (td)->td_fpop; \
731 (td)->td_fpop = (fp); \
732 __retval = (code); \
733 (td)->td_fpop = __fpop; \
734 __retval; \
735 })
736
737 static int
linux_dev_fdopen(struct cdev * dev,int fflags,struct thread * td,struct file * file)738 linux_dev_fdopen(struct cdev *dev, int fflags, struct thread *td,
739 struct file *file)
740 {
741 struct linux_cdev *ldev;
742 struct linux_file *filp;
743 const struct file_operations *fop;
744 int error;
745
746 ldev = dev->si_drv1;
747
748 filp = linux_file_alloc();
749 filp->f_dentry = &filp->f_dentry_store;
750 filp->f_op = ldev->ops;
751 filp->f_mode = file->f_flag;
752 filp->f_flags = file->f_flag;
753 filp->f_vnode = file->f_vnode;
754 filp->_file = file;
755 refcount_acquire(&ldev->refs);
756 filp->f_cdev = ldev;
757
758 linux_set_current(td);
759 linux_get_fop(filp, &fop, &ldev);
760
761 if (fop->open != NULL) {
762 error = -fop->open(file->f_vnode, filp);
763 if (error != 0) {
764 linux_drop_fop(ldev);
765 linux_cdev_deref(filp->f_cdev);
766 kfree(filp);
767 return (error);
768 }
769 }
770
771 /* hold on to the vnode - used for fstat() */
772 vhold(filp->f_vnode);
773
774 /* release the file from devfs */
775 finit(file, filp->f_mode, DTYPE_DEV, filp, &linuxfileops);
776 linux_drop_fop(ldev);
777 return (ENXIO);
778 }
779
780 #define LINUX_IOCTL_MIN_PTR 0x10000UL
781 #define LINUX_IOCTL_MAX_PTR (LINUX_IOCTL_MIN_PTR + IOCPARM_MAX)
782
783 static inline int
linux_remap_address(void ** uaddr,size_t len)784 linux_remap_address(void **uaddr, size_t len)
785 {
786 uintptr_t uaddr_val = (uintptr_t)(*uaddr);
787
788 if (unlikely(uaddr_val >= LINUX_IOCTL_MIN_PTR &&
789 uaddr_val < LINUX_IOCTL_MAX_PTR)) {
790 struct task_struct *pts = current;
791 if (pts == NULL) {
792 *uaddr = NULL;
793 return (1);
794 }
795
796 /* compute data offset */
797 uaddr_val -= LINUX_IOCTL_MIN_PTR;
798
799 /* check that length is within bounds */
800 if ((len > IOCPARM_MAX) ||
801 (uaddr_val + len) > pts->bsd_ioctl_len) {
802 *uaddr = NULL;
803 return (1);
804 }
805
806 /* re-add kernel buffer address */
807 uaddr_val += (uintptr_t)pts->bsd_ioctl_data;
808
809 /* update address location */
810 *uaddr = (void *)uaddr_val;
811 return (1);
812 }
813 return (0);
814 }
815
816 int
linux_copyin(const void * uaddr,void * kaddr,size_t len)817 linux_copyin(const void *uaddr, void *kaddr, size_t len)
818 {
819 if (linux_remap_address(__DECONST(void **, &uaddr), len)) {
820 if (uaddr == NULL)
821 return (-EFAULT);
822 memcpy(kaddr, uaddr, len);
823 return (0);
824 }
825 return (-copyin(uaddr, kaddr, len));
826 }
827
828 int
linux_copyout(const void * kaddr,void * uaddr,size_t len)829 linux_copyout(const void *kaddr, void *uaddr, size_t len)
830 {
831 if (linux_remap_address(&uaddr, len)) {
832 if (uaddr == NULL)
833 return (-EFAULT);
834 memcpy(uaddr, kaddr, len);
835 return (0);
836 }
837 return (-copyout(kaddr, uaddr, len));
838 }
839
840 size_t
linux_clear_user(void * _uaddr,size_t _len)841 linux_clear_user(void *_uaddr, size_t _len)
842 {
843 uint8_t *uaddr = _uaddr;
844 size_t len = _len;
845
846 /* make sure uaddr is aligned before going into the fast loop */
847 while (((uintptr_t)uaddr & 7) != 0 && len > 7) {
848 if (subyte(uaddr, 0))
849 return (_len);
850 uaddr++;
851 len--;
852 }
853
854 /* zero 8 bytes at a time */
855 while (len > 7) {
856 #ifdef __LP64__
857 if (suword64(uaddr, 0))
858 return (_len);
859 #else
860 if (suword32(uaddr, 0))
861 return (_len);
862 if (suword32(uaddr + 4, 0))
863 return (_len);
864 #endif
865 uaddr += 8;
866 len -= 8;
867 }
868
869 /* zero fill end, if any */
870 while (len > 0) {
871 if (subyte(uaddr, 0))
872 return (_len);
873 uaddr++;
874 len--;
875 }
876 return (0);
877 }
878
879 int
linux_access_ok(const void * uaddr,size_t len)880 linux_access_ok(const void *uaddr, size_t len)
881 {
882 uintptr_t saddr;
883 uintptr_t eaddr;
884
885 /* get start and end address */
886 saddr = (uintptr_t)uaddr;
887 eaddr = (uintptr_t)uaddr + len;
888
889 /* verify addresses are valid for userspace */
890 return ((saddr == eaddr) ||
891 (eaddr > saddr && eaddr <= VM_MAXUSER_ADDRESS));
892 }
893
894 /*
895 * This function should return either EINTR or ERESTART depending on
896 * the signal type sent to this thread:
897 */
898 static int
linux_get_error(struct task_struct * task,int error)899 linux_get_error(struct task_struct *task, int error)
900 {
901 /* check for signal type interrupt code */
902 if (error == EINTR || error == ERESTARTSYS || error == ERESTART) {
903 error = -linux_schedule_get_interrupt_value(task);
904 if (error == 0)
905 error = EINTR;
906 }
907 return (error);
908 }
909
910 static int
linux_file_ioctl_sub(struct file * fp,struct linux_file * filp,const struct file_operations * fop,u_long cmd,caddr_t data,struct thread * td)911 linux_file_ioctl_sub(struct file *fp, struct linux_file *filp,
912 const struct file_operations *fop, u_long cmd, caddr_t data,
913 struct thread *td)
914 {
915 struct task_struct *task = current;
916 unsigned size;
917 int error;
918
919 size = IOCPARM_LEN(cmd);
920 /* refer to logic in sys_ioctl() */
921 if (size > 0) {
922 /*
923 * Setup hint for linux_copyin() and linux_copyout().
924 *
925 * Background: Linux code expects a user-space address
926 * while FreeBSD supplies a kernel-space address.
927 */
928 task->bsd_ioctl_data = data;
929 task->bsd_ioctl_len = size;
930 data = (void *)LINUX_IOCTL_MIN_PTR;
931 } else {
932 /* fetch user-space pointer */
933 data = *(void **)data;
934 }
935 #ifdef COMPAT_FREEBSD32
936 if (SV_PROC_FLAG(td->td_proc, SV_ILP32)) {
937 /* try the compat IOCTL handler first */
938 if (fop->compat_ioctl != NULL) {
939 error = -OPW(fp, td, fop->compat_ioctl(filp,
940 cmd, (u_long)data));
941 } else {
942 error = ENOTTY;
943 }
944
945 /* fallback to the regular IOCTL handler, if any */
946 if (error == ENOTTY && fop->unlocked_ioctl != NULL) {
947 error = -OPW(fp, td, fop->unlocked_ioctl(filp,
948 cmd, (u_long)data));
949 }
950 } else
951 #endif
952 {
953 if (fop->unlocked_ioctl != NULL) {
954 error = -OPW(fp, td, fop->unlocked_ioctl(filp,
955 cmd, (u_long)data));
956 } else {
957 error = ENOTTY;
958 }
959 }
960 if (size > 0) {
961 task->bsd_ioctl_data = NULL;
962 task->bsd_ioctl_len = 0;
963 }
964
965 if (error == EWOULDBLOCK) {
966 /* update kqfilter status, if any */
967 linux_file_kqfilter_poll(filp,
968 LINUX_KQ_FLAG_HAS_READ | LINUX_KQ_FLAG_HAS_WRITE);
969 } else {
970 error = linux_get_error(task, error);
971 }
972 return (error);
973 }
974
975 #define LINUX_POLL_TABLE_NORMAL ((poll_table *)1)
976
977 /*
978 * This function atomically updates the poll wakeup state and returns
979 * the previous state at the time of update.
980 */
981 static uint8_t
linux_poll_wakeup_state(atomic_t * v,const uint8_t * pstate)982 linux_poll_wakeup_state(atomic_t *v, const uint8_t *pstate)
983 {
984 int c, old;
985
986 c = v->counter;
987
988 while ((old = atomic_cmpxchg(v, c, pstate[c])) != c)
989 c = old;
990
991 return (c);
992 }
993
994 static int
linux_poll_wakeup_callback(wait_queue_t * wq,unsigned int wq_state,int flags,void * key)995 linux_poll_wakeup_callback(wait_queue_t *wq, unsigned int wq_state, int flags, void *key)
996 {
997 static const uint8_t state[LINUX_FWQ_STATE_MAX] = {
998 [LINUX_FWQ_STATE_INIT] = LINUX_FWQ_STATE_INIT, /* NOP */
999 [LINUX_FWQ_STATE_NOT_READY] = LINUX_FWQ_STATE_NOT_READY, /* NOP */
1000 [LINUX_FWQ_STATE_QUEUED] = LINUX_FWQ_STATE_READY,
1001 [LINUX_FWQ_STATE_READY] = LINUX_FWQ_STATE_READY, /* NOP */
1002 };
1003 struct linux_file *filp = container_of(wq, struct linux_file, f_wait_queue.wq);
1004
1005 switch (linux_poll_wakeup_state(&filp->f_wait_queue.state, state)) {
1006 case LINUX_FWQ_STATE_QUEUED:
1007 linux_poll_wakeup(filp);
1008 return (1);
1009 default:
1010 return (0);
1011 }
1012 }
1013
1014 void
linux_poll_wait(struct linux_file * filp,wait_queue_head_t * wqh,poll_table * p)1015 linux_poll_wait(struct linux_file *filp, wait_queue_head_t *wqh, poll_table *p)
1016 {
1017 static const uint8_t state[LINUX_FWQ_STATE_MAX] = {
1018 [LINUX_FWQ_STATE_INIT] = LINUX_FWQ_STATE_NOT_READY,
1019 [LINUX_FWQ_STATE_NOT_READY] = LINUX_FWQ_STATE_NOT_READY, /* NOP */
1020 [LINUX_FWQ_STATE_QUEUED] = LINUX_FWQ_STATE_QUEUED, /* NOP */
1021 [LINUX_FWQ_STATE_READY] = LINUX_FWQ_STATE_QUEUED,
1022 };
1023
1024 /* check if we are called inside the select system call */
1025 if (p == LINUX_POLL_TABLE_NORMAL)
1026 selrecord(curthread, &filp->f_selinfo);
1027
1028 switch (linux_poll_wakeup_state(&filp->f_wait_queue.state, state)) {
1029 case LINUX_FWQ_STATE_INIT:
1030 /* NOTE: file handles can only belong to one wait-queue */
1031 filp->f_wait_queue.wqh = wqh;
1032 filp->f_wait_queue.wq.func = &linux_poll_wakeup_callback;
1033 add_wait_queue(wqh, &filp->f_wait_queue.wq);
1034 atomic_set(&filp->f_wait_queue.state, LINUX_FWQ_STATE_QUEUED);
1035 break;
1036 default:
1037 break;
1038 }
1039 }
1040
1041 static void
linux_poll_wait_dequeue(struct linux_file * filp)1042 linux_poll_wait_dequeue(struct linux_file *filp)
1043 {
1044 static const uint8_t state[LINUX_FWQ_STATE_MAX] = {
1045 [LINUX_FWQ_STATE_INIT] = LINUX_FWQ_STATE_INIT, /* NOP */
1046 [LINUX_FWQ_STATE_NOT_READY] = LINUX_FWQ_STATE_INIT,
1047 [LINUX_FWQ_STATE_QUEUED] = LINUX_FWQ_STATE_INIT,
1048 [LINUX_FWQ_STATE_READY] = LINUX_FWQ_STATE_INIT,
1049 };
1050
1051 seldrain(&filp->f_selinfo);
1052
1053 switch (linux_poll_wakeup_state(&filp->f_wait_queue.state, state)) {
1054 case LINUX_FWQ_STATE_NOT_READY:
1055 case LINUX_FWQ_STATE_QUEUED:
1056 case LINUX_FWQ_STATE_READY:
1057 remove_wait_queue(filp->f_wait_queue.wqh, &filp->f_wait_queue.wq);
1058 break;
1059 default:
1060 break;
1061 }
1062 }
1063
1064 void
linux_poll_wakeup(struct linux_file * filp)1065 linux_poll_wakeup(struct linux_file *filp)
1066 {
1067 /* this function should be NULL-safe */
1068 if (filp == NULL)
1069 return;
1070
1071 selwakeup(&filp->f_selinfo);
1072
1073 spin_lock(&filp->f_kqlock);
1074 filp->f_kqflags |= LINUX_KQ_FLAG_NEED_READ |
1075 LINUX_KQ_FLAG_NEED_WRITE;
1076
1077 /* make sure the "knote" gets woken up */
1078 KNOTE_LOCKED(&filp->f_selinfo.si_note, 1);
1079 spin_unlock(&filp->f_kqlock);
1080 }
1081
1082 static void
linux_file_kqfilter_detach(struct knote * kn)1083 linux_file_kqfilter_detach(struct knote *kn)
1084 {
1085 struct linux_file *filp = kn->kn_hook;
1086
1087 spin_lock(&filp->f_kqlock);
1088 knlist_remove(&filp->f_selinfo.si_note, kn, 1);
1089 spin_unlock(&filp->f_kqlock);
1090 }
1091
1092 static int
linux_file_kqfilter_read_event(struct knote * kn,long hint)1093 linux_file_kqfilter_read_event(struct knote *kn, long hint)
1094 {
1095 struct linux_file *filp = kn->kn_hook;
1096
1097 mtx_assert(&filp->f_kqlock, MA_OWNED);
1098
1099 return ((filp->f_kqflags & LINUX_KQ_FLAG_NEED_READ) ? 1 : 0);
1100 }
1101
1102 static int
linux_file_kqfilter_write_event(struct knote * kn,long hint)1103 linux_file_kqfilter_write_event(struct knote *kn, long hint)
1104 {
1105 struct linux_file *filp = kn->kn_hook;
1106
1107 mtx_assert(&filp->f_kqlock, MA_OWNED);
1108
1109 return ((filp->f_kqflags & LINUX_KQ_FLAG_NEED_WRITE) ? 1 : 0);
1110 }
1111
1112 static struct filterops linux_dev_kqfiltops_read = {
1113 .f_isfd = 1,
1114 .f_detach = linux_file_kqfilter_detach,
1115 .f_event = linux_file_kqfilter_read_event,
1116 };
1117
1118 static struct filterops linux_dev_kqfiltops_write = {
1119 .f_isfd = 1,
1120 .f_detach = linux_file_kqfilter_detach,
1121 .f_event = linux_file_kqfilter_write_event,
1122 };
1123
1124 static void
linux_file_kqfilter_poll(struct linux_file * filp,int kqflags)1125 linux_file_kqfilter_poll(struct linux_file *filp, int kqflags)
1126 {
1127 struct thread *td;
1128 const struct file_operations *fop;
1129 struct linux_cdev *ldev;
1130 int temp;
1131
1132 if ((filp->f_kqflags & kqflags) == 0)
1133 return;
1134
1135 td = curthread;
1136
1137 linux_get_fop(filp, &fop, &ldev);
1138 /* get the latest polling state */
1139 temp = OPW(filp->_file, td, fop->poll(filp, NULL));
1140 linux_drop_fop(ldev);
1141
1142 spin_lock(&filp->f_kqlock);
1143 /* clear kqflags */
1144 filp->f_kqflags &= ~(LINUX_KQ_FLAG_NEED_READ |
1145 LINUX_KQ_FLAG_NEED_WRITE);
1146 /* update kqflags */
1147 if ((temp & (POLLIN | POLLOUT)) != 0) {
1148 if ((temp & POLLIN) != 0)
1149 filp->f_kqflags |= LINUX_KQ_FLAG_NEED_READ;
1150 if ((temp & POLLOUT) != 0)
1151 filp->f_kqflags |= LINUX_KQ_FLAG_NEED_WRITE;
1152
1153 /* make sure the "knote" gets woken up */
1154 KNOTE_LOCKED(&filp->f_selinfo.si_note, 0);
1155 }
1156 spin_unlock(&filp->f_kqlock);
1157 }
1158
1159 static int
linux_file_kqfilter(struct file * file,struct knote * kn)1160 linux_file_kqfilter(struct file *file, struct knote *kn)
1161 {
1162 struct linux_file *filp;
1163 struct thread *td;
1164 int error;
1165
1166 td = curthread;
1167 filp = (struct linux_file *)file->f_data;
1168 filp->f_flags = file->f_flag;
1169 if (filp->f_op->poll == NULL)
1170 return (EINVAL);
1171
1172 spin_lock(&filp->f_kqlock);
1173 switch (kn->kn_filter) {
1174 case EVFILT_READ:
1175 filp->f_kqflags |= LINUX_KQ_FLAG_HAS_READ;
1176 kn->kn_fop = &linux_dev_kqfiltops_read;
1177 kn->kn_hook = filp;
1178 knlist_add(&filp->f_selinfo.si_note, kn, 1);
1179 error = 0;
1180 break;
1181 case EVFILT_WRITE:
1182 filp->f_kqflags |= LINUX_KQ_FLAG_HAS_WRITE;
1183 kn->kn_fop = &linux_dev_kqfiltops_write;
1184 kn->kn_hook = filp;
1185 knlist_add(&filp->f_selinfo.si_note, kn, 1);
1186 error = 0;
1187 break;
1188 default:
1189 error = EINVAL;
1190 break;
1191 }
1192 spin_unlock(&filp->f_kqlock);
1193
1194 if (error == 0) {
1195 linux_set_current(td);
1196
1197 /* update kqfilter status, if any */
1198 linux_file_kqfilter_poll(filp,
1199 LINUX_KQ_FLAG_HAS_READ | LINUX_KQ_FLAG_HAS_WRITE);
1200 }
1201 return (error);
1202 }
1203
1204 static int
linux_file_mmap_single(struct file * fp,const struct file_operations * fop,vm_ooffset_t * offset,vm_size_t size,struct vm_object ** object,int nprot,bool is_shared,struct thread * td)1205 linux_file_mmap_single(struct file *fp, const struct file_operations *fop,
1206 vm_ooffset_t *offset, vm_size_t size, struct vm_object **object,
1207 int nprot, bool is_shared, struct thread *td)
1208 {
1209 struct task_struct *task;
1210 struct vm_area_struct *vmap;
1211 struct mm_struct *mm;
1212 struct linux_file *filp;
1213 vm_memattr_t attr;
1214 int error;
1215
1216 filp = (struct linux_file *)fp->f_data;
1217 filp->f_flags = fp->f_flag;
1218
1219 if (fop->mmap == NULL)
1220 return (EOPNOTSUPP);
1221
1222 linux_set_current(td);
1223
1224 /*
1225 * The same VM object might be shared by multiple processes
1226 * and the mm_struct is usually freed when a process exits.
1227 *
1228 * The atomic reference below makes sure the mm_struct is
1229 * available as long as the vmap is in the linux_vma_head.
1230 */
1231 task = current;
1232 mm = task->mm;
1233 if (atomic_inc_not_zero(&mm->mm_users) == 0)
1234 return (EINVAL);
1235
1236 vmap = kzalloc(sizeof(*vmap), GFP_KERNEL);
1237 vmap->vm_start = 0;
1238 vmap->vm_end = size;
1239 vmap->vm_pgoff = *offset / PAGE_SIZE;
1240 vmap->vm_pfn = 0;
1241 vmap->vm_flags = vmap->vm_page_prot = (nprot & VM_PROT_ALL);
1242 if (is_shared)
1243 vmap->vm_flags |= VM_SHARED;
1244 vmap->vm_ops = NULL;
1245 vmap->vm_file = get_file(filp);
1246 vmap->vm_mm = mm;
1247
1248 if (unlikely(down_write_killable(&vmap->vm_mm->mmap_sem))) {
1249 error = linux_get_error(task, EINTR);
1250 } else {
1251 error = -OPW(fp, td, fop->mmap(filp, vmap));
1252 error = linux_get_error(task, error);
1253 up_write(&vmap->vm_mm->mmap_sem);
1254 }
1255
1256 if (error != 0) {
1257 linux_cdev_handle_free(vmap);
1258 return (error);
1259 }
1260
1261 attr = pgprot2cachemode(vmap->vm_page_prot);
1262
1263 if (vmap->vm_ops != NULL) {
1264 struct vm_area_struct *ptr;
1265 void *vm_private_data;
1266 bool vm_no_fault;
1267
1268 if (vmap->vm_ops->open == NULL ||
1269 vmap->vm_ops->close == NULL ||
1270 vmap->vm_private_data == NULL) {
1271 /* free allocated VM area struct */
1272 linux_cdev_handle_free(vmap);
1273 return (EINVAL);
1274 }
1275
1276 vm_private_data = vmap->vm_private_data;
1277
1278 rw_wlock(&linux_vma_lock);
1279 TAILQ_FOREACH(ptr, &linux_vma_head, vm_entry) {
1280 if (ptr->vm_private_data == vm_private_data)
1281 break;
1282 }
1283 /* check if there is an existing VM area struct */
1284 if (ptr != NULL) {
1285 /* check if the VM area structure is invalid */
1286 if (ptr->vm_ops == NULL ||
1287 ptr->vm_ops->open == NULL ||
1288 ptr->vm_ops->close == NULL) {
1289 error = ESTALE;
1290 vm_no_fault = 1;
1291 } else {
1292 error = EEXIST;
1293 vm_no_fault = (ptr->vm_ops->fault == NULL);
1294 }
1295 } else {
1296 /* insert VM area structure into list */
1297 TAILQ_INSERT_TAIL(&linux_vma_head, vmap, vm_entry);
1298 error = 0;
1299 vm_no_fault = (vmap->vm_ops->fault == NULL);
1300 }
1301 rw_wunlock(&linux_vma_lock);
1302
1303 if (error != 0) {
1304 /* free allocated VM area struct */
1305 linux_cdev_handle_free(vmap);
1306 /* check for stale VM area struct */
1307 if (error != EEXIST)
1308 return (error);
1309 }
1310
1311 /* check if there is no fault handler */
1312 if (vm_no_fault) {
1313 *object = cdev_pager_allocate(vm_private_data, OBJT_DEVICE,
1314 &linux_cdev_pager_ops[1], size, nprot, *offset,
1315 td->td_ucred);
1316 } else {
1317 *object = cdev_pager_allocate(vm_private_data, OBJT_MGTDEVICE,
1318 &linux_cdev_pager_ops[0], size, nprot, *offset,
1319 td->td_ucred);
1320 }
1321
1322 /* check if allocating the VM object failed */
1323 if (*object == NULL) {
1324 if (error == 0) {
1325 /* remove VM area struct from list */
1326 linux_cdev_handle_remove(vmap);
1327 /* free allocated VM area struct */
1328 linux_cdev_handle_free(vmap);
1329 }
1330 return (EINVAL);
1331 }
1332 } else {
1333 struct sglist *sg;
1334
1335 sg = sglist_alloc(1, M_WAITOK);
1336 sglist_append_phys(sg,
1337 (vm_paddr_t)vmap->vm_pfn << PAGE_SHIFT, vmap->vm_len);
1338
1339 *object = vm_pager_allocate(OBJT_SG, sg, vmap->vm_len,
1340 nprot, 0, td->td_ucred);
1341
1342 linux_cdev_handle_free(vmap);
1343
1344 if (*object == NULL) {
1345 sglist_free(sg);
1346 return (EINVAL);
1347 }
1348 }
1349
1350 if (attr != VM_MEMATTR_DEFAULT) {
1351 VM_OBJECT_WLOCK(*object);
1352 vm_object_set_memattr(*object, attr);
1353 VM_OBJECT_WUNLOCK(*object);
1354 }
1355 *offset = 0;
1356 return (0);
1357 }
1358
1359 struct cdevsw linuxcdevsw = {
1360 .d_version = D_VERSION,
1361 .d_fdopen = linux_dev_fdopen,
1362 .d_name = "lkpidev",
1363 };
1364
1365 static int
linux_file_read(struct file * file,struct uio * uio,struct ucred * active_cred,int flags,struct thread * td)1366 linux_file_read(struct file *file, struct uio *uio, struct ucred *active_cred,
1367 int flags, struct thread *td)
1368 {
1369 struct linux_file *filp;
1370 const struct file_operations *fop;
1371 struct linux_cdev *ldev;
1372 ssize_t bytes;
1373 int error;
1374
1375 error = 0;
1376 filp = (struct linux_file *)file->f_data;
1377 filp->f_flags = file->f_flag;
1378 /* XXX no support for I/O vectors currently */
1379 if (uio->uio_iovcnt != 1)
1380 return (EOPNOTSUPP);
1381 if (uio->uio_resid > DEVFS_IOSIZE_MAX)
1382 return (EINVAL);
1383 linux_set_current(td);
1384 linux_get_fop(filp, &fop, &ldev);
1385 if (fop->read != NULL) {
1386 bytes = OPW(file, td, fop->read(filp,
1387 uio->uio_iov->iov_base,
1388 uio->uio_iov->iov_len, &uio->uio_offset));
1389 if (bytes >= 0) {
1390 uio->uio_iov->iov_base =
1391 ((uint8_t *)uio->uio_iov->iov_base) + bytes;
1392 uio->uio_iov->iov_len -= bytes;
1393 uio->uio_resid -= bytes;
1394 } else {
1395 error = linux_get_error(current, -bytes);
1396 }
1397 } else
1398 error = ENXIO;
1399
1400 /* update kqfilter status, if any */
1401 linux_file_kqfilter_poll(filp, LINUX_KQ_FLAG_HAS_READ);
1402 linux_drop_fop(ldev);
1403
1404 return (error);
1405 }
1406
1407 static int
linux_file_write(struct file * file,struct uio * uio,struct ucred * active_cred,int flags,struct thread * td)1408 linux_file_write(struct file *file, struct uio *uio, struct ucred *active_cred,
1409 int flags, struct thread *td)
1410 {
1411 struct linux_file *filp;
1412 const struct file_operations *fop;
1413 struct linux_cdev *ldev;
1414 ssize_t bytes;
1415 int error;
1416
1417 filp = (struct linux_file *)file->f_data;
1418 filp->f_flags = file->f_flag;
1419 /* XXX no support for I/O vectors currently */
1420 if (uio->uio_iovcnt != 1)
1421 return (EOPNOTSUPP);
1422 if (uio->uio_resid > DEVFS_IOSIZE_MAX)
1423 return (EINVAL);
1424 linux_set_current(td);
1425 linux_get_fop(filp, &fop, &ldev);
1426 if (fop->write != NULL) {
1427 bytes = OPW(file, td, fop->write(filp,
1428 uio->uio_iov->iov_base,
1429 uio->uio_iov->iov_len, &uio->uio_offset));
1430 if (bytes >= 0) {
1431 uio->uio_iov->iov_base =
1432 ((uint8_t *)uio->uio_iov->iov_base) + bytes;
1433 uio->uio_iov->iov_len -= bytes;
1434 uio->uio_resid -= bytes;
1435 error = 0;
1436 } else {
1437 error = linux_get_error(current, -bytes);
1438 }
1439 } else
1440 error = ENXIO;
1441
1442 /* update kqfilter status, if any */
1443 linux_file_kqfilter_poll(filp, LINUX_KQ_FLAG_HAS_WRITE);
1444
1445 linux_drop_fop(ldev);
1446
1447 return (error);
1448 }
1449
1450 static int
linux_file_poll(struct file * file,int events,struct ucred * active_cred,struct thread * td)1451 linux_file_poll(struct file *file, int events, struct ucred *active_cred,
1452 struct thread *td)
1453 {
1454 struct linux_file *filp;
1455 const struct file_operations *fop;
1456 struct linux_cdev *ldev;
1457 int revents;
1458
1459 filp = (struct linux_file *)file->f_data;
1460 filp->f_flags = file->f_flag;
1461 linux_set_current(td);
1462 linux_get_fop(filp, &fop, &ldev);
1463 if (fop->poll != NULL) {
1464 revents = OPW(file, td, fop->poll(filp,
1465 LINUX_POLL_TABLE_NORMAL)) & events;
1466 } else {
1467 revents = 0;
1468 }
1469 linux_drop_fop(ldev);
1470 return (revents);
1471 }
1472
1473 static int
linux_file_close(struct file * file,struct thread * td)1474 linux_file_close(struct file *file, struct thread *td)
1475 {
1476 struct linux_file *filp;
1477 int (*release)(struct inode *, struct linux_file *);
1478 const struct file_operations *fop;
1479 struct linux_cdev *ldev;
1480 int error;
1481
1482 filp = (struct linux_file *)file->f_data;
1483
1484 KASSERT(file_count(filp) == 0,
1485 ("File refcount(%d) is not zero", file_count(filp)));
1486
1487 if (td == NULL)
1488 td = curthread;
1489
1490 error = 0;
1491 filp->f_flags = file->f_flag;
1492 linux_set_current(td);
1493 linux_poll_wait_dequeue(filp);
1494 linux_get_fop(filp, &fop, &ldev);
1495 /*
1496 * Always use the real release function, if any, to avoid
1497 * leaking device resources:
1498 */
1499 release = filp->f_op->release;
1500 if (release != NULL)
1501 error = -OPW(file, td, release(filp->f_vnode, filp));
1502 funsetown(&filp->f_sigio);
1503 if (filp->f_vnode != NULL)
1504 vdrop(filp->f_vnode);
1505 linux_drop_fop(ldev);
1506 ldev = filp->f_cdev;
1507 if (ldev != NULL)
1508 linux_cdev_deref(ldev);
1509 linux_synchronize_rcu(RCU_TYPE_REGULAR);
1510 kfree(filp);
1511
1512 return (error);
1513 }
1514
1515 static int
linux_file_ioctl(struct file * fp,u_long cmd,void * data,struct ucred * cred,struct thread * td)1516 linux_file_ioctl(struct file *fp, u_long cmd, void *data, struct ucred *cred,
1517 struct thread *td)
1518 {
1519 struct linux_file *filp;
1520 const struct file_operations *fop;
1521 struct linux_cdev *ldev;
1522 struct fiodgname_arg *fgn;
1523 const char *p;
1524 int error, i;
1525
1526 error = 0;
1527 filp = (struct linux_file *)fp->f_data;
1528 filp->f_flags = fp->f_flag;
1529 linux_get_fop(filp, &fop, &ldev);
1530
1531 linux_set_current(td);
1532 switch (cmd) {
1533 case FIONBIO:
1534 break;
1535 case FIOASYNC:
1536 if (fop->fasync == NULL)
1537 break;
1538 error = -OPW(fp, td, fop->fasync(0, filp, fp->f_flag & FASYNC));
1539 break;
1540 case FIOSETOWN:
1541 error = fsetown(*(int *)data, &filp->f_sigio);
1542 if (error == 0) {
1543 if (fop->fasync == NULL)
1544 break;
1545 error = -OPW(fp, td, fop->fasync(0, filp,
1546 fp->f_flag & FASYNC));
1547 }
1548 break;
1549 case FIOGETOWN:
1550 *(int *)data = fgetown(&filp->f_sigio);
1551 break;
1552 case FIODGNAME:
1553 #ifdef COMPAT_FREEBSD32
1554 case FIODGNAME_32:
1555 #endif
1556 if (filp->f_cdev == NULL || filp->f_cdev->cdev == NULL) {
1557 error = ENXIO;
1558 break;
1559 }
1560 fgn = data;
1561 p = devtoname(filp->f_cdev->cdev);
1562 i = strlen(p) + 1;
1563 if (i > fgn->len) {
1564 error = EINVAL;
1565 break;
1566 }
1567 error = copyout(p, fiodgname_buf_get_ptr(fgn, cmd), i);
1568 break;
1569 default:
1570 error = linux_file_ioctl_sub(fp, filp, fop, cmd, data, td);
1571 break;
1572 }
1573 linux_drop_fop(ldev);
1574 return (error);
1575 }
1576
1577 static int
linux_file_mmap_sub(struct thread * td,vm_size_t objsize,vm_prot_t prot,vm_prot_t maxprot,int flags,struct file * fp,vm_ooffset_t * foff,const struct file_operations * fop,vm_object_t * objp)1578 linux_file_mmap_sub(struct thread *td, vm_size_t objsize, vm_prot_t prot,
1579 vm_prot_t maxprot, int flags, struct file *fp,
1580 vm_ooffset_t *foff, const struct file_operations *fop, vm_object_t *objp)
1581 {
1582 /*
1583 * Character devices do not provide private mappings
1584 * of any kind:
1585 */
1586 if ((maxprot & VM_PROT_WRITE) == 0 &&
1587 (prot & VM_PROT_WRITE) != 0)
1588 return (EACCES);
1589 if ((flags & (MAP_PRIVATE | MAP_COPY)) != 0)
1590 return (EINVAL);
1591
1592 return (linux_file_mmap_single(fp, fop, foff, objsize, objp,
1593 (int)prot, (flags & MAP_SHARED) ? true : false, td));
1594 }
1595
1596 static int
linux_file_mmap(struct file * fp,vm_map_t map,vm_offset_t * addr,vm_size_t size,vm_prot_t prot,vm_prot_t cap_maxprot,int flags,vm_ooffset_t foff,struct thread * td)1597 linux_file_mmap(struct file *fp, vm_map_t map, vm_offset_t *addr, vm_size_t size,
1598 vm_prot_t prot, vm_prot_t cap_maxprot, int flags, vm_ooffset_t foff,
1599 struct thread *td)
1600 {
1601 struct linux_file *filp;
1602 const struct file_operations *fop;
1603 struct linux_cdev *ldev;
1604 struct mount *mp;
1605 struct vnode *vp;
1606 vm_object_t object;
1607 vm_prot_t maxprot;
1608 int error;
1609
1610 filp = (struct linux_file *)fp->f_data;
1611
1612 vp = filp->f_vnode;
1613 if (vp == NULL)
1614 return (EOPNOTSUPP);
1615
1616 /*
1617 * Ensure that file and memory protections are
1618 * compatible.
1619 */
1620 mp = vp->v_mount;
1621 if (mp != NULL && (mp->mnt_flag & MNT_NOEXEC) != 0) {
1622 maxprot = VM_PROT_NONE;
1623 if ((prot & VM_PROT_EXECUTE) != 0)
1624 return (EACCES);
1625 } else
1626 maxprot = VM_PROT_EXECUTE;
1627 if ((fp->f_flag & FREAD) != 0)
1628 maxprot |= VM_PROT_READ;
1629 else if ((prot & VM_PROT_READ) != 0)
1630 return (EACCES);
1631
1632 /*
1633 * If we are sharing potential changes via MAP_SHARED and we
1634 * are trying to get write permission although we opened it
1635 * without asking for it, bail out.
1636 *
1637 * Note that most character devices always share mappings.
1638 *
1639 * Rely on linux_file_mmap_sub() to fail invalid MAP_PRIVATE
1640 * requests rather than doing it here.
1641 */
1642 if ((flags & MAP_SHARED) != 0) {
1643 if ((fp->f_flag & FWRITE) != 0)
1644 maxprot |= VM_PROT_WRITE;
1645 else if ((prot & VM_PROT_WRITE) != 0)
1646 return (EACCES);
1647 }
1648 maxprot &= cap_maxprot;
1649
1650 linux_get_fop(filp, &fop, &ldev);
1651 error = linux_file_mmap_sub(td, size, prot, maxprot, flags, fp,
1652 &foff, fop, &object);
1653 if (error != 0)
1654 goto out;
1655
1656 error = vm_mmap_object(map, addr, size, prot, maxprot, flags, object,
1657 foff, FALSE, td);
1658 if (error != 0)
1659 vm_object_deallocate(object);
1660 out:
1661 linux_drop_fop(ldev);
1662 return (error);
1663 }
1664
1665 static int
linux_file_stat(struct file * fp,struct stat * sb,struct ucred * active_cred)1666 linux_file_stat(struct file *fp, struct stat *sb, struct ucred *active_cred)
1667 {
1668 struct linux_file *filp;
1669 struct vnode *vp;
1670 int error;
1671
1672 filp = (struct linux_file *)fp->f_data;
1673 if (filp->f_vnode == NULL)
1674 return (EOPNOTSUPP);
1675
1676 vp = filp->f_vnode;
1677
1678 vn_lock(vp, LK_SHARED | LK_RETRY);
1679 error = VOP_STAT(vp, sb, curthread->td_ucred, NOCRED);
1680 VOP_UNLOCK(vp);
1681
1682 return (error);
1683 }
1684
1685 static int
linux_file_fill_kinfo(struct file * fp,struct kinfo_file * kif,struct filedesc * fdp)1686 linux_file_fill_kinfo(struct file *fp, struct kinfo_file *kif,
1687 struct filedesc *fdp)
1688 {
1689 struct linux_file *filp;
1690 struct vnode *vp;
1691 int error;
1692
1693 filp = fp->f_data;
1694 vp = filp->f_vnode;
1695 if (vp == NULL) {
1696 error = 0;
1697 kif->kf_type = KF_TYPE_DEV;
1698 } else {
1699 vref(vp);
1700 FILEDESC_SUNLOCK(fdp);
1701 error = vn_fill_kinfo_vnode(vp, kif);
1702 vrele(vp);
1703 kif->kf_type = KF_TYPE_VNODE;
1704 FILEDESC_SLOCK(fdp);
1705 }
1706 return (error);
1707 }
1708
1709 unsigned int
linux_iminor(struct inode * inode)1710 linux_iminor(struct inode *inode)
1711 {
1712 struct linux_cdev *ldev;
1713
1714 if (inode == NULL || inode->v_rdev == NULL ||
1715 inode->v_rdev->si_devsw != &linuxcdevsw)
1716 return (-1U);
1717 ldev = inode->v_rdev->si_drv1;
1718 if (ldev == NULL)
1719 return (-1U);
1720
1721 return (minor(ldev->dev));
1722 }
1723
1724 static int
linux_file_kcmp(struct file * fp1,struct file * fp2,struct thread * td)1725 linux_file_kcmp(struct file *fp1, struct file *fp2, struct thread *td)
1726 {
1727 struct linux_file *filp1, *filp2;
1728
1729 if (fp2->f_type != DTYPE_DEV)
1730 return (3);
1731
1732 filp1 = fp1->f_data;
1733 filp2 = fp2->f_data;
1734 return (kcmp_cmp((uintptr_t)filp1->f_cdev, (uintptr_t)filp2->f_cdev));
1735 }
1736
1737 struct fileops linuxfileops = {
1738 .fo_read = linux_file_read,
1739 .fo_write = linux_file_write,
1740 .fo_truncate = invfo_truncate,
1741 .fo_kqfilter = linux_file_kqfilter,
1742 .fo_stat = linux_file_stat,
1743 .fo_fill_kinfo = linux_file_fill_kinfo,
1744 .fo_poll = linux_file_poll,
1745 .fo_close = linux_file_close,
1746 .fo_ioctl = linux_file_ioctl,
1747 .fo_mmap = linux_file_mmap,
1748 .fo_chmod = invfo_chmod,
1749 .fo_chown = invfo_chown,
1750 .fo_sendfile = invfo_sendfile,
1751 .fo_cmp = linux_file_kcmp,
1752 .fo_flags = DFLAG_PASSABLE,
1753 };
1754
1755 /*
1756 * Hash of vmmap addresses. This is infrequently accessed and does not
1757 * need to be particularly large. This is done because we must store the
1758 * caller's idea of the map size to properly unmap.
1759 */
1760 struct vmmap {
1761 LIST_ENTRY(vmmap) vm_next;
1762 void *vm_addr;
1763 unsigned long vm_size;
1764 };
1765
1766 struct vmmaphd {
1767 struct vmmap *lh_first;
1768 };
1769 #define VMMAP_HASH_SIZE 64
1770 #define VMMAP_HASH_MASK (VMMAP_HASH_SIZE - 1)
1771 #define VM_HASH(addr) ((uintptr_t)(addr) >> PAGE_SHIFT) & VMMAP_HASH_MASK
1772 static struct vmmaphd vmmaphead[VMMAP_HASH_SIZE];
1773 static struct mtx vmmaplock;
1774
1775 static void
vmmap_add(void * addr,unsigned long size)1776 vmmap_add(void *addr, unsigned long size)
1777 {
1778 struct vmmap *vmmap;
1779
1780 vmmap = kmalloc(sizeof(*vmmap), GFP_KERNEL);
1781 mtx_lock(&vmmaplock);
1782 vmmap->vm_size = size;
1783 vmmap->vm_addr = addr;
1784 LIST_INSERT_HEAD(&vmmaphead[VM_HASH(addr)], vmmap, vm_next);
1785 mtx_unlock(&vmmaplock);
1786 }
1787
1788 static struct vmmap *
vmmap_remove(void * addr)1789 vmmap_remove(void *addr)
1790 {
1791 struct vmmap *vmmap;
1792
1793 mtx_lock(&vmmaplock);
1794 LIST_FOREACH(vmmap, &vmmaphead[VM_HASH(addr)], vm_next)
1795 if (vmmap->vm_addr == addr)
1796 break;
1797 if (vmmap)
1798 LIST_REMOVE(vmmap, vm_next);
1799 mtx_unlock(&vmmaplock);
1800
1801 return (vmmap);
1802 }
1803
1804 #if defined(__i386__) || defined(__amd64__) || defined(__powerpc__) || defined(__aarch64__) || defined(__riscv)
1805 void *
_ioremap_attr(vm_paddr_t phys_addr,unsigned long size,int attr)1806 _ioremap_attr(vm_paddr_t phys_addr, unsigned long size, int attr)
1807 {
1808 void *addr;
1809
1810 addr = pmap_mapdev_attr(phys_addr, size, attr);
1811 if (addr == NULL)
1812 return (NULL);
1813 vmmap_add(addr, size);
1814
1815 return (addr);
1816 }
1817 #endif
1818
1819 void
iounmap(void * addr)1820 iounmap(void *addr)
1821 {
1822 struct vmmap *vmmap;
1823
1824 vmmap = vmmap_remove(addr);
1825 if (vmmap == NULL)
1826 return;
1827 #if defined(__i386__) || defined(__amd64__) || defined(__powerpc__) || defined(__aarch64__) || defined(__riscv)
1828 pmap_unmapdev(addr, vmmap->vm_size);
1829 #endif
1830 kfree(vmmap);
1831 }
1832
1833 void *
vmap(struct page ** pages,unsigned int count,unsigned long flags,int prot)1834 vmap(struct page **pages, unsigned int count, unsigned long flags, int prot)
1835 {
1836 vm_offset_t off;
1837 size_t size;
1838
1839 size = count * PAGE_SIZE;
1840 off = kva_alloc(size);
1841 if (off == 0)
1842 return (NULL);
1843 vmmap_add((void *)off, size);
1844 pmap_qenter(off, pages, count);
1845
1846 return ((void *)off);
1847 }
1848
1849 void
vunmap(void * addr)1850 vunmap(void *addr)
1851 {
1852 struct vmmap *vmmap;
1853
1854 vmmap = vmmap_remove(addr);
1855 if (vmmap == NULL)
1856 return;
1857 pmap_qremove((vm_offset_t)addr, vmmap->vm_size / PAGE_SIZE);
1858 kva_free((vm_offset_t)addr, vmmap->vm_size);
1859 kfree(vmmap);
1860 }
1861
1862 static char *
devm_kvasprintf(struct device * dev,gfp_t gfp,const char * fmt,va_list ap)1863 devm_kvasprintf(struct device *dev, gfp_t gfp, const char *fmt, va_list ap)
1864 {
1865 unsigned int len;
1866 char *p;
1867 va_list aq;
1868
1869 va_copy(aq, ap);
1870 len = vsnprintf(NULL, 0, fmt, aq);
1871 va_end(aq);
1872
1873 if (dev != NULL)
1874 p = devm_kmalloc(dev, len + 1, gfp);
1875 else
1876 p = kmalloc(len + 1, gfp);
1877 if (p != NULL)
1878 vsnprintf(p, len + 1, fmt, ap);
1879
1880 return (p);
1881 }
1882
1883 char *
kvasprintf(gfp_t gfp,const char * fmt,va_list ap)1884 kvasprintf(gfp_t gfp, const char *fmt, va_list ap)
1885 {
1886
1887 return (devm_kvasprintf(NULL, gfp, fmt, ap));
1888 }
1889
1890 char *
lkpi_devm_kasprintf(struct device * dev,gfp_t gfp,const char * fmt,...)1891 lkpi_devm_kasprintf(struct device *dev, gfp_t gfp, const char *fmt, ...)
1892 {
1893 va_list ap;
1894 char *p;
1895
1896 va_start(ap, fmt);
1897 p = devm_kvasprintf(dev, gfp, fmt, ap);
1898 va_end(ap);
1899
1900 return (p);
1901 }
1902
1903 char *
kasprintf(gfp_t gfp,const char * fmt,...)1904 kasprintf(gfp_t gfp, const char *fmt, ...)
1905 {
1906 va_list ap;
1907 char *p;
1908
1909 va_start(ap, fmt);
1910 p = kvasprintf(gfp, fmt, ap);
1911 va_end(ap);
1912
1913 return (p);
1914 }
1915
1916 static void
linux_timer_callback_wrapper(void * context)1917 linux_timer_callback_wrapper(void *context)
1918 {
1919 struct timer_list *timer;
1920
1921 timer = context;
1922
1923 /* the timer is about to be shutdown permanently */
1924 if (timer->function == NULL)
1925 return;
1926
1927 if (linux_set_current_flags(curthread, M_NOWAIT)) {
1928 /* try again later */
1929 callout_reset(&timer->callout, 1,
1930 &linux_timer_callback_wrapper, timer);
1931 return;
1932 }
1933
1934 timer->function(timer->data);
1935 }
1936
1937 int
mod_timer(struct timer_list * timer,int expires)1938 mod_timer(struct timer_list *timer, int expires)
1939 {
1940 int ret;
1941
1942 timer->expires = expires;
1943 ret = callout_reset(&timer->callout,
1944 linux_timer_jiffies_until(expires),
1945 &linux_timer_callback_wrapper, timer);
1946
1947 MPASS(ret == 0 || ret == 1);
1948
1949 return (ret == 1);
1950 }
1951
1952 void
add_timer(struct timer_list * timer)1953 add_timer(struct timer_list *timer)
1954 {
1955
1956 callout_reset(&timer->callout,
1957 linux_timer_jiffies_until(timer->expires),
1958 &linux_timer_callback_wrapper, timer);
1959 }
1960
1961 void
add_timer_on(struct timer_list * timer,int cpu)1962 add_timer_on(struct timer_list *timer, int cpu)
1963 {
1964
1965 callout_reset_on(&timer->callout,
1966 linux_timer_jiffies_until(timer->expires),
1967 &linux_timer_callback_wrapper, timer, cpu);
1968 }
1969
1970 int
del_timer(struct timer_list * timer)1971 del_timer(struct timer_list *timer)
1972 {
1973
1974 if (callout_stop(&(timer)->callout) == -1)
1975 return (0);
1976 return (1);
1977 }
1978
1979 int
del_timer_sync(struct timer_list * timer)1980 del_timer_sync(struct timer_list *timer)
1981 {
1982
1983 if (callout_drain(&(timer)->callout) == -1)
1984 return (0);
1985 return (1);
1986 }
1987
1988 int
timer_delete_sync(struct timer_list * timer)1989 timer_delete_sync(struct timer_list *timer)
1990 {
1991
1992 return (del_timer_sync(timer));
1993 }
1994
1995 int
timer_shutdown_sync(struct timer_list * timer)1996 timer_shutdown_sync(struct timer_list *timer)
1997 {
1998
1999 timer->function = NULL;
2000 return (del_timer_sync(timer));
2001 }
2002
2003 /* greatest common divisor, Euclid equation */
2004 static uint64_t
lkpi_gcd_64(uint64_t a,uint64_t b)2005 lkpi_gcd_64(uint64_t a, uint64_t b)
2006 {
2007 uint64_t an;
2008 uint64_t bn;
2009
2010 while (b != 0) {
2011 an = b;
2012 bn = a % b;
2013 a = an;
2014 b = bn;
2015 }
2016 return (a);
2017 }
2018
2019 uint64_t lkpi_nsec2hz_rem;
2020 uint64_t lkpi_nsec2hz_div = 1000000000ULL;
2021 uint64_t lkpi_nsec2hz_max;
2022
2023 uint64_t lkpi_usec2hz_rem;
2024 uint64_t lkpi_usec2hz_div = 1000000ULL;
2025 uint64_t lkpi_usec2hz_max;
2026
2027 uint64_t lkpi_msec2hz_rem;
2028 uint64_t lkpi_msec2hz_div = 1000ULL;
2029 uint64_t lkpi_msec2hz_max;
2030
2031 static void
linux_timer_init(void * arg)2032 linux_timer_init(void *arg)
2033 {
2034 uint64_t gcd;
2035
2036 /*
2037 * Compute an internal HZ value which can divide 2**32 to
2038 * avoid timer rounding problems when the tick value wraps
2039 * around 2**32:
2040 */
2041 linux_timer_hz_mask = 1;
2042 while (linux_timer_hz_mask < (unsigned long)hz)
2043 linux_timer_hz_mask *= 2;
2044 linux_timer_hz_mask--;
2045
2046 /* compute some internal constants */
2047
2048 lkpi_nsec2hz_rem = hz;
2049 lkpi_usec2hz_rem = hz;
2050 lkpi_msec2hz_rem = hz;
2051
2052 gcd = lkpi_gcd_64(lkpi_nsec2hz_rem, lkpi_nsec2hz_div);
2053 lkpi_nsec2hz_rem /= gcd;
2054 lkpi_nsec2hz_div /= gcd;
2055 lkpi_nsec2hz_max = -1ULL / lkpi_nsec2hz_rem;
2056
2057 gcd = lkpi_gcd_64(lkpi_usec2hz_rem, lkpi_usec2hz_div);
2058 lkpi_usec2hz_rem /= gcd;
2059 lkpi_usec2hz_div /= gcd;
2060 lkpi_usec2hz_max = -1ULL / lkpi_usec2hz_rem;
2061
2062 gcd = lkpi_gcd_64(lkpi_msec2hz_rem, lkpi_msec2hz_div);
2063 lkpi_msec2hz_rem /= gcd;
2064 lkpi_msec2hz_div /= gcd;
2065 lkpi_msec2hz_max = -1ULL / lkpi_msec2hz_rem;
2066 }
2067 SYSINIT(linux_timer, SI_SUB_DRIVERS, SI_ORDER_FIRST, linux_timer_init, NULL);
2068
2069 void
linux_complete_common(struct completion * c,int all)2070 linux_complete_common(struct completion *c, int all)
2071 {
2072 int wakeup_swapper;
2073
2074 sleepq_lock(c);
2075 if (all) {
2076 c->done = UINT_MAX;
2077 wakeup_swapper = sleepq_broadcast(c, SLEEPQ_SLEEP, 0, 0);
2078 } else {
2079 if (c->done != UINT_MAX)
2080 c->done++;
2081 wakeup_swapper = sleepq_signal(c, SLEEPQ_SLEEP, 0, 0);
2082 }
2083 sleepq_release(c);
2084 if (wakeup_swapper)
2085 kick_proc0();
2086 }
2087
2088 /*
2089 * Indefinite wait for done != 0 with or without signals.
2090 */
2091 int
linux_wait_for_common(struct completion * c,int flags)2092 linux_wait_for_common(struct completion *c, int flags)
2093 {
2094 struct task_struct *task;
2095 int error;
2096
2097 if (SCHEDULER_STOPPED())
2098 return (0);
2099
2100 task = current;
2101
2102 if (flags != 0)
2103 flags = SLEEPQ_INTERRUPTIBLE | SLEEPQ_SLEEP;
2104 else
2105 flags = SLEEPQ_SLEEP;
2106 error = 0;
2107 for (;;) {
2108 sleepq_lock(c);
2109 if (c->done)
2110 break;
2111 sleepq_add(c, NULL, "completion", flags, 0);
2112 if (flags & SLEEPQ_INTERRUPTIBLE) {
2113 DROP_GIANT();
2114 error = -sleepq_wait_sig(c, 0);
2115 PICKUP_GIANT();
2116 if (error != 0) {
2117 linux_schedule_save_interrupt_value(task, error);
2118 error = -ERESTARTSYS;
2119 goto intr;
2120 }
2121 } else {
2122 DROP_GIANT();
2123 sleepq_wait(c, 0);
2124 PICKUP_GIANT();
2125 }
2126 }
2127 if (c->done != UINT_MAX)
2128 c->done--;
2129 sleepq_release(c);
2130
2131 intr:
2132 return (error);
2133 }
2134
2135 /*
2136 * Time limited wait for done != 0 with or without signals.
2137 */
2138 int
linux_wait_for_timeout_common(struct completion * c,int timeout,int flags)2139 linux_wait_for_timeout_common(struct completion *c, int timeout, int flags)
2140 {
2141 struct task_struct *task;
2142 int end = jiffies + timeout;
2143 int error;
2144
2145 if (SCHEDULER_STOPPED())
2146 return (0);
2147
2148 task = current;
2149
2150 if (flags != 0)
2151 flags = SLEEPQ_INTERRUPTIBLE | SLEEPQ_SLEEP;
2152 else
2153 flags = SLEEPQ_SLEEP;
2154
2155 for (;;) {
2156 sleepq_lock(c);
2157 if (c->done)
2158 break;
2159 sleepq_add(c, NULL, "completion", flags, 0);
2160 sleepq_set_timeout(c, linux_timer_jiffies_until(end));
2161
2162 DROP_GIANT();
2163 if (flags & SLEEPQ_INTERRUPTIBLE)
2164 error = -sleepq_timedwait_sig(c, 0);
2165 else
2166 error = -sleepq_timedwait(c, 0);
2167 PICKUP_GIANT();
2168
2169 if (error != 0) {
2170 /* check for timeout */
2171 if (error == -EWOULDBLOCK) {
2172 error = 0; /* timeout */
2173 } else {
2174 /* signal happened */
2175 linux_schedule_save_interrupt_value(task, error);
2176 error = -ERESTARTSYS;
2177 }
2178 goto done;
2179 }
2180 }
2181 if (c->done != UINT_MAX)
2182 c->done--;
2183 sleepq_release(c);
2184
2185 /* return how many jiffies are left */
2186 error = linux_timer_jiffies_until(end);
2187 done:
2188 return (error);
2189 }
2190
2191 int
linux_try_wait_for_completion(struct completion * c)2192 linux_try_wait_for_completion(struct completion *c)
2193 {
2194 int isdone;
2195
2196 sleepq_lock(c);
2197 isdone = (c->done != 0);
2198 if (c->done != 0 && c->done != UINT_MAX)
2199 c->done--;
2200 sleepq_release(c);
2201 return (isdone);
2202 }
2203
2204 int
linux_completion_done(struct completion * c)2205 linux_completion_done(struct completion *c)
2206 {
2207 int isdone;
2208
2209 sleepq_lock(c);
2210 isdone = (c->done != 0);
2211 sleepq_release(c);
2212 return (isdone);
2213 }
2214
2215 static void
linux_cdev_deref(struct linux_cdev * ldev)2216 linux_cdev_deref(struct linux_cdev *ldev)
2217 {
2218 if (refcount_release(&ldev->refs) &&
2219 ldev->kobj.ktype == &linux_cdev_ktype)
2220 kfree(ldev);
2221 }
2222
2223 static void
linux_cdev_release(struct kobject * kobj)2224 linux_cdev_release(struct kobject *kobj)
2225 {
2226 struct linux_cdev *cdev;
2227 struct kobject *parent;
2228
2229 cdev = container_of(kobj, struct linux_cdev, kobj);
2230 parent = kobj->parent;
2231 linux_destroy_dev(cdev);
2232 linux_cdev_deref(cdev);
2233 kobject_put(parent);
2234 }
2235
2236 static void
linux_cdev_static_release(struct kobject * kobj)2237 linux_cdev_static_release(struct kobject *kobj)
2238 {
2239 struct cdev *cdev;
2240 struct linux_cdev *ldev;
2241
2242 ldev = container_of(kobj, struct linux_cdev, kobj);
2243 cdev = ldev->cdev;
2244 if (cdev != NULL) {
2245 destroy_dev(cdev);
2246 ldev->cdev = NULL;
2247 }
2248 kobject_put(kobj->parent);
2249 }
2250
2251 int
linux_cdev_device_add(struct linux_cdev * ldev,struct device * dev)2252 linux_cdev_device_add(struct linux_cdev *ldev, struct device *dev)
2253 {
2254 int ret;
2255
2256 if (dev->devt != 0) {
2257 /* Set parent kernel object. */
2258 ldev->kobj.parent = &dev->kobj;
2259
2260 /*
2261 * Unlike Linux we require the kobject of the
2262 * character device structure to have a valid name
2263 * before calling this function:
2264 */
2265 if (ldev->kobj.name == NULL)
2266 return (-EINVAL);
2267
2268 ret = cdev_add(ldev, dev->devt, 1);
2269 if (ret)
2270 return (ret);
2271 }
2272 ret = device_add(dev);
2273 if (ret != 0 && dev->devt != 0)
2274 cdev_del(ldev);
2275 return (ret);
2276 }
2277
2278 void
linux_cdev_device_del(struct linux_cdev * ldev,struct device * dev)2279 linux_cdev_device_del(struct linux_cdev *ldev, struct device *dev)
2280 {
2281 device_del(dev);
2282
2283 if (dev->devt != 0)
2284 cdev_del(ldev);
2285 }
2286
2287 static void
linux_destroy_dev(struct linux_cdev * ldev)2288 linux_destroy_dev(struct linux_cdev *ldev)
2289 {
2290
2291 if (ldev->cdev == NULL)
2292 return;
2293
2294 MPASS((ldev->siref & LDEV_SI_DTR) == 0);
2295 MPASS(ldev->kobj.ktype == &linux_cdev_ktype);
2296
2297 atomic_set_int(&ldev->siref, LDEV_SI_DTR);
2298 while ((atomic_load_int(&ldev->siref) & ~LDEV_SI_DTR) != 0)
2299 pause("ldevdtr", hz / 4);
2300
2301 destroy_dev(ldev->cdev);
2302 ldev->cdev = NULL;
2303 }
2304
2305 const struct kobj_type linux_cdev_ktype = {
2306 .release = linux_cdev_release,
2307 };
2308
2309 const struct kobj_type linux_cdev_static_ktype = {
2310 .release = linux_cdev_static_release,
2311 };
2312
2313 static void
linux_handle_ifnet_link_event(void * arg,struct ifnet * ifp,int linkstate)2314 linux_handle_ifnet_link_event(void *arg, struct ifnet *ifp, int linkstate)
2315 {
2316 struct notifier_block *nb;
2317 struct netdev_notifier_info ni;
2318
2319 nb = arg;
2320 ni.ifp = ifp;
2321 ni.dev = (struct net_device *)ifp;
2322 if (linkstate == LINK_STATE_UP)
2323 nb->notifier_call(nb, NETDEV_UP, &ni);
2324 else
2325 nb->notifier_call(nb, NETDEV_DOWN, &ni);
2326 }
2327
2328 static void
linux_handle_ifnet_arrival_event(void * arg,struct ifnet * ifp)2329 linux_handle_ifnet_arrival_event(void *arg, struct ifnet *ifp)
2330 {
2331 struct notifier_block *nb;
2332 struct netdev_notifier_info ni;
2333
2334 nb = arg;
2335 ni.ifp = ifp;
2336 ni.dev = (struct net_device *)ifp;
2337 nb->notifier_call(nb, NETDEV_REGISTER, &ni);
2338 }
2339
2340 static void
linux_handle_ifnet_departure_event(void * arg,struct ifnet * ifp)2341 linux_handle_ifnet_departure_event(void *arg, struct ifnet *ifp)
2342 {
2343 struct notifier_block *nb;
2344 struct netdev_notifier_info ni;
2345
2346 nb = arg;
2347 ni.ifp = ifp;
2348 ni.dev = (struct net_device *)ifp;
2349 nb->notifier_call(nb, NETDEV_UNREGISTER, &ni);
2350 }
2351
2352 static void
linux_handle_iflladdr_event(void * arg,struct ifnet * ifp)2353 linux_handle_iflladdr_event(void *arg, struct ifnet *ifp)
2354 {
2355 struct notifier_block *nb;
2356 struct netdev_notifier_info ni;
2357
2358 nb = arg;
2359 ni.ifp = ifp;
2360 ni.dev = (struct net_device *)ifp;
2361 nb->notifier_call(nb, NETDEV_CHANGEADDR, &ni);
2362 }
2363
2364 static void
linux_handle_ifaddr_event(void * arg,struct ifnet * ifp)2365 linux_handle_ifaddr_event(void *arg, struct ifnet *ifp)
2366 {
2367 struct notifier_block *nb;
2368 struct netdev_notifier_info ni;
2369
2370 nb = arg;
2371 ni.ifp = ifp;
2372 ni.dev = (struct net_device *)ifp;
2373 nb->notifier_call(nb, NETDEV_CHANGEIFADDR, &ni);
2374 }
2375
2376 int
register_netdevice_notifier(struct notifier_block * nb)2377 register_netdevice_notifier(struct notifier_block *nb)
2378 {
2379
2380 nb->tags[NETDEV_UP] = EVENTHANDLER_REGISTER(
2381 ifnet_link_event, linux_handle_ifnet_link_event, nb, 0);
2382 nb->tags[NETDEV_REGISTER] = EVENTHANDLER_REGISTER(
2383 ifnet_arrival_event, linux_handle_ifnet_arrival_event, nb, 0);
2384 nb->tags[NETDEV_UNREGISTER] = EVENTHANDLER_REGISTER(
2385 ifnet_departure_event, linux_handle_ifnet_departure_event, nb, 0);
2386 nb->tags[NETDEV_CHANGEADDR] = EVENTHANDLER_REGISTER(
2387 iflladdr_event, linux_handle_iflladdr_event, nb, 0);
2388
2389 return (0);
2390 }
2391
2392 int
register_inetaddr_notifier(struct notifier_block * nb)2393 register_inetaddr_notifier(struct notifier_block *nb)
2394 {
2395
2396 nb->tags[NETDEV_CHANGEIFADDR] = EVENTHANDLER_REGISTER(
2397 ifaddr_event, linux_handle_ifaddr_event, nb, 0);
2398 return (0);
2399 }
2400
2401 int
unregister_netdevice_notifier(struct notifier_block * nb)2402 unregister_netdevice_notifier(struct notifier_block *nb)
2403 {
2404
2405 EVENTHANDLER_DEREGISTER(ifnet_link_event,
2406 nb->tags[NETDEV_UP]);
2407 EVENTHANDLER_DEREGISTER(ifnet_arrival_event,
2408 nb->tags[NETDEV_REGISTER]);
2409 EVENTHANDLER_DEREGISTER(ifnet_departure_event,
2410 nb->tags[NETDEV_UNREGISTER]);
2411 EVENTHANDLER_DEREGISTER(iflladdr_event,
2412 nb->tags[NETDEV_CHANGEADDR]);
2413
2414 return (0);
2415 }
2416
2417 int
unregister_inetaddr_notifier(struct notifier_block * nb)2418 unregister_inetaddr_notifier(struct notifier_block *nb)
2419 {
2420
2421 EVENTHANDLER_DEREGISTER(ifaddr_event,
2422 nb->tags[NETDEV_CHANGEIFADDR]);
2423
2424 return (0);
2425 }
2426
2427 struct list_sort_thunk {
2428 int (*cmp)(void *, struct list_head *, struct list_head *);
2429 void *priv;
2430 };
2431
2432 static inline int
linux_le_cmp(const void * d1,const void * d2,void * priv)2433 linux_le_cmp(const void *d1, const void *d2, void *priv)
2434 {
2435 struct list_head *le1, *le2;
2436 struct list_sort_thunk *thunk;
2437
2438 thunk = priv;
2439 le1 = *(__DECONST(struct list_head **, d1));
2440 le2 = *(__DECONST(struct list_head **, d2));
2441 return ((thunk->cmp)(thunk->priv, le1, le2));
2442 }
2443
2444 void
list_sort(void * priv,struct list_head * head,int (* cmp)(void * priv,struct list_head * a,struct list_head * b))2445 list_sort(void *priv, struct list_head *head, int (*cmp)(void *priv,
2446 struct list_head *a, struct list_head *b))
2447 {
2448 struct list_sort_thunk thunk;
2449 struct list_head **ar, *le;
2450 size_t count, i;
2451
2452 count = 0;
2453 list_for_each(le, head)
2454 count++;
2455 ar = malloc(sizeof(struct list_head *) * count, M_KMALLOC, M_WAITOK);
2456 i = 0;
2457 list_for_each(le, head)
2458 ar[i++] = le;
2459 thunk.cmp = cmp;
2460 thunk.priv = priv;
2461 qsort_r(ar, count, sizeof(struct list_head *), linux_le_cmp, &thunk);
2462 INIT_LIST_HEAD(head);
2463 for (i = 0; i < count; i++)
2464 list_add_tail(ar[i], head);
2465 free(ar, M_KMALLOC);
2466 }
2467
2468 #if defined(__i386__) || defined(__amd64__)
2469 int
linux_wbinvd_on_all_cpus(void)2470 linux_wbinvd_on_all_cpus(void)
2471 {
2472
2473 pmap_invalidate_cache();
2474 return (0);
2475 }
2476 #endif
2477
2478 int
linux_on_each_cpu(void callback (void *),void * data)2479 linux_on_each_cpu(void callback(void *), void *data)
2480 {
2481
2482 smp_rendezvous(smp_no_rendezvous_barrier, callback,
2483 smp_no_rendezvous_barrier, data);
2484 return (0);
2485 }
2486
2487 int
linux_in_atomic(void)2488 linux_in_atomic(void)
2489 {
2490
2491 return ((curthread->td_pflags & TDP_NOFAULTING) != 0);
2492 }
2493
2494 struct linux_cdev *
linux_find_cdev(const char * name,unsigned major,unsigned minor)2495 linux_find_cdev(const char *name, unsigned major, unsigned minor)
2496 {
2497 dev_t dev = MKDEV(major, minor);
2498 struct cdev *cdev;
2499
2500 dev_lock();
2501 LIST_FOREACH(cdev, &linuxcdevsw.d_devs, si_list) {
2502 struct linux_cdev *ldev = cdev->si_drv1;
2503 if (ldev->dev == dev &&
2504 strcmp(kobject_name(&ldev->kobj), name) == 0) {
2505 break;
2506 }
2507 }
2508 dev_unlock();
2509
2510 return (cdev != NULL ? cdev->si_drv1 : NULL);
2511 }
2512
2513 int
__register_chrdev(unsigned int major,unsigned int baseminor,unsigned int count,const char * name,const struct file_operations * fops)2514 __register_chrdev(unsigned int major, unsigned int baseminor,
2515 unsigned int count, const char *name,
2516 const struct file_operations *fops)
2517 {
2518 struct linux_cdev *cdev;
2519 int ret = 0;
2520 int i;
2521
2522 for (i = baseminor; i < baseminor + count; i++) {
2523 cdev = cdev_alloc();
2524 cdev->ops = fops;
2525 kobject_set_name(&cdev->kobj, name);
2526
2527 ret = cdev_add(cdev, makedev(major, i), 1);
2528 if (ret != 0)
2529 break;
2530 }
2531 return (ret);
2532 }
2533
2534 int
__register_chrdev_p(unsigned int major,unsigned int baseminor,unsigned int count,const char * name,const struct file_operations * fops,uid_t uid,gid_t gid,int mode)2535 __register_chrdev_p(unsigned int major, unsigned int baseminor,
2536 unsigned int count, const char *name,
2537 const struct file_operations *fops, uid_t uid,
2538 gid_t gid, int mode)
2539 {
2540 struct linux_cdev *cdev;
2541 int ret = 0;
2542 int i;
2543
2544 for (i = baseminor; i < baseminor + count; i++) {
2545 cdev = cdev_alloc();
2546 cdev->ops = fops;
2547 kobject_set_name(&cdev->kobj, name);
2548
2549 ret = cdev_add_ext(cdev, makedev(major, i), uid, gid, mode);
2550 if (ret != 0)
2551 break;
2552 }
2553 return (ret);
2554 }
2555
2556 void
__unregister_chrdev(unsigned int major,unsigned int baseminor,unsigned int count,const char * name)2557 __unregister_chrdev(unsigned int major, unsigned int baseminor,
2558 unsigned int count, const char *name)
2559 {
2560 struct linux_cdev *cdevp;
2561 int i;
2562
2563 for (i = baseminor; i < baseminor + count; i++) {
2564 cdevp = linux_find_cdev(name, major, i);
2565 if (cdevp != NULL)
2566 cdev_del(cdevp);
2567 }
2568 }
2569
2570 void
linux_dump_stack(void)2571 linux_dump_stack(void)
2572 {
2573 #ifdef STACK
2574 struct stack st;
2575
2576 stack_save(&st);
2577 stack_print(&st);
2578 #endif
2579 }
2580
2581 int
linuxkpi_net_ratelimit(void)2582 linuxkpi_net_ratelimit(void)
2583 {
2584
2585 return (ppsratecheck(&lkpi_net_lastlog, &lkpi_net_curpps,
2586 lkpi_net_maxpps));
2587 }
2588
2589 struct io_mapping *
io_mapping_create_wc(resource_size_t base,unsigned long size)2590 io_mapping_create_wc(resource_size_t base, unsigned long size)
2591 {
2592 struct io_mapping *mapping;
2593
2594 mapping = kmalloc(sizeof(*mapping), GFP_KERNEL);
2595 if (mapping == NULL)
2596 return (NULL);
2597 return (io_mapping_init_wc(mapping, base, size));
2598 }
2599
2600 /* We likely want a linuxkpi_device.c at some point. */
2601 bool
device_can_wakeup(struct device * dev)2602 device_can_wakeup(struct device *dev)
2603 {
2604
2605 if (dev == NULL)
2606 return (false);
2607 /*
2608 * XXX-BZ iwlwifi queries it as part of enabling WoWLAN.
2609 * Normally this would be based on a bool in dev->power.XXX.
2610 * Check such as PCI PCIM_PCAP_*PME. We have no way to enable this yet.
2611 * We may get away by directly calling into bsddev for as long as
2612 * we can assume PCI only avoiding changing struct device breaking KBI.
2613 */
2614 pr_debug("%s:%d: not enabled; see comment.\n", __func__, __LINE__);
2615 return (false);
2616 }
2617
2618 #if defined(__i386__) || defined(__amd64__)
2619 bool linux_cpu_has_clflush;
2620 struct cpuinfo_x86 boot_cpu_data;
2621 struct cpuinfo_x86 *__cpu_data;
2622 #endif
2623
2624 cpumask_t *
lkpi_get_static_single_cpu_mask(int cpuid)2625 lkpi_get_static_single_cpu_mask(int cpuid)
2626 {
2627
2628 KASSERT((cpuid >= 0 && cpuid <= mp_maxid), ("%s: invalid cpuid %d\n",
2629 __func__, cpuid));
2630 KASSERT(!CPU_ABSENT(cpuid), ("%s: cpu with cpuid %d is absent\n",
2631 __func__, cpuid));
2632
2633 return (static_single_cpu_mask[cpuid]);
2634 }
2635
2636 bool
lkpi_xen_initial_domain(void)2637 lkpi_xen_initial_domain(void)
2638 {
2639 #ifdef XENHVM
2640 return (xen_initial_domain());
2641 #else
2642 return (false);
2643 #endif
2644 }
2645
2646 bool
lkpi_xen_pv_domain(void)2647 lkpi_xen_pv_domain(void)
2648 {
2649 #ifdef XENHVM
2650 return (xen_pv_domain());
2651 #else
2652 return (false);
2653 #endif
2654 }
2655
2656 static void
linux_compat_init(void * arg)2657 linux_compat_init(void *arg)
2658 {
2659 struct sysctl_oid *rootoid;
2660 int i;
2661
2662 #if defined(__i386__) || defined(__amd64__)
2663 static const uint32_t x86_vendors[X86_VENDOR_NUM] = {
2664 [X86_VENDOR_INTEL] = CPU_VENDOR_INTEL,
2665 [X86_VENDOR_CYRIX] = CPU_VENDOR_CYRIX,
2666 [X86_VENDOR_AMD] = CPU_VENDOR_AMD,
2667 [X86_VENDOR_UMC] = CPU_VENDOR_UMC,
2668 [X86_VENDOR_CENTAUR] = CPU_VENDOR_CENTAUR,
2669 [X86_VENDOR_TRANSMETA] = CPU_VENDOR_TRANSMETA,
2670 [X86_VENDOR_NSC] = CPU_VENDOR_NSC,
2671 [X86_VENDOR_HYGON] = CPU_VENDOR_HYGON,
2672 };
2673 uint8_t x86_vendor = X86_VENDOR_UNKNOWN;
2674
2675 for (i = 0; i < X86_VENDOR_NUM; i++) {
2676 if (cpu_vendor_id != 0 && cpu_vendor_id == x86_vendors[i]) {
2677 x86_vendor = i;
2678 break;
2679 }
2680 }
2681 linux_cpu_has_clflush = (cpu_feature & CPUID_CLFSH);
2682 boot_cpu_data.x86_clflush_size = cpu_clflush_line_size;
2683 boot_cpu_data.x86_max_cores = mp_ncpus;
2684 boot_cpu_data.x86 = CPUID_TO_FAMILY(cpu_id);
2685 boot_cpu_data.x86_model = CPUID_TO_MODEL(cpu_id);
2686 boot_cpu_data.x86_vendor = x86_vendor;
2687
2688 __cpu_data = mallocarray(mp_maxid + 1,
2689 sizeof(*__cpu_data), M_KMALLOC, M_WAITOK | M_ZERO);
2690 CPU_FOREACH(i) {
2691 __cpu_data[i].x86_clflush_size = cpu_clflush_line_size;
2692 __cpu_data[i].x86_max_cores = mp_ncpus;
2693 __cpu_data[i].x86 = CPUID_TO_FAMILY(cpu_id);
2694 __cpu_data[i].x86_model = CPUID_TO_MODEL(cpu_id);
2695 __cpu_data[i].x86_vendor = x86_vendor;
2696 }
2697 #endif
2698 rw_init(&linux_vma_lock, "lkpi-vma-lock");
2699
2700 rootoid = SYSCTL_ADD_ROOT_NODE(NULL,
2701 OID_AUTO, "sys", CTLFLAG_RD|CTLFLAG_MPSAFE, NULL, "sys");
2702 kobject_init(&linux_class_root, &linux_class_ktype);
2703 kobject_set_name(&linux_class_root, "class");
2704 linux_class_root.oidp = SYSCTL_ADD_NODE(NULL, SYSCTL_CHILDREN(rootoid),
2705 OID_AUTO, "class", CTLFLAG_RD|CTLFLAG_MPSAFE, NULL, "class");
2706 kobject_init(&linux_root_device.kobj, &linux_dev_ktype);
2707 kobject_set_name(&linux_root_device.kobj, "device");
2708 linux_root_device.kobj.oidp = SYSCTL_ADD_NODE(NULL,
2709 SYSCTL_CHILDREN(rootoid), OID_AUTO, "device",
2710 CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "device");
2711 linux_root_device.bsddev = root_bus;
2712 linux_class_misc.name = "misc";
2713 class_register(&linux_class_misc);
2714 INIT_LIST_HEAD(&pci_drivers);
2715 INIT_LIST_HEAD(&pci_devices);
2716 spin_lock_init(&pci_lock);
2717 mtx_init(&vmmaplock, "IO Map lock", NULL, MTX_DEF);
2718 for (i = 0; i < VMMAP_HASH_SIZE; i++)
2719 LIST_INIT(&vmmaphead[i]);
2720 init_waitqueue_head(&linux_bit_waitq);
2721 init_waitqueue_head(&linux_var_waitq);
2722
2723 CPU_COPY(&all_cpus, &cpu_online_mask);
2724 /*
2725 * Generate a single-CPU cpumask_t for each CPU (possibly) in the system.
2726 * CPUs are indexed from 0..(mp_maxid). The entry for cpuid 0 will only
2727 * have itself in the cpumask, cupid 1 only itself on entry 1, and so on.
2728 * This is used by cpumask_of() (and possibly others in the future) for,
2729 * e.g., drivers to pass hints to irq_set_affinity_hint().
2730 */
2731 static_single_cpu_mask = mallocarray(mp_maxid + 1,
2732 sizeof(static_single_cpu_mask), M_KMALLOC, M_WAITOK | M_ZERO);
2733
2734 /*
2735 * When the number of CPUs reach a threshold, we start to save memory
2736 * given the sets are static by overlapping those having their single
2737 * bit set at same position in a bitset word. Asymptotically, this
2738 * regular scheme is in O(n²) whereas the overlapping one is in O(n)
2739 * only with n being the maximum number of CPUs, so the gain will become
2740 * huge quite quickly. The threshold for 64-bit architectures is 128
2741 * CPUs.
2742 */
2743 if (mp_ncpus < (2 * _BITSET_BITS)) {
2744 cpumask_t *sscm_ptr;
2745
2746 /*
2747 * This represents 'mp_ncpus * __bitset_words(CPU_SETSIZE) *
2748 * (_BITSET_BITS / 8)' bytes (for comparison with the
2749 * overlapping scheme).
2750 */
2751 static_single_cpu_mask_lcs = mallocarray(mp_ncpus,
2752 sizeof(*static_single_cpu_mask_lcs),
2753 M_KMALLOC, M_WAITOK | M_ZERO);
2754
2755 sscm_ptr = static_single_cpu_mask_lcs;
2756 CPU_FOREACH(i) {
2757 static_single_cpu_mask[i] = sscm_ptr++;
2758 CPU_SET(i, static_single_cpu_mask[i]);
2759 }
2760 } else {
2761 /* Pointer to a bitset word. */
2762 __typeof(((cpuset_t *)NULL)->__bits[0]) *bwp;
2763
2764 /*
2765 * Allocate memory for (static) spans of 'cpumask_t' ('cpuset_t'
2766 * really) with a single bit set that can be reused for all
2767 * single CPU masks by making them start at different offsets.
2768 * We need '__bitset_words(CPU_SETSIZE) - 1' bitset words before
2769 * the word having its single bit set, and the same amount
2770 * after.
2771 */
2772 static_single_cpu_mask_lcs = mallocarray(_BITSET_BITS,
2773 (2 * __bitset_words(CPU_SETSIZE) - 1) * (_BITSET_BITS / 8),
2774 M_KMALLOC, M_WAITOK | M_ZERO);
2775
2776 /*
2777 * We rely below on cpuset_t and the bitset generic
2778 * implementation assigning words in the '__bits' array in the
2779 * same order of bits (i.e., little-endian ordering, not to be
2780 * confused with machine endianness, which concerns bits in
2781 * words and other integers). This is an imperfect test, but it
2782 * will detect a change to big-endian ordering.
2783 */
2784 _Static_assert(
2785 __bitset_word(_BITSET_BITS + 1, _BITSET_BITS) == 1,
2786 "Assumes a bitset implementation that is little-endian "
2787 "on its words");
2788
2789 /* Initialize the single bit of each static span. */
2790 bwp = (__typeof(bwp))static_single_cpu_mask_lcs +
2791 (__bitset_words(CPU_SETSIZE) - 1);
2792 for (i = 0; i < _BITSET_BITS; i++) {
2793 CPU_SET(i, (cpuset_t *)bwp);
2794 bwp += (2 * __bitset_words(CPU_SETSIZE) - 1);
2795 }
2796
2797 /*
2798 * Finally set all CPU masks to the proper word in their
2799 * relevant span.
2800 */
2801 CPU_FOREACH(i) {
2802 bwp = (__typeof(bwp))static_single_cpu_mask_lcs;
2803 /* Find the non-zero word of the relevant span. */
2804 bwp += (2 * __bitset_words(CPU_SETSIZE) - 1) *
2805 (i % _BITSET_BITS) +
2806 __bitset_words(CPU_SETSIZE) - 1;
2807 /* Shift to find the CPU mask start. */
2808 bwp -= (i / _BITSET_BITS);
2809 static_single_cpu_mask[i] = (cpuset_t *)bwp;
2810 }
2811 }
2812
2813 strlcpy(init_uts_ns.name.release, osrelease, sizeof(init_uts_ns.name.release));
2814 }
2815 SYSINIT(linux_compat, SI_SUB_DRIVERS, SI_ORDER_SECOND, linux_compat_init, NULL);
2816
2817 static void
linux_compat_uninit(void * arg)2818 linux_compat_uninit(void *arg)
2819 {
2820 linux_kobject_kfree_name(&linux_class_root);
2821 linux_kobject_kfree_name(&linux_root_device.kobj);
2822 linux_kobject_kfree_name(&linux_class_misc.kobj);
2823
2824 free(static_single_cpu_mask_lcs, M_KMALLOC);
2825 free(static_single_cpu_mask, M_KMALLOC);
2826 #if defined(__i386__) || defined(__amd64__)
2827 free(__cpu_data, M_KMALLOC);
2828 #endif
2829
2830 mtx_destroy(&vmmaplock);
2831 spin_lock_destroy(&pci_lock);
2832 rw_destroy(&linux_vma_lock);
2833 }
2834 SYSUNINIT(linux_compat, SI_SUB_DRIVERS, SI_ORDER_SECOND, linux_compat_uninit, NULL);
2835
2836 /*
2837 * NOTE: Linux frequently uses "unsigned long" for pointer to integer
2838 * conversion and vice versa, where in FreeBSD "uintptr_t" would be
2839 * used. Assert these types have the same size, else some parts of the
2840 * LinuxKPI may not work like expected:
2841 */
2842 CTASSERT(sizeof(unsigned long) == sizeof(uintptr_t));
2843