1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * VFIO core
4  *
5  * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
6  *     Author: Alex Williamson <alex.williamson@redhat.com>
7  *
8  * Derived from original vfio:
9  * Copyright 2010 Cisco Systems, Inc.  All rights reserved.
10  * Author: Tom Lyon, pugs@cisco.com
11  */
12 
13 #include <linux/cdev.h>
14 #include <linux/compat.h>
15 #include <linux/device.h>
16 #include <linux/file.h>
17 #include <linux/anon_inodes.h>
18 #include <linux/fs.h>
19 #include <linux/idr.h>
20 #include <linux/iommu.h>
21 #include <linux/list.h>
22 #include <linux/miscdevice.h>
23 #include <linux/module.h>
24 #include <linux/mutex.h>
25 #include <linux/pci.h>
26 #include <linux/rwsem.h>
27 #include <linux/sched.h>
28 #include <linux/slab.h>
29 #include <linux/stat.h>
30 #include <linux/string.h>
31 #include <linux/uaccess.h>
32 #include <linux/vfio.h>
33 #include <linux/wait.h>
34 #include <linux/sched/signal.h>
35 
36 #define DRIVER_VERSION	"0.3"
37 #define DRIVER_AUTHOR	"Alex Williamson <alex.williamson@redhat.com>"
38 #define DRIVER_DESC	"VFIO - User Level meta-driver"
39 
40 static struct vfio {
41 	struct class			*class;
42 	struct list_head		iommu_drivers_list;
43 	struct mutex			iommu_drivers_lock;
44 	struct list_head		group_list;
45 	struct idr			group_idr;
46 	struct mutex			group_lock;
47 	struct cdev			group_cdev;
48 	dev_t				group_devt;
49 } vfio;
50 
51 struct vfio_iommu_driver {
52 	const struct vfio_iommu_driver_ops	*ops;
53 	struct list_head			vfio_next;
54 };
55 
56 struct vfio_container {
57 	struct kref			kref;
58 	struct list_head		group_list;
59 	struct rw_semaphore		group_lock;
60 	struct vfio_iommu_driver	*iommu_driver;
61 	void				*iommu_data;
62 	bool				noiommu;
63 };
64 
65 struct vfio_unbound_dev {
66 	struct device			*dev;
67 	struct list_head		unbound_next;
68 };
69 
70 struct vfio_group {
71 	struct kref			kref;
72 	int				minor;
73 	atomic_t			container_users;
74 	struct iommu_group		*iommu_group;
75 	struct vfio_container		*container;
76 	struct list_head		device_list;
77 	struct mutex			device_lock;
78 	struct device			*dev;
79 	struct notifier_block		nb;
80 	struct list_head		vfio_next;
81 	struct list_head		container_next;
82 	struct list_head		unbound_list;
83 	struct mutex			unbound_lock;
84 	atomic_t			opened;
85 	wait_queue_head_t		container_q;
86 	bool				noiommu;
87 	unsigned int			dev_counter;
88 	struct kvm			*kvm;
89 	struct blocking_notifier_head	notifier;
90 };
91 
92 #ifdef CONFIG_VFIO_NOIOMMU
93 static bool noiommu __read_mostly;
94 module_param_named(enable_unsafe_noiommu_mode,
95 		   noiommu, bool, S_IRUGO | S_IWUSR);
96 MODULE_PARM_DESC(enable_unsafe_noiommu_mode, "Enable UNSAFE, no-IOMMU mode.  This mode provides no device isolation, no DMA translation, no host kernel protection, cannot be used for device assignment to virtual machines, requires RAWIO permissions, and will taint the kernel.  If you do not know what this is for, step away. (default: false)");
97 #endif
98 
99 /*
100  * vfio_iommu_group_{get,put} are only intended for VFIO bus driver probe
101  * and remove functions, any use cases other than acquiring the first
102  * reference for the purpose of calling vfio_register_group_dev() or removing
103  * that symmetric reference after vfio_unregister_group_dev() should use the raw
104  * iommu_group_{get,put} functions.  In particular, vfio_iommu_group_put()
105  * removes the device from the dummy group and cannot be nested.
106  */
vfio_iommu_group_get(struct device * dev)107 struct iommu_group *vfio_iommu_group_get(struct device *dev)
108 {
109 	struct iommu_group *group;
110 	int __maybe_unused ret;
111 
112 	group = iommu_group_get(dev);
113 
114 #ifdef CONFIG_VFIO_NOIOMMU
115 	/*
116 	 * With noiommu enabled, an IOMMU group will be created for a device
117 	 * that doesn't already have one and doesn't have an iommu_ops on their
118 	 * bus.  We set iommudata simply to be able to identify these groups
119 	 * as special use and for reclamation later.
120 	 */
121 	if (group || !noiommu || iommu_present(dev->bus))
122 		return group;
123 
124 	group = iommu_group_alloc();
125 	if (IS_ERR(group))
126 		return NULL;
127 
128 	iommu_group_set_name(group, "vfio-noiommu");
129 	iommu_group_set_iommudata(group, &noiommu, NULL);
130 	ret = iommu_group_add_device(group, dev);
131 	if (ret) {
132 		iommu_group_put(group);
133 		return NULL;
134 	}
135 
136 	/*
137 	 * Where to taint?  At this point we've added an IOMMU group for a
138 	 * device that is not backed by iommu_ops, therefore any iommu_
139 	 * callback using iommu_ops can legitimately Oops.  So, while we may
140 	 * be about to give a DMA capable device to a user without IOMMU
141 	 * protection, which is clearly taint-worthy, let's go ahead and do
142 	 * it here.
143 	 */
144 	add_taint(TAINT_USER, LOCKDEP_STILL_OK);
145 	dev_warn(dev, "Adding kernel taint for vfio-noiommu group on device\n");
146 #endif
147 
148 	return group;
149 }
150 EXPORT_SYMBOL_GPL(vfio_iommu_group_get);
151 
vfio_iommu_group_put(struct iommu_group * group,struct device * dev)152 void vfio_iommu_group_put(struct iommu_group *group, struct device *dev)
153 {
154 #ifdef CONFIG_VFIO_NOIOMMU
155 	if (iommu_group_get_iommudata(group) == &noiommu)
156 		iommu_group_remove_device(dev);
157 #endif
158 
159 	iommu_group_put(group);
160 }
161 EXPORT_SYMBOL_GPL(vfio_iommu_group_put);
162 
163 #ifdef CONFIG_VFIO_NOIOMMU
vfio_noiommu_open(unsigned long arg)164 static void *vfio_noiommu_open(unsigned long arg)
165 {
166 	if (arg != VFIO_NOIOMMU_IOMMU)
167 		return ERR_PTR(-EINVAL);
168 	if (!capable(CAP_SYS_RAWIO))
169 		return ERR_PTR(-EPERM);
170 
171 	return NULL;
172 }
173 
vfio_noiommu_release(void * iommu_data)174 static void vfio_noiommu_release(void *iommu_data)
175 {
176 }
177 
vfio_noiommu_ioctl(void * iommu_data,unsigned int cmd,unsigned long arg)178 static long vfio_noiommu_ioctl(void *iommu_data,
179 			       unsigned int cmd, unsigned long arg)
180 {
181 	if (cmd == VFIO_CHECK_EXTENSION)
182 		return noiommu && (arg == VFIO_NOIOMMU_IOMMU) ? 1 : 0;
183 
184 	return -ENOTTY;
185 }
186 
vfio_noiommu_attach_group(void * iommu_data,struct iommu_group * iommu_group)187 static int vfio_noiommu_attach_group(void *iommu_data,
188 				     struct iommu_group *iommu_group)
189 {
190 	return iommu_group_get_iommudata(iommu_group) == &noiommu ? 0 : -EINVAL;
191 }
192 
vfio_noiommu_detach_group(void * iommu_data,struct iommu_group * iommu_group)193 static void vfio_noiommu_detach_group(void *iommu_data,
194 				      struct iommu_group *iommu_group)
195 {
196 }
197 
198 static const struct vfio_iommu_driver_ops vfio_noiommu_ops = {
199 	.name = "vfio-noiommu",
200 	.owner = THIS_MODULE,
201 	.open = vfio_noiommu_open,
202 	.release = vfio_noiommu_release,
203 	.ioctl = vfio_noiommu_ioctl,
204 	.attach_group = vfio_noiommu_attach_group,
205 	.detach_group = vfio_noiommu_detach_group,
206 };
207 #endif
208 
209 
210 /**
211  * IOMMU driver registration
212  */
vfio_register_iommu_driver(const struct vfio_iommu_driver_ops * ops)213 int vfio_register_iommu_driver(const struct vfio_iommu_driver_ops *ops)
214 {
215 	struct vfio_iommu_driver *driver, *tmp;
216 
217 	driver = kzalloc(sizeof(*driver), GFP_KERNEL);
218 	if (!driver)
219 		return -ENOMEM;
220 
221 	driver->ops = ops;
222 
223 	mutex_lock(&vfio.iommu_drivers_lock);
224 
225 	/* Check for duplicates */
226 	list_for_each_entry(tmp, &vfio.iommu_drivers_list, vfio_next) {
227 		if (tmp->ops == ops) {
228 			mutex_unlock(&vfio.iommu_drivers_lock);
229 			kfree(driver);
230 			return -EINVAL;
231 		}
232 	}
233 
234 	list_add(&driver->vfio_next, &vfio.iommu_drivers_list);
235 
236 	mutex_unlock(&vfio.iommu_drivers_lock);
237 
238 	return 0;
239 }
240 EXPORT_SYMBOL_GPL(vfio_register_iommu_driver);
241 
vfio_unregister_iommu_driver(const struct vfio_iommu_driver_ops * ops)242 void vfio_unregister_iommu_driver(const struct vfio_iommu_driver_ops *ops)
243 {
244 	struct vfio_iommu_driver *driver;
245 
246 	mutex_lock(&vfio.iommu_drivers_lock);
247 	list_for_each_entry(driver, &vfio.iommu_drivers_list, vfio_next) {
248 		if (driver->ops == ops) {
249 			list_del(&driver->vfio_next);
250 			mutex_unlock(&vfio.iommu_drivers_lock);
251 			kfree(driver);
252 			return;
253 		}
254 	}
255 	mutex_unlock(&vfio.iommu_drivers_lock);
256 }
257 EXPORT_SYMBOL_GPL(vfio_unregister_iommu_driver);
258 
259 /**
260  * Group minor allocation/free - both called with vfio.group_lock held
261  */
vfio_alloc_group_minor(struct vfio_group * group)262 static int vfio_alloc_group_minor(struct vfio_group *group)
263 {
264 	return idr_alloc(&vfio.group_idr, group, 0, MINORMASK + 1, GFP_KERNEL);
265 }
266 
vfio_free_group_minor(int minor)267 static void vfio_free_group_minor(int minor)
268 {
269 	idr_remove(&vfio.group_idr, minor);
270 }
271 
272 static int vfio_iommu_group_notifier(struct notifier_block *nb,
273 				     unsigned long action, void *data);
274 static void vfio_group_get(struct vfio_group *group);
275 
276 /**
277  * Container objects - containers are created when /dev/vfio/vfio is
278  * opened, but their lifecycle extends until the last user is done, so
279  * it's freed via kref.  Must support container/group/device being
280  * closed in any order.
281  */
vfio_container_get(struct vfio_container * container)282 static void vfio_container_get(struct vfio_container *container)
283 {
284 	kref_get(&container->kref);
285 }
286 
vfio_container_release(struct kref * kref)287 static void vfio_container_release(struct kref *kref)
288 {
289 	struct vfio_container *container;
290 	container = container_of(kref, struct vfio_container, kref);
291 
292 	kfree(container);
293 }
294 
vfio_container_put(struct vfio_container * container)295 static void vfio_container_put(struct vfio_container *container)
296 {
297 	kref_put(&container->kref, vfio_container_release);
298 }
299 
vfio_group_unlock_and_free(struct vfio_group * group)300 static void vfio_group_unlock_and_free(struct vfio_group *group)
301 {
302 	mutex_unlock(&vfio.group_lock);
303 	/*
304 	 * Unregister outside of lock.  A spurious callback is harmless now
305 	 * that the group is no longer in vfio.group_list.
306 	 */
307 	iommu_group_unregister_notifier(group->iommu_group, &group->nb);
308 	kfree(group);
309 }
310 
311 /**
312  * Group objects - create, release, get, put, search
313  */
vfio_create_group(struct iommu_group * iommu_group)314 static struct vfio_group *vfio_create_group(struct iommu_group *iommu_group)
315 {
316 	struct vfio_group *group, *tmp;
317 	struct device *dev;
318 	int ret, minor;
319 
320 	group = kzalloc(sizeof(*group), GFP_KERNEL);
321 	if (!group)
322 		return ERR_PTR(-ENOMEM);
323 
324 	kref_init(&group->kref);
325 	INIT_LIST_HEAD(&group->device_list);
326 	mutex_init(&group->device_lock);
327 	INIT_LIST_HEAD(&group->unbound_list);
328 	mutex_init(&group->unbound_lock);
329 	atomic_set(&group->container_users, 0);
330 	atomic_set(&group->opened, 0);
331 	init_waitqueue_head(&group->container_q);
332 	group->iommu_group = iommu_group;
333 #ifdef CONFIG_VFIO_NOIOMMU
334 	group->noiommu = (iommu_group_get_iommudata(iommu_group) == &noiommu);
335 #endif
336 	BLOCKING_INIT_NOTIFIER_HEAD(&group->notifier);
337 
338 	group->nb.notifier_call = vfio_iommu_group_notifier;
339 
340 	/*
341 	 * blocking notifiers acquire a rwsem around registering and hold
342 	 * it around callback.  Therefore, need to register outside of
343 	 * vfio.group_lock to avoid A-B/B-A contention.  Our callback won't
344 	 * do anything unless it can find the group in vfio.group_list, so
345 	 * no harm in registering early.
346 	 */
347 	ret = iommu_group_register_notifier(iommu_group, &group->nb);
348 	if (ret) {
349 		kfree(group);
350 		return ERR_PTR(ret);
351 	}
352 
353 	mutex_lock(&vfio.group_lock);
354 
355 	/* Did we race creating this group? */
356 	list_for_each_entry(tmp, &vfio.group_list, vfio_next) {
357 		if (tmp->iommu_group == iommu_group) {
358 			vfio_group_get(tmp);
359 			vfio_group_unlock_and_free(group);
360 			return tmp;
361 		}
362 	}
363 
364 	minor = vfio_alloc_group_minor(group);
365 	if (minor < 0) {
366 		vfio_group_unlock_and_free(group);
367 		return ERR_PTR(minor);
368 	}
369 
370 	dev = device_create(vfio.class, NULL,
371 			    MKDEV(MAJOR(vfio.group_devt), minor),
372 			    group, "%s%d", group->noiommu ? "noiommu-" : "",
373 			    iommu_group_id(iommu_group));
374 	if (IS_ERR(dev)) {
375 		vfio_free_group_minor(minor);
376 		vfio_group_unlock_and_free(group);
377 		return ERR_CAST(dev);
378 	}
379 
380 	group->minor = minor;
381 	group->dev = dev;
382 
383 	list_add(&group->vfio_next, &vfio.group_list);
384 
385 	mutex_unlock(&vfio.group_lock);
386 
387 	return group;
388 }
389 
390 /* called with vfio.group_lock held */
vfio_group_release(struct kref * kref)391 static void vfio_group_release(struct kref *kref)
392 {
393 	struct vfio_group *group = container_of(kref, struct vfio_group, kref);
394 	struct vfio_unbound_dev *unbound, *tmp;
395 	struct iommu_group *iommu_group = group->iommu_group;
396 
397 	WARN_ON(!list_empty(&group->device_list));
398 	WARN_ON(group->notifier.head);
399 
400 	list_for_each_entry_safe(unbound, tmp,
401 				 &group->unbound_list, unbound_next) {
402 		list_del(&unbound->unbound_next);
403 		kfree(unbound);
404 	}
405 
406 	device_destroy(vfio.class, MKDEV(MAJOR(vfio.group_devt), group->minor));
407 	list_del(&group->vfio_next);
408 	vfio_free_group_minor(group->minor);
409 	vfio_group_unlock_and_free(group);
410 	iommu_group_put(iommu_group);
411 }
412 
vfio_group_put(struct vfio_group * group)413 static void vfio_group_put(struct vfio_group *group)
414 {
415 	kref_put_mutex(&group->kref, vfio_group_release, &vfio.group_lock);
416 }
417 
418 struct vfio_group_put_work {
419 	struct work_struct work;
420 	struct vfio_group *group;
421 };
422 
vfio_group_put_bg(struct work_struct * work)423 static void vfio_group_put_bg(struct work_struct *work)
424 {
425 	struct vfio_group_put_work *do_work;
426 
427 	do_work = container_of(work, struct vfio_group_put_work, work);
428 
429 	vfio_group_put(do_work->group);
430 	kfree(do_work);
431 }
432 
vfio_group_schedule_put(struct vfio_group * group)433 static void vfio_group_schedule_put(struct vfio_group *group)
434 {
435 	struct vfio_group_put_work *do_work;
436 
437 	do_work = kmalloc(sizeof(*do_work), GFP_KERNEL);
438 	if (WARN_ON(!do_work))
439 		return;
440 
441 	INIT_WORK(&do_work->work, vfio_group_put_bg);
442 	do_work->group = group;
443 	schedule_work(&do_work->work);
444 }
445 
446 /* Assume group_lock or group reference is held */
vfio_group_get(struct vfio_group * group)447 static void vfio_group_get(struct vfio_group *group)
448 {
449 	kref_get(&group->kref);
450 }
451 
452 /*
453  * Not really a try as we will sleep for mutex, but we need to make
454  * sure the group pointer is valid under lock and get a reference.
455  */
vfio_group_try_get(struct vfio_group * group)456 static struct vfio_group *vfio_group_try_get(struct vfio_group *group)
457 {
458 	struct vfio_group *target = group;
459 
460 	mutex_lock(&vfio.group_lock);
461 	list_for_each_entry(group, &vfio.group_list, vfio_next) {
462 		if (group == target) {
463 			vfio_group_get(group);
464 			mutex_unlock(&vfio.group_lock);
465 			return group;
466 		}
467 	}
468 	mutex_unlock(&vfio.group_lock);
469 
470 	return NULL;
471 }
472 
473 static
vfio_group_get_from_iommu(struct iommu_group * iommu_group)474 struct vfio_group *vfio_group_get_from_iommu(struct iommu_group *iommu_group)
475 {
476 	struct vfio_group *group;
477 
478 	mutex_lock(&vfio.group_lock);
479 	list_for_each_entry(group, &vfio.group_list, vfio_next) {
480 		if (group->iommu_group == iommu_group) {
481 			vfio_group_get(group);
482 			mutex_unlock(&vfio.group_lock);
483 			return group;
484 		}
485 	}
486 	mutex_unlock(&vfio.group_lock);
487 
488 	return NULL;
489 }
490 
vfio_group_get_from_minor(int minor)491 static struct vfio_group *vfio_group_get_from_minor(int minor)
492 {
493 	struct vfio_group *group;
494 
495 	mutex_lock(&vfio.group_lock);
496 	group = idr_find(&vfio.group_idr, minor);
497 	if (!group) {
498 		mutex_unlock(&vfio.group_lock);
499 		return NULL;
500 	}
501 	vfio_group_get(group);
502 	mutex_unlock(&vfio.group_lock);
503 
504 	return group;
505 }
506 
vfio_group_get_from_dev(struct device * dev)507 static struct vfio_group *vfio_group_get_from_dev(struct device *dev)
508 {
509 	struct iommu_group *iommu_group;
510 	struct vfio_group *group;
511 
512 	iommu_group = iommu_group_get(dev);
513 	if (!iommu_group)
514 		return NULL;
515 
516 	group = vfio_group_get_from_iommu(iommu_group);
517 	iommu_group_put(iommu_group);
518 
519 	return group;
520 }
521 
522 /**
523  * Device objects - create, release, get, put, search
524  */
525 /* Device reference always implies a group reference */
vfio_device_put(struct vfio_device * device)526 void vfio_device_put(struct vfio_device *device)
527 {
528 	if (refcount_dec_and_test(&device->refcount))
529 		complete(&device->comp);
530 }
531 EXPORT_SYMBOL_GPL(vfio_device_put);
532 
vfio_device_try_get(struct vfio_device * device)533 static bool vfio_device_try_get(struct vfio_device *device)
534 {
535 	return refcount_inc_not_zero(&device->refcount);
536 }
537 
vfio_group_get_device(struct vfio_group * group,struct device * dev)538 static struct vfio_device *vfio_group_get_device(struct vfio_group *group,
539 						 struct device *dev)
540 {
541 	struct vfio_device *device;
542 
543 	mutex_lock(&group->device_lock);
544 	list_for_each_entry(device, &group->device_list, group_next) {
545 		if (device->dev == dev && vfio_device_try_get(device)) {
546 			mutex_unlock(&group->device_lock);
547 			return device;
548 		}
549 	}
550 	mutex_unlock(&group->device_lock);
551 	return NULL;
552 }
553 
554 /*
555  * Some drivers, like pci-stub, are only used to prevent other drivers from
556  * claiming a device and are therefore perfectly legitimate for a user owned
557  * group.  The pci-stub driver has no dependencies on DMA or the IOVA mapping
558  * of the device, but it does prevent the user from having direct access to
559  * the device, which is useful in some circumstances.
560  *
561  * We also assume that we can include PCI interconnect devices, ie. bridges.
562  * IOMMU grouping on PCI necessitates that if we lack isolation on a bridge
563  * then all of the downstream devices will be part of the same IOMMU group as
564  * the bridge.  Thus, if placing the bridge into the user owned IOVA space
565  * breaks anything, it only does so for user owned devices downstream.  Note
566  * that error notification via MSI can be affected for platforms that handle
567  * MSI within the same IOVA space as DMA.
568  */
569 static const char * const vfio_driver_allowed[] = { "pci-stub" };
570 
vfio_dev_driver_allowed(struct device * dev,struct device_driver * drv)571 static bool vfio_dev_driver_allowed(struct device *dev,
572 				    struct device_driver *drv)
573 {
574 	if (dev_is_pci(dev)) {
575 		struct pci_dev *pdev = to_pci_dev(dev);
576 
577 		if (pdev->hdr_type != PCI_HEADER_TYPE_NORMAL)
578 			return true;
579 	}
580 
581 	return match_string(vfio_driver_allowed,
582 			    ARRAY_SIZE(vfio_driver_allowed),
583 			    drv->name) >= 0;
584 }
585 
586 /*
587  * A vfio group is viable for use by userspace if all devices are in
588  * one of the following states:
589  *  - driver-less
590  *  - bound to a vfio driver
591  *  - bound to an otherwise allowed driver
592  *  - a PCI interconnect device
593  *
594  * We use two methods to determine whether a device is bound to a vfio
595  * driver.  The first is to test whether the device exists in the vfio
596  * group.  The second is to test if the device exists on the group
597  * unbound_list, indicating it's in the middle of transitioning from
598  * a vfio driver to driver-less.
599  */
vfio_dev_viable(struct device * dev,void * data)600 static int vfio_dev_viable(struct device *dev, void *data)
601 {
602 	struct vfio_group *group = data;
603 	struct vfio_device *device;
604 	struct device_driver *drv = READ_ONCE(dev->driver);
605 	struct vfio_unbound_dev *unbound;
606 	int ret = -EINVAL;
607 
608 	mutex_lock(&group->unbound_lock);
609 	list_for_each_entry(unbound, &group->unbound_list, unbound_next) {
610 		if (dev == unbound->dev) {
611 			ret = 0;
612 			break;
613 		}
614 	}
615 	mutex_unlock(&group->unbound_lock);
616 
617 	if (!ret || !drv || vfio_dev_driver_allowed(dev, drv))
618 		return 0;
619 
620 	device = vfio_group_get_device(group, dev);
621 	if (device) {
622 		vfio_device_put(device);
623 		return 0;
624 	}
625 
626 	return ret;
627 }
628 
629 /**
630  * Async device support
631  */
vfio_group_nb_add_dev(struct vfio_group * group,struct device * dev)632 static int vfio_group_nb_add_dev(struct vfio_group *group, struct device *dev)
633 {
634 	struct vfio_device *device;
635 
636 	/* Do we already know about it?  We shouldn't */
637 	device = vfio_group_get_device(group, dev);
638 	if (WARN_ON_ONCE(device)) {
639 		vfio_device_put(device);
640 		return 0;
641 	}
642 
643 	/* Nothing to do for idle groups */
644 	if (!atomic_read(&group->container_users))
645 		return 0;
646 
647 	/* TODO Prevent device auto probing */
648 	dev_WARN(dev, "Device added to live group %d!\n",
649 		 iommu_group_id(group->iommu_group));
650 
651 	return 0;
652 }
653 
vfio_group_nb_verify(struct vfio_group * group,struct device * dev)654 static int vfio_group_nb_verify(struct vfio_group *group, struct device *dev)
655 {
656 	/* We don't care what happens when the group isn't in use */
657 	if (!atomic_read(&group->container_users))
658 		return 0;
659 
660 	return vfio_dev_viable(dev, group);
661 }
662 
vfio_iommu_group_notifier(struct notifier_block * nb,unsigned long action,void * data)663 static int vfio_iommu_group_notifier(struct notifier_block *nb,
664 				     unsigned long action, void *data)
665 {
666 	struct vfio_group *group = container_of(nb, struct vfio_group, nb);
667 	struct device *dev = data;
668 	struct vfio_unbound_dev *unbound;
669 
670 	/*
671 	 * Need to go through a group_lock lookup to get a reference or we
672 	 * risk racing a group being removed.  Ignore spurious notifies.
673 	 */
674 	group = vfio_group_try_get(group);
675 	if (!group)
676 		return NOTIFY_OK;
677 
678 	switch (action) {
679 	case IOMMU_GROUP_NOTIFY_ADD_DEVICE:
680 		vfio_group_nb_add_dev(group, dev);
681 		break;
682 	case IOMMU_GROUP_NOTIFY_DEL_DEVICE:
683 		/*
684 		 * Nothing to do here.  If the device is in use, then the
685 		 * vfio sub-driver should block the remove callback until
686 		 * it is unused.  If the device is unused or attached to a
687 		 * stub driver, then it should be released and we don't
688 		 * care that it will be going away.
689 		 */
690 		break;
691 	case IOMMU_GROUP_NOTIFY_BIND_DRIVER:
692 		dev_dbg(dev, "%s: group %d binding to driver\n", __func__,
693 			iommu_group_id(group->iommu_group));
694 		break;
695 	case IOMMU_GROUP_NOTIFY_BOUND_DRIVER:
696 		dev_dbg(dev, "%s: group %d bound to driver %s\n", __func__,
697 			iommu_group_id(group->iommu_group), dev->driver->name);
698 		BUG_ON(vfio_group_nb_verify(group, dev));
699 		break;
700 	case IOMMU_GROUP_NOTIFY_UNBIND_DRIVER:
701 		dev_dbg(dev, "%s: group %d unbinding from driver %s\n",
702 			__func__, iommu_group_id(group->iommu_group),
703 			dev->driver->name);
704 		break;
705 	case IOMMU_GROUP_NOTIFY_UNBOUND_DRIVER:
706 		dev_dbg(dev, "%s: group %d unbound from driver\n", __func__,
707 			iommu_group_id(group->iommu_group));
708 		/*
709 		 * XXX An unbound device in a live group is ok, but we'd
710 		 * really like to avoid the above BUG_ON by preventing other
711 		 * drivers from binding to it.  Once that occurs, we have to
712 		 * stop the system to maintain isolation.  At a minimum, we'd
713 		 * want a toggle to disable driver auto probe for this device.
714 		 */
715 
716 		mutex_lock(&group->unbound_lock);
717 		list_for_each_entry(unbound,
718 				    &group->unbound_list, unbound_next) {
719 			if (dev == unbound->dev) {
720 				list_del(&unbound->unbound_next);
721 				kfree(unbound);
722 				break;
723 			}
724 		}
725 		mutex_unlock(&group->unbound_lock);
726 		break;
727 	}
728 
729 	/*
730 	 * If we're the last reference to the group, the group will be
731 	 * released, which includes unregistering the iommu group notifier.
732 	 * We hold a read-lock on that notifier list, unregistering needs
733 	 * a write-lock... deadlock.  Release our reference asynchronously
734 	 * to avoid that situation.
735 	 */
736 	vfio_group_schedule_put(group);
737 	return NOTIFY_OK;
738 }
739 
740 /**
741  * VFIO driver API
742  */
vfio_init_group_dev(struct vfio_device * device,struct device * dev,const struct vfio_device_ops * ops)743 void vfio_init_group_dev(struct vfio_device *device, struct device *dev,
744 			 const struct vfio_device_ops *ops)
745 {
746 	init_completion(&device->comp);
747 	device->dev = dev;
748 	device->ops = ops;
749 }
750 EXPORT_SYMBOL_GPL(vfio_init_group_dev);
751 
vfio_register_group_dev(struct vfio_device * device)752 int vfio_register_group_dev(struct vfio_device *device)
753 {
754 	struct vfio_device *existing_device;
755 	struct iommu_group *iommu_group;
756 	struct vfio_group *group;
757 
758 	iommu_group = iommu_group_get(device->dev);
759 	if (!iommu_group)
760 		return -EINVAL;
761 
762 	group = vfio_group_get_from_iommu(iommu_group);
763 	if (!group) {
764 		group = vfio_create_group(iommu_group);
765 		if (IS_ERR(group)) {
766 			iommu_group_put(iommu_group);
767 			return PTR_ERR(group);
768 		}
769 	} else {
770 		/*
771 		 * A found vfio_group already holds a reference to the
772 		 * iommu_group.  A created vfio_group keeps the reference.
773 		 */
774 		iommu_group_put(iommu_group);
775 	}
776 
777 	existing_device = vfio_group_get_device(group, device->dev);
778 	if (existing_device) {
779 		dev_WARN(device->dev, "Device already exists on group %d\n",
780 			 iommu_group_id(iommu_group));
781 		vfio_device_put(existing_device);
782 		vfio_group_put(group);
783 		return -EBUSY;
784 	}
785 
786 	/* Our reference on group is moved to the device */
787 	device->group = group;
788 
789 	/* Refcounting can't start until the driver calls register */
790 	refcount_set(&device->refcount, 1);
791 
792 	mutex_lock(&group->device_lock);
793 	list_add(&device->group_next, &group->device_list);
794 	group->dev_counter++;
795 	mutex_unlock(&group->device_lock);
796 
797 	return 0;
798 }
799 EXPORT_SYMBOL_GPL(vfio_register_group_dev);
800 
801 /**
802  * Get a reference to the vfio_device for a device.  Even if the
803  * caller thinks they own the device, they could be racing with a
804  * release call path, so we can't trust drvdata for the shortcut.
805  * Go the long way around, from the iommu_group to the vfio_group
806  * to the vfio_device.
807  */
vfio_device_get_from_dev(struct device * dev)808 struct vfio_device *vfio_device_get_from_dev(struct device *dev)
809 {
810 	struct vfio_group *group;
811 	struct vfio_device *device;
812 
813 	group = vfio_group_get_from_dev(dev);
814 	if (!group)
815 		return NULL;
816 
817 	device = vfio_group_get_device(group, dev);
818 	vfio_group_put(group);
819 
820 	return device;
821 }
822 EXPORT_SYMBOL_GPL(vfio_device_get_from_dev);
823 
vfio_device_get_from_name(struct vfio_group * group,char * buf)824 static struct vfio_device *vfio_device_get_from_name(struct vfio_group *group,
825 						     char *buf)
826 {
827 	struct vfio_device *it, *device = ERR_PTR(-ENODEV);
828 
829 	mutex_lock(&group->device_lock);
830 	list_for_each_entry(it, &group->device_list, group_next) {
831 		int ret;
832 
833 		if (it->ops->match) {
834 			ret = it->ops->match(it, buf);
835 			if (ret < 0) {
836 				device = ERR_PTR(ret);
837 				break;
838 			}
839 		} else {
840 			ret = !strcmp(dev_name(it->dev), buf);
841 		}
842 
843 		if (ret && vfio_device_try_get(it)) {
844 			device = it;
845 			break;
846 		}
847 	}
848 	mutex_unlock(&group->device_lock);
849 
850 	return device;
851 }
852 
853 /*
854  * Decrement the device reference count and wait for the device to be
855  * removed.  Open file descriptors for the device... */
vfio_unregister_group_dev(struct vfio_device * device)856 void vfio_unregister_group_dev(struct vfio_device *device)
857 {
858 	struct vfio_group *group = device->group;
859 	struct vfio_unbound_dev *unbound;
860 	unsigned int i = 0;
861 	bool interrupted = false;
862 	long rc;
863 
864 	/*
865 	 * When the device is removed from the group, the group suddenly
866 	 * becomes non-viable; the device has a driver (until the unbind
867 	 * completes), but it's not present in the group.  This is bad news
868 	 * for any external users that need to re-acquire a group reference
869 	 * in order to match and release their existing reference.  To
870 	 * solve this, we track such devices on the unbound_list to bridge
871 	 * the gap until they're fully unbound.
872 	 */
873 	unbound = kzalloc(sizeof(*unbound), GFP_KERNEL);
874 	if (unbound) {
875 		unbound->dev = device->dev;
876 		mutex_lock(&group->unbound_lock);
877 		list_add(&unbound->unbound_next, &group->unbound_list);
878 		mutex_unlock(&group->unbound_lock);
879 	}
880 	WARN_ON(!unbound);
881 
882 	vfio_device_put(device);
883 	rc = try_wait_for_completion(&device->comp);
884 	while (rc <= 0) {
885 		if (device->ops->request)
886 			device->ops->request(device, i++);
887 
888 		if (interrupted) {
889 			rc = wait_for_completion_timeout(&device->comp,
890 							 HZ * 10);
891 		} else {
892 			rc = wait_for_completion_interruptible_timeout(
893 				&device->comp, HZ * 10);
894 			if (rc < 0) {
895 				interrupted = true;
896 				dev_warn(device->dev,
897 					 "Device is currently in use, task"
898 					 " \"%s\" (%d) "
899 					 "blocked until device is released",
900 					 current->comm, task_pid_nr(current));
901 			}
902 		}
903 	}
904 
905 	mutex_lock(&group->device_lock);
906 	list_del(&device->group_next);
907 	group->dev_counter--;
908 	mutex_unlock(&group->device_lock);
909 
910 	/*
911 	 * In order to support multiple devices per group, devices can be
912 	 * plucked from the group while other devices in the group are still
913 	 * in use.  The container persists with this group and those remaining
914 	 * devices still attached.  If the user creates an isolation violation
915 	 * by binding this device to another driver while the group is still in
916 	 * use, that's their fault.  However, in the case of removing the last,
917 	 * or potentially the only, device in the group there can be no other
918 	 * in-use devices in the group.  The user has done their due diligence
919 	 * and we should lay no claims to those devices.  In order to do that,
920 	 * we need to make sure the group is detached from the container.
921 	 * Without this stall, we're potentially racing with a user process
922 	 * that may attempt to immediately bind this device to another driver.
923 	 */
924 	if (list_empty(&group->device_list))
925 		wait_event(group->container_q, !group->container);
926 
927 	/* Matches the get in vfio_register_group_dev() */
928 	vfio_group_put(group);
929 }
930 EXPORT_SYMBOL_GPL(vfio_unregister_group_dev);
931 
932 /**
933  * VFIO base fd, /dev/vfio/vfio
934  */
vfio_ioctl_check_extension(struct vfio_container * container,unsigned long arg)935 static long vfio_ioctl_check_extension(struct vfio_container *container,
936 				       unsigned long arg)
937 {
938 	struct vfio_iommu_driver *driver;
939 	long ret = 0;
940 
941 	down_read(&container->group_lock);
942 
943 	driver = container->iommu_driver;
944 
945 	switch (arg) {
946 		/* No base extensions yet */
947 	default:
948 		/*
949 		 * If no driver is set, poll all registered drivers for
950 		 * extensions and return the first positive result.  If
951 		 * a driver is already set, further queries will be passed
952 		 * only to that driver.
953 		 */
954 		if (!driver) {
955 			mutex_lock(&vfio.iommu_drivers_lock);
956 			list_for_each_entry(driver, &vfio.iommu_drivers_list,
957 					    vfio_next) {
958 
959 #ifdef CONFIG_VFIO_NOIOMMU
960 				if (!list_empty(&container->group_list) &&
961 				    (container->noiommu !=
962 				     (driver->ops == &vfio_noiommu_ops)))
963 					continue;
964 #endif
965 
966 				if (!try_module_get(driver->ops->owner))
967 					continue;
968 
969 				ret = driver->ops->ioctl(NULL,
970 							 VFIO_CHECK_EXTENSION,
971 							 arg);
972 				module_put(driver->ops->owner);
973 				if (ret > 0)
974 					break;
975 			}
976 			mutex_unlock(&vfio.iommu_drivers_lock);
977 		} else
978 			ret = driver->ops->ioctl(container->iommu_data,
979 						 VFIO_CHECK_EXTENSION, arg);
980 	}
981 
982 	up_read(&container->group_lock);
983 
984 	return ret;
985 }
986 
987 /* hold write lock on container->group_lock */
__vfio_container_attach_groups(struct vfio_container * container,struct vfio_iommu_driver * driver,void * data)988 static int __vfio_container_attach_groups(struct vfio_container *container,
989 					  struct vfio_iommu_driver *driver,
990 					  void *data)
991 {
992 	struct vfio_group *group;
993 	int ret = -ENODEV;
994 
995 	list_for_each_entry(group, &container->group_list, container_next) {
996 		ret = driver->ops->attach_group(data, group->iommu_group);
997 		if (ret)
998 			goto unwind;
999 	}
1000 
1001 	return ret;
1002 
1003 unwind:
1004 	list_for_each_entry_continue_reverse(group, &container->group_list,
1005 					     container_next) {
1006 		driver->ops->detach_group(data, group->iommu_group);
1007 	}
1008 
1009 	return ret;
1010 }
1011 
vfio_ioctl_set_iommu(struct vfio_container * container,unsigned long arg)1012 static long vfio_ioctl_set_iommu(struct vfio_container *container,
1013 				 unsigned long arg)
1014 {
1015 	struct vfio_iommu_driver *driver;
1016 	long ret = -ENODEV;
1017 
1018 	down_write(&container->group_lock);
1019 
1020 	/*
1021 	 * The container is designed to be an unprivileged interface while
1022 	 * the group can be assigned to specific users.  Therefore, only by
1023 	 * adding a group to a container does the user get the privilege of
1024 	 * enabling the iommu, which may allocate finite resources.  There
1025 	 * is no unset_iommu, but by removing all the groups from a container,
1026 	 * the container is deprivileged and returns to an unset state.
1027 	 */
1028 	if (list_empty(&container->group_list) || container->iommu_driver) {
1029 		up_write(&container->group_lock);
1030 		return -EINVAL;
1031 	}
1032 
1033 	mutex_lock(&vfio.iommu_drivers_lock);
1034 	list_for_each_entry(driver, &vfio.iommu_drivers_list, vfio_next) {
1035 		void *data;
1036 
1037 #ifdef CONFIG_VFIO_NOIOMMU
1038 		/*
1039 		 * Only noiommu containers can use vfio-noiommu and noiommu
1040 		 * containers can only use vfio-noiommu.
1041 		 */
1042 		if (container->noiommu != (driver->ops == &vfio_noiommu_ops))
1043 			continue;
1044 #endif
1045 
1046 		if (!try_module_get(driver->ops->owner))
1047 			continue;
1048 
1049 		/*
1050 		 * The arg magic for SET_IOMMU is the same as CHECK_EXTENSION,
1051 		 * so test which iommu driver reported support for this
1052 		 * extension and call open on them.  We also pass them the
1053 		 * magic, allowing a single driver to support multiple
1054 		 * interfaces if they'd like.
1055 		 */
1056 		if (driver->ops->ioctl(NULL, VFIO_CHECK_EXTENSION, arg) <= 0) {
1057 			module_put(driver->ops->owner);
1058 			continue;
1059 		}
1060 
1061 		data = driver->ops->open(arg);
1062 		if (IS_ERR(data)) {
1063 			ret = PTR_ERR(data);
1064 			module_put(driver->ops->owner);
1065 			continue;
1066 		}
1067 
1068 		ret = __vfio_container_attach_groups(container, driver, data);
1069 		if (ret) {
1070 			driver->ops->release(data);
1071 			module_put(driver->ops->owner);
1072 			continue;
1073 		}
1074 
1075 		container->iommu_driver = driver;
1076 		container->iommu_data = data;
1077 		break;
1078 	}
1079 
1080 	mutex_unlock(&vfio.iommu_drivers_lock);
1081 	up_write(&container->group_lock);
1082 
1083 	return ret;
1084 }
1085 
vfio_fops_unl_ioctl(struct file * filep,unsigned int cmd,unsigned long arg)1086 static long vfio_fops_unl_ioctl(struct file *filep,
1087 				unsigned int cmd, unsigned long arg)
1088 {
1089 	struct vfio_container *container = filep->private_data;
1090 	struct vfio_iommu_driver *driver;
1091 	void *data;
1092 	long ret = -EINVAL;
1093 
1094 	if (!container)
1095 		return ret;
1096 
1097 	switch (cmd) {
1098 	case VFIO_GET_API_VERSION:
1099 		ret = VFIO_API_VERSION;
1100 		break;
1101 	case VFIO_CHECK_EXTENSION:
1102 		ret = vfio_ioctl_check_extension(container, arg);
1103 		break;
1104 	case VFIO_SET_IOMMU:
1105 		ret = vfio_ioctl_set_iommu(container, arg);
1106 		break;
1107 	default:
1108 		driver = container->iommu_driver;
1109 		data = container->iommu_data;
1110 
1111 		if (driver) /* passthrough all unrecognized ioctls */
1112 			ret = driver->ops->ioctl(data, cmd, arg);
1113 	}
1114 
1115 	return ret;
1116 }
1117 
vfio_fops_open(struct inode * inode,struct file * filep)1118 static int vfio_fops_open(struct inode *inode, struct file *filep)
1119 {
1120 	struct vfio_container *container;
1121 
1122 	container = kzalloc(sizeof(*container), GFP_KERNEL);
1123 	if (!container)
1124 		return -ENOMEM;
1125 
1126 	INIT_LIST_HEAD(&container->group_list);
1127 	init_rwsem(&container->group_lock);
1128 	kref_init(&container->kref);
1129 
1130 	filep->private_data = container;
1131 
1132 	return 0;
1133 }
1134 
vfio_fops_release(struct inode * inode,struct file * filep)1135 static int vfio_fops_release(struct inode *inode, struct file *filep)
1136 {
1137 	struct vfio_container *container = filep->private_data;
1138 	struct vfio_iommu_driver *driver = container->iommu_driver;
1139 
1140 	if (driver && driver->ops->notify)
1141 		driver->ops->notify(container->iommu_data,
1142 				    VFIO_IOMMU_CONTAINER_CLOSE);
1143 
1144 	filep->private_data = NULL;
1145 
1146 	vfio_container_put(container);
1147 
1148 	return 0;
1149 }
1150 
1151 /*
1152  * Once an iommu driver is set, we optionally pass read/write/mmap
1153  * on to the driver, allowing management interfaces beyond ioctl.
1154  */
vfio_fops_read(struct file * filep,char __user * buf,size_t count,loff_t * ppos)1155 static ssize_t vfio_fops_read(struct file *filep, char __user *buf,
1156 			      size_t count, loff_t *ppos)
1157 {
1158 	struct vfio_container *container = filep->private_data;
1159 	struct vfio_iommu_driver *driver;
1160 	ssize_t ret = -EINVAL;
1161 
1162 	driver = container->iommu_driver;
1163 	if (likely(driver && driver->ops->read))
1164 		ret = driver->ops->read(container->iommu_data,
1165 					buf, count, ppos);
1166 
1167 	return ret;
1168 }
1169 
vfio_fops_write(struct file * filep,const char __user * buf,size_t count,loff_t * ppos)1170 static ssize_t vfio_fops_write(struct file *filep, const char __user *buf,
1171 			       size_t count, loff_t *ppos)
1172 {
1173 	struct vfio_container *container = filep->private_data;
1174 	struct vfio_iommu_driver *driver;
1175 	ssize_t ret = -EINVAL;
1176 
1177 	driver = container->iommu_driver;
1178 	if (likely(driver && driver->ops->write))
1179 		ret = driver->ops->write(container->iommu_data,
1180 					 buf, count, ppos);
1181 
1182 	return ret;
1183 }
1184 
vfio_fops_mmap(struct file * filep,struct vm_area_struct * vma)1185 static int vfio_fops_mmap(struct file *filep, struct vm_area_struct *vma)
1186 {
1187 	struct vfio_container *container = filep->private_data;
1188 	struct vfio_iommu_driver *driver;
1189 	int ret = -EINVAL;
1190 
1191 	driver = container->iommu_driver;
1192 	if (likely(driver && driver->ops->mmap))
1193 		ret = driver->ops->mmap(container->iommu_data, vma);
1194 
1195 	return ret;
1196 }
1197 
1198 static const struct file_operations vfio_fops = {
1199 	.owner		= THIS_MODULE,
1200 	.open		= vfio_fops_open,
1201 	.release	= vfio_fops_release,
1202 	.read		= vfio_fops_read,
1203 	.write		= vfio_fops_write,
1204 	.unlocked_ioctl	= vfio_fops_unl_ioctl,
1205 	.compat_ioctl	= compat_ptr_ioctl,
1206 	.mmap		= vfio_fops_mmap,
1207 };
1208 
1209 /**
1210  * VFIO Group fd, /dev/vfio/$GROUP
1211  */
__vfio_group_unset_container(struct vfio_group * group)1212 static void __vfio_group_unset_container(struct vfio_group *group)
1213 {
1214 	struct vfio_container *container = group->container;
1215 	struct vfio_iommu_driver *driver;
1216 
1217 	down_write(&container->group_lock);
1218 
1219 	driver = container->iommu_driver;
1220 	if (driver)
1221 		driver->ops->detach_group(container->iommu_data,
1222 					  group->iommu_group);
1223 
1224 	group->container = NULL;
1225 	wake_up(&group->container_q);
1226 	list_del(&group->container_next);
1227 
1228 	/* Detaching the last group deprivileges a container, remove iommu */
1229 	if (driver && list_empty(&container->group_list)) {
1230 		driver->ops->release(container->iommu_data);
1231 		module_put(driver->ops->owner);
1232 		container->iommu_driver = NULL;
1233 		container->iommu_data = NULL;
1234 	}
1235 
1236 	up_write(&container->group_lock);
1237 
1238 	vfio_container_put(container);
1239 }
1240 
1241 /*
1242  * VFIO_GROUP_UNSET_CONTAINER should fail if there are other users or
1243  * if there was no container to unset.  Since the ioctl is called on
1244  * the group, we know that still exists, therefore the only valid
1245  * transition here is 1->0.
1246  */
vfio_group_unset_container(struct vfio_group * group)1247 static int vfio_group_unset_container(struct vfio_group *group)
1248 {
1249 	int users = atomic_cmpxchg(&group->container_users, 1, 0);
1250 
1251 	if (!users)
1252 		return -EINVAL;
1253 	if (users != 1)
1254 		return -EBUSY;
1255 
1256 	__vfio_group_unset_container(group);
1257 
1258 	return 0;
1259 }
1260 
1261 /*
1262  * When removing container users, anything that removes the last user
1263  * implicitly removes the group from the container.  That is, if the
1264  * group file descriptor is closed, as well as any device file descriptors,
1265  * the group is free.
1266  */
vfio_group_try_dissolve_container(struct vfio_group * group)1267 static void vfio_group_try_dissolve_container(struct vfio_group *group)
1268 {
1269 	if (0 == atomic_dec_if_positive(&group->container_users))
1270 		__vfio_group_unset_container(group);
1271 }
1272 
vfio_group_set_container(struct vfio_group * group,int container_fd)1273 static int vfio_group_set_container(struct vfio_group *group, int container_fd)
1274 {
1275 	struct fd f;
1276 	struct vfio_container *container;
1277 	struct vfio_iommu_driver *driver;
1278 	int ret = 0;
1279 
1280 	if (atomic_read(&group->container_users))
1281 		return -EINVAL;
1282 
1283 	if (group->noiommu && !capable(CAP_SYS_RAWIO))
1284 		return -EPERM;
1285 
1286 	f = fdget(container_fd);
1287 	if (!f.file)
1288 		return -EBADF;
1289 
1290 	/* Sanity check, is this really our fd? */
1291 	if (f.file->f_op != &vfio_fops) {
1292 		fdput(f);
1293 		return -EINVAL;
1294 	}
1295 
1296 	container = f.file->private_data;
1297 	WARN_ON(!container); /* fget ensures we don't race vfio_release */
1298 
1299 	down_write(&container->group_lock);
1300 
1301 	/* Real groups and fake groups cannot mix */
1302 	if (!list_empty(&container->group_list) &&
1303 	    container->noiommu != group->noiommu) {
1304 		ret = -EPERM;
1305 		goto unlock_out;
1306 	}
1307 
1308 	driver = container->iommu_driver;
1309 	if (driver) {
1310 		ret = driver->ops->attach_group(container->iommu_data,
1311 						group->iommu_group);
1312 		if (ret)
1313 			goto unlock_out;
1314 	}
1315 
1316 	group->container = container;
1317 	container->noiommu = group->noiommu;
1318 	list_add(&group->container_next, &container->group_list);
1319 
1320 	/* Get a reference on the container and mark a user within the group */
1321 	vfio_container_get(container);
1322 	atomic_inc(&group->container_users);
1323 
1324 unlock_out:
1325 	up_write(&container->group_lock);
1326 	fdput(f);
1327 	return ret;
1328 }
1329 
vfio_group_viable(struct vfio_group * group)1330 static bool vfio_group_viable(struct vfio_group *group)
1331 {
1332 	return (iommu_group_for_each_dev(group->iommu_group,
1333 					 group, vfio_dev_viable) == 0);
1334 }
1335 
vfio_group_add_container_user(struct vfio_group * group)1336 static int vfio_group_add_container_user(struct vfio_group *group)
1337 {
1338 	if (!atomic_inc_not_zero(&group->container_users))
1339 		return -EINVAL;
1340 
1341 	if (group->noiommu) {
1342 		atomic_dec(&group->container_users);
1343 		return -EPERM;
1344 	}
1345 	if (!group->container->iommu_driver || !vfio_group_viable(group)) {
1346 		atomic_dec(&group->container_users);
1347 		return -EINVAL;
1348 	}
1349 
1350 	return 0;
1351 }
1352 
1353 static const struct file_operations vfio_device_fops;
1354 
vfio_group_get_device_fd(struct vfio_group * group,char * buf)1355 static int vfio_group_get_device_fd(struct vfio_group *group, char *buf)
1356 {
1357 	struct vfio_device *device;
1358 	struct file *filep;
1359 	int ret;
1360 
1361 	if (0 == atomic_read(&group->container_users) ||
1362 	    !group->container->iommu_driver || !vfio_group_viable(group))
1363 		return -EINVAL;
1364 
1365 	if (group->noiommu && !capable(CAP_SYS_RAWIO))
1366 		return -EPERM;
1367 
1368 	device = vfio_device_get_from_name(group, buf);
1369 	if (IS_ERR(device))
1370 		return PTR_ERR(device);
1371 
1372 	ret = device->ops->open(device);
1373 	if (ret) {
1374 		vfio_device_put(device);
1375 		return ret;
1376 	}
1377 
1378 	/*
1379 	 * We can't use anon_inode_getfd() because we need to modify
1380 	 * the f_mode flags directly to allow more than just ioctls
1381 	 */
1382 	ret = get_unused_fd_flags(O_CLOEXEC);
1383 	if (ret < 0) {
1384 		device->ops->release(device);
1385 		vfio_device_put(device);
1386 		return ret;
1387 	}
1388 
1389 	filep = anon_inode_getfile("[vfio-device]", &vfio_device_fops,
1390 				   device, O_RDWR);
1391 	if (IS_ERR(filep)) {
1392 		put_unused_fd(ret);
1393 		ret = PTR_ERR(filep);
1394 		device->ops->release(device);
1395 		vfio_device_put(device);
1396 		return ret;
1397 	}
1398 
1399 	/*
1400 	 * TODO: add an anon_inode interface to do this.
1401 	 * Appears to be missing by lack of need rather than
1402 	 * explicitly prevented.  Now there's need.
1403 	 */
1404 	filep->f_mode |= (FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE);
1405 
1406 	atomic_inc(&group->container_users);
1407 
1408 	fd_install(ret, filep);
1409 
1410 	if (group->noiommu)
1411 		dev_warn(device->dev, "vfio-noiommu device opened by user "
1412 			 "(%s:%d)\n", current->comm, task_pid_nr(current));
1413 
1414 	return ret;
1415 }
1416 
vfio_group_fops_unl_ioctl(struct file * filep,unsigned int cmd,unsigned long arg)1417 static long vfio_group_fops_unl_ioctl(struct file *filep,
1418 				      unsigned int cmd, unsigned long arg)
1419 {
1420 	struct vfio_group *group = filep->private_data;
1421 	long ret = -ENOTTY;
1422 
1423 	switch (cmd) {
1424 	case VFIO_GROUP_GET_STATUS:
1425 	{
1426 		struct vfio_group_status status;
1427 		unsigned long minsz;
1428 
1429 		minsz = offsetofend(struct vfio_group_status, flags);
1430 
1431 		if (copy_from_user(&status, (void __user *)arg, minsz))
1432 			return -EFAULT;
1433 
1434 		if (status.argsz < minsz)
1435 			return -EINVAL;
1436 
1437 		status.flags = 0;
1438 
1439 		if (vfio_group_viable(group))
1440 			status.flags |= VFIO_GROUP_FLAGS_VIABLE;
1441 
1442 		if (group->container)
1443 			status.flags |= VFIO_GROUP_FLAGS_CONTAINER_SET;
1444 
1445 		if (copy_to_user((void __user *)arg, &status, minsz))
1446 			return -EFAULT;
1447 
1448 		ret = 0;
1449 		break;
1450 	}
1451 	case VFIO_GROUP_SET_CONTAINER:
1452 	{
1453 		int fd;
1454 
1455 		if (get_user(fd, (int __user *)arg))
1456 			return -EFAULT;
1457 
1458 		if (fd < 0)
1459 			return -EINVAL;
1460 
1461 		ret = vfio_group_set_container(group, fd);
1462 		break;
1463 	}
1464 	case VFIO_GROUP_UNSET_CONTAINER:
1465 		ret = vfio_group_unset_container(group);
1466 		break;
1467 	case VFIO_GROUP_GET_DEVICE_FD:
1468 	{
1469 		char *buf;
1470 
1471 		buf = strndup_user((const char __user *)arg, PAGE_SIZE);
1472 		if (IS_ERR(buf))
1473 			return PTR_ERR(buf);
1474 
1475 		ret = vfio_group_get_device_fd(group, buf);
1476 		kfree(buf);
1477 		break;
1478 	}
1479 	}
1480 
1481 	return ret;
1482 }
1483 
vfio_group_fops_open(struct inode * inode,struct file * filep)1484 static int vfio_group_fops_open(struct inode *inode, struct file *filep)
1485 {
1486 	struct vfio_group *group;
1487 	int opened;
1488 
1489 	group = vfio_group_get_from_minor(iminor(inode));
1490 	if (!group)
1491 		return -ENODEV;
1492 
1493 	if (group->noiommu && !capable(CAP_SYS_RAWIO)) {
1494 		vfio_group_put(group);
1495 		return -EPERM;
1496 	}
1497 
1498 	/* Do we need multiple instances of the group open?  Seems not. */
1499 	opened = atomic_cmpxchg(&group->opened, 0, 1);
1500 	if (opened) {
1501 		vfio_group_put(group);
1502 		return -EBUSY;
1503 	}
1504 
1505 	/* Is something still in use from a previous open? */
1506 	if (group->container) {
1507 		atomic_dec(&group->opened);
1508 		vfio_group_put(group);
1509 		return -EBUSY;
1510 	}
1511 
1512 	/* Warn if previous user didn't cleanup and re-init to drop them */
1513 	if (WARN_ON(group->notifier.head))
1514 		BLOCKING_INIT_NOTIFIER_HEAD(&group->notifier);
1515 
1516 	filep->private_data = group;
1517 
1518 	return 0;
1519 }
1520 
vfio_group_fops_release(struct inode * inode,struct file * filep)1521 static int vfio_group_fops_release(struct inode *inode, struct file *filep)
1522 {
1523 	struct vfio_group *group = filep->private_data;
1524 
1525 	filep->private_data = NULL;
1526 
1527 	vfio_group_try_dissolve_container(group);
1528 
1529 	atomic_dec(&group->opened);
1530 
1531 	vfio_group_put(group);
1532 
1533 	return 0;
1534 }
1535 
1536 static const struct file_operations vfio_group_fops = {
1537 	.owner		= THIS_MODULE,
1538 	.unlocked_ioctl	= vfio_group_fops_unl_ioctl,
1539 	.compat_ioctl	= compat_ptr_ioctl,
1540 	.open		= vfio_group_fops_open,
1541 	.release	= vfio_group_fops_release,
1542 };
1543 
1544 /**
1545  * VFIO Device fd
1546  */
vfio_device_fops_release(struct inode * inode,struct file * filep)1547 static int vfio_device_fops_release(struct inode *inode, struct file *filep)
1548 {
1549 	struct vfio_device *device = filep->private_data;
1550 
1551 	device->ops->release(device);
1552 
1553 	vfio_group_try_dissolve_container(device->group);
1554 
1555 	vfio_device_put(device);
1556 
1557 	return 0;
1558 }
1559 
vfio_device_fops_unl_ioctl(struct file * filep,unsigned int cmd,unsigned long arg)1560 static long vfio_device_fops_unl_ioctl(struct file *filep,
1561 				       unsigned int cmd, unsigned long arg)
1562 {
1563 	struct vfio_device *device = filep->private_data;
1564 
1565 	if (unlikely(!device->ops->ioctl))
1566 		return -EINVAL;
1567 
1568 	return device->ops->ioctl(device, cmd, arg);
1569 }
1570 
vfio_device_fops_read(struct file * filep,char __user * buf,size_t count,loff_t * ppos)1571 static ssize_t vfio_device_fops_read(struct file *filep, char __user *buf,
1572 				     size_t count, loff_t *ppos)
1573 {
1574 	struct vfio_device *device = filep->private_data;
1575 
1576 	if (unlikely(!device->ops->read))
1577 		return -EINVAL;
1578 
1579 	return device->ops->read(device, buf, count, ppos);
1580 }
1581 
vfio_device_fops_write(struct file * filep,const char __user * buf,size_t count,loff_t * ppos)1582 static ssize_t vfio_device_fops_write(struct file *filep,
1583 				      const char __user *buf,
1584 				      size_t count, loff_t *ppos)
1585 {
1586 	struct vfio_device *device = filep->private_data;
1587 
1588 	if (unlikely(!device->ops->write))
1589 		return -EINVAL;
1590 
1591 	return device->ops->write(device, buf, count, ppos);
1592 }
1593 
vfio_device_fops_mmap(struct file * filep,struct vm_area_struct * vma)1594 static int vfio_device_fops_mmap(struct file *filep, struct vm_area_struct *vma)
1595 {
1596 	struct vfio_device *device = filep->private_data;
1597 
1598 	if (unlikely(!device->ops->mmap))
1599 		return -EINVAL;
1600 
1601 	return device->ops->mmap(device, vma);
1602 }
1603 
1604 static const struct file_operations vfio_device_fops = {
1605 	.owner		= THIS_MODULE,
1606 	.release	= vfio_device_fops_release,
1607 	.read		= vfio_device_fops_read,
1608 	.write		= vfio_device_fops_write,
1609 	.unlocked_ioctl	= vfio_device_fops_unl_ioctl,
1610 	.compat_ioctl	= compat_ptr_ioctl,
1611 	.mmap		= vfio_device_fops_mmap,
1612 };
1613 
1614 /**
1615  * External user API, exported by symbols to be linked dynamically.
1616  *
1617  * The protocol includes:
1618  *  1. do normal VFIO init operation:
1619  *	- opening a new container;
1620  *	- attaching group(s) to it;
1621  *	- setting an IOMMU driver for a container.
1622  * When IOMMU is set for a container, all groups in it are
1623  * considered ready to use by an external user.
1624  *
1625  * 2. User space passes a group fd to an external user.
1626  * The external user calls vfio_group_get_external_user()
1627  * to verify that:
1628  *	- the group is initialized;
1629  *	- IOMMU is set for it.
1630  * If both checks passed, vfio_group_get_external_user()
1631  * increments the container user counter to prevent
1632  * the VFIO group from disposal before KVM exits.
1633  *
1634  * 3. The external user calls vfio_external_user_iommu_id()
1635  * to know an IOMMU ID.
1636  *
1637  * 4. When the external KVM finishes, it calls
1638  * vfio_group_put_external_user() to release the VFIO group.
1639  * This call decrements the container user counter.
1640  */
vfio_group_get_external_user(struct file * filep)1641 struct vfio_group *vfio_group_get_external_user(struct file *filep)
1642 {
1643 	struct vfio_group *group = filep->private_data;
1644 	int ret;
1645 
1646 	if (filep->f_op != &vfio_group_fops)
1647 		return ERR_PTR(-EINVAL);
1648 
1649 	ret = vfio_group_add_container_user(group);
1650 	if (ret)
1651 		return ERR_PTR(ret);
1652 
1653 	vfio_group_get(group);
1654 
1655 	return group;
1656 }
1657 EXPORT_SYMBOL_GPL(vfio_group_get_external_user);
1658 
1659 /**
1660  * External user API, exported by symbols to be linked dynamically.
1661  * The external user passes in a device pointer
1662  * to verify that:
1663  *	- A VFIO group is assiciated with the device;
1664  *	- IOMMU is set for the group.
1665  * If both checks passed, vfio_group_get_external_user_from_dev()
1666  * increments the container user counter to prevent the VFIO group
1667  * from disposal before external user exits and returns the pointer
1668  * to the VFIO group.
1669  *
1670  * When the external user finishes using the VFIO group, it calls
1671  * vfio_group_put_external_user() to release the VFIO group and
1672  * decrement the container user counter.
1673  *
1674  * @dev [in]	: device
1675  * Return error PTR or pointer to VFIO group.
1676  */
1677 
vfio_group_get_external_user_from_dev(struct device * dev)1678 struct vfio_group *vfio_group_get_external_user_from_dev(struct device *dev)
1679 {
1680 	struct vfio_group *group;
1681 	int ret;
1682 
1683 	group = vfio_group_get_from_dev(dev);
1684 	if (!group)
1685 		return ERR_PTR(-ENODEV);
1686 
1687 	ret = vfio_group_add_container_user(group);
1688 	if (ret) {
1689 		vfio_group_put(group);
1690 		return ERR_PTR(ret);
1691 	}
1692 
1693 	return group;
1694 }
1695 EXPORT_SYMBOL_GPL(vfio_group_get_external_user_from_dev);
1696 
vfio_group_put_external_user(struct vfio_group * group)1697 void vfio_group_put_external_user(struct vfio_group *group)
1698 {
1699 	vfio_group_try_dissolve_container(group);
1700 	vfio_group_put(group);
1701 }
1702 EXPORT_SYMBOL_GPL(vfio_group_put_external_user);
1703 
vfio_external_group_match_file(struct vfio_group * test_group,struct file * filep)1704 bool vfio_external_group_match_file(struct vfio_group *test_group,
1705 				    struct file *filep)
1706 {
1707 	struct vfio_group *group = filep->private_data;
1708 
1709 	return (filep->f_op == &vfio_group_fops) && (group == test_group);
1710 }
1711 EXPORT_SYMBOL_GPL(vfio_external_group_match_file);
1712 
vfio_external_user_iommu_id(struct vfio_group * group)1713 int vfio_external_user_iommu_id(struct vfio_group *group)
1714 {
1715 	return iommu_group_id(group->iommu_group);
1716 }
1717 EXPORT_SYMBOL_GPL(vfio_external_user_iommu_id);
1718 
vfio_external_check_extension(struct vfio_group * group,unsigned long arg)1719 long vfio_external_check_extension(struct vfio_group *group, unsigned long arg)
1720 {
1721 	return vfio_ioctl_check_extension(group->container, arg);
1722 }
1723 EXPORT_SYMBOL_GPL(vfio_external_check_extension);
1724 
1725 /**
1726  * Sub-module support
1727  */
1728 /*
1729  * Helper for managing a buffer of info chain capabilities, allocate or
1730  * reallocate a buffer with additional @size, filling in @id and @version
1731  * of the capability.  A pointer to the new capability is returned.
1732  *
1733  * NB. The chain is based at the head of the buffer, so new entries are
1734  * added to the tail, vfio_info_cap_shift() should be called to fixup the
1735  * next offsets prior to copying to the user buffer.
1736  */
vfio_info_cap_add(struct vfio_info_cap * caps,size_t size,u16 id,u16 version)1737 struct vfio_info_cap_header *vfio_info_cap_add(struct vfio_info_cap *caps,
1738 					       size_t size, u16 id, u16 version)
1739 {
1740 	void *buf;
1741 	struct vfio_info_cap_header *header, *tmp;
1742 
1743 	buf = krealloc(caps->buf, caps->size + size, GFP_KERNEL);
1744 	if (!buf) {
1745 		kfree(caps->buf);
1746 		caps->size = 0;
1747 		return ERR_PTR(-ENOMEM);
1748 	}
1749 
1750 	caps->buf = buf;
1751 	header = buf + caps->size;
1752 
1753 	/* Eventually copied to user buffer, zero */
1754 	memset(header, 0, size);
1755 
1756 	header->id = id;
1757 	header->version = version;
1758 
1759 	/* Add to the end of the capability chain */
1760 	for (tmp = buf; tmp->next; tmp = buf + tmp->next)
1761 		; /* nothing */
1762 
1763 	tmp->next = caps->size;
1764 	caps->size += size;
1765 
1766 	return header;
1767 }
1768 EXPORT_SYMBOL_GPL(vfio_info_cap_add);
1769 
vfio_info_cap_shift(struct vfio_info_cap * caps,size_t offset)1770 void vfio_info_cap_shift(struct vfio_info_cap *caps, size_t offset)
1771 {
1772 	struct vfio_info_cap_header *tmp;
1773 	void *buf = (void *)caps->buf;
1774 
1775 	for (tmp = buf; tmp->next; tmp = buf + tmp->next - offset)
1776 		tmp->next += offset;
1777 }
1778 EXPORT_SYMBOL(vfio_info_cap_shift);
1779 
vfio_info_add_capability(struct vfio_info_cap * caps,struct vfio_info_cap_header * cap,size_t size)1780 int vfio_info_add_capability(struct vfio_info_cap *caps,
1781 			     struct vfio_info_cap_header *cap, size_t size)
1782 {
1783 	struct vfio_info_cap_header *header;
1784 
1785 	header = vfio_info_cap_add(caps, size, cap->id, cap->version);
1786 	if (IS_ERR(header))
1787 		return PTR_ERR(header);
1788 
1789 	memcpy(header + 1, cap + 1, size - sizeof(*header));
1790 
1791 	return 0;
1792 }
1793 EXPORT_SYMBOL(vfio_info_add_capability);
1794 
vfio_set_irqs_validate_and_prepare(struct vfio_irq_set * hdr,int num_irqs,int max_irq_type,size_t * data_size)1795 int vfio_set_irqs_validate_and_prepare(struct vfio_irq_set *hdr, int num_irqs,
1796 				       int max_irq_type, size_t *data_size)
1797 {
1798 	unsigned long minsz;
1799 	size_t size;
1800 
1801 	minsz = offsetofend(struct vfio_irq_set, count);
1802 
1803 	if ((hdr->argsz < minsz) || (hdr->index >= max_irq_type) ||
1804 	    (hdr->count >= (U32_MAX - hdr->start)) ||
1805 	    (hdr->flags & ~(VFIO_IRQ_SET_DATA_TYPE_MASK |
1806 				VFIO_IRQ_SET_ACTION_TYPE_MASK)))
1807 		return -EINVAL;
1808 
1809 	if (data_size)
1810 		*data_size = 0;
1811 
1812 	if (hdr->start >= num_irqs || hdr->start + hdr->count > num_irqs)
1813 		return -EINVAL;
1814 
1815 	switch (hdr->flags & VFIO_IRQ_SET_DATA_TYPE_MASK) {
1816 	case VFIO_IRQ_SET_DATA_NONE:
1817 		size = 0;
1818 		break;
1819 	case VFIO_IRQ_SET_DATA_BOOL:
1820 		size = sizeof(uint8_t);
1821 		break;
1822 	case VFIO_IRQ_SET_DATA_EVENTFD:
1823 		size = sizeof(int32_t);
1824 		break;
1825 	default:
1826 		return -EINVAL;
1827 	}
1828 
1829 	if (size) {
1830 		if (hdr->argsz - minsz < hdr->count * size)
1831 			return -EINVAL;
1832 
1833 		if (!data_size)
1834 			return -EINVAL;
1835 
1836 		*data_size = hdr->count * size;
1837 	}
1838 
1839 	return 0;
1840 }
1841 EXPORT_SYMBOL(vfio_set_irqs_validate_and_prepare);
1842 
1843 /*
1844  * Pin a set of guest PFNs and return their associated host PFNs for local
1845  * domain only.
1846  * @dev [in]     : device
1847  * @user_pfn [in]: array of user/guest PFNs to be pinned.
1848  * @npage [in]   : count of elements in user_pfn array.  This count should not
1849  *		   be greater VFIO_PIN_PAGES_MAX_ENTRIES.
1850  * @prot [in]    : protection flags
1851  * @phys_pfn[out]: array of host PFNs
1852  * Return error or number of pages pinned.
1853  */
vfio_pin_pages(struct device * dev,unsigned long * user_pfn,int npage,int prot,unsigned long * phys_pfn)1854 int vfio_pin_pages(struct device *dev, unsigned long *user_pfn, int npage,
1855 		   int prot, unsigned long *phys_pfn)
1856 {
1857 	struct vfio_container *container;
1858 	struct vfio_group *group;
1859 	struct vfio_iommu_driver *driver;
1860 	int ret;
1861 
1862 	if (!dev || !user_pfn || !phys_pfn || !npage)
1863 		return -EINVAL;
1864 
1865 	if (npage > VFIO_PIN_PAGES_MAX_ENTRIES)
1866 		return -E2BIG;
1867 
1868 	group = vfio_group_get_from_dev(dev);
1869 	if (!group)
1870 		return -ENODEV;
1871 
1872 	if (group->dev_counter > 1) {
1873 		ret = -EINVAL;
1874 		goto err_pin_pages;
1875 	}
1876 
1877 	ret = vfio_group_add_container_user(group);
1878 	if (ret)
1879 		goto err_pin_pages;
1880 
1881 	container = group->container;
1882 	driver = container->iommu_driver;
1883 	if (likely(driver && driver->ops->pin_pages))
1884 		ret = driver->ops->pin_pages(container->iommu_data,
1885 					     group->iommu_group, user_pfn,
1886 					     npage, prot, phys_pfn);
1887 	else
1888 		ret = -ENOTTY;
1889 
1890 	vfio_group_try_dissolve_container(group);
1891 
1892 err_pin_pages:
1893 	vfio_group_put(group);
1894 	return ret;
1895 }
1896 EXPORT_SYMBOL(vfio_pin_pages);
1897 
1898 /*
1899  * Unpin set of host PFNs for local domain only.
1900  * @dev [in]     : device
1901  * @user_pfn [in]: array of user/guest PFNs to be unpinned. Number of user/guest
1902  *		   PFNs should not be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
1903  * @npage [in]   : count of elements in user_pfn array.  This count should not
1904  *                 be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
1905  * Return error or number of pages unpinned.
1906  */
vfio_unpin_pages(struct device * dev,unsigned long * user_pfn,int npage)1907 int vfio_unpin_pages(struct device *dev, unsigned long *user_pfn, int npage)
1908 {
1909 	struct vfio_container *container;
1910 	struct vfio_group *group;
1911 	struct vfio_iommu_driver *driver;
1912 	int ret;
1913 
1914 	if (!dev || !user_pfn || !npage)
1915 		return -EINVAL;
1916 
1917 	if (npage > VFIO_PIN_PAGES_MAX_ENTRIES)
1918 		return -E2BIG;
1919 
1920 	group = vfio_group_get_from_dev(dev);
1921 	if (!group)
1922 		return -ENODEV;
1923 
1924 	ret = vfio_group_add_container_user(group);
1925 	if (ret)
1926 		goto err_unpin_pages;
1927 
1928 	container = group->container;
1929 	driver = container->iommu_driver;
1930 	if (likely(driver && driver->ops->unpin_pages))
1931 		ret = driver->ops->unpin_pages(container->iommu_data, user_pfn,
1932 					       npage);
1933 	else
1934 		ret = -ENOTTY;
1935 
1936 	vfio_group_try_dissolve_container(group);
1937 
1938 err_unpin_pages:
1939 	vfio_group_put(group);
1940 	return ret;
1941 }
1942 EXPORT_SYMBOL(vfio_unpin_pages);
1943 
1944 /*
1945  * Pin a set of guest IOVA PFNs and return their associated host PFNs for a
1946  * VFIO group.
1947  *
1948  * The caller needs to call vfio_group_get_external_user() or
1949  * vfio_group_get_external_user_from_dev() prior to calling this interface,
1950  * so as to prevent the VFIO group from disposal in the middle of the call.
1951  * But it can keep the reference to the VFIO group for several calls into
1952  * this interface.
1953  * After finishing using of the VFIO group, the caller needs to release the
1954  * VFIO group by calling vfio_group_put_external_user().
1955  *
1956  * @group [in]		: VFIO group
1957  * @user_iova_pfn [in]	: array of user/guest IOVA PFNs to be pinned.
1958  * @npage [in]		: count of elements in user_iova_pfn array.
1959  *			  This count should not be greater
1960  *			  VFIO_PIN_PAGES_MAX_ENTRIES.
1961  * @prot [in]		: protection flags
1962  * @phys_pfn [out]	: array of host PFNs
1963  * Return error or number of pages pinned.
1964  */
vfio_group_pin_pages(struct vfio_group * group,unsigned long * user_iova_pfn,int npage,int prot,unsigned long * phys_pfn)1965 int vfio_group_pin_pages(struct vfio_group *group,
1966 			 unsigned long *user_iova_pfn, int npage,
1967 			 int prot, unsigned long *phys_pfn)
1968 {
1969 	struct vfio_container *container;
1970 	struct vfio_iommu_driver *driver;
1971 	int ret;
1972 
1973 	if (!group || !user_iova_pfn || !phys_pfn || !npage)
1974 		return -EINVAL;
1975 
1976 	if (group->dev_counter > 1)
1977 		return -EINVAL;
1978 
1979 	if (npage > VFIO_PIN_PAGES_MAX_ENTRIES)
1980 		return -E2BIG;
1981 
1982 	container = group->container;
1983 	driver = container->iommu_driver;
1984 	if (likely(driver && driver->ops->pin_pages))
1985 		ret = driver->ops->pin_pages(container->iommu_data,
1986 					     group->iommu_group, user_iova_pfn,
1987 					     npage, prot, phys_pfn);
1988 	else
1989 		ret = -ENOTTY;
1990 
1991 	return ret;
1992 }
1993 EXPORT_SYMBOL(vfio_group_pin_pages);
1994 
1995 /*
1996  * Unpin a set of guest IOVA PFNs for a VFIO group.
1997  *
1998  * The caller needs to call vfio_group_get_external_user() or
1999  * vfio_group_get_external_user_from_dev() prior to calling this interface,
2000  * so as to prevent the VFIO group from disposal in the middle of the call.
2001  * But it can keep the reference to the VFIO group for several calls into
2002  * this interface.
2003  * After finishing using of the VFIO group, the caller needs to release the
2004  * VFIO group by calling vfio_group_put_external_user().
2005  *
2006  * @group [in]		: vfio group
2007  * @user_iova_pfn [in]	: array of user/guest IOVA PFNs to be unpinned.
2008  * @npage [in]		: count of elements in user_iova_pfn array.
2009  *			  This count should not be greater than
2010  *			  VFIO_PIN_PAGES_MAX_ENTRIES.
2011  * Return error or number of pages unpinned.
2012  */
vfio_group_unpin_pages(struct vfio_group * group,unsigned long * user_iova_pfn,int npage)2013 int vfio_group_unpin_pages(struct vfio_group *group,
2014 			   unsigned long *user_iova_pfn, int npage)
2015 {
2016 	struct vfio_container *container;
2017 	struct vfio_iommu_driver *driver;
2018 	int ret;
2019 
2020 	if (!group || !user_iova_pfn || !npage)
2021 		return -EINVAL;
2022 
2023 	if (npage > VFIO_PIN_PAGES_MAX_ENTRIES)
2024 		return -E2BIG;
2025 
2026 	container = group->container;
2027 	driver = container->iommu_driver;
2028 	if (likely(driver && driver->ops->unpin_pages))
2029 		ret = driver->ops->unpin_pages(container->iommu_data,
2030 					       user_iova_pfn, npage);
2031 	else
2032 		ret = -ENOTTY;
2033 
2034 	return ret;
2035 }
2036 EXPORT_SYMBOL(vfio_group_unpin_pages);
2037 
2038 
2039 /*
2040  * This interface allows the CPUs to perform some sort of virtual DMA on
2041  * behalf of the device.
2042  *
2043  * CPUs read/write from/into a range of IOVAs pointing to user space memory
2044  * into/from a kernel buffer.
2045  *
2046  * As the read/write of user space memory is conducted via the CPUs and is
2047  * not a real device DMA, it is not necessary to pin the user space memory.
2048  *
2049  * The caller needs to call vfio_group_get_external_user() or
2050  * vfio_group_get_external_user_from_dev() prior to calling this interface,
2051  * so as to prevent the VFIO group from disposal in the middle of the call.
2052  * But it can keep the reference to the VFIO group for several calls into
2053  * this interface.
2054  * After finishing using of the VFIO group, the caller needs to release the
2055  * VFIO group by calling vfio_group_put_external_user().
2056  *
2057  * @group [in]		: VFIO group
2058  * @user_iova [in]	: base IOVA of a user space buffer
2059  * @data [in]		: pointer to kernel buffer
2060  * @len [in]		: kernel buffer length
2061  * @write		: indicate read or write
2062  * Return error code on failure or 0 on success.
2063  */
vfio_dma_rw(struct vfio_group * group,dma_addr_t user_iova,void * data,size_t len,bool write)2064 int vfio_dma_rw(struct vfio_group *group, dma_addr_t user_iova,
2065 		void *data, size_t len, bool write)
2066 {
2067 	struct vfio_container *container;
2068 	struct vfio_iommu_driver *driver;
2069 	int ret = 0;
2070 
2071 	if (!group || !data || len <= 0)
2072 		return -EINVAL;
2073 
2074 	container = group->container;
2075 	driver = container->iommu_driver;
2076 
2077 	if (likely(driver && driver->ops->dma_rw))
2078 		ret = driver->ops->dma_rw(container->iommu_data,
2079 					  user_iova, data, len, write);
2080 	else
2081 		ret = -ENOTTY;
2082 
2083 	return ret;
2084 }
2085 EXPORT_SYMBOL(vfio_dma_rw);
2086 
vfio_register_iommu_notifier(struct vfio_group * group,unsigned long * events,struct notifier_block * nb)2087 static int vfio_register_iommu_notifier(struct vfio_group *group,
2088 					unsigned long *events,
2089 					struct notifier_block *nb)
2090 {
2091 	struct vfio_container *container;
2092 	struct vfio_iommu_driver *driver;
2093 	int ret;
2094 
2095 	ret = vfio_group_add_container_user(group);
2096 	if (ret)
2097 		return -EINVAL;
2098 
2099 	container = group->container;
2100 	driver = container->iommu_driver;
2101 	if (likely(driver && driver->ops->register_notifier))
2102 		ret = driver->ops->register_notifier(container->iommu_data,
2103 						     events, nb);
2104 	else
2105 		ret = -ENOTTY;
2106 
2107 	vfio_group_try_dissolve_container(group);
2108 
2109 	return ret;
2110 }
2111 
vfio_unregister_iommu_notifier(struct vfio_group * group,struct notifier_block * nb)2112 static int vfio_unregister_iommu_notifier(struct vfio_group *group,
2113 					  struct notifier_block *nb)
2114 {
2115 	struct vfio_container *container;
2116 	struct vfio_iommu_driver *driver;
2117 	int ret;
2118 
2119 	ret = vfio_group_add_container_user(group);
2120 	if (ret)
2121 		return -EINVAL;
2122 
2123 	container = group->container;
2124 	driver = container->iommu_driver;
2125 	if (likely(driver && driver->ops->unregister_notifier))
2126 		ret = driver->ops->unregister_notifier(container->iommu_data,
2127 						       nb);
2128 	else
2129 		ret = -ENOTTY;
2130 
2131 	vfio_group_try_dissolve_container(group);
2132 
2133 	return ret;
2134 }
2135 
vfio_group_set_kvm(struct vfio_group * group,struct kvm * kvm)2136 void vfio_group_set_kvm(struct vfio_group *group, struct kvm *kvm)
2137 {
2138 	group->kvm = kvm;
2139 	blocking_notifier_call_chain(&group->notifier,
2140 				VFIO_GROUP_NOTIFY_SET_KVM, kvm);
2141 }
2142 EXPORT_SYMBOL_GPL(vfio_group_set_kvm);
2143 
vfio_register_group_notifier(struct vfio_group * group,unsigned long * events,struct notifier_block * nb)2144 static int vfio_register_group_notifier(struct vfio_group *group,
2145 					unsigned long *events,
2146 					struct notifier_block *nb)
2147 {
2148 	int ret;
2149 	bool set_kvm = false;
2150 
2151 	if (*events & VFIO_GROUP_NOTIFY_SET_KVM)
2152 		set_kvm = true;
2153 
2154 	/* clear known events */
2155 	*events &= ~VFIO_GROUP_NOTIFY_SET_KVM;
2156 
2157 	/* refuse to continue if still events remaining */
2158 	if (*events)
2159 		return -EINVAL;
2160 
2161 	ret = vfio_group_add_container_user(group);
2162 	if (ret)
2163 		return -EINVAL;
2164 
2165 	ret = blocking_notifier_chain_register(&group->notifier, nb);
2166 
2167 	/*
2168 	 * The attaching of kvm and vfio_group might already happen, so
2169 	 * here we replay once upon registration.
2170 	 */
2171 	if (!ret && set_kvm && group->kvm)
2172 		blocking_notifier_call_chain(&group->notifier,
2173 					VFIO_GROUP_NOTIFY_SET_KVM, group->kvm);
2174 
2175 	vfio_group_try_dissolve_container(group);
2176 
2177 	return ret;
2178 }
2179 
vfio_unregister_group_notifier(struct vfio_group * group,struct notifier_block * nb)2180 static int vfio_unregister_group_notifier(struct vfio_group *group,
2181 					 struct notifier_block *nb)
2182 {
2183 	int ret;
2184 
2185 	ret = vfio_group_add_container_user(group);
2186 	if (ret)
2187 		return -EINVAL;
2188 
2189 	ret = blocking_notifier_chain_unregister(&group->notifier, nb);
2190 
2191 	vfio_group_try_dissolve_container(group);
2192 
2193 	return ret;
2194 }
2195 
vfio_register_notifier(struct device * dev,enum vfio_notify_type type,unsigned long * events,struct notifier_block * nb)2196 int vfio_register_notifier(struct device *dev, enum vfio_notify_type type,
2197 			   unsigned long *events, struct notifier_block *nb)
2198 {
2199 	struct vfio_group *group;
2200 	int ret;
2201 
2202 	if (!dev || !nb || !events || (*events == 0))
2203 		return -EINVAL;
2204 
2205 	group = vfio_group_get_from_dev(dev);
2206 	if (!group)
2207 		return -ENODEV;
2208 
2209 	switch (type) {
2210 	case VFIO_IOMMU_NOTIFY:
2211 		ret = vfio_register_iommu_notifier(group, events, nb);
2212 		break;
2213 	case VFIO_GROUP_NOTIFY:
2214 		ret = vfio_register_group_notifier(group, events, nb);
2215 		break;
2216 	default:
2217 		ret = -EINVAL;
2218 	}
2219 
2220 	vfio_group_put(group);
2221 	return ret;
2222 }
2223 EXPORT_SYMBOL(vfio_register_notifier);
2224 
vfio_unregister_notifier(struct device * dev,enum vfio_notify_type type,struct notifier_block * nb)2225 int vfio_unregister_notifier(struct device *dev, enum vfio_notify_type type,
2226 			     struct notifier_block *nb)
2227 {
2228 	struct vfio_group *group;
2229 	int ret;
2230 
2231 	if (!dev || !nb)
2232 		return -EINVAL;
2233 
2234 	group = vfio_group_get_from_dev(dev);
2235 	if (!group)
2236 		return -ENODEV;
2237 
2238 	switch (type) {
2239 	case VFIO_IOMMU_NOTIFY:
2240 		ret = vfio_unregister_iommu_notifier(group, nb);
2241 		break;
2242 	case VFIO_GROUP_NOTIFY:
2243 		ret = vfio_unregister_group_notifier(group, nb);
2244 		break;
2245 	default:
2246 		ret = -EINVAL;
2247 	}
2248 
2249 	vfio_group_put(group);
2250 	return ret;
2251 }
2252 EXPORT_SYMBOL(vfio_unregister_notifier);
2253 
vfio_group_iommu_domain(struct vfio_group * group)2254 struct iommu_domain *vfio_group_iommu_domain(struct vfio_group *group)
2255 {
2256 	struct vfio_container *container;
2257 	struct vfio_iommu_driver *driver;
2258 
2259 	if (!group)
2260 		return ERR_PTR(-EINVAL);
2261 
2262 	container = group->container;
2263 	driver = container->iommu_driver;
2264 	if (likely(driver && driver->ops->group_iommu_domain))
2265 		return driver->ops->group_iommu_domain(container->iommu_data,
2266 						       group->iommu_group);
2267 
2268 	return ERR_PTR(-ENOTTY);
2269 }
2270 EXPORT_SYMBOL_GPL(vfio_group_iommu_domain);
2271 
2272 /**
2273  * Module/class support
2274  */
vfio_devnode(struct device * dev,umode_t * mode)2275 static char *vfio_devnode(struct device *dev, umode_t *mode)
2276 {
2277 	return kasprintf(GFP_KERNEL, "vfio/%s", dev_name(dev));
2278 }
2279 
2280 static struct miscdevice vfio_dev = {
2281 	.minor = VFIO_MINOR,
2282 	.name = "vfio",
2283 	.fops = &vfio_fops,
2284 	.nodename = "vfio/vfio",
2285 	.mode = S_IRUGO | S_IWUGO,
2286 };
2287 
vfio_init(void)2288 static int __init vfio_init(void)
2289 {
2290 	int ret;
2291 
2292 	idr_init(&vfio.group_idr);
2293 	mutex_init(&vfio.group_lock);
2294 	mutex_init(&vfio.iommu_drivers_lock);
2295 	INIT_LIST_HEAD(&vfio.group_list);
2296 	INIT_LIST_HEAD(&vfio.iommu_drivers_list);
2297 
2298 	ret = misc_register(&vfio_dev);
2299 	if (ret) {
2300 		pr_err("vfio: misc device register failed\n");
2301 		return ret;
2302 	}
2303 
2304 	/* /dev/vfio/$GROUP */
2305 	vfio.class = class_create(THIS_MODULE, "vfio");
2306 	if (IS_ERR(vfio.class)) {
2307 		ret = PTR_ERR(vfio.class);
2308 		goto err_class;
2309 	}
2310 
2311 	vfio.class->devnode = vfio_devnode;
2312 
2313 	ret = alloc_chrdev_region(&vfio.group_devt, 0, MINORMASK + 1, "vfio");
2314 	if (ret)
2315 		goto err_alloc_chrdev;
2316 
2317 	cdev_init(&vfio.group_cdev, &vfio_group_fops);
2318 	ret = cdev_add(&vfio.group_cdev, vfio.group_devt, MINORMASK + 1);
2319 	if (ret)
2320 		goto err_cdev_add;
2321 
2322 	pr_info(DRIVER_DESC " version: " DRIVER_VERSION "\n");
2323 
2324 #ifdef CONFIG_VFIO_NOIOMMU
2325 	vfio_register_iommu_driver(&vfio_noiommu_ops);
2326 #endif
2327 	return 0;
2328 
2329 err_cdev_add:
2330 	unregister_chrdev_region(vfio.group_devt, MINORMASK + 1);
2331 err_alloc_chrdev:
2332 	class_destroy(vfio.class);
2333 	vfio.class = NULL;
2334 err_class:
2335 	misc_deregister(&vfio_dev);
2336 	return ret;
2337 }
2338 
vfio_cleanup(void)2339 static void __exit vfio_cleanup(void)
2340 {
2341 	WARN_ON(!list_empty(&vfio.group_list));
2342 
2343 #ifdef CONFIG_VFIO_NOIOMMU
2344 	vfio_unregister_iommu_driver(&vfio_noiommu_ops);
2345 #endif
2346 	idr_destroy(&vfio.group_idr);
2347 	cdev_del(&vfio.group_cdev);
2348 	unregister_chrdev_region(vfio.group_devt, MINORMASK + 1);
2349 	class_destroy(vfio.class);
2350 	vfio.class = NULL;
2351 	misc_deregister(&vfio_dev);
2352 }
2353 
2354 module_init(vfio_init);
2355 module_exit(vfio_cleanup);
2356 
2357 MODULE_VERSION(DRIVER_VERSION);
2358 MODULE_LICENSE("GPL v2");
2359 MODULE_AUTHOR(DRIVER_AUTHOR);
2360 MODULE_DESCRIPTION(DRIVER_DESC);
2361 MODULE_ALIAS_MISCDEV(VFIO_MINOR);
2362 MODULE_ALIAS("devname:vfio/vfio");
2363 MODULE_SOFTDEP("post: vfio_iommu_type1 vfio_iommu_spapr_tce");
2364