xref: /linux/drivers/vfio/vfio_main.c (revision e91c37f1)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * VFIO core
4  *
5  * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
6  *     Author: Alex Williamson <alex.williamson@redhat.com>
7  *
8  * Derived from original vfio:
9  * Copyright 2010 Cisco Systems, Inc.  All rights reserved.
10  * Author: Tom Lyon, pugs@cisco.com
11  */
12 
13 #include <linux/cdev.h>
14 #include <linux/compat.h>
15 #include <linux/device.h>
16 #include <linux/fs.h>
17 #include <linux/idr.h>
18 #include <linux/iommu.h>
19 #ifdef CONFIG_HAVE_KVM
20 #include <linux/kvm_host.h>
21 #endif
22 #include <linux/list.h>
23 #include <linux/miscdevice.h>
24 #include <linux/module.h>
25 #include <linux/mutex.h>
26 #include <linux/pci.h>
27 #include <linux/rwsem.h>
28 #include <linux/sched.h>
29 #include <linux/slab.h>
30 #include <linux/stat.h>
31 #include <linux/string.h>
32 #include <linux/uaccess.h>
33 #include <linux/vfio.h>
34 #include <linux/wait.h>
35 #include <linux/sched/signal.h>
36 #include <linux/pm_runtime.h>
37 #include <linux/interval_tree.h>
38 #include <linux/iova_bitmap.h>
39 #include <linux/iommufd.h>
40 #include "vfio.h"
41 
42 #define DRIVER_VERSION	"0.3"
43 #define DRIVER_AUTHOR	"Alex Williamson <alex.williamson@redhat.com>"
44 #define DRIVER_DESC	"VFIO - User Level meta-driver"
45 
46 static struct vfio {
47 	struct class			*device_class;
48 	struct ida			device_ida;
49 } vfio;
50 
51 #ifdef CONFIG_VFIO_NOIOMMU
52 bool vfio_noiommu __read_mostly;
53 module_param_named(enable_unsafe_noiommu_mode,
54 		   vfio_noiommu, bool, S_IRUGO | S_IWUSR);
55 MODULE_PARM_DESC(enable_unsafe_noiommu_mode, "Enable UNSAFE, no-IOMMU mode.  This mode provides no device isolation, no DMA translation, no host kernel protection, cannot be used for device assignment to virtual machines, requires RAWIO permissions, and will taint the kernel.  If you do not know what this is for, step away. (default: false)");
56 #endif
57 
58 static DEFINE_XARRAY(vfio_device_set_xa);
59 
60 int vfio_assign_device_set(struct vfio_device *device, void *set_id)
61 {
62 	unsigned long idx = (unsigned long)set_id;
63 	struct vfio_device_set *new_dev_set;
64 	struct vfio_device_set *dev_set;
65 
66 	if (WARN_ON(!set_id))
67 		return -EINVAL;
68 
69 	/*
70 	 * Atomically acquire a singleton object in the xarray for this set_id
71 	 */
72 	xa_lock(&vfio_device_set_xa);
73 	dev_set = xa_load(&vfio_device_set_xa, idx);
74 	if (dev_set)
75 		goto found_get_ref;
76 	xa_unlock(&vfio_device_set_xa);
77 
78 	new_dev_set = kzalloc(sizeof(*new_dev_set), GFP_KERNEL);
79 	if (!new_dev_set)
80 		return -ENOMEM;
81 	mutex_init(&new_dev_set->lock);
82 	INIT_LIST_HEAD(&new_dev_set->device_list);
83 	new_dev_set->set_id = set_id;
84 
85 	xa_lock(&vfio_device_set_xa);
86 	dev_set = __xa_cmpxchg(&vfio_device_set_xa, idx, NULL, new_dev_set,
87 			       GFP_KERNEL);
88 	if (!dev_set) {
89 		dev_set = new_dev_set;
90 		goto found_get_ref;
91 	}
92 
93 	kfree(new_dev_set);
94 	if (xa_is_err(dev_set)) {
95 		xa_unlock(&vfio_device_set_xa);
96 		return xa_err(dev_set);
97 	}
98 
99 found_get_ref:
100 	dev_set->device_count++;
101 	xa_unlock(&vfio_device_set_xa);
102 	mutex_lock(&dev_set->lock);
103 	device->dev_set = dev_set;
104 	list_add_tail(&device->dev_set_list, &dev_set->device_list);
105 	mutex_unlock(&dev_set->lock);
106 	return 0;
107 }
108 EXPORT_SYMBOL_GPL(vfio_assign_device_set);
109 
110 static void vfio_release_device_set(struct vfio_device *device)
111 {
112 	struct vfio_device_set *dev_set = device->dev_set;
113 
114 	if (!dev_set)
115 		return;
116 
117 	mutex_lock(&dev_set->lock);
118 	list_del(&device->dev_set_list);
119 	mutex_unlock(&dev_set->lock);
120 
121 	xa_lock(&vfio_device_set_xa);
122 	if (!--dev_set->device_count) {
123 		__xa_erase(&vfio_device_set_xa,
124 			   (unsigned long)dev_set->set_id);
125 		mutex_destroy(&dev_set->lock);
126 		kfree(dev_set);
127 	}
128 	xa_unlock(&vfio_device_set_xa);
129 }
130 
131 unsigned int vfio_device_set_open_count(struct vfio_device_set *dev_set)
132 {
133 	struct vfio_device *cur;
134 	unsigned int open_count = 0;
135 
136 	lockdep_assert_held(&dev_set->lock);
137 
138 	list_for_each_entry(cur, &dev_set->device_list, dev_set_list)
139 		open_count += cur->open_count;
140 	return open_count;
141 }
142 EXPORT_SYMBOL_GPL(vfio_device_set_open_count);
143 
144 struct vfio_device *
145 vfio_find_device_in_devset(struct vfio_device_set *dev_set,
146 			   struct device *dev)
147 {
148 	struct vfio_device *cur;
149 
150 	lockdep_assert_held(&dev_set->lock);
151 
152 	list_for_each_entry(cur, &dev_set->device_list, dev_set_list)
153 		if (cur->dev == dev)
154 			return cur;
155 	return NULL;
156 }
157 EXPORT_SYMBOL_GPL(vfio_find_device_in_devset);
158 
159 /*
160  * Device objects - create, release, get, put, search
161  */
162 /* Device reference always implies a group reference */
163 void vfio_device_put_registration(struct vfio_device *device)
164 {
165 	if (refcount_dec_and_test(&device->refcount))
166 		complete(&device->comp);
167 }
168 
169 bool vfio_device_try_get_registration(struct vfio_device *device)
170 {
171 	return refcount_inc_not_zero(&device->refcount);
172 }
173 
174 /*
175  * VFIO driver API
176  */
177 /* Release helper called by vfio_put_device() */
178 static void vfio_device_release(struct device *dev)
179 {
180 	struct vfio_device *device =
181 			container_of(dev, struct vfio_device, device);
182 
183 	vfio_release_device_set(device);
184 	ida_free(&vfio.device_ida, device->index);
185 
186 	if (device->ops->release)
187 		device->ops->release(device);
188 
189 	kvfree(device);
190 }
191 
192 static int vfio_init_device(struct vfio_device *device, struct device *dev,
193 			    const struct vfio_device_ops *ops);
194 
195 /*
196  * Allocate and initialize vfio_device so it can be registered to vfio
197  * core.
198  *
199  * Drivers should use the wrapper vfio_alloc_device() for allocation.
200  * @size is the size of the structure to be allocated, including any
201  * private data used by the driver.
202  *
203  * Driver may provide an @init callback to cover device private data.
204  *
205  * Use vfio_put_device() to release the structure after success return.
206  */
207 struct vfio_device *_vfio_alloc_device(size_t size, struct device *dev,
208 				       const struct vfio_device_ops *ops)
209 {
210 	struct vfio_device *device;
211 	int ret;
212 
213 	if (WARN_ON(size < sizeof(struct vfio_device)))
214 		return ERR_PTR(-EINVAL);
215 
216 	device = kvzalloc(size, GFP_KERNEL);
217 	if (!device)
218 		return ERR_PTR(-ENOMEM);
219 
220 	ret = vfio_init_device(device, dev, ops);
221 	if (ret)
222 		goto out_free;
223 	return device;
224 
225 out_free:
226 	kvfree(device);
227 	return ERR_PTR(ret);
228 }
229 EXPORT_SYMBOL_GPL(_vfio_alloc_device);
230 
231 /*
232  * Initialize a vfio_device so it can be registered to vfio core.
233  */
234 static int vfio_init_device(struct vfio_device *device, struct device *dev,
235 			    const struct vfio_device_ops *ops)
236 {
237 	int ret;
238 
239 	ret = ida_alloc_max(&vfio.device_ida, MINORMASK, GFP_KERNEL);
240 	if (ret < 0) {
241 		dev_dbg(dev, "Error to alloc index\n");
242 		return ret;
243 	}
244 
245 	device->index = ret;
246 	init_completion(&device->comp);
247 	device->dev = dev;
248 	device->ops = ops;
249 
250 	if (ops->init) {
251 		ret = ops->init(device);
252 		if (ret)
253 			goto out_uninit;
254 	}
255 
256 	device_initialize(&device->device);
257 	device->device.release = vfio_device_release;
258 	device->device.class = vfio.device_class;
259 	device->device.parent = device->dev;
260 	return 0;
261 
262 out_uninit:
263 	vfio_release_device_set(device);
264 	ida_free(&vfio.device_ida, device->index);
265 	return ret;
266 }
267 
268 static int __vfio_register_dev(struct vfio_device *device,
269 			       enum vfio_group_type type)
270 {
271 	int ret;
272 
273 	if (WARN_ON(IS_ENABLED(CONFIG_IOMMUFD) &&
274 		    (!device->ops->bind_iommufd ||
275 		     !device->ops->unbind_iommufd ||
276 		     !device->ops->attach_ioas ||
277 		     !device->ops->detach_ioas)))
278 		return -EINVAL;
279 
280 	/*
281 	 * If the driver doesn't specify a set then the device is added to a
282 	 * singleton set just for itself.
283 	 */
284 	if (!device->dev_set)
285 		vfio_assign_device_set(device, device);
286 
287 	ret = dev_set_name(&device->device, "vfio%d", device->index);
288 	if (ret)
289 		return ret;
290 
291 	ret = vfio_device_set_group(device, type);
292 	if (ret)
293 		return ret;
294 
295 	/*
296 	 * VFIO always sets IOMMU_CACHE because we offer no way for userspace to
297 	 * restore cache coherency. It has to be checked here because it is only
298 	 * valid for cases where we are using iommu groups.
299 	 */
300 	if (type == VFIO_IOMMU && !vfio_device_is_noiommu(device) &&
301 	    !device_iommu_capable(device->dev, IOMMU_CAP_CACHE_COHERENCY)) {
302 		ret = -EINVAL;
303 		goto err_out;
304 	}
305 
306 	ret = vfio_device_add(device);
307 	if (ret)
308 		goto err_out;
309 
310 	/* Refcounting can't start until the driver calls register */
311 	refcount_set(&device->refcount, 1);
312 
313 	vfio_device_group_register(device);
314 	vfio_device_debugfs_init(device);
315 
316 	return 0;
317 err_out:
318 	vfio_device_remove_group(device);
319 	return ret;
320 }
321 
322 int vfio_register_group_dev(struct vfio_device *device)
323 {
324 	return __vfio_register_dev(device, VFIO_IOMMU);
325 }
326 EXPORT_SYMBOL_GPL(vfio_register_group_dev);
327 
328 /*
329  * Register a virtual device without IOMMU backing.  The user of this
330  * device must not be able to directly trigger unmediated DMA.
331  */
332 int vfio_register_emulated_iommu_dev(struct vfio_device *device)
333 {
334 	return __vfio_register_dev(device, VFIO_EMULATED_IOMMU);
335 }
336 EXPORT_SYMBOL_GPL(vfio_register_emulated_iommu_dev);
337 
338 /*
339  * Decrement the device reference count and wait for the device to be
340  * removed.  Open file descriptors for the device... */
341 void vfio_unregister_group_dev(struct vfio_device *device)
342 {
343 	unsigned int i = 0;
344 	bool interrupted = false;
345 	long rc;
346 
347 	/*
348 	 * Prevent new device opened by userspace via the
349 	 * VFIO_GROUP_GET_DEVICE_FD in the group path.
350 	 */
351 	vfio_device_group_unregister(device);
352 
353 	/*
354 	 * Balances vfio_device_add() in register path, also prevents
355 	 * new device opened by userspace in the cdev path.
356 	 */
357 	vfio_device_del(device);
358 
359 	vfio_device_put_registration(device);
360 	rc = try_wait_for_completion(&device->comp);
361 	while (rc <= 0) {
362 		if (device->ops->request)
363 			device->ops->request(device, i++);
364 
365 		if (interrupted) {
366 			rc = wait_for_completion_timeout(&device->comp,
367 							 HZ * 10);
368 		} else {
369 			rc = wait_for_completion_interruptible_timeout(
370 				&device->comp, HZ * 10);
371 			if (rc < 0) {
372 				interrupted = true;
373 				dev_warn(device->dev,
374 					 "Device is currently in use, task"
375 					 " \"%s\" (%d) "
376 					 "blocked until device is released",
377 					 current->comm, task_pid_nr(current));
378 			}
379 		}
380 	}
381 
382 	vfio_device_debugfs_exit(device);
383 	/* Balances vfio_device_set_group in register path */
384 	vfio_device_remove_group(device);
385 }
386 EXPORT_SYMBOL_GPL(vfio_unregister_group_dev);
387 
388 #ifdef CONFIG_HAVE_KVM
389 void vfio_device_get_kvm_safe(struct vfio_device *device, struct kvm *kvm)
390 {
391 	void (*pfn)(struct kvm *kvm);
392 	bool (*fn)(struct kvm *kvm);
393 	bool ret;
394 
395 	lockdep_assert_held(&device->dev_set->lock);
396 
397 	if (!kvm)
398 		return;
399 
400 	pfn = symbol_get(kvm_put_kvm);
401 	if (WARN_ON(!pfn))
402 		return;
403 
404 	fn = symbol_get(kvm_get_kvm_safe);
405 	if (WARN_ON(!fn)) {
406 		symbol_put(kvm_put_kvm);
407 		return;
408 	}
409 
410 	ret = fn(kvm);
411 	symbol_put(kvm_get_kvm_safe);
412 	if (!ret) {
413 		symbol_put(kvm_put_kvm);
414 		return;
415 	}
416 
417 	device->put_kvm = pfn;
418 	device->kvm = kvm;
419 }
420 
421 void vfio_device_put_kvm(struct vfio_device *device)
422 {
423 	lockdep_assert_held(&device->dev_set->lock);
424 
425 	if (!device->kvm)
426 		return;
427 
428 	if (WARN_ON(!device->put_kvm))
429 		goto clear;
430 
431 	device->put_kvm(device->kvm);
432 	device->put_kvm = NULL;
433 	symbol_put(kvm_put_kvm);
434 
435 clear:
436 	device->kvm = NULL;
437 }
438 #endif
439 
440 /* true if the vfio_device has open_device() called but not close_device() */
441 static bool vfio_assert_device_open(struct vfio_device *device)
442 {
443 	return !WARN_ON_ONCE(!READ_ONCE(device->open_count));
444 }
445 
446 struct vfio_device_file *
447 vfio_allocate_device_file(struct vfio_device *device)
448 {
449 	struct vfio_device_file *df;
450 
451 	df = kzalloc(sizeof(*df), GFP_KERNEL_ACCOUNT);
452 	if (!df)
453 		return ERR_PTR(-ENOMEM);
454 
455 	df->device = device;
456 	spin_lock_init(&df->kvm_ref_lock);
457 
458 	return df;
459 }
460 
461 static int vfio_df_device_first_open(struct vfio_device_file *df)
462 {
463 	struct vfio_device *device = df->device;
464 	struct iommufd_ctx *iommufd = df->iommufd;
465 	int ret;
466 
467 	lockdep_assert_held(&device->dev_set->lock);
468 
469 	if (!try_module_get(device->dev->driver->owner))
470 		return -ENODEV;
471 
472 	if (iommufd)
473 		ret = vfio_df_iommufd_bind(df);
474 	else
475 		ret = vfio_device_group_use_iommu(device);
476 	if (ret)
477 		goto err_module_put;
478 
479 	if (device->ops->open_device) {
480 		ret = device->ops->open_device(device);
481 		if (ret)
482 			goto err_unuse_iommu;
483 	}
484 	return 0;
485 
486 err_unuse_iommu:
487 	if (iommufd)
488 		vfio_df_iommufd_unbind(df);
489 	else
490 		vfio_device_group_unuse_iommu(device);
491 err_module_put:
492 	module_put(device->dev->driver->owner);
493 	return ret;
494 }
495 
496 static void vfio_df_device_last_close(struct vfio_device_file *df)
497 {
498 	struct vfio_device *device = df->device;
499 	struct iommufd_ctx *iommufd = df->iommufd;
500 
501 	lockdep_assert_held(&device->dev_set->lock);
502 
503 	if (device->ops->close_device)
504 		device->ops->close_device(device);
505 	if (iommufd)
506 		vfio_df_iommufd_unbind(df);
507 	else
508 		vfio_device_group_unuse_iommu(device);
509 	module_put(device->dev->driver->owner);
510 }
511 
512 int vfio_df_open(struct vfio_device_file *df)
513 {
514 	struct vfio_device *device = df->device;
515 	int ret = 0;
516 
517 	lockdep_assert_held(&device->dev_set->lock);
518 
519 	/*
520 	 * Only the group path allows the device to be opened multiple
521 	 * times.  The device cdev path doesn't have a secure way for it.
522 	 */
523 	if (device->open_count != 0 && !df->group)
524 		return -EINVAL;
525 
526 	device->open_count++;
527 	if (device->open_count == 1) {
528 		ret = vfio_df_device_first_open(df);
529 		if (ret)
530 			device->open_count--;
531 	}
532 
533 	return ret;
534 }
535 
536 void vfio_df_close(struct vfio_device_file *df)
537 {
538 	struct vfio_device *device = df->device;
539 
540 	lockdep_assert_held(&device->dev_set->lock);
541 
542 	vfio_assert_device_open(device);
543 	if (device->open_count == 1)
544 		vfio_df_device_last_close(df);
545 	device->open_count--;
546 }
547 
548 /*
549  * Wrapper around pm_runtime_resume_and_get().
550  * Return error code on failure or 0 on success.
551  */
552 static inline int vfio_device_pm_runtime_get(struct vfio_device *device)
553 {
554 	struct device *dev = device->dev;
555 
556 	if (dev->driver && dev->driver->pm) {
557 		int ret;
558 
559 		ret = pm_runtime_resume_and_get(dev);
560 		if (ret) {
561 			dev_info_ratelimited(dev,
562 				"vfio: runtime resume failed %d\n", ret);
563 			return -EIO;
564 		}
565 	}
566 
567 	return 0;
568 }
569 
570 /*
571  * Wrapper around pm_runtime_put().
572  */
573 static inline void vfio_device_pm_runtime_put(struct vfio_device *device)
574 {
575 	struct device *dev = device->dev;
576 
577 	if (dev->driver && dev->driver->pm)
578 		pm_runtime_put(dev);
579 }
580 
581 /*
582  * VFIO Device fd
583  */
584 static int vfio_device_fops_release(struct inode *inode, struct file *filep)
585 {
586 	struct vfio_device_file *df = filep->private_data;
587 	struct vfio_device *device = df->device;
588 
589 	if (df->group)
590 		vfio_df_group_close(df);
591 	else
592 		vfio_df_unbind_iommufd(df);
593 
594 	vfio_device_put_registration(device);
595 
596 	kfree(df);
597 
598 	return 0;
599 }
600 
601 /*
602  * vfio_mig_get_next_state - Compute the next step in the FSM
603  * @cur_fsm - The current state the device is in
604  * @new_fsm - The target state to reach
605  * @next_fsm - Pointer to the next step to get to new_fsm
606  *
607  * Return 0 upon success, otherwise -errno
608  * Upon success the next step in the state progression between cur_fsm and
609  * new_fsm will be set in next_fsm.
610  *
611  * This breaks down requests for combination transitions into smaller steps and
612  * returns the next step to get to new_fsm. The function may need to be called
613  * multiple times before reaching new_fsm.
614  *
615  */
616 int vfio_mig_get_next_state(struct vfio_device *device,
617 			    enum vfio_device_mig_state cur_fsm,
618 			    enum vfio_device_mig_state new_fsm,
619 			    enum vfio_device_mig_state *next_fsm)
620 {
621 	enum { VFIO_DEVICE_NUM_STATES = VFIO_DEVICE_STATE_PRE_COPY_P2P + 1 };
622 	/*
623 	 * The coding in this table requires the driver to implement the
624 	 * following FSM arcs:
625 	 *         RESUMING -> STOP
626 	 *         STOP -> RESUMING
627 	 *         STOP -> STOP_COPY
628 	 *         STOP_COPY -> STOP
629 	 *
630 	 * If P2P is supported then the driver must also implement these FSM
631 	 * arcs:
632 	 *         RUNNING -> RUNNING_P2P
633 	 *         RUNNING_P2P -> RUNNING
634 	 *         RUNNING_P2P -> STOP
635 	 *         STOP -> RUNNING_P2P
636 	 *
637 	 * If precopy is supported then the driver must support these additional
638 	 * FSM arcs:
639 	 *         RUNNING -> PRE_COPY
640 	 *         PRE_COPY -> RUNNING
641 	 *         PRE_COPY -> STOP_COPY
642 	 * However, if precopy and P2P are supported together then the driver
643 	 * must support these additional arcs beyond the P2P arcs above:
644 	 *         PRE_COPY -> RUNNING
645 	 *         PRE_COPY -> PRE_COPY_P2P
646 	 *         PRE_COPY_P2P -> PRE_COPY
647 	 *         PRE_COPY_P2P -> RUNNING_P2P
648 	 *         PRE_COPY_P2P -> STOP_COPY
649 	 *         RUNNING -> PRE_COPY
650 	 *         RUNNING_P2P -> PRE_COPY_P2P
651 	 *
652 	 * Without P2P and precopy the driver must implement:
653 	 *         RUNNING -> STOP
654 	 *         STOP -> RUNNING
655 	 *
656 	 * The coding will step through multiple states for some combination
657 	 * transitions; if all optional features are supported, this means the
658 	 * following ones:
659 	 *         PRE_COPY -> PRE_COPY_P2P -> STOP_COPY
660 	 *         PRE_COPY -> RUNNING -> RUNNING_P2P
661 	 *         PRE_COPY -> RUNNING -> RUNNING_P2P -> STOP
662 	 *         PRE_COPY -> RUNNING -> RUNNING_P2P -> STOP -> RESUMING
663 	 *         PRE_COPY_P2P -> RUNNING_P2P -> RUNNING
664 	 *         PRE_COPY_P2P -> RUNNING_P2P -> STOP
665 	 *         PRE_COPY_P2P -> RUNNING_P2P -> STOP -> RESUMING
666 	 *         RESUMING -> STOP -> RUNNING_P2P
667 	 *         RESUMING -> STOP -> RUNNING_P2P -> PRE_COPY_P2P
668 	 *         RESUMING -> STOP -> RUNNING_P2P -> RUNNING
669 	 *         RESUMING -> STOP -> RUNNING_P2P -> RUNNING -> PRE_COPY
670 	 *         RESUMING -> STOP -> STOP_COPY
671 	 *         RUNNING -> RUNNING_P2P -> PRE_COPY_P2P
672 	 *         RUNNING -> RUNNING_P2P -> STOP
673 	 *         RUNNING -> RUNNING_P2P -> STOP -> RESUMING
674 	 *         RUNNING -> RUNNING_P2P -> STOP -> STOP_COPY
675 	 *         RUNNING_P2P -> RUNNING -> PRE_COPY
676 	 *         RUNNING_P2P -> STOP -> RESUMING
677 	 *         RUNNING_P2P -> STOP -> STOP_COPY
678 	 *         STOP -> RUNNING_P2P -> PRE_COPY_P2P
679 	 *         STOP -> RUNNING_P2P -> RUNNING
680 	 *         STOP -> RUNNING_P2P -> RUNNING -> PRE_COPY
681 	 *         STOP_COPY -> STOP -> RESUMING
682 	 *         STOP_COPY -> STOP -> RUNNING_P2P
683 	 *         STOP_COPY -> STOP -> RUNNING_P2P -> RUNNING
684 	 *
685 	 *  The following transitions are blocked:
686 	 *         STOP_COPY -> PRE_COPY
687 	 *         STOP_COPY -> PRE_COPY_P2P
688 	 */
689 	static const u8 vfio_from_fsm_table[VFIO_DEVICE_NUM_STATES][VFIO_DEVICE_NUM_STATES] = {
690 		[VFIO_DEVICE_STATE_STOP] = {
691 			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
692 			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING_P2P,
693 			[VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_RUNNING_P2P,
694 			[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
695 			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY,
696 			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RESUMING,
697 			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
698 			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
699 		},
700 		[VFIO_DEVICE_STATE_RUNNING] = {
701 			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_RUNNING_P2P,
702 			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING,
703 			[VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_PRE_COPY,
704 			[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
705 			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_RUNNING_P2P,
706 			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RUNNING_P2P,
707 			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
708 			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
709 		},
710 		[VFIO_DEVICE_STATE_PRE_COPY] = {
711 			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_RUNNING,
712 			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING,
713 			[VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_PRE_COPY,
714 			[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_PRE_COPY_P2P,
715 			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_PRE_COPY_P2P,
716 			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RUNNING,
717 			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING,
718 			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
719 		},
720 		[VFIO_DEVICE_STATE_PRE_COPY_P2P] = {
721 			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_RUNNING_P2P,
722 			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING_P2P,
723 			[VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_PRE_COPY,
724 			[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_PRE_COPY_P2P,
725 			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY,
726 			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RUNNING_P2P,
727 			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
728 			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
729 		},
730 		[VFIO_DEVICE_STATE_STOP_COPY] = {
731 			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
732 			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_STOP,
733 			[VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_ERROR,
734 			[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_ERROR,
735 			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY,
736 			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_STOP,
737 			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_STOP,
738 			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
739 		},
740 		[VFIO_DEVICE_STATE_RESUMING] = {
741 			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
742 			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_STOP,
743 			[VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_STOP,
744 			[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_STOP,
745 			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP,
746 			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RESUMING,
747 			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_STOP,
748 			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
749 		},
750 		[VFIO_DEVICE_STATE_RUNNING_P2P] = {
751 			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
752 			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING,
753 			[VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_RUNNING,
754 			[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_PRE_COPY_P2P,
755 			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP,
756 			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_STOP,
757 			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
758 			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
759 		},
760 		[VFIO_DEVICE_STATE_ERROR] = {
761 			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_ERROR,
762 			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_ERROR,
763 			[VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_ERROR,
764 			[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_ERROR,
765 			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_ERROR,
766 			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_ERROR,
767 			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_ERROR,
768 			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
769 		},
770 	};
771 
772 	static const unsigned int state_flags_table[VFIO_DEVICE_NUM_STATES] = {
773 		[VFIO_DEVICE_STATE_STOP] = VFIO_MIGRATION_STOP_COPY,
774 		[VFIO_DEVICE_STATE_RUNNING] = VFIO_MIGRATION_STOP_COPY,
775 		[VFIO_DEVICE_STATE_PRE_COPY] =
776 			VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_PRE_COPY,
777 		[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_MIGRATION_STOP_COPY |
778 						   VFIO_MIGRATION_P2P |
779 						   VFIO_MIGRATION_PRE_COPY,
780 		[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_MIGRATION_STOP_COPY,
781 		[VFIO_DEVICE_STATE_RESUMING] = VFIO_MIGRATION_STOP_COPY,
782 		[VFIO_DEVICE_STATE_RUNNING_P2P] =
783 			VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_P2P,
784 		[VFIO_DEVICE_STATE_ERROR] = ~0U,
785 	};
786 
787 	if (WARN_ON(cur_fsm >= ARRAY_SIZE(vfio_from_fsm_table) ||
788 		    (state_flags_table[cur_fsm] & device->migration_flags) !=
789 			state_flags_table[cur_fsm]))
790 		return -EINVAL;
791 
792 	if (new_fsm >= ARRAY_SIZE(vfio_from_fsm_table) ||
793 	   (state_flags_table[new_fsm] & device->migration_flags) !=
794 			state_flags_table[new_fsm])
795 		return -EINVAL;
796 
797 	/*
798 	 * Arcs touching optional and unsupported states are skipped over. The
799 	 * driver will instead see an arc from the original state to the next
800 	 * logical state, as per the above comment.
801 	 */
802 	*next_fsm = vfio_from_fsm_table[cur_fsm][new_fsm];
803 	while ((state_flags_table[*next_fsm] & device->migration_flags) !=
804 			state_flags_table[*next_fsm])
805 		*next_fsm = vfio_from_fsm_table[*next_fsm][new_fsm];
806 
807 	return (*next_fsm != VFIO_DEVICE_STATE_ERROR) ? 0 : -EINVAL;
808 }
809 EXPORT_SYMBOL_GPL(vfio_mig_get_next_state);
810 
811 /*
812  * Convert the drivers's struct file into a FD number and return it to userspace
813  */
814 static int vfio_ioct_mig_return_fd(struct file *filp, void __user *arg,
815 				   struct vfio_device_feature_mig_state *mig)
816 {
817 	int ret;
818 	int fd;
819 
820 	fd = get_unused_fd_flags(O_CLOEXEC);
821 	if (fd < 0) {
822 		ret = fd;
823 		goto out_fput;
824 	}
825 
826 	mig->data_fd = fd;
827 	if (copy_to_user(arg, mig, sizeof(*mig))) {
828 		ret = -EFAULT;
829 		goto out_put_unused;
830 	}
831 	fd_install(fd, filp);
832 	return 0;
833 
834 out_put_unused:
835 	put_unused_fd(fd);
836 out_fput:
837 	fput(filp);
838 	return ret;
839 }
840 
841 static int
842 vfio_ioctl_device_feature_mig_device_state(struct vfio_device *device,
843 					   u32 flags, void __user *arg,
844 					   size_t argsz)
845 {
846 	size_t minsz =
847 		offsetofend(struct vfio_device_feature_mig_state, data_fd);
848 	struct vfio_device_feature_mig_state mig;
849 	struct file *filp = NULL;
850 	int ret;
851 
852 	if (!device->mig_ops)
853 		return -ENOTTY;
854 
855 	ret = vfio_check_feature(flags, argsz,
856 				 VFIO_DEVICE_FEATURE_SET |
857 				 VFIO_DEVICE_FEATURE_GET,
858 				 sizeof(mig));
859 	if (ret != 1)
860 		return ret;
861 
862 	if (copy_from_user(&mig, arg, minsz))
863 		return -EFAULT;
864 
865 	if (flags & VFIO_DEVICE_FEATURE_GET) {
866 		enum vfio_device_mig_state curr_state;
867 
868 		ret = device->mig_ops->migration_get_state(device,
869 							   &curr_state);
870 		if (ret)
871 			return ret;
872 		mig.device_state = curr_state;
873 		goto out_copy;
874 	}
875 
876 	/* Handle the VFIO_DEVICE_FEATURE_SET */
877 	filp = device->mig_ops->migration_set_state(device, mig.device_state);
878 	if (IS_ERR(filp) || !filp)
879 		goto out_copy;
880 
881 	return vfio_ioct_mig_return_fd(filp, arg, &mig);
882 out_copy:
883 	mig.data_fd = -1;
884 	if (copy_to_user(arg, &mig, sizeof(mig)))
885 		return -EFAULT;
886 	if (IS_ERR(filp))
887 		return PTR_ERR(filp);
888 	return 0;
889 }
890 
891 static int
892 vfio_ioctl_device_feature_migration_data_size(struct vfio_device *device,
893 					      u32 flags, void __user *arg,
894 					      size_t argsz)
895 {
896 	struct vfio_device_feature_mig_data_size data_size = {};
897 	unsigned long stop_copy_length;
898 	int ret;
899 
900 	if (!device->mig_ops)
901 		return -ENOTTY;
902 
903 	ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_GET,
904 				 sizeof(data_size));
905 	if (ret != 1)
906 		return ret;
907 
908 	ret = device->mig_ops->migration_get_data_size(device, &stop_copy_length);
909 	if (ret)
910 		return ret;
911 
912 	data_size.stop_copy_length = stop_copy_length;
913 	if (copy_to_user(arg, &data_size, sizeof(data_size)))
914 		return -EFAULT;
915 
916 	return 0;
917 }
918 
919 static int vfio_ioctl_device_feature_migration(struct vfio_device *device,
920 					       u32 flags, void __user *arg,
921 					       size_t argsz)
922 {
923 	struct vfio_device_feature_migration mig = {
924 		.flags = device->migration_flags,
925 	};
926 	int ret;
927 
928 	if (!device->mig_ops)
929 		return -ENOTTY;
930 
931 	ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_GET,
932 				 sizeof(mig));
933 	if (ret != 1)
934 		return ret;
935 	if (copy_to_user(arg, &mig, sizeof(mig)))
936 		return -EFAULT;
937 	return 0;
938 }
939 
940 void vfio_combine_iova_ranges(struct rb_root_cached *root, u32 cur_nodes,
941 			      u32 req_nodes)
942 {
943 	struct interval_tree_node *prev, *curr, *comb_start, *comb_end;
944 	unsigned long min_gap, curr_gap;
945 
946 	/* Special shortcut when a single range is required */
947 	if (req_nodes == 1) {
948 		unsigned long last;
949 
950 		comb_start = interval_tree_iter_first(root, 0, ULONG_MAX);
951 
952 		/* Empty list */
953 		if (WARN_ON_ONCE(!comb_start))
954 			return;
955 
956 		curr = comb_start;
957 		while (curr) {
958 			last = curr->last;
959 			prev = curr;
960 			curr = interval_tree_iter_next(curr, 0, ULONG_MAX);
961 			if (prev != comb_start)
962 				interval_tree_remove(prev, root);
963 		}
964 		comb_start->last = last;
965 		return;
966 	}
967 
968 	/* Combine ranges which have the smallest gap */
969 	while (cur_nodes > req_nodes) {
970 		prev = NULL;
971 		min_gap = ULONG_MAX;
972 		curr = interval_tree_iter_first(root, 0, ULONG_MAX);
973 		while (curr) {
974 			if (prev) {
975 				curr_gap = curr->start - prev->last;
976 				if (curr_gap < min_gap) {
977 					min_gap = curr_gap;
978 					comb_start = prev;
979 					comb_end = curr;
980 				}
981 			}
982 			prev = curr;
983 			curr = interval_tree_iter_next(curr, 0, ULONG_MAX);
984 		}
985 
986 		/* Empty list or no nodes to combine */
987 		if (WARN_ON_ONCE(min_gap == ULONG_MAX))
988 			break;
989 
990 		comb_start->last = comb_end->last;
991 		interval_tree_remove(comb_end, root);
992 		cur_nodes--;
993 	}
994 }
995 EXPORT_SYMBOL_GPL(vfio_combine_iova_ranges);
996 
997 /* Ranges should fit into a single kernel page */
998 #define LOG_MAX_RANGES \
999 	(PAGE_SIZE / sizeof(struct vfio_device_feature_dma_logging_range))
1000 
1001 static int
1002 vfio_ioctl_device_feature_logging_start(struct vfio_device *device,
1003 					u32 flags, void __user *arg,
1004 					size_t argsz)
1005 {
1006 	size_t minsz =
1007 		offsetofend(struct vfio_device_feature_dma_logging_control,
1008 			    ranges);
1009 	struct vfio_device_feature_dma_logging_range __user *ranges;
1010 	struct vfio_device_feature_dma_logging_control control;
1011 	struct vfio_device_feature_dma_logging_range range;
1012 	struct rb_root_cached root = RB_ROOT_CACHED;
1013 	struct interval_tree_node *nodes;
1014 	u64 iova_end;
1015 	u32 nnodes;
1016 	int i, ret;
1017 
1018 	if (!device->log_ops)
1019 		return -ENOTTY;
1020 
1021 	ret = vfio_check_feature(flags, argsz,
1022 				 VFIO_DEVICE_FEATURE_SET,
1023 				 sizeof(control));
1024 	if (ret != 1)
1025 		return ret;
1026 
1027 	if (copy_from_user(&control, arg, minsz))
1028 		return -EFAULT;
1029 
1030 	nnodes = control.num_ranges;
1031 	if (!nnodes)
1032 		return -EINVAL;
1033 
1034 	if (nnodes > LOG_MAX_RANGES)
1035 		return -E2BIG;
1036 
1037 	ranges = u64_to_user_ptr(control.ranges);
1038 	nodes = kmalloc_array(nnodes, sizeof(struct interval_tree_node),
1039 			      GFP_KERNEL);
1040 	if (!nodes)
1041 		return -ENOMEM;
1042 
1043 	for (i = 0; i < nnodes; i++) {
1044 		if (copy_from_user(&range, &ranges[i], sizeof(range))) {
1045 			ret = -EFAULT;
1046 			goto end;
1047 		}
1048 		if (!IS_ALIGNED(range.iova, control.page_size) ||
1049 		    !IS_ALIGNED(range.length, control.page_size)) {
1050 			ret = -EINVAL;
1051 			goto end;
1052 		}
1053 
1054 		if (check_add_overflow(range.iova, range.length, &iova_end) ||
1055 		    iova_end > ULONG_MAX) {
1056 			ret = -EOVERFLOW;
1057 			goto end;
1058 		}
1059 
1060 		nodes[i].start = range.iova;
1061 		nodes[i].last = range.iova + range.length - 1;
1062 		if (interval_tree_iter_first(&root, nodes[i].start,
1063 					     nodes[i].last)) {
1064 			/* Range overlapping */
1065 			ret = -EINVAL;
1066 			goto end;
1067 		}
1068 		interval_tree_insert(nodes + i, &root);
1069 	}
1070 
1071 	ret = device->log_ops->log_start(device, &root, nnodes,
1072 					 &control.page_size);
1073 	if (ret)
1074 		goto end;
1075 
1076 	if (copy_to_user(arg, &control, sizeof(control))) {
1077 		ret = -EFAULT;
1078 		device->log_ops->log_stop(device);
1079 	}
1080 
1081 end:
1082 	kfree(nodes);
1083 	return ret;
1084 }
1085 
1086 static int
1087 vfio_ioctl_device_feature_logging_stop(struct vfio_device *device,
1088 				       u32 flags, void __user *arg,
1089 				       size_t argsz)
1090 {
1091 	int ret;
1092 
1093 	if (!device->log_ops)
1094 		return -ENOTTY;
1095 
1096 	ret = vfio_check_feature(flags, argsz,
1097 				 VFIO_DEVICE_FEATURE_SET, 0);
1098 	if (ret != 1)
1099 		return ret;
1100 
1101 	return device->log_ops->log_stop(device);
1102 }
1103 
1104 static int vfio_device_log_read_and_clear(struct iova_bitmap *iter,
1105 					  unsigned long iova, size_t length,
1106 					  void *opaque)
1107 {
1108 	struct vfio_device *device = opaque;
1109 
1110 	return device->log_ops->log_read_and_clear(device, iova, length, iter);
1111 }
1112 
1113 static int
1114 vfio_ioctl_device_feature_logging_report(struct vfio_device *device,
1115 					 u32 flags, void __user *arg,
1116 					 size_t argsz)
1117 {
1118 	size_t minsz =
1119 		offsetofend(struct vfio_device_feature_dma_logging_report,
1120 			    bitmap);
1121 	struct vfio_device_feature_dma_logging_report report;
1122 	struct iova_bitmap *iter;
1123 	u64 iova_end;
1124 	int ret;
1125 
1126 	if (!device->log_ops)
1127 		return -ENOTTY;
1128 
1129 	ret = vfio_check_feature(flags, argsz,
1130 				 VFIO_DEVICE_FEATURE_GET,
1131 				 sizeof(report));
1132 	if (ret != 1)
1133 		return ret;
1134 
1135 	if (copy_from_user(&report, arg, minsz))
1136 		return -EFAULT;
1137 
1138 	if (report.page_size < SZ_4K || !is_power_of_2(report.page_size))
1139 		return -EINVAL;
1140 
1141 	if (check_add_overflow(report.iova, report.length, &iova_end) ||
1142 	    iova_end > ULONG_MAX)
1143 		return -EOVERFLOW;
1144 
1145 	iter = iova_bitmap_alloc(report.iova, report.length,
1146 				 report.page_size,
1147 				 u64_to_user_ptr(report.bitmap));
1148 	if (IS_ERR(iter))
1149 		return PTR_ERR(iter);
1150 
1151 	ret = iova_bitmap_for_each(iter, device,
1152 				   vfio_device_log_read_and_clear);
1153 
1154 	iova_bitmap_free(iter);
1155 	return ret;
1156 }
1157 
1158 static int vfio_ioctl_device_feature(struct vfio_device *device,
1159 				     struct vfio_device_feature __user *arg)
1160 {
1161 	size_t minsz = offsetofend(struct vfio_device_feature, flags);
1162 	struct vfio_device_feature feature;
1163 
1164 	if (copy_from_user(&feature, arg, minsz))
1165 		return -EFAULT;
1166 
1167 	if (feature.argsz < minsz)
1168 		return -EINVAL;
1169 
1170 	/* Check unknown flags */
1171 	if (feature.flags &
1172 	    ~(VFIO_DEVICE_FEATURE_MASK | VFIO_DEVICE_FEATURE_SET |
1173 	      VFIO_DEVICE_FEATURE_GET | VFIO_DEVICE_FEATURE_PROBE))
1174 		return -EINVAL;
1175 
1176 	/* GET & SET are mutually exclusive except with PROBE */
1177 	if (!(feature.flags & VFIO_DEVICE_FEATURE_PROBE) &&
1178 	    (feature.flags & VFIO_DEVICE_FEATURE_SET) &&
1179 	    (feature.flags & VFIO_DEVICE_FEATURE_GET))
1180 		return -EINVAL;
1181 
1182 	switch (feature.flags & VFIO_DEVICE_FEATURE_MASK) {
1183 	case VFIO_DEVICE_FEATURE_MIGRATION:
1184 		return vfio_ioctl_device_feature_migration(
1185 			device, feature.flags, arg->data,
1186 			feature.argsz - minsz);
1187 	case VFIO_DEVICE_FEATURE_MIG_DEVICE_STATE:
1188 		return vfio_ioctl_device_feature_mig_device_state(
1189 			device, feature.flags, arg->data,
1190 			feature.argsz - minsz);
1191 	case VFIO_DEVICE_FEATURE_DMA_LOGGING_START:
1192 		return vfio_ioctl_device_feature_logging_start(
1193 			device, feature.flags, arg->data,
1194 			feature.argsz - minsz);
1195 	case VFIO_DEVICE_FEATURE_DMA_LOGGING_STOP:
1196 		return vfio_ioctl_device_feature_logging_stop(
1197 			device, feature.flags, arg->data,
1198 			feature.argsz - minsz);
1199 	case VFIO_DEVICE_FEATURE_DMA_LOGGING_REPORT:
1200 		return vfio_ioctl_device_feature_logging_report(
1201 			device, feature.flags, arg->data,
1202 			feature.argsz - minsz);
1203 	case VFIO_DEVICE_FEATURE_MIG_DATA_SIZE:
1204 		return vfio_ioctl_device_feature_migration_data_size(
1205 			device, feature.flags, arg->data,
1206 			feature.argsz - minsz);
1207 	default:
1208 		if (unlikely(!device->ops->device_feature))
1209 			return -EINVAL;
1210 		return device->ops->device_feature(device, feature.flags,
1211 						   arg->data,
1212 						   feature.argsz - minsz);
1213 	}
1214 }
1215 
1216 static long vfio_device_fops_unl_ioctl(struct file *filep,
1217 				       unsigned int cmd, unsigned long arg)
1218 {
1219 	struct vfio_device_file *df = filep->private_data;
1220 	struct vfio_device *device = df->device;
1221 	void __user *uptr = (void __user *)arg;
1222 	int ret;
1223 
1224 	if (cmd == VFIO_DEVICE_BIND_IOMMUFD)
1225 		return vfio_df_ioctl_bind_iommufd(df, uptr);
1226 
1227 	/* Paired with smp_store_release() following vfio_df_open() */
1228 	if (!smp_load_acquire(&df->access_granted))
1229 		return -EINVAL;
1230 
1231 	ret = vfio_device_pm_runtime_get(device);
1232 	if (ret)
1233 		return ret;
1234 
1235 	/* cdev only ioctls */
1236 	if (IS_ENABLED(CONFIG_VFIO_DEVICE_CDEV) && !df->group) {
1237 		switch (cmd) {
1238 		case VFIO_DEVICE_ATTACH_IOMMUFD_PT:
1239 			ret = vfio_df_ioctl_attach_pt(df, uptr);
1240 			goto out;
1241 
1242 		case VFIO_DEVICE_DETACH_IOMMUFD_PT:
1243 			ret = vfio_df_ioctl_detach_pt(df, uptr);
1244 			goto out;
1245 		}
1246 	}
1247 
1248 	switch (cmd) {
1249 	case VFIO_DEVICE_FEATURE:
1250 		ret = vfio_ioctl_device_feature(device, uptr);
1251 		break;
1252 
1253 	default:
1254 		if (unlikely(!device->ops->ioctl))
1255 			ret = -EINVAL;
1256 		else
1257 			ret = device->ops->ioctl(device, cmd, arg);
1258 		break;
1259 	}
1260 out:
1261 	vfio_device_pm_runtime_put(device);
1262 	return ret;
1263 }
1264 
1265 static ssize_t vfio_device_fops_read(struct file *filep, char __user *buf,
1266 				     size_t count, loff_t *ppos)
1267 {
1268 	struct vfio_device_file *df = filep->private_data;
1269 	struct vfio_device *device = df->device;
1270 
1271 	/* Paired with smp_store_release() following vfio_df_open() */
1272 	if (!smp_load_acquire(&df->access_granted))
1273 		return -EINVAL;
1274 
1275 	if (unlikely(!device->ops->read))
1276 		return -EINVAL;
1277 
1278 	return device->ops->read(device, buf, count, ppos);
1279 }
1280 
1281 static ssize_t vfio_device_fops_write(struct file *filep,
1282 				      const char __user *buf,
1283 				      size_t count, loff_t *ppos)
1284 {
1285 	struct vfio_device_file *df = filep->private_data;
1286 	struct vfio_device *device = df->device;
1287 
1288 	/* Paired with smp_store_release() following vfio_df_open() */
1289 	if (!smp_load_acquire(&df->access_granted))
1290 		return -EINVAL;
1291 
1292 	if (unlikely(!device->ops->write))
1293 		return -EINVAL;
1294 
1295 	return device->ops->write(device, buf, count, ppos);
1296 }
1297 
1298 static int vfio_device_fops_mmap(struct file *filep, struct vm_area_struct *vma)
1299 {
1300 	struct vfio_device_file *df = filep->private_data;
1301 	struct vfio_device *device = df->device;
1302 
1303 	/* Paired with smp_store_release() following vfio_df_open() */
1304 	if (!smp_load_acquire(&df->access_granted))
1305 		return -EINVAL;
1306 
1307 	if (unlikely(!device->ops->mmap))
1308 		return -EINVAL;
1309 
1310 	return device->ops->mmap(device, vma);
1311 }
1312 
1313 const struct file_operations vfio_device_fops = {
1314 	.owner		= THIS_MODULE,
1315 	.open		= vfio_device_fops_cdev_open,
1316 	.release	= vfio_device_fops_release,
1317 	.read		= vfio_device_fops_read,
1318 	.write		= vfio_device_fops_write,
1319 	.unlocked_ioctl	= vfio_device_fops_unl_ioctl,
1320 	.compat_ioctl	= compat_ptr_ioctl,
1321 	.mmap		= vfio_device_fops_mmap,
1322 };
1323 
1324 static struct vfio_device *vfio_device_from_file(struct file *file)
1325 {
1326 	struct vfio_device_file *df = file->private_data;
1327 
1328 	if (file->f_op != &vfio_device_fops)
1329 		return NULL;
1330 	return df->device;
1331 }
1332 
1333 /**
1334  * vfio_file_is_valid - True if the file is valid vfio file
1335  * @file: VFIO group file or VFIO device file
1336  */
1337 bool vfio_file_is_valid(struct file *file)
1338 {
1339 	return vfio_group_from_file(file) ||
1340 	       vfio_device_from_file(file);
1341 }
1342 EXPORT_SYMBOL_GPL(vfio_file_is_valid);
1343 
1344 /**
1345  * vfio_file_enforced_coherent - True if the DMA associated with the VFIO file
1346  *        is always CPU cache coherent
1347  * @file: VFIO group file or VFIO device file
1348  *
1349  * Enforced coherency means that the IOMMU ignores things like the PCIe no-snoop
1350  * bit in DMA transactions. A return of false indicates that the user has
1351  * rights to access additional instructions such as wbinvd on x86.
1352  */
1353 bool vfio_file_enforced_coherent(struct file *file)
1354 {
1355 	struct vfio_device *device;
1356 	struct vfio_group *group;
1357 
1358 	group = vfio_group_from_file(file);
1359 	if (group)
1360 		return vfio_group_enforced_coherent(group);
1361 
1362 	device = vfio_device_from_file(file);
1363 	if (device)
1364 		return device_iommu_capable(device->dev,
1365 					    IOMMU_CAP_ENFORCE_CACHE_COHERENCY);
1366 
1367 	return true;
1368 }
1369 EXPORT_SYMBOL_GPL(vfio_file_enforced_coherent);
1370 
1371 static void vfio_device_file_set_kvm(struct file *file, struct kvm *kvm)
1372 {
1373 	struct vfio_device_file *df = file->private_data;
1374 
1375 	/*
1376 	 * The kvm is first recorded in the vfio_device_file, and will
1377 	 * be propagated to vfio_device::kvm when the file is bound to
1378 	 * iommufd successfully in the vfio device cdev path.
1379 	 */
1380 	spin_lock(&df->kvm_ref_lock);
1381 	df->kvm = kvm;
1382 	spin_unlock(&df->kvm_ref_lock);
1383 }
1384 
1385 /**
1386  * vfio_file_set_kvm - Link a kvm with VFIO drivers
1387  * @file: VFIO group file or VFIO device file
1388  * @kvm: KVM to link
1389  *
1390  * When a VFIO device is first opened the KVM will be available in
1391  * device->kvm if one was associated with the file.
1392  */
1393 void vfio_file_set_kvm(struct file *file, struct kvm *kvm)
1394 {
1395 	struct vfio_group *group;
1396 
1397 	group = vfio_group_from_file(file);
1398 	if (group)
1399 		vfio_group_set_kvm(group, kvm);
1400 
1401 	if (vfio_device_from_file(file))
1402 		vfio_device_file_set_kvm(file, kvm);
1403 }
1404 EXPORT_SYMBOL_GPL(vfio_file_set_kvm);
1405 
1406 /*
1407  * Sub-module support
1408  */
1409 /*
1410  * Helper for managing a buffer of info chain capabilities, allocate or
1411  * reallocate a buffer with additional @size, filling in @id and @version
1412  * of the capability.  A pointer to the new capability is returned.
1413  *
1414  * NB. The chain is based at the head of the buffer, so new entries are
1415  * added to the tail, vfio_info_cap_shift() should be called to fixup the
1416  * next offsets prior to copying to the user buffer.
1417  */
1418 struct vfio_info_cap_header *vfio_info_cap_add(struct vfio_info_cap *caps,
1419 					       size_t size, u16 id, u16 version)
1420 {
1421 	void *buf;
1422 	struct vfio_info_cap_header *header, *tmp;
1423 
1424 	/* Ensure that the next capability struct will be aligned */
1425 	size = ALIGN(size, sizeof(u64));
1426 
1427 	buf = krealloc(caps->buf, caps->size + size, GFP_KERNEL);
1428 	if (!buf) {
1429 		kfree(caps->buf);
1430 		caps->buf = NULL;
1431 		caps->size = 0;
1432 		return ERR_PTR(-ENOMEM);
1433 	}
1434 
1435 	caps->buf = buf;
1436 	header = buf + caps->size;
1437 
1438 	/* Eventually copied to user buffer, zero */
1439 	memset(header, 0, size);
1440 
1441 	header->id = id;
1442 	header->version = version;
1443 
1444 	/* Add to the end of the capability chain */
1445 	for (tmp = buf; tmp->next; tmp = buf + tmp->next)
1446 		; /* nothing */
1447 
1448 	tmp->next = caps->size;
1449 	caps->size += size;
1450 
1451 	return header;
1452 }
1453 EXPORT_SYMBOL_GPL(vfio_info_cap_add);
1454 
1455 void vfio_info_cap_shift(struct vfio_info_cap *caps, size_t offset)
1456 {
1457 	struct vfio_info_cap_header *tmp;
1458 	void *buf = (void *)caps->buf;
1459 
1460 	/* Capability structs should start with proper alignment */
1461 	WARN_ON(!IS_ALIGNED(offset, sizeof(u64)));
1462 
1463 	for (tmp = buf; tmp->next; tmp = buf + tmp->next - offset)
1464 		tmp->next += offset;
1465 }
1466 EXPORT_SYMBOL(vfio_info_cap_shift);
1467 
1468 int vfio_info_add_capability(struct vfio_info_cap *caps,
1469 			     struct vfio_info_cap_header *cap, size_t size)
1470 {
1471 	struct vfio_info_cap_header *header;
1472 
1473 	header = vfio_info_cap_add(caps, size, cap->id, cap->version);
1474 	if (IS_ERR(header))
1475 		return PTR_ERR(header);
1476 
1477 	memcpy(header + 1, cap + 1, size - sizeof(*header));
1478 
1479 	return 0;
1480 }
1481 EXPORT_SYMBOL(vfio_info_add_capability);
1482 
1483 int vfio_set_irqs_validate_and_prepare(struct vfio_irq_set *hdr, int num_irqs,
1484 				       int max_irq_type, size_t *data_size)
1485 {
1486 	unsigned long minsz;
1487 	size_t size;
1488 
1489 	minsz = offsetofend(struct vfio_irq_set, count);
1490 
1491 	if ((hdr->argsz < minsz) || (hdr->index >= max_irq_type) ||
1492 	    (hdr->count >= (U32_MAX - hdr->start)) ||
1493 	    (hdr->flags & ~(VFIO_IRQ_SET_DATA_TYPE_MASK |
1494 				VFIO_IRQ_SET_ACTION_TYPE_MASK)))
1495 		return -EINVAL;
1496 
1497 	if (data_size)
1498 		*data_size = 0;
1499 
1500 	if (hdr->start >= num_irqs || hdr->start + hdr->count > num_irqs)
1501 		return -EINVAL;
1502 
1503 	switch (hdr->flags & VFIO_IRQ_SET_DATA_TYPE_MASK) {
1504 	case VFIO_IRQ_SET_DATA_NONE:
1505 		size = 0;
1506 		break;
1507 	case VFIO_IRQ_SET_DATA_BOOL:
1508 		size = sizeof(uint8_t);
1509 		break;
1510 	case VFIO_IRQ_SET_DATA_EVENTFD:
1511 		size = sizeof(int32_t);
1512 		break;
1513 	default:
1514 		return -EINVAL;
1515 	}
1516 
1517 	if (size) {
1518 		if (hdr->argsz - minsz < hdr->count * size)
1519 			return -EINVAL;
1520 
1521 		if (!data_size)
1522 			return -EINVAL;
1523 
1524 		*data_size = hdr->count * size;
1525 	}
1526 
1527 	return 0;
1528 }
1529 EXPORT_SYMBOL(vfio_set_irqs_validate_and_prepare);
1530 
1531 /*
1532  * Pin contiguous user pages and return their associated host pages for local
1533  * domain only.
1534  * @device [in]  : device
1535  * @iova [in]    : starting IOVA of user pages to be pinned.
1536  * @npage [in]   : count of pages to be pinned.  This count should not
1537  *		   be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
1538  * @prot [in]    : protection flags
1539  * @pages[out]   : array of host pages
1540  * Return error or number of pages pinned.
1541  *
1542  * A driver may only call this function if the vfio_device was created
1543  * by vfio_register_emulated_iommu_dev() due to vfio_device_container_pin_pages().
1544  */
1545 int vfio_pin_pages(struct vfio_device *device, dma_addr_t iova,
1546 		   int npage, int prot, struct page **pages)
1547 {
1548 	/* group->container cannot change while a vfio device is open */
1549 	if (!pages || !npage || WARN_ON(!vfio_assert_device_open(device)))
1550 		return -EINVAL;
1551 	if (!device->ops->dma_unmap)
1552 		return -EINVAL;
1553 	if (vfio_device_has_container(device))
1554 		return vfio_device_container_pin_pages(device, iova,
1555 						       npage, prot, pages);
1556 	if (device->iommufd_access) {
1557 		int ret;
1558 
1559 		if (iova > ULONG_MAX)
1560 			return -EINVAL;
1561 		/*
1562 		 * VFIO ignores the sub page offset, npages is from the start of
1563 		 * a PAGE_SIZE chunk of IOVA. The caller is expected to recover
1564 		 * the sub page offset by doing:
1565 		 *     pages[0] + (iova % PAGE_SIZE)
1566 		 */
1567 		ret = iommufd_access_pin_pages(
1568 			device->iommufd_access, ALIGN_DOWN(iova, PAGE_SIZE),
1569 			npage * PAGE_SIZE, pages,
1570 			(prot & IOMMU_WRITE) ? IOMMUFD_ACCESS_RW_WRITE : 0);
1571 		if (ret)
1572 			return ret;
1573 		return npage;
1574 	}
1575 	return -EINVAL;
1576 }
1577 EXPORT_SYMBOL(vfio_pin_pages);
1578 
1579 /*
1580  * Unpin contiguous host pages for local domain only.
1581  * @device [in]  : device
1582  * @iova [in]    : starting address of user pages to be unpinned.
1583  * @npage [in]   : count of pages to be unpinned.  This count should not
1584  *                 be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
1585  */
1586 void vfio_unpin_pages(struct vfio_device *device, dma_addr_t iova, int npage)
1587 {
1588 	if (WARN_ON(!vfio_assert_device_open(device)))
1589 		return;
1590 	if (WARN_ON(!device->ops->dma_unmap))
1591 		return;
1592 
1593 	if (vfio_device_has_container(device)) {
1594 		vfio_device_container_unpin_pages(device, iova, npage);
1595 		return;
1596 	}
1597 	if (device->iommufd_access) {
1598 		if (WARN_ON(iova > ULONG_MAX))
1599 			return;
1600 		iommufd_access_unpin_pages(device->iommufd_access,
1601 					   ALIGN_DOWN(iova, PAGE_SIZE),
1602 					   npage * PAGE_SIZE);
1603 		return;
1604 	}
1605 }
1606 EXPORT_SYMBOL(vfio_unpin_pages);
1607 
1608 /*
1609  * This interface allows the CPUs to perform some sort of virtual DMA on
1610  * behalf of the device.
1611  *
1612  * CPUs read/write from/into a range of IOVAs pointing to user space memory
1613  * into/from a kernel buffer.
1614  *
1615  * As the read/write of user space memory is conducted via the CPUs and is
1616  * not a real device DMA, it is not necessary to pin the user space memory.
1617  *
1618  * @device [in]		: VFIO device
1619  * @iova [in]		: base IOVA of a user space buffer
1620  * @data [in]		: pointer to kernel buffer
1621  * @len [in]		: kernel buffer length
1622  * @write		: indicate read or write
1623  * Return error code on failure or 0 on success.
1624  */
1625 int vfio_dma_rw(struct vfio_device *device, dma_addr_t iova, void *data,
1626 		size_t len, bool write)
1627 {
1628 	if (!data || len <= 0 || !vfio_assert_device_open(device))
1629 		return -EINVAL;
1630 
1631 	if (vfio_device_has_container(device))
1632 		return vfio_device_container_dma_rw(device, iova,
1633 						    data, len, write);
1634 
1635 	if (device->iommufd_access) {
1636 		unsigned int flags = 0;
1637 
1638 		if (iova > ULONG_MAX)
1639 			return -EINVAL;
1640 
1641 		/* VFIO historically tries to auto-detect a kthread */
1642 		if (!current->mm)
1643 			flags |= IOMMUFD_ACCESS_RW_KTHREAD;
1644 		if (write)
1645 			flags |= IOMMUFD_ACCESS_RW_WRITE;
1646 		return iommufd_access_rw(device->iommufd_access, iova, data,
1647 					 len, flags);
1648 	}
1649 	return -EINVAL;
1650 }
1651 EXPORT_SYMBOL(vfio_dma_rw);
1652 
1653 /*
1654  * Module/class support
1655  */
1656 static int __init vfio_init(void)
1657 {
1658 	int ret;
1659 
1660 	ida_init(&vfio.device_ida);
1661 
1662 	ret = vfio_group_init();
1663 	if (ret)
1664 		return ret;
1665 
1666 	ret = vfio_virqfd_init();
1667 	if (ret)
1668 		goto err_virqfd;
1669 
1670 	/* /sys/class/vfio-dev/vfioX */
1671 	vfio.device_class = class_create("vfio-dev");
1672 	if (IS_ERR(vfio.device_class)) {
1673 		ret = PTR_ERR(vfio.device_class);
1674 		goto err_dev_class;
1675 	}
1676 
1677 	ret = vfio_cdev_init(vfio.device_class);
1678 	if (ret)
1679 		goto err_alloc_dev_chrdev;
1680 
1681 	vfio_debugfs_create_root();
1682 	pr_info(DRIVER_DESC " version: " DRIVER_VERSION "\n");
1683 	return 0;
1684 
1685 err_alloc_dev_chrdev:
1686 	class_destroy(vfio.device_class);
1687 	vfio.device_class = NULL;
1688 err_dev_class:
1689 	vfio_virqfd_exit();
1690 err_virqfd:
1691 	vfio_group_cleanup();
1692 	return ret;
1693 }
1694 
1695 static void __exit vfio_cleanup(void)
1696 {
1697 	vfio_debugfs_remove_root();
1698 	ida_destroy(&vfio.device_ida);
1699 	vfio_cdev_cleanup();
1700 	class_destroy(vfio.device_class);
1701 	vfio.device_class = NULL;
1702 	vfio_virqfd_exit();
1703 	vfio_group_cleanup();
1704 	xa_destroy(&vfio_device_set_xa);
1705 }
1706 
1707 module_init(vfio_init);
1708 module_exit(vfio_cleanup);
1709 
1710 MODULE_IMPORT_NS(IOMMUFD);
1711 MODULE_VERSION(DRIVER_VERSION);
1712 MODULE_LICENSE("GPL v2");
1713 MODULE_AUTHOR(DRIVER_AUTHOR);
1714 MODULE_DESCRIPTION(DRIVER_DESC);
1715 MODULE_SOFTDEP("post: vfio_iommu_type1 vfio_iommu_spapr_tce");
1716