1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3 * Kernel-based Virtual Machine driver for Linux
4 *
5 * This module enables machines with Intel VT-x extensions to run virtual
6 * machines without emulation or binary translation.
7 *
8 * Copyright (C) 2006 Qumranet, Inc.
9 * Copyright 2010 Red Hat, Inc. and/or its affiliates.
10 *
11 * Authors:
12 * Avi Kivity <avi@qumranet.com>
13 * Yaniv Kamay <yaniv@qumranet.com>
14 */
15
16 #include <kvm/iodev.h>
17
18 #include <linux/kvm_host.h>
19 #include <linux/kvm.h>
20 #include <linux/module.h>
21 #include <linux/errno.h>
22 #include <linux/percpu.h>
23 #include <linux/mm.h>
24 #include <linux/miscdevice.h>
25 #include <linux/vmalloc.h>
26 #include <linux/reboot.h>
27 #include <linux/debugfs.h>
28 #include <linux/highmem.h>
29 #include <linux/file.h>
30 #include <linux/syscore_ops.h>
31 #include <linux/cpu.h>
32 #include <linux/sched/signal.h>
33 #include <linux/sched/mm.h>
34 #include <linux/sched/stat.h>
35 #include <linux/cpumask.h>
36 #include <linux/smp.h>
37 #include <linux/anon_inodes.h>
38 #include <linux/profile.h>
39 #include <linux/kvm_para.h>
40 #include <linux/pagemap.h>
41 #include <linux/mman.h>
42 #include <linux/swap.h>
43 #include <linux/bitops.h>
44 #include <linux/spinlock.h>
45 #include <linux/compat.h>
46 #include <linux/srcu.h>
47 #include <linux/hugetlb.h>
48 #include <linux/slab.h>
49 #include <linux/sort.h>
50 #include <linux/bsearch.h>
51 #include <linux/io.h>
52 #include <linux/lockdep.h>
53 #include <linux/kthread.h>
54 #include <linux/suspend.h>
55
56 #include <asm/processor.h>
57 #include <asm/ioctl.h>
58 #include <linux/uaccess.h>
59
60 #include "coalesced_mmio.h"
61 #include "async_pf.h"
62 #include "kvm_mm.h"
63 #include "vfio.h"
64
65 #include <trace/events/ipi.h>
66
67 #define CREATE_TRACE_POINTS
68 #include <trace/events/kvm.h>
69
70 #include <linux/kvm_dirty_ring.h>
71
72
73 /* Worst case buffer size needed for holding an integer. */
74 #define ITOA_MAX_LEN 12
75
76 MODULE_AUTHOR("Qumranet");
77 MODULE_LICENSE("GPL");
78
79 /* Architectures should define their poll value according to the halt latency */
80 unsigned int halt_poll_ns = KVM_HALT_POLL_NS_DEFAULT;
81 module_param(halt_poll_ns, uint, 0644);
82 EXPORT_SYMBOL_GPL(halt_poll_ns);
83
84 /* Default doubles per-vcpu halt_poll_ns. */
85 unsigned int halt_poll_ns_grow = 2;
86 module_param(halt_poll_ns_grow, uint, 0644);
87 EXPORT_SYMBOL_GPL(halt_poll_ns_grow);
88
89 /* The start value to grow halt_poll_ns from */
90 unsigned int halt_poll_ns_grow_start = 10000; /* 10us */
91 module_param(halt_poll_ns_grow_start, uint, 0644);
92 EXPORT_SYMBOL_GPL(halt_poll_ns_grow_start);
93
94 /* Default resets per-vcpu halt_poll_ns . */
95 unsigned int halt_poll_ns_shrink;
96 module_param(halt_poll_ns_shrink, uint, 0644);
97 EXPORT_SYMBOL_GPL(halt_poll_ns_shrink);
98
99 /*
100 * Ordering of locks:
101 *
102 * kvm->lock --> kvm->slots_lock --> kvm->irq_lock
103 */
104
105 DEFINE_MUTEX(kvm_lock);
106 LIST_HEAD(vm_list);
107
108 static struct kmem_cache *kvm_vcpu_cache;
109
110 static __read_mostly struct preempt_ops kvm_preempt_ops;
111 static DEFINE_PER_CPU(struct kvm_vcpu *, kvm_running_vcpu);
112
113 struct dentry *kvm_debugfs_dir;
114 EXPORT_SYMBOL_GPL(kvm_debugfs_dir);
115
116 static const struct file_operations stat_fops_per_vm;
117
118 static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl,
119 unsigned long arg);
120 #ifdef CONFIG_KVM_COMPAT
121 static long kvm_vcpu_compat_ioctl(struct file *file, unsigned int ioctl,
122 unsigned long arg);
123 #define KVM_COMPAT(c) .compat_ioctl = (c)
124 #else
125 /*
126 * For architectures that don't implement a compat infrastructure,
127 * adopt a double line of defense:
128 * - Prevent a compat task from opening /dev/kvm
129 * - If the open has been done by a 64bit task, and the KVM fd
130 * passed to a compat task, let the ioctls fail.
131 */
kvm_no_compat_ioctl(struct file * file,unsigned int ioctl,unsigned long arg)132 static long kvm_no_compat_ioctl(struct file *file, unsigned int ioctl,
133 unsigned long arg) { return -EINVAL; }
134
kvm_no_compat_open(struct inode * inode,struct file * file)135 static int kvm_no_compat_open(struct inode *inode, struct file *file)
136 {
137 return is_compat_task() ? -ENODEV : 0;
138 }
139 #define KVM_COMPAT(c) .compat_ioctl = kvm_no_compat_ioctl, \
140 .open = kvm_no_compat_open
141 #endif
142 static int hardware_enable_all(void);
143 static void hardware_disable_all(void);
144
145 static void kvm_io_bus_destroy(struct kvm_io_bus *bus);
146
147 #define KVM_EVENT_CREATE_VM 0
148 #define KVM_EVENT_DESTROY_VM 1
149 static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm);
150 static unsigned long long kvm_createvm_count;
151 static unsigned long long kvm_active_vms;
152
153 static DEFINE_PER_CPU(cpumask_var_t, cpu_kick_mask);
154
kvm_arch_guest_memory_reclaimed(struct kvm * kvm)155 __weak void kvm_arch_guest_memory_reclaimed(struct kvm *kvm)
156 {
157 }
158
kvm_is_zone_device_page(struct page * page)159 bool kvm_is_zone_device_page(struct page *page)
160 {
161 /*
162 * The metadata used by is_zone_device_page() to determine whether or
163 * not a page is ZONE_DEVICE is guaranteed to be valid if and only if
164 * the device has been pinned, e.g. by get_user_pages(). WARN if the
165 * page_count() is zero to help detect bad usage of this helper.
166 */
167 if (WARN_ON_ONCE(!page_count(page)))
168 return false;
169
170 return is_zone_device_page(page);
171 }
172
173 /*
174 * Returns a 'struct page' if the pfn is "valid" and backed by a refcounted
175 * page, NULL otherwise. Note, the list of refcounted PG_reserved page types
176 * is likely incomplete, it has been compiled purely through people wanting to
177 * back guest with a certain type of memory and encountering issues.
178 */
kvm_pfn_to_refcounted_page(kvm_pfn_t pfn)179 struct page *kvm_pfn_to_refcounted_page(kvm_pfn_t pfn)
180 {
181 struct page *page;
182
183 if (!pfn_valid(pfn))
184 return NULL;
185
186 page = pfn_to_page(pfn);
187 if (!PageReserved(page))
188 return page;
189
190 /* The ZERO_PAGE(s) is marked PG_reserved, but is refcounted. */
191 if (is_zero_pfn(pfn))
192 return page;
193
194 /*
195 * ZONE_DEVICE pages currently set PG_reserved, but from a refcounting
196 * perspective they are "normal" pages, albeit with slightly different
197 * usage rules.
198 */
199 if (kvm_is_zone_device_page(page))
200 return page;
201
202 return NULL;
203 }
204
205 /*
206 * Switches to specified vcpu, until a matching vcpu_put()
207 */
vcpu_load(struct kvm_vcpu * vcpu)208 void vcpu_load(struct kvm_vcpu *vcpu)
209 {
210 int cpu = get_cpu();
211
212 __this_cpu_write(kvm_running_vcpu, vcpu);
213 preempt_notifier_register(&vcpu->preempt_notifier);
214 kvm_arch_vcpu_load(vcpu, cpu);
215 put_cpu();
216 }
217 EXPORT_SYMBOL_GPL(vcpu_load);
218
vcpu_put(struct kvm_vcpu * vcpu)219 void vcpu_put(struct kvm_vcpu *vcpu)
220 {
221 preempt_disable();
222 kvm_arch_vcpu_put(vcpu);
223 preempt_notifier_unregister(&vcpu->preempt_notifier);
224 __this_cpu_write(kvm_running_vcpu, NULL);
225 preempt_enable();
226 }
227 EXPORT_SYMBOL_GPL(vcpu_put);
228
229 /* TODO: merge with kvm_arch_vcpu_should_kick */
kvm_request_needs_ipi(struct kvm_vcpu * vcpu,unsigned req)230 static bool kvm_request_needs_ipi(struct kvm_vcpu *vcpu, unsigned req)
231 {
232 int mode = kvm_vcpu_exiting_guest_mode(vcpu);
233
234 /*
235 * We need to wait for the VCPU to reenable interrupts and get out of
236 * READING_SHADOW_PAGE_TABLES mode.
237 */
238 if (req & KVM_REQUEST_WAIT)
239 return mode != OUTSIDE_GUEST_MODE;
240
241 /*
242 * Need to kick a running VCPU, but otherwise there is nothing to do.
243 */
244 return mode == IN_GUEST_MODE;
245 }
246
ack_kick(void * _completed)247 static void ack_kick(void *_completed)
248 {
249 }
250
kvm_kick_many_cpus(struct cpumask * cpus,bool wait)251 static inline bool kvm_kick_many_cpus(struct cpumask *cpus, bool wait)
252 {
253 if (cpumask_empty(cpus))
254 return false;
255
256 smp_call_function_many(cpus, ack_kick, NULL, wait);
257 return true;
258 }
259
kvm_make_vcpu_request(struct kvm_vcpu * vcpu,unsigned int req,struct cpumask * tmp,int current_cpu)260 static void kvm_make_vcpu_request(struct kvm_vcpu *vcpu, unsigned int req,
261 struct cpumask *tmp, int current_cpu)
262 {
263 int cpu;
264
265 if (likely(!(req & KVM_REQUEST_NO_ACTION)))
266 __kvm_make_request(req, vcpu);
267
268 if (!(req & KVM_REQUEST_NO_WAKEUP) && kvm_vcpu_wake_up(vcpu))
269 return;
270
271 /*
272 * Note, the vCPU could get migrated to a different pCPU at any point
273 * after kvm_request_needs_ipi(), which could result in sending an IPI
274 * to the previous pCPU. But, that's OK because the purpose of the IPI
275 * is to ensure the vCPU returns to OUTSIDE_GUEST_MODE, which is
276 * satisfied if the vCPU migrates. Entering READING_SHADOW_PAGE_TABLES
277 * after this point is also OK, as the requirement is only that KVM wait
278 * for vCPUs that were reading SPTEs _before_ any changes were
279 * finalized. See kvm_vcpu_kick() for more details on handling requests.
280 */
281 if (kvm_request_needs_ipi(vcpu, req)) {
282 cpu = READ_ONCE(vcpu->cpu);
283 if (cpu != -1 && cpu != current_cpu)
284 __cpumask_set_cpu(cpu, tmp);
285 }
286 }
287
kvm_make_vcpus_request_mask(struct kvm * kvm,unsigned int req,unsigned long * vcpu_bitmap)288 bool kvm_make_vcpus_request_mask(struct kvm *kvm, unsigned int req,
289 unsigned long *vcpu_bitmap)
290 {
291 struct kvm_vcpu *vcpu;
292 struct cpumask *cpus;
293 int i, me;
294 bool called;
295
296 me = get_cpu();
297
298 cpus = this_cpu_cpumask_var_ptr(cpu_kick_mask);
299 cpumask_clear(cpus);
300
301 for_each_set_bit(i, vcpu_bitmap, KVM_MAX_VCPUS) {
302 vcpu = kvm_get_vcpu(kvm, i);
303 if (!vcpu)
304 continue;
305 kvm_make_vcpu_request(vcpu, req, cpus, me);
306 }
307
308 called = kvm_kick_many_cpus(cpus, !!(req & KVM_REQUEST_WAIT));
309 put_cpu();
310
311 return called;
312 }
313
kvm_make_all_cpus_request(struct kvm * kvm,unsigned int req)314 bool kvm_make_all_cpus_request(struct kvm *kvm, unsigned int req)
315 {
316 struct kvm_vcpu *vcpu;
317 struct cpumask *cpus;
318 unsigned long i;
319 bool called;
320 int me;
321
322 me = get_cpu();
323
324 cpus = this_cpu_cpumask_var_ptr(cpu_kick_mask);
325 cpumask_clear(cpus);
326
327 kvm_for_each_vcpu(i, vcpu, kvm)
328 kvm_make_vcpu_request(vcpu, req, cpus, me);
329
330 called = kvm_kick_many_cpus(cpus, !!(req & KVM_REQUEST_WAIT));
331 put_cpu();
332
333 return called;
334 }
335 EXPORT_SYMBOL_GPL(kvm_make_all_cpus_request);
336
kvm_flush_remote_tlbs(struct kvm * kvm)337 void kvm_flush_remote_tlbs(struct kvm *kvm)
338 {
339 ++kvm->stat.generic.remote_tlb_flush_requests;
340
341 /*
342 * We want to publish modifications to the page tables before reading
343 * mode. Pairs with a memory barrier in arch-specific code.
344 * - x86: smp_mb__after_srcu_read_unlock in vcpu_enter_guest
345 * and smp_mb in walk_shadow_page_lockless_begin/end.
346 * - powerpc: smp_mb in kvmppc_prepare_to_enter.
347 *
348 * There is already an smp_mb__after_atomic() before
349 * kvm_make_all_cpus_request() reads vcpu->mode. We reuse that
350 * barrier here.
351 */
352 if (!kvm_arch_flush_remote_tlbs(kvm)
353 || kvm_make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH))
354 ++kvm->stat.generic.remote_tlb_flush;
355 }
356 EXPORT_SYMBOL_GPL(kvm_flush_remote_tlbs);
357
kvm_flush_remote_tlbs_range(struct kvm * kvm,gfn_t gfn,u64 nr_pages)358 void kvm_flush_remote_tlbs_range(struct kvm *kvm, gfn_t gfn, u64 nr_pages)
359 {
360 if (!kvm_arch_flush_remote_tlbs_range(kvm, gfn, nr_pages))
361 return;
362
363 /*
364 * Fall back to a flushing entire TLBs if the architecture range-based
365 * TLB invalidation is unsupported or can't be performed for whatever
366 * reason.
367 */
368 kvm_flush_remote_tlbs(kvm);
369 }
370
kvm_flush_remote_tlbs_memslot(struct kvm * kvm,const struct kvm_memory_slot * memslot)371 void kvm_flush_remote_tlbs_memslot(struct kvm *kvm,
372 const struct kvm_memory_slot *memslot)
373 {
374 /*
375 * All current use cases for flushing the TLBs for a specific memslot
376 * are related to dirty logging, and many do the TLB flush out of
377 * mmu_lock. The interaction between the various operations on memslot
378 * must be serialized by slots_locks to ensure the TLB flush from one
379 * operation is observed by any other operation on the same memslot.
380 */
381 lockdep_assert_held(&kvm->slots_lock);
382 kvm_flush_remote_tlbs_range(kvm, memslot->base_gfn, memslot->npages);
383 }
384
kvm_flush_shadow_all(struct kvm * kvm)385 static void kvm_flush_shadow_all(struct kvm *kvm)
386 {
387 kvm_arch_flush_shadow_all(kvm);
388 kvm_arch_guest_memory_reclaimed(kvm);
389 }
390
391 #ifdef KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE
mmu_memory_cache_alloc_obj(struct kvm_mmu_memory_cache * mc,gfp_t gfp_flags)392 static inline void *mmu_memory_cache_alloc_obj(struct kvm_mmu_memory_cache *mc,
393 gfp_t gfp_flags)
394 {
395 void *page;
396
397 gfp_flags |= mc->gfp_zero;
398
399 if (mc->kmem_cache)
400 return kmem_cache_alloc(mc->kmem_cache, gfp_flags);
401
402 page = (void *)__get_free_page(gfp_flags);
403 if (page && mc->init_value)
404 memset64(page, mc->init_value, PAGE_SIZE / sizeof(u64));
405 return page;
406 }
407
__kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache * mc,int capacity,int min)408 int __kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int capacity, int min)
409 {
410 gfp_t gfp = mc->gfp_custom ? mc->gfp_custom : GFP_KERNEL_ACCOUNT;
411 void *obj;
412
413 if (mc->nobjs >= min)
414 return 0;
415
416 if (unlikely(!mc->objects)) {
417 if (WARN_ON_ONCE(!capacity))
418 return -EIO;
419
420 /*
421 * Custom init values can be used only for page allocations,
422 * and obviously conflict with __GFP_ZERO.
423 */
424 if (WARN_ON_ONCE(mc->init_value && (mc->kmem_cache || mc->gfp_zero)))
425 return -EIO;
426
427 mc->objects = kvmalloc_array(capacity, sizeof(void *), gfp);
428 if (!mc->objects)
429 return -ENOMEM;
430
431 mc->capacity = capacity;
432 }
433
434 /* It is illegal to request a different capacity across topups. */
435 if (WARN_ON_ONCE(mc->capacity != capacity))
436 return -EIO;
437
438 while (mc->nobjs < mc->capacity) {
439 obj = mmu_memory_cache_alloc_obj(mc, gfp);
440 if (!obj)
441 return mc->nobjs >= min ? 0 : -ENOMEM;
442 mc->objects[mc->nobjs++] = obj;
443 }
444 return 0;
445 }
446
kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache * mc,int min)447 int kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int min)
448 {
449 return __kvm_mmu_topup_memory_cache(mc, KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE, min);
450 }
451
kvm_mmu_memory_cache_nr_free_objects(struct kvm_mmu_memory_cache * mc)452 int kvm_mmu_memory_cache_nr_free_objects(struct kvm_mmu_memory_cache *mc)
453 {
454 return mc->nobjs;
455 }
456
kvm_mmu_free_memory_cache(struct kvm_mmu_memory_cache * mc)457 void kvm_mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc)
458 {
459 while (mc->nobjs) {
460 if (mc->kmem_cache)
461 kmem_cache_free(mc->kmem_cache, mc->objects[--mc->nobjs]);
462 else
463 free_page((unsigned long)mc->objects[--mc->nobjs]);
464 }
465
466 kvfree(mc->objects);
467
468 mc->objects = NULL;
469 mc->capacity = 0;
470 }
471
kvm_mmu_memory_cache_alloc(struct kvm_mmu_memory_cache * mc)472 void *kvm_mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc)
473 {
474 void *p;
475
476 if (WARN_ON(!mc->nobjs))
477 p = mmu_memory_cache_alloc_obj(mc, GFP_ATOMIC | __GFP_ACCOUNT);
478 else
479 p = mc->objects[--mc->nobjs];
480 BUG_ON(!p);
481 return p;
482 }
483 #endif
484
kvm_vcpu_init(struct kvm_vcpu * vcpu,struct kvm * kvm,unsigned id)485 static void kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
486 {
487 mutex_init(&vcpu->mutex);
488 vcpu->cpu = -1;
489 vcpu->kvm = kvm;
490 vcpu->vcpu_id = id;
491 vcpu->pid = NULL;
492 #ifndef __KVM_HAVE_ARCH_WQP
493 rcuwait_init(&vcpu->wait);
494 #endif
495 kvm_async_pf_vcpu_init(vcpu);
496
497 kvm_vcpu_set_in_spin_loop(vcpu, false);
498 kvm_vcpu_set_dy_eligible(vcpu, false);
499 vcpu->preempted = false;
500 vcpu->ready = false;
501 preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops);
502 vcpu->last_used_slot = NULL;
503
504 /* Fill the stats id string for the vcpu */
505 snprintf(vcpu->stats_id, sizeof(vcpu->stats_id), "kvm-%d/vcpu-%d",
506 task_pid_nr(current), id);
507 }
508
kvm_vcpu_destroy(struct kvm_vcpu * vcpu)509 static void kvm_vcpu_destroy(struct kvm_vcpu *vcpu)
510 {
511 kvm_arch_vcpu_destroy(vcpu);
512 kvm_dirty_ring_free(&vcpu->dirty_ring);
513
514 /*
515 * No need for rcu_read_lock as VCPU_RUN is the only place that changes
516 * the vcpu->pid pointer, and at destruction time all file descriptors
517 * are already gone.
518 */
519 put_pid(rcu_dereference_protected(vcpu->pid, 1));
520
521 free_page((unsigned long)vcpu->run);
522 kmem_cache_free(kvm_vcpu_cache, vcpu);
523 }
524
kvm_destroy_vcpus(struct kvm * kvm)525 void kvm_destroy_vcpus(struct kvm *kvm)
526 {
527 unsigned long i;
528 struct kvm_vcpu *vcpu;
529
530 kvm_for_each_vcpu(i, vcpu, kvm) {
531 kvm_vcpu_destroy(vcpu);
532 xa_erase(&kvm->vcpu_array, i);
533 }
534
535 atomic_set(&kvm->online_vcpus, 0);
536 }
537 EXPORT_SYMBOL_GPL(kvm_destroy_vcpus);
538
539 #ifdef CONFIG_KVM_GENERIC_MMU_NOTIFIER
mmu_notifier_to_kvm(struct mmu_notifier * mn)540 static inline struct kvm *mmu_notifier_to_kvm(struct mmu_notifier *mn)
541 {
542 return container_of(mn, struct kvm, mmu_notifier);
543 }
544
545 typedef bool (*gfn_handler_t)(struct kvm *kvm, struct kvm_gfn_range *range);
546
547 typedef void (*on_lock_fn_t)(struct kvm *kvm);
548
549 struct kvm_mmu_notifier_range {
550 /*
551 * 64-bit addresses, as KVM notifiers can operate on host virtual
552 * addresses (unsigned long) and guest physical addresses (64-bit).
553 */
554 u64 start;
555 u64 end;
556 union kvm_mmu_notifier_arg arg;
557 gfn_handler_t handler;
558 on_lock_fn_t on_lock;
559 bool flush_on_ret;
560 bool may_block;
561 };
562
563 /*
564 * The inner-most helper returns a tuple containing the return value from the
565 * arch- and action-specific handler, plus a flag indicating whether or not at
566 * least one memslot was found, i.e. if the handler found guest memory.
567 *
568 * Note, most notifiers are averse to booleans, so even though KVM tracks the
569 * return from arch code as a bool, outer helpers will cast it to an int. :-(
570 */
571 typedef struct kvm_mmu_notifier_return {
572 bool ret;
573 bool found_memslot;
574 } kvm_mn_ret_t;
575
576 /*
577 * Use a dedicated stub instead of NULL to indicate that there is no callback
578 * function/handler. The compiler technically can't guarantee that a real
579 * function will have a non-zero address, and so it will generate code to
580 * check for !NULL, whereas comparing against a stub will be elided at compile
581 * time (unless the compiler is getting long in the tooth, e.g. gcc 4.9).
582 */
kvm_null_fn(void)583 static void kvm_null_fn(void)
584 {
585
586 }
587 #define IS_KVM_NULL_FN(fn) ((fn) == (void *)kvm_null_fn)
588
589 /* Iterate over each memslot intersecting [start, last] (inclusive) range */
590 #define kvm_for_each_memslot_in_hva_range(node, slots, start, last) \
591 for (node = interval_tree_iter_first(&slots->hva_tree, start, last); \
592 node; \
593 node = interval_tree_iter_next(node, start, last)) \
594
__kvm_handle_hva_range(struct kvm * kvm,const struct kvm_mmu_notifier_range * range)595 static __always_inline kvm_mn_ret_t __kvm_handle_hva_range(struct kvm *kvm,
596 const struct kvm_mmu_notifier_range *range)
597 {
598 struct kvm_mmu_notifier_return r = {
599 .ret = false,
600 .found_memslot = false,
601 };
602 struct kvm_gfn_range gfn_range;
603 struct kvm_memory_slot *slot;
604 struct kvm_memslots *slots;
605 int i, idx;
606
607 if (WARN_ON_ONCE(range->end <= range->start))
608 return r;
609
610 /* A null handler is allowed if and only if on_lock() is provided. */
611 if (WARN_ON_ONCE(IS_KVM_NULL_FN(range->on_lock) &&
612 IS_KVM_NULL_FN(range->handler)))
613 return r;
614
615 idx = srcu_read_lock(&kvm->srcu);
616
617 for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) {
618 struct interval_tree_node *node;
619
620 slots = __kvm_memslots(kvm, i);
621 kvm_for_each_memslot_in_hva_range(node, slots,
622 range->start, range->end - 1) {
623 unsigned long hva_start, hva_end;
624
625 slot = container_of(node, struct kvm_memory_slot, hva_node[slots->node_idx]);
626 hva_start = max_t(unsigned long, range->start, slot->userspace_addr);
627 hva_end = min_t(unsigned long, range->end,
628 slot->userspace_addr + (slot->npages << PAGE_SHIFT));
629
630 /*
631 * To optimize for the likely case where the address
632 * range is covered by zero or one memslots, don't
633 * bother making these conditional (to avoid writes on
634 * the second or later invocation of the handler).
635 */
636 gfn_range.arg = range->arg;
637 gfn_range.may_block = range->may_block;
638
639 /*
640 * {gfn(page) | page intersects with [hva_start, hva_end)} =
641 * {gfn_start, gfn_start+1, ..., gfn_end-1}.
642 */
643 gfn_range.start = hva_to_gfn_memslot(hva_start, slot);
644 gfn_range.end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, slot);
645 gfn_range.slot = slot;
646
647 if (!r.found_memslot) {
648 r.found_memslot = true;
649 KVM_MMU_LOCK(kvm);
650 if (!IS_KVM_NULL_FN(range->on_lock))
651 range->on_lock(kvm);
652
653 if (IS_KVM_NULL_FN(range->handler))
654 break;
655 }
656 r.ret |= range->handler(kvm, &gfn_range);
657 }
658 }
659
660 if (range->flush_on_ret && r.ret)
661 kvm_flush_remote_tlbs(kvm);
662
663 if (r.found_memslot)
664 KVM_MMU_UNLOCK(kvm);
665
666 srcu_read_unlock(&kvm->srcu, idx);
667
668 return r;
669 }
670
kvm_handle_hva_range(struct mmu_notifier * mn,unsigned long start,unsigned long end,gfn_handler_t handler)671 static __always_inline int kvm_handle_hva_range(struct mmu_notifier *mn,
672 unsigned long start,
673 unsigned long end,
674 gfn_handler_t handler)
675 {
676 struct kvm *kvm = mmu_notifier_to_kvm(mn);
677 const struct kvm_mmu_notifier_range range = {
678 .start = start,
679 .end = end,
680 .handler = handler,
681 .on_lock = (void *)kvm_null_fn,
682 .flush_on_ret = true,
683 .may_block = false,
684 };
685
686 return __kvm_handle_hva_range(kvm, &range).ret;
687 }
688
kvm_handle_hva_range_no_flush(struct mmu_notifier * mn,unsigned long start,unsigned long end,gfn_handler_t handler)689 static __always_inline int kvm_handle_hva_range_no_flush(struct mmu_notifier *mn,
690 unsigned long start,
691 unsigned long end,
692 gfn_handler_t handler)
693 {
694 struct kvm *kvm = mmu_notifier_to_kvm(mn);
695 const struct kvm_mmu_notifier_range range = {
696 .start = start,
697 .end = end,
698 .handler = handler,
699 .on_lock = (void *)kvm_null_fn,
700 .flush_on_ret = false,
701 .may_block = false,
702 };
703
704 return __kvm_handle_hva_range(kvm, &range).ret;
705 }
706
kvm_mmu_invalidate_begin(struct kvm * kvm)707 void kvm_mmu_invalidate_begin(struct kvm *kvm)
708 {
709 lockdep_assert_held_write(&kvm->mmu_lock);
710 /*
711 * The count increase must become visible at unlock time as no
712 * spte can be established without taking the mmu_lock and
713 * count is also read inside the mmu_lock critical section.
714 */
715 kvm->mmu_invalidate_in_progress++;
716
717 if (likely(kvm->mmu_invalidate_in_progress == 1)) {
718 kvm->mmu_invalidate_range_start = INVALID_GPA;
719 kvm->mmu_invalidate_range_end = INVALID_GPA;
720 }
721 }
722
kvm_mmu_invalidate_range_add(struct kvm * kvm,gfn_t start,gfn_t end)723 void kvm_mmu_invalidate_range_add(struct kvm *kvm, gfn_t start, gfn_t end)
724 {
725 lockdep_assert_held_write(&kvm->mmu_lock);
726
727 WARN_ON_ONCE(!kvm->mmu_invalidate_in_progress);
728
729 if (likely(kvm->mmu_invalidate_range_start == INVALID_GPA)) {
730 kvm->mmu_invalidate_range_start = start;
731 kvm->mmu_invalidate_range_end = end;
732 } else {
733 /*
734 * Fully tracking multiple concurrent ranges has diminishing
735 * returns. Keep things simple and just find the minimal range
736 * which includes the current and new ranges. As there won't be
737 * enough information to subtract a range after its invalidate
738 * completes, any ranges invalidated concurrently will
739 * accumulate and persist until all outstanding invalidates
740 * complete.
741 */
742 kvm->mmu_invalidate_range_start =
743 min(kvm->mmu_invalidate_range_start, start);
744 kvm->mmu_invalidate_range_end =
745 max(kvm->mmu_invalidate_range_end, end);
746 }
747 }
748
kvm_mmu_unmap_gfn_range(struct kvm * kvm,struct kvm_gfn_range * range)749 bool kvm_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
750 {
751 kvm_mmu_invalidate_range_add(kvm, range->start, range->end);
752 return kvm_unmap_gfn_range(kvm, range);
753 }
754
kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier * mn,const struct mmu_notifier_range * range)755 static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
756 const struct mmu_notifier_range *range)
757 {
758 struct kvm *kvm = mmu_notifier_to_kvm(mn);
759 const struct kvm_mmu_notifier_range hva_range = {
760 .start = range->start,
761 .end = range->end,
762 .handler = kvm_mmu_unmap_gfn_range,
763 .on_lock = kvm_mmu_invalidate_begin,
764 .flush_on_ret = true,
765 .may_block = mmu_notifier_range_blockable(range),
766 };
767
768 trace_kvm_unmap_hva_range(range->start, range->end);
769
770 /*
771 * Prevent memslot modification between range_start() and range_end()
772 * so that conditionally locking provides the same result in both
773 * functions. Without that guarantee, the mmu_invalidate_in_progress
774 * adjustments will be imbalanced.
775 *
776 * Pairs with the decrement in range_end().
777 */
778 spin_lock(&kvm->mn_invalidate_lock);
779 kvm->mn_active_invalidate_count++;
780 spin_unlock(&kvm->mn_invalidate_lock);
781
782 /*
783 * Invalidate pfn caches _before_ invalidating the secondary MMUs, i.e.
784 * before acquiring mmu_lock, to avoid holding mmu_lock while acquiring
785 * each cache's lock. There are relatively few caches in existence at
786 * any given time, and the caches themselves can check for hva overlap,
787 * i.e. don't need to rely on memslot overlap checks for performance.
788 * Because this runs without holding mmu_lock, the pfn caches must use
789 * mn_active_invalidate_count (see above) instead of
790 * mmu_invalidate_in_progress.
791 */
792 gfn_to_pfn_cache_invalidate_start(kvm, range->start, range->end);
793
794 /*
795 * If one or more memslots were found and thus zapped, notify arch code
796 * that guest memory has been reclaimed. This needs to be done *after*
797 * dropping mmu_lock, as x86's reclaim path is slooooow.
798 */
799 if (__kvm_handle_hva_range(kvm, &hva_range).found_memslot)
800 kvm_arch_guest_memory_reclaimed(kvm);
801
802 return 0;
803 }
804
kvm_mmu_invalidate_end(struct kvm * kvm)805 void kvm_mmu_invalidate_end(struct kvm *kvm)
806 {
807 lockdep_assert_held_write(&kvm->mmu_lock);
808
809 /*
810 * This sequence increase will notify the kvm page fault that
811 * the page that is going to be mapped in the spte could have
812 * been freed.
813 */
814 kvm->mmu_invalidate_seq++;
815 smp_wmb();
816 /*
817 * The above sequence increase must be visible before the
818 * below count decrease, which is ensured by the smp_wmb above
819 * in conjunction with the smp_rmb in mmu_invalidate_retry().
820 */
821 kvm->mmu_invalidate_in_progress--;
822 KVM_BUG_ON(kvm->mmu_invalidate_in_progress < 0, kvm);
823
824 /*
825 * Assert that at least one range was added between start() and end().
826 * Not adding a range isn't fatal, but it is a KVM bug.
827 */
828 WARN_ON_ONCE(kvm->mmu_invalidate_range_start == INVALID_GPA);
829 }
830
kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier * mn,const struct mmu_notifier_range * range)831 static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
832 const struct mmu_notifier_range *range)
833 {
834 struct kvm *kvm = mmu_notifier_to_kvm(mn);
835 const struct kvm_mmu_notifier_range hva_range = {
836 .start = range->start,
837 .end = range->end,
838 .handler = (void *)kvm_null_fn,
839 .on_lock = kvm_mmu_invalidate_end,
840 .flush_on_ret = false,
841 .may_block = mmu_notifier_range_blockable(range),
842 };
843 bool wake;
844
845 __kvm_handle_hva_range(kvm, &hva_range);
846
847 /* Pairs with the increment in range_start(). */
848 spin_lock(&kvm->mn_invalidate_lock);
849 if (!WARN_ON_ONCE(!kvm->mn_active_invalidate_count))
850 --kvm->mn_active_invalidate_count;
851 wake = !kvm->mn_active_invalidate_count;
852 spin_unlock(&kvm->mn_invalidate_lock);
853
854 /*
855 * There can only be one waiter, since the wait happens under
856 * slots_lock.
857 */
858 if (wake)
859 rcuwait_wake_up(&kvm->mn_memslots_update_rcuwait);
860 }
861
kvm_mmu_notifier_clear_flush_young(struct mmu_notifier * mn,struct mm_struct * mm,unsigned long start,unsigned long end)862 static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn,
863 struct mm_struct *mm,
864 unsigned long start,
865 unsigned long end)
866 {
867 trace_kvm_age_hva(start, end);
868
869 return kvm_handle_hva_range(mn, start, end, kvm_age_gfn);
870 }
871
kvm_mmu_notifier_clear_young(struct mmu_notifier * mn,struct mm_struct * mm,unsigned long start,unsigned long end)872 static int kvm_mmu_notifier_clear_young(struct mmu_notifier *mn,
873 struct mm_struct *mm,
874 unsigned long start,
875 unsigned long end)
876 {
877 trace_kvm_age_hva(start, end);
878
879 /*
880 * Even though we do not flush TLB, this will still adversely
881 * affect performance on pre-Haswell Intel EPT, where there is
882 * no EPT Access Bit to clear so that we have to tear down EPT
883 * tables instead. If we find this unacceptable, we can always
884 * add a parameter to kvm_age_hva so that it effectively doesn't
885 * do anything on clear_young.
886 *
887 * Also note that currently we never issue secondary TLB flushes
888 * from clear_young, leaving this job up to the regular system
889 * cadence. If we find this inaccurate, we might come up with a
890 * more sophisticated heuristic later.
891 */
892 return kvm_handle_hva_range_no_flush(mn, start, end, kvm_age_gfn);
893 }
894
kvm_mmu_notifier_test_young(struct mmu_notifier * mn,struct mm_struct * mm,unsigned long address)895 static int kvm_mmu_notifier_test_young(struct mmu_notifier *mn,
896 struct mm_struct *mm,
897 unsigned long address)
898 {
899 trace_kvm_test_age_hva(address);
900
901 return kvm_handle_hva_range_no_flush(mn, address, address + 1,
902 kvm_test_age_gfn);
903 }
904
kvm_mmu_notifier_release(struct mmu_notifier * mn,struct mm_struct * mm)905 static void kvm_mmu_notifier_release(struct mmu_notifier *mn,
906 struct mm_struct *mm)
907 {
908 struct kvm *kvm = mmu_notifier_to_kvm(mn);
909 int idx;
910
911 idx = srcu_read_lock(&kvm->srcu);
912 kvm_flush_shadow_all(kvm);
913 srcu_read_unlock(&kvm->srcu, idx);
914 }
915
916 static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {
917 .invalidate_range_start = kvm_mmu_notifier_invalidate_range_start,
918 .invalidate_range_end = kvm_mmu_notifier_invalidate_range_end,
919 .clear_flush_young = kvm_mmu_notifier_clear_flush_young,
920 .clear_young = kvm_mmu_notifier_clear_young,
921 .test_young = kvm_mmu_notifier_test_young,
922 .release = kvm_mmu_notifier_release,
923 };
924
kvm_init_mmu_notifier(struct kvm * kvm)925 static int kvm_init_mmu_notifier(struct kvm *kvm)
926 {
927 kvm->mmu_notifier.ops = &kvm_mmu_notifier_ops;
928 return mmu_notifier_register(&kvm->mmu_notifier, current->mm);
929 }
930
931 #else /* !CONFIG_KVM_GENERIC_MMU_NOTIFIER */
932
kvm_init_mmu_notifier(struct kvm * kvm)933 static int kvm_init_mmu_notifier(struct kvm *kvm)
934 {
935 return 0;
936 }
937
938 #endif /* CONFIG_KVM_GENERIC_MMU_NOTIFIER */
939
940 #ifdef CONFIG_HAVE_KVM_PM_NOTIFIER
kvm_pm_notifier_call(struct notifier_block * bl,unsigned long state,void * unused)941 static int kvm_pm_notifier_call(struct notifier_block *bl,
942 unsigned long state,
943 void *unused)
944 {
945 struct kvm *kvm = container_of(bl, struct kvm, pm_notifier);
946
947 return kvm_arch_pm_notifier(kvm, state);
948 }
949
kvm_init_pm_notifier(struct kvm * kvm)950 static void kvm_init_pm_notifier(struct kvm *kvm)
951 {
952 kvm->pm_notifier.notifier_call = kvm_pm_notifier_call;
953 /* Suspend KVM before we suspend ftrace, RCU, etc. */
954 kvm->pm_notifier.priority = INT_MAX;
955 register_pm_notifier(&kvm->pm_notifier);
956 }
957
kvm_destroy_pm_notifier(struct kvm * kvm)958 static void kvm_destroy_pm_notifier(struct kvm *kvm)
959 {
960 unregister_pm_notifier(&kvm->pm_notifier);
961 }
962 #else /* !CONFIG_HAVE_KVM_PM_NOTIFIER */
kvm_init_pm_notifier(struct kvm * kvm)963 static void kvm_init_pm_notifier(struct kvm *kvm)
964 {
965 }
966
kvm_destroy_pm_notifier(struct kvm * kvm)967 static void kvm_destroy_pm_notifier(struct kvm *kvm)
968 {
969 }
970 #endif /* CONFIG_HAVE_KVM_PM_NOTIFIER */
971
kvm_destroy_dirty_bitmap(struct kvm_memory_slot * memslot)972 static void kvm_destroy_dirty_bitmap(struct kvm_memory_slot *memslot)
973 {
974 if (!memslot->dirty_bitmap)
975 return;
976
977 vfree(memslot->dirty_bitmap);
978 memslot->dirty_bitmap = NULL;
979 }
980
981 /* This does not remove the slot from struct kvm_memslots data structures */
kvm_free_memslot(struct kvm * kvm,struct kvm_memory_slot * slot)982 static void kvm_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot)
983 {
984 if (slot->flags & KVM_MEM_GUEST_MEMFD)
985 kvm_gmem_unbind(slot);
986
987 kvm_destroy_dirty_bitmap(slot);
988
989 kvm_arch_free_memslot(kvm, slot);
990
991 kfree(slot);
992 }
993
kvm_free_memslots(struct kvm * kvm,struct kvm_memslots * slots)994 static void kvm_free_memslots(struct kvm *kvm, struct kvm_memslots *slots)
995 {
996 struct hlist_node *idnode;
997 struct kvm_memory_slot *memslot;
998 int bkt;
999
1000 /*
1001 * The same memslot objects live in both active and inactive sets,
1002 * arbitrarily free using index '1' so the second invocation of this
1003 * function isn't operating over a structure with dangling pointers
1004 * (even though this function isn't actually touching them).
1005 */
1006 if (!slots->node_idx)
1007 return;
1008
1009 hash_for_each_safe(slots->id_hash, bkt, idnode, memslot, id_node[1])
1010 kvm_free_memslot(kvm, memslot);
1011 }
1012
kvm_stats_debugfs_mode(const struct _kvm_stats_desc * pdesc)1013 static umode_t kvm_stats_debugfs_mode(const struct _kvm_stats_desc *pdesc)
1014 {
1015 switch (pdesc->desc.flags & KVM_STATS_TYPE_MASK) {
1016 case KVM_STATS_TYPE_INSTANT:
1017 return 0444;
1018 case KVM_STATS_TYPE_CUMULATIVE:
1019 case KVM_STATS_TYPE_PEAK:
1020 default:
1021 return 0644;
1022 }
1023 }
1024
1025
kvm_destroy_vm_debugfs(struct kvm * kvm)1026 static void kvm_destroy_vm_debugfs(struct kvm *kvm)
1027 {
1028 int i;
1029 int kvm_debugfs_num_entries = kvm_vm_stats_header.num_desc +
1030 kvm_vcpu_stats_header.num_desc;
1031
1032 if (IS_ERR(kvm->debugfs_dentry))
1033 return;
1034
1035 debugfs_remove_recursive(kvm->debugfs_dentry);
1036
1037 if (kvm->debugfs_stat_data) {
1038 for (i = 0; i < kvm_debugfs_num_entries; i++)
1039 kfree(kvm->debugfs_stat_data[i]);
1040 kfree(kvm->debugfs_stat_data);
1041 }
1042 }
1043
kvm_create_vm_debugfs(struct kvm * kvm,const char * fdname)1044 static int kvm_create_vm_debugfs(struct kvm *kvm, const char *fdname)
1045 {
1046 static DEFINE_MUTEX(kvm_debugfs_lock);
1047 struct dentry *dent;
1048 char dir_name[ITOA_MAX_LEN * 2];
1049 struct kvm_stat_data *stat_data;
1050 const struct _kvm_stats_desc *pdesc;
1051 int i, ret = -ENOMEM;
1052 int kvm_debugfs_num_entries = kvm_vm_stats_header.num_desc +
1053 kvm_vcpu_stats_header.num_desc;
1054
1055 if (!debugfs_initialized())
1056 return 0;
1057
1058 snprintf(dir_name, sizeof(dir_name), "%d-%s", task_pid_nr(current), fdname);
1059 mutex_lock(&kvm_debugfs_lock);
1060 dent = debugfs_lookup(dir_name, kvm_debugfs_dir);
1061 if (dent) {
1062 pr_warn_ratelimited("KVM: debugfs: duplicate directory %s\n", dir_name);
1063 dput(dent);
1064 mutex_unlock(&kvm_debugfs_lock);
1065 return 0;
1066 }
1067 dent = debugfs_create_dir(dir_name, kvm_debugfs_dir);
1068 mutex_unlock(&kvm_debugfs_lock);
1069 if (IS_ERR(dent))
1070 return 0;
1071
1072 kvm->debugfs_dentry = dent;
1073 kvm->debugfs_stat_data = kcalloc(kvm_debugfs_num_entries,
1074 sizeof(*kvm->debugfs_stat_data),
1075 GFP_KERNEL_ACCOUNT);
1076 if (!kvm->debugfs_stat_data)
1077 goto out_err;
1078
1079 for (i = 0; i < kvm_vm_stats_header.num_desc; ++i) {
1080 pdesc = &kvm_vm_stats_desc[i];
1081 stat_data = kzalloc(sizeof(*stat_data), GFP_KERNEL_ACCOUNT);
1082 if (!stat_data)
1083 goto out_err;
1084
1085 stat_data->kvm = kvm;
1086 stat_data->desc = pdesc;
1087 stat_data->kind = KVM_STAT_VM;
1088 kvm->debugfs_stat_data[i] = stat_data;
1089 debugfs_create_file(pdesc->name, kvm_stats_debugfs_mode(pdesc),
1090 kvm->debugfs_dentry, stat_data,
1091 &stat_fops_per_vm);
1092 }
1093
1094 for (i = 0; i < kvm_vcpu_stats_header.num_desc; ++i) {
1095 pdesc = &kvm_vcpu_stats_desc[i];
1096 stat_data = kzalloc(sizeof(*stat_data), GFP_KERNEL_ACCOUNT);
1097 if (!stat_data)
1098 goto out_err;
1099
1100 stat_data->kvm = kvm;
1101 stat_data->desc = pdesc;
1102 stat_data->kind = KVM_STAT_VCPU;
1103 kvm->debugfs_stat_data[i + kvm_vm_stats_header.num_desc] = stat_data;
1104 debugfs_create_file(pdesc->name, kvm_stats_debugfs_mode(pdesc),
1105 kvm->debugfs_dentry, stat_data,
1106 &stat_fops_per_vm);
1107 }
1108
1109 kvm_arch_create_vm_debugfs(kvm);
1110 return 0;
1111 out_err:
1112 kvm_destroy_vm_debugfs(kvm);
1113 return ret;
1114 }
1115
1116 /*
1117 * Called after the VM is otherwise initialized, but just before adding it to
1118 * the vm_list.
1119 */
kvm_arch_post_init_vm(struct kvm * kvm)1120 int __weak kvm_arch_post_init_vm(struct kvm *kvm)
1121 {
1122 return 0;
1123 }
1124
1125 /*
1126 * Called just after removing the VM from the vm_list, but before doing any
1127 * other destruction.
1128 */
kvm_arch_pre_destroy_vm(struct kvm * kvm)1129 void __weak kvm_arch_pre_destroy_vm(struct kvm *kvm)
1130 {
1131 }
1132
1133 /*
1134 * Called after per-vm debugfs created. When called kvm->debugfs_dentry should
1135 * be setup already, so we can create arch-specific debugfs entries under it.
1136 * Cleanup should be automatic done in kvm_destroy_vm_debugfs() recursively, so
1137 * a per-arch destroy interface is not needed.
1138 */
kvm_arch_create_vm_debugfs(struct kvm * kvm)1139 void __weak kvm_arch_create_vm_debugfs(struct kvm *kvm)
1140 {
1141 }
1142
kvm_create_vm(unsigned long type,const char * fdname)1143 static struct kvm *kvm_create_vm(unsigned long type, const char *fdname)
1144 {
1145 struct kvm *kvm = kvm_arch_alloc_vm();
1146 struct kvm_memslots *slots;
1147 int r = -ENOMEM;
1148 int i, j;
1149
1150 if (!kvm)
1151 return ERR_PTR(-ENOMEM);
1152
1153 KVM_MMU_LOCK_INIT(kvm);
1154 mmgrab(current->mm);
1155 kvm->mm = current->mm;
1156 kvm_eventfd_init(kvm);
1157 mutex_init(&kvm->lock);
1158 mutex_init(&kvm->irq_lock);
1159 mutex_init(&kvm->slots_lock);
1160 mutex_init(&kvm->slots_arch_lock);
1161 spin_lock_init(&kvm->mn_invalidate_lock);
1162 rcuwait_init(&kvm->mn_memslots_update_rcuwait);
1163 xa_init(&kvm->vcpu_array);
1164 #ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
1165 xa_init(&kvm->mem_attr_array);
1166 #endif
1167
1168 INIT_LIST_HEAD(&kvm->gpc_list);
1169 spin_lock_init(&kvm->gpc_lock);
1170
1171 INIT_LIST_HEAD(&kvm->devices);
1172 kvm->max_vcpus = KVM_MAX_VCPUS;
1173
1174 BUILD_BUG_ON(KVM_MEM_SLOTS_NUM > SHRT_MAX);
1175
1176 /*
1177 * Force subsequent debugfs file creations to fail if the VM directory
1178 * is not created (by kvm_create_vm_debugfs()).
1179 */
1180 kvm->debugfs_dentry = ERR_PTR(-ENOENT);
1181
1182 snprintf(kvm->stats_id, sizeof(kvm->stats_id), "kvm-%d",
1183 task_pid_nr(current));
1184
1185 if (init_srcu_struct(&kvm->srcu))
1186 goto out_err_no_srcu;
1187 if (init_srcu_struct(&kvm->irq_srcu))
1188 goto out_err_no_irq_srcu;
1189
1190 refcount_set(&kvm->users_count, 1);
1191 for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) {
1192 for (j = 0; j < 2; j++) {
1193 slots = &kvm->__memslots[i][j];
1194
1195 atomic_long_set(&slots->last_used_slot, (unsigned long)NULL);
1196 slots->hva_tree = RB_ROOT_CACHED;
1197 slots->gfn_tree = RB_ROOT;
1198 hash_init(slots->id_hash);
1199 slots->node_idx = j;
1200
1201 /* Generations must be different for each address space. */
1202 slots->generation = i;
1203 }
1204
1205 rcu_assign_pointer(kvm->memslots[i], &kvm->__memslots[i][0]);
1206 }
1207
1208 for (i = 0; i < KVM_NR_BUSES; i++) {
1209 rcu_assign_pointer(kvm->buses[i],
1210 kzalloc(sizeof(struct kvm_io_bus), GFP_KERNEL_ACCOUNT));
1211 if (!kvm->buses[i])
1212 goto out_err_no_arch_destroy_vm;
1213 }
1214
1215 r = kvm_arch_init_vm(kvm, type);
1216 if (r)
1217 goto out_err_no_arch_destroy_vm;
1218
1219 r = hardware_enable_all();
1220 if (r)
1221 goto out_err_no_disable;
1222
1223 #ifdef CONFIG_HAVE_KVM_IRQCHIP
1224 INIT_HLIST_HEAD(&kvm->irq_ack_notifier_list);
1225 #endif
1226
1227 r = kvm_init_mmu_notifier(kvm);
1228 if (r)
1229 goto out_err_no_mmu_notifier;
1230
1231 r = kvm_coalesced_mmio_init(kvm);
1232 if (r < 0)
1233 goto out_no_coalesced_mmio;
1234
1235 r = kvm_create_vm_debugfs(kvm, fdname);
1236 if (r)
1237 goto out_err_no_debugfs;
1238
1239 r = kvm_arch_post_init_vm(kvm);
1240 if (r)
1241 goto out_err;
1242
1243 mutex_lock(&kvm_lock);
1244 list_add(&kvm->vm_list, &vm_list);
1245 mutex_unlock(&kvm_lock);
1246
1247 preempt_notifier_inc();
1248 kvm_init_pm_notifier(kvm);
1249
1250 return kvm;
1251
1252 out_err:
1253 kvm_destroy_vm_debugfs(kvm);
1254 out_err_no_debugfs:
1255 kvm_coalesced_mmio_free(kvm);
1256 out_no_coalesced_mmio:
1257 #ifdef CONFIG_KVM_GENERIC_MMU_NOTIFIER
1258 if (kvm->mmu_notifier.ops)
1259 mmu_notifier_unregister(&kvm->mmu_notifier, current->mm);
1260 #endif
1261 out_err_no_mmu_notifier:
1262 hardware_disable_all();
1263 out_err_no_disable:
1264 kvm_arch_destroy_vm(kvm);
1265 out_err_no_arch_destroy_vm:
1266 WARN_ON_ONCE(!refcount_dec_and_test(&kvm->users_count));
1267 for (i = 0; i < KVM_NR_BUSES; i++)
1268 kfree(kvm_get_bus(kvm, i));
1269 cleanup_srcu_struct(&kvm->irq_srcu);
1270 out_err_no_irq_srcu:
1271 cleanup_srcu_struct(&kvm->srcu);
1272 out_err_no_srcu:
1273 kvm_arch_free_vm(kvm);
1274 mmdrop(current->mm);
1275 return ERR_PTR(r);
1276 }
1277
kvm_destroy_devices(struct kvm * kvm)1278 static void kvm_destroy_devices(struct kvm *kvm)
1279 {
1280 struct kvm_device *dev, *tmp;
1281
1282 /*
1283 * We do not need to take the kvm->lock here, because nobody else
1284 * has a reference to the struct kvm at this point and therefore
1285 * cannot access the devices list anyhow.
1286 *
1287 * The device list is generally managed as an rculist, but list_del()
1288 * is used intentionally here. If a bug in KVM introduced a reader that
1289 * was not backed by a reference on the kvm struct, the hope is that
1290 * it'd consume the poisoned forward pointer instead of suffering a
1291 * use-after-free, even though this cannot be guaranteed.
1292 */
1293 list_for_each_entry_safe(dev, tmp, &kvm->devices, vm_node) {
1294 list_del(&dev->vm_node);
1295 dev->ops->destroy(dev);
1296 }
1297 }
1298
kvm_destroy_vm(struct kvm * kvm)1299 static void kvm_destroy_vm(struct kvm *kvm)
1300 {
1301 int i;
1302 struct mm_struct *mm = kvm->mm;
1303
1304 kvm_destroy_pm_notifier(kvm);
1305 kvm_uevent_notify_change(KVM_EVENT_DESTROY_VM, kvm);
1306 kvm_destroy_vm_debugfs(kvm);
1307 kvm_arch_sync_events(kvm);
1308 mutex_lock(&kvm_lock);
1309 list_del(&kvm->vm_list);
1310 mutex_unlock(&kvm_lock);
1311 kvm_arch_pre_destroy_vm(kvm);
1312
1313 kvm_free_irq_routing(kvm);
1314 for (i = 0; i < KVM_NR_BUSES; i++) {
1315 struct kvm_io_bus *bus = kvm_get_bus(kvm, i);
1316
1317 if (bus)
1318 kvm_io_bus_destroy(bus);
1319 kvm->buses[i] = NULL;
1320 }
1321 kvm_coalesced_mmio_free(kvm);
1322 #ifdef CONFIG_KVM_GENERIC_MMU_NOTIFIER
1323 mmu_notifier_unregister(&kvm->mmu_notifier, kvm->mm);
1324 /*
1325 * At this point, pending calls to invalidate_range_start()
1326 * have completed but no more MMU notifiers will run, so
1327 * mn_active_invalidate_count may remain unbalanced.
1328 * No threads can be waiting in kvm_swap_active_memslots() as the
1329 * last reference on KVM has been dropped, but freeing
1330 * memslots would deadlock without this manual intervention.
1331 *
1332 * If the count isn't unbalanced, i.e. KVM did NOT unregister its MMU
1333 * notifier between a start() and end(), then there shouldn't be any
1334 * in-progress invalidations.
1335 */
1336 WARN_ON(rcuwait_active(&kvm->mn_memslots_update_rcuwait));
1337 if (kvm->mn_active_invalidate_count)
1338 kvm->mn_active_invalidate_count = 0;
1339 else
1340 WARN_ON(kvm->mmu_invalidate_in_progress);
1341 #else
1342 kvm_flush_shadow_all(kvm);
1343 #endif
1344 kvm_arch_destroy_vm(kvm);
1345 kvm_destroy_devices(kvm);
1346 for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) {
1347 kvm_free_memslots(kvm, &kvm->__memslots[i][0]);
1348 kvm_free_memslots(kvm, &kvm->__memslots[i][1]);
1349 }
1350 cleanup_srcu_struct(&kvm->irq_srcu);
1351 cleanup_srcu_struct(&kvm->srcu);
1352 #ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
1353 xa_destroy(&kvm->mem_attr_array);
1354 #endif
1355 kvm_arch_free_vm(kvm);
1356 preempt_notifier_dec();
1357 hardware_disable_all();
1358 mmdrop(mm);
1359 }
1360
kvm_get_kvm(struct kvm * kvm)1361 void kvm_get_kvm(struct kvm *kvm)
1362 {
1363 refcount_inc(&kvm->users_count);
1364 }
1365 EXPORT_SYMBOL_GPL(kvm_get_kvm);
1366
1367 /*
1368 * Make sure the vm is not during destruction, which is a safe version of
1369 * kvm_get_kvm(). Return true if kvm referenced successfully, false otherwise.
1370 */
kvm_get_kvm_safe(struct kvm * kvm)1371 bool kvm_get_kvm_safe(struct kvm *kvm)
1372 {
1373 return refcount_inc_not_zero(&kvm->users_count);
1374 }
1375 EXPORT_SYMBOL_GPL(kvm_get_kvm_safe);
1376
kvm_put_kvm(struct kvm * kvm)1377 void kvm_put_kvm(struct kvm *kvm)
1378 {
1379 if (refcount_dec_and_test(&kvm->users_count))
1380 kvm_destroy_vm(kvm);
1381 }
1382 EXPORT_SYMBOL_GPL(kvm_put_kvm);
1383
1384 /*
1385 * Used to put a reference that was taken on behalf of an object associated
1386 * with a user-visible file descriptor, e.g. a vcpu or device, if installation
1387 * of the new file descriptor fails and the reference cannot be transferred to
1388 * its final owner. In such cases, the caller is still actively using @kvm and
1389 * will fail miserably if the refcount unexpectedly hits zero.
1390 */
kvm_put_kvm_no_destroy(struct kvm * kvm)1391 void kvm_put_kvm_no_destroy(struct kvm *kvm)
1392 {
1393 WARN_ON(refcount_dec_and_test(&kvm->users_count));
1394 }
1395 EXPORT_SYMBOL_GPL(kvm_put_kvm_no_destroy);
1396
kvm_vm_release(struct inode * inode,struct file * filp)1397 static int kvm_vm_release(struct inode *inode, struct file *filp)
1398 {
1399 struct kvm *kvm = filp->private_data;
1400
1401 kvm_irqfd_release(kvm);
1402
1403 kvm_put_kvm(kvm);
1404 return 0;
1405 }
1406
1407 /*
1408 * Allocation size is twice as large as the actual dirty bitmap size.
1409 * See kvm_vm_ioctl_get_dirty_log() why this is needed.
1410 */
kvm_alloc_dirty_bitmap(struct kvm_memory_slot * memslot)1411 static int kvm_alloc_dirty_bitmap(struct kvm_memory_slot *memslot)
1412 {
1413 unsigned long dirty_bytes = kvm_dirty_bitmap_bytes(memslot);
1414
1415 memslot->dirty_bitmap = __vcalloc(2, dirty_bytes, GFP_KERNEL_ACCOUNT);
1416 if (!memslot->dirty_bitmap)
1417 return -ENOMEM;
1418
1419 return 0;
1420 }
1421
kvm_get_inactive_memslots(struct kvm * kvm,int as_id)1422 static struct kvm_memslots *kvm_get_inactive_memslots(struct kvm *kvm, int as_id)
1423 {
1424 struct kvm_memslots *active = __kvm_memslots(kvm, as_id);
1425 int node_idx_inactive = active->node_idx ^ 1;
1426
1427 return &kvm->__memslots[as_id][node_idx_inactive];
1428 }
1429
1430 /*
1431 * Helper to get the address space ID when one of memslot pointers may be NULL.
1432 * This also serves as a sanity that at least one of the pointers is non-NULL,
1433 * and that their address space IDs don't diverge.
1434 */
kvm_memslots_get_as_id(struct kvm_memory_slot * a,struct kvm_memory_slot * b)1435 static int kvm_memslots_get_as_id(struct kvm_memory_slot *a,
1436 struct kvm_memory_slot *b)
1437 {
1438 if (WARN_ON_ONCE(!a && !b))
1439 return 0;
1440
1441 if (!a)
1442 return b->as_id;
1443 if (!b)
1444 return a->as_id;
1445
1446 WARN_ON_ONCE(a->as_id != b->as_id);
1447 return a->as_id;
1448 }
1449
kvm_insert_gfn_node(struct kvm_memslots * slots,struct kvm_memory_slot * slot)1450 static void kvm_insert_gfn_node(struct kvm_memslots *slots,
1451 struct kvm_memory_slot *slot)
1452 {
1453 struct rb_root *gfn_tree = &slots->gfn_tree;
1454 struct rb_node **node, *parent;
1455 int idx = slots->node_idx;
1456
1457 parent = NULL;
1458 for (node = &gfn_tree->rb_node; *node; ) {
1459 struct kvm_memory_slot *tmp;
1460
1461 tmp = container_of(*node, struct kvm_memory_slot, gfn_node[idx]);
1462 parent = *node;
1463 if (slot->base_gfn < tmp->base_gfn)
1464 node = &(*node)->rb_left;
1465 else if (slot->base_gfn > tmp->base_gfn)
1466 node = &(*node)->rb_right;
1467 else
1468 BUG();
1469 }
1470
1471 rb_link_node(&slot->gfn_node[idx], parent, node);
1472 rb_insert_color(&slot->gfn_node[idx], gfn_tree);
1473 }
1474
kvm_erase_gfn_node(struct kvm_memslots * slots,struct kvm_memory_slot * slot)1475 static void kvm_erase_gfn_node(struct kvm_memslots *slots,
1476 struct kvm_memory_slot *slot)
1477 {
1478 rb_erase(&slot->gfn_node[slots->node_idx], &slots->gfn_tree);
1479 }
1480
kvm_replace_gfn_node(struct kvm_memslots * slots,struct kvm_memory_slot * old,struct kvm_memory_slot * new)1481 static void kvm_replace_gfn_node(struct kvm_memslots *slots,
1482 struct kvm_memory_slot *old,
1483 struct kvm_memory_slot *new)
1484 {
1485 int idx = slots->node_idx;
1486
1487 WARN_ON_ONCE(old->base_gfn != new->base_gfn);
1488
1489 rb_replace_node(&old->gfn_node[idx], &new->gfn_node[idx],
1490 &slots->gfn_tree);
1491 }
1492
1493 /*
1494 * Replace @old with @new in the inactive memslots.
1495 *
1496 * With NULL @old this simply adds @new.
1497 * With NULL @new this simply removes @old.
1498 *
1499 * If @new is non-NULL its hva_node[slots_idx] range has to be set
1500 * appropriately.
1501 */
kvm_replace_memslot(struct kvm * kvm,struct kvm_memory_slot * old,struct kvm_memory_slot * new)1502 static void kvm_replace_memslot(struct kvm *kvm,
1503 struct kvm_memory_slot *old,
1504 struct kvm_memory_slot *new)
1505 {
1506 int as_id = kvm_memslots_get_as_id(old, new);
1507 struct kvm_memslots *slots = kvm_get_inactive_memslots(kvm, as_id);
1508 int idx = slots->node_idx;
1509
1510 if (old) {
1511 hash_del(&old->id_node[idx]);
1512 interval_tree_remove(&old->hva_node[idx], &slots->hva_tree);
1513
1514 if ((long)old == atomic_long_read(&slots->last_used_slot))
1515 atomic_long_set(&slots->last_used_slot, (long)new);
1516
1517 if (!new) {
1518 kvm_erase_gfn_node(slots, old);
1519 return;
1520 }
1521 }
1522
1523 /*
1524 * Initialize @new's hva range. Do this even when replacing an @old
1525 * slot, kvm_copy_memslot() deliberately does not touch node data.
1526 */
1527 new->hva_node[idx].start = new->userspace_addr;
1528 new->hva_node[idx].last = new->userspace_addr +
1529 (new->npages << PAGE_SHIFT) - 1;
1530
1531 /*
1532 * (Re)Add the new memslot. There is no O(1) interval_tree_replace(),
1533 * hva_node needs to be swapped with remove+insert even though hva can't
1534 * change when replacing an existing slot.
1535 */
1536 hash_add(slots->id_hash, &new->id_node[idx], new->id);
1537 interval_tree_insert(&new->hva_node[idx], &slots->hva_tree);
1538
1539 /*
1540 * If the memslot gfn is unchanged, rb_replace_node() can be used to
1541 * switch the node in the gfn tree instead of removing the old and
1542 * inserting the new as two separate operations. Replacement is a
1543 * single O(1) operation versus two O(log(n)) operations for
1544 * remove+insert.
1545 */
1546 if (old && old->base_gfn == new->base_gfn) {
1547 kvm_replace_gfn_node(slots, old, new);
1548 } else {
1549 if (old)
1550 kvm_erase_gfn_node(slots, old);
1551 kvm_insert_gfn_node(slots, new);
1552 }
1553 }
1554
1555 /*
1556 * Flags that do not access any of the extra space of struct
1557 * kvm_userspace_memory_region2. KVM_SET_USER_MEMORY_REGION_V1_FLAGS
1558 * only allows these.
1559 */
1560 #define KVM_SET_USER_MEMORY_REGION_V1_FLAGS \
1561 (KVM_MEM_LOG_DIRTY_PAGES | KVM_MEM_READONLY)
1562
check_memory_region_flags(struct kvm * kvm,const struct kvm_userspace_memory_region2 * mem)1563 static int check_memory_region_flags(struct kvm *kvm,
1564 const struct kvm_userspace_memory_region2 *mem)
1565 {
1566 u32 valid_flags = KVM_MEM_LOG_DIRTY_PAGES;
1567
1568 if (kvm_arch_has_private_mem(kvm))
1569 valid_flags |= KVM_MEM_GUEST_MEMFD;
1570
1571 /* Dirty logging private memory is not currently supported. */
1572 if (mem->flags & KVM_MEM_GUEST_MEMFD)
1573 valid_flags &= ~KVM_MEM_LOG_DIRTY_PAGES;
1574
1575 #ifdef CONFIG_HAVE_KVM_READONLY_MEM
1576 /*
1577 * GUEST_MEMFD is incompatible with read-only memslots, as writes to
1578 * read-only memslots have emulated MMIO, not page fault, semantics,
1579 * and KVM doesn't allow emulated MMIO for private memory.
1580 */
1581 if (!(mem->flags & KVM_MEM_GUEST_MEMFD))
1582 valid_flags |= KVM_MEM_READONLY;
1583 #endif
1584
1585 if (mem->flags & ~valid_flags)
1586 return -EINVAL;
1587
1588 return 0;
1589 }
1590
kvm_swap_active_memslots(struct kvm * kvm,int as_id)1591 static void kvm_swap_active_memslots(struct kvm *kvm, int as_id)
1592 {
1593 struct kvm_memslots *slots = kvm_get_inactive_memslots(kvm, as_id);
1594
1595 /* Grab the generation from the activate memslots. */
1596 u64 gen = __kvm_memslots(kvm, as_id)->generation;
1597
1598 WARN_ON(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS);
1599 slots->generation = gen | KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS;
1600
1601 /*
1602 * Do not store the new memslots while there are invalidations in
1603 * progress, otherwise the locking in invalidate_range_start and
1604 * invalidate_range_end will be unbalanced.
1605 */
1606 spin_lock(&kvm->mn_invalidate_lock);
1607 prepare_to_rcuwait(&kvm->mn_memslots_update_rcuwait);
1608 while (kvm->mn_active_invalidate_count) {
1609 set_current_state(TASK_UNINTERRUPTIBLE);
1610 spin_unlock(&kvm->mn_invalidate_lock);
1611 schedule();
1612 spin_lock(&kvm->mn_invalidate_lock);
1613 }
1614 finish_rcuwait(&kvm->mn_memslots_update_rcuwait);
1615 rcu_assign_pointer(kvm->memslots[as_id], slots);
1616 spin_unlock(&kvm->mn_invalidate_lock);
1617
1618 /*
1619 * Acquired in kvm_set_memslot. Must be released before synchronize
1620 * SRCU below in order to avoid deadlock with another thread
1621 * acquiring the slots_arch_lock in an srcu critical section.
1622 */
1623 mutex_unlock(&kvm->slots_arch_lock);
1624
1625 synchronize_srcu_expedited(&kvm->srcu);
1626
1627 /*
1628 * Increment the new memslot generation a second time, dropping the
1629 * update in-progress flag and incrementing the generation based on
1630 * the number of address spaces. This provides a unique and easily
1631 * identifiable generation number while the memslots are in flux.
1632 */
1633 gen = slots->generation & ~KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS;
1634
1635 /*
1636 * Generations must be unique even across address spaces. We do not need
1637 * a global counter for that, instead the generation space is evenly split
1638 * across address spaces. For example, with two address spaces, address
1639 * space 0 will use generations 0, 2, 4, ... while address space 1 will
1640 * use generations 1, 3, 5, ...
1641 */
1642 gen += kvm_arch_nr_memslot_as_ids(kvm);
1643
1644 kvm_arch_memslots_updated(kvm, gen);
1645
1646 slots->generation = gen;
1647 }
1648
kvm_prepare_memory_region(struct kvm * kvm,const struct kvm_memory_slot * old,struct kvm_memory_slot * new,enum kvm_mr_change change)1649 static int kvm_prepare_memory_region(struct kvm *kvm,
1650 const struct kvm_memory_slot *old,
1651 struct kvm_memory_slot *new,
1652 enum kvm_mr_change change)
1653 {
1654 int r;
1655
1656 /*
1657 * If dirty logging is disabled, nullify the bitmap; the old bitmap
1658 * will be freed on "commit". If logging is enabled in both old and
1659 * new, reuse the existing bitmap. If logging is enabled only in the
1660 * new and KVM isn't using a ring buffer, allocate and initialize a
1661 * new bitmap.
1662 */
1663 if (change != KVM_MR_DELETE) {
1664 if (!(new->flags & KVM_MEM_LOG_DIRTY_PAGES))
1665 new->dirty_bitmap = NULL;
1666 else if (old && old->dirty_bitmap)
1667 new->dirty_bitmap = old->dirty_bitmap;
1668 else if (kvm_use_dirty_bitmap(kvm)) {
1669 r = kvm_alloc_dirty_bitmap(new);
1670 if (r)
1671 return r;
1672
1673 if (kvm_dirty_log_manual_protect_and_init_set(kvm))
1674 bitmap_set(new->dirty_bitmap, 0, new->npages);
1675 }
1676 }
1677
1678 r = kvm_arch_prepare_memory_region(kvm, old, new, change);
1679
1680 /* Free the bitmap on failure if it was allocated above. */
1681 if (r && new && new->dirty_bitmap && (!old || !old->dirty_bitmap))
1682 kvm_destroy_dirty_bitmap(new);
1683
1684 return r;
1685 }
1686
kvm_commit_memory_region(struct kvm * kvm,struct kvm_memory_slot * old,const struct kvm_memory_slot * new,enum kvm_mr_change change)1687 static void kvm_commit_memory_region(struct kvm *kvm,
1688 struct kvm_memory_slot *old,
1689 const struct kvm_memory_slot *new,
1690 enum kvm_mr_change change)
1691 {
1692 int old_flags = old ? old->flags : 0;
1693 int new_flags = new ? new->flags : 0;
1694 /*
1695 * Update the total number of memslot pages before calling the arch
1696 * hook so that architectures can consume the result directly.
1697 */
1698 if (change == KVM_MR_DELETE)
1699 kvm->nr_memslot_pages -= old->npages;
1700 else if (change == KVM_MR_CREATE)
1701 kvm->nr_memslot_pages += new->npages;
1702
1703 if ((old_flags ^ new_flags) & KVM_MEM_LOG_DIRTY_PAGES) {
1704 int change = (new_flags & KVM_MEM_LOG_DIRTY_PAGES) ? 1 : -1;
1705 atomic_set(&kvm->nr_memslots_dirty_logging,
1706 atomic_read(&kvm->nr_memslots_dirty_logging) + change);
1707 }
1708
1709 kvm_arch_commit_memory_region(kvm, old, new, change);
1710
1711 switch (change) {
1712 case KVM_MR_CREATE:
1713 /* Nothing more to do. */
1714 break;
1715 case KVM_MR_DELETE:
1716 /* Free the old memslot and all its metadata. */
1717 kvm_free_memslot(kvm, old);
1718 break;
1719 case KVM_MR_MOVE:
1720 case KVM_MR_FLAGS_ONLY:
1721 /*
1722 * Free the dirty bitmap as needed; the below check encompasses
1723 * both the flags and whether a ring buffer is being used)
1724 */
1725 if (old->dirty_bitmap && !new->dirty_bitmap)
1726 kvm_destroy_dirty_bitmap(old);
1727
1728 /*
1729 * The final quirk. Free the detached, old slot, but only its
1730 * memory, not any metadata. Metadata, including arch specific
1731 * data, may be reused by @new.
1732 */
1733 kfree(old);
1734 break;
1735 default:
1736 BUG();
1737 }
1738 }
1739
1740 /*
1741 * Activate @new, which must be installed in the inactive slots by the caller,
1742 * by swapping the active slots and then propagating @new to @old once @old is
1743 * unreachable and can be safely modified.
1744 *
1745 * With NULL @old this simply adds @new to @active (while swapping the sets).
1746 * With NULL @new this simply removes @old from @active and frees it
1747 * (while also swapping the sets).
1748 */
kvm_activate_memslot(struct kvm * kvm,struct kvm_memory_slot * old,struct kvm_memory_slot * new)1749 static void kvm_activate_memslot(struct kvm *kvm,
1750 struct kvm_memory_slot *old,
1751 struct kvm_memory_slot *new)
1752 {
1753 int as_id = kvm_memslots_get_as_id(old, new);
1754
1755 kvm_swap_active_memslots(kvm, as_id);
1756
1757 /* Propagate the new memslot to the now inactive memslots. */
1758 kvm_replace_memslot(kvm, old, new);
1759 }
1760
kvm_copy_memslot(struct kvm_memory_slot * dest,const struct kvm_memory_slot * src)1761 static void kvm_copy_memslot(struct kvm_memory_slot *dest,
1762 const struct kvm_memory_slot *src)
1763 {
1764 dest->base_gfn = src->base_gfn;
1765 dest->npages = src->npages;
1766 dest->dirty_bitmap = src->dirty_bitmap;
1767 dest->arch = src->arch;
1768 dest->userspace_addr = src->userspace_addr;
1769 dest->flags = src->flags;
1770 dest->id = src->id;
1771 dest->as_id = src->as_id;
1772 }
1773
kvm_invalidate_memslot(struct kvm * kvm,struct kvm_memory_slot * old,struct kvm_memory_slot * invalid_slot)1774 static void kvm_invalidate_memslot(struct kvm *kvm,
1775 struct kvm_memory_slot *old,
1776 struct kvm_memory_slot *invalid_slot)
1777 {
1778 /*
1779 * Mark the current slot INVALID. As with all memslot modifications,
1780 * this must be done on an unreachable slot to avoid modifying the
1781 * current slot in the active tree.
1782 */
1783 kvm_copy_memslot(invalid_slot, old);
1784 invalid_slot->flags |= KVM_MEMSLOT_INVALID;
1785 kvm_replace_memslot(kvm, old, invalid_slot);
1786
1787 /*
1788 * Activate the slot that is now marked INVALID, but don't propagate
1789 * the slot to the now inactive slots. The slot is either going to be
1790 * deleted or recreated as a new slot.
1791 */
1792 kvm_swap_active_memslots(kvm, old->as_id);
1793
1794 /*
1795 * From this point no new shadow pages pointing to a deleted, or moved,
1796 * memslot will be created. Validation of sp->gfn happens in:
1797 * - gfn_to_hva (kvm_read_guest, gfn_to_pfn)
1798 * - kvm_is_visible_gfn (mmu_check_root)
1799 */
1800 kvm_arch_flush_shadow_memslot(kvm, old);
1801 kvm_arch_guest_memory_reclaimed(kvm);
1802
1803 /* Was released by kvm_swap_active_memslots(), reacquire. */
1804 mutex_lock(&kvm->slots_arch_lock);
1805
1806 /*
1807 * Copy the arch-specific field of the newly-installed slot back to the
1808 * old slot as the arch data could have changed between releasing
1809 * slots_arch_lock in kvm_swap_active_memslots() and re-acquiring the lock
1810 * above. Writers are required to retrieve memslots *after* acquiring
1811 * slots_arch_lock, thus the active slot's data is guaranteed to be fresh.
1812 */
1813 old->arch = invalid_slot->arch;
1814 }
1815
kvm_create_memslot(struct kvm * kvm,struct kvm_memory_slot * new)1816 static void kvm_create_memslot(struct kvm *kvm,
1817 struct kvm_memory_slot *new)
1818 {
1819 /* Add the new memslot to the inactive set and activate. */
1820 kvm_replace_memslot(kvm, NULL, new);
1821 kvm_activate_memslot(kvm, NULL, new);
1822 }
1823
kvm_delete_memslot(struct kvm * kvm,struct kvm_memory_slot * old,struct kvm_memory_slot * invalid_slot)1824 static void kvm_delete_memslot(struct kvm *kvm,
1825 struct kvm_memory_slot *old,
1826 struct kvm_memory_slot *invalid_slot)
1827 {
1828 /*
1829 * Remove the old memslot (in the inactive memslots) by passing NULL as
1830 * the "new" slot, and for the invalid version in the active slots.
1831 */
1832 kvm_replace_memslot(kvm, old, NULL);
1833 kvm_activate_memslot(kvm, invalid_slot, NULL);
1834 }
1835
kvm_move_memslot(struct kvm * kvm,struct kvm_memory_slot * old,struct kvm_memory_slot * new,struct kvm_memory_slot * invalid_slot)1836 static void kvm_move_memslot(struct kvm *kvm,
1837 struct kvm_memory_slot *old,
1838 struct kvm_memory_slot *new,
1839 struct kvm_memory_slot *invalid_slot)
1840 {
1841 /*
1842 * Replace the old memslot in the inactive slots, and then swap slots
1843 * and replace the current INVALID with the new as well.
1844 */
1845 kvm_replace_memslot(kvm, old, new);
1846 kvm_activate_memslot(kvm, invalid_slot, new);
1847 }
1848
kvm_update_flags_memslot(struct kvm * kvm,struct kvm_memory_slot * old,struct kvm_memory_slot * new)1849 static void kvm_update_flags_memslot(struct kvm *kvm,
1850 struct kvm_memory_slot *old,
1851 struct kvm_memory_slot *new)
1852 {
1853 /*
1854 * Similar to the MOVE case, but the slot doesn't need to be zapped as
1855 * an intermediate step. Instead, the old memslot is simply replaced
1856 * with a new, updated copy in both memslot sets.
1857 */
1858 kvm_replace_memslot(kvm, old, new);
1859 kvm_activate_memslot(kvm, old, new);
1860 }
1861
kvm_set_memslot(struct kvm * kvm,struct kvm_memory_slot * old,struct kvm_memory_slot * new,enum kvm_mr_change change)1862 static int kvm_set_memslot(struct kvm *kvm,
1863 struct kvm_memory_slot *old,
1864 struct kvm_memory_slot *new,
1865 enum kvm_mr_change change)
1866 {
1867 struct kvm_memory_slot *invalid_slot;
1868 int r;
1869
1870 /*
1871 * Released in kvm_swap_active_memslots().
1872 *
1873 * Must be held from before the current memslots are copied until after
1874 * the new memslots are installed with rcu_assign_pointer, then
1875 * released before the synchronize srcu in kvm_swap_active_memslots().
1876 *
1877 * When modifying memslots outside of the slots_lock, must be held
1878 * before reading the pointer to the current memslots until after all
1879 * changes to those memslots are complete.
1880 *
1881 * These rules ensure that installing new memslots does not lose
1882 * changes made to the previous memslots.
1883 */
1884 mutex_lock(&kvm->slots_arch_lock);
1885
1886 /*
1887 * Invalidate the old slot if it's being deleted or moved. This is
1888 * done prior to actually deleting/moving the memslot to allow vCPUs to
1889 * continue running by ensuring there are no mappings or shadow pages
1890 * for the memslot when it is deleted/moved. Without pre-invalidation
1891 * (and without a lock), a window would exist between effecting the
1892 * delete/move and committing the changes in arch code where KVM or a
1893 * guest could access a non-existent memslot.
1894 *
1895 * Modifications are done on a temporary, unreachable slot. The old
1896 * slot needs to be preserved in case a later step fails and the
1897 * invalidation needs to be reverted.
1898 */
1899 if (change == KVM_MR_DELETE || change == KVM_MR_MOVE) {
1900 invalid_slot = kzalloc(sizeof(*invalid_slot), GFP_KERNEL_ACCOUNT);
1901 if (!invalid_slot) {
1902 mutex_unlock(&kvm->slots_arch_lock);
1903 return -ENOMEM;
1904 }
1905 kvm_invalidate_memslot(kvm, old, invalid_slot);
1906 }
1907
1908 r = kvm_prepare_memory_region(kvm, old, new, change);
1909 if (r) {
1910 /*
1911 * For DELETE/MOVE, revert the above INVALID change. No
1912 * modifications required since the original slot was preserved
1913 * in the inactive slots. Changing the active memslots also
1914 * release slots_arch_lock.
1915 */
1916 if (change == KVM_MR_DELETE || change == KVM_MR_MOVE) {
1917 kvm_activate_memslot(kvm, invalid_slot, old);
1918 kfree(invalid_slot);
1919 } else {
1920 mutex_unlock(&kvm->slots_arch_lock);
1921 }
1922 return r;
1923 }
1924
1925 /*
1926 * For DELETE and MOVE, the working slot is now active as the INVALID
1927 * version of the old slot. MOVE is particularly special as it reuses
1928 * the old slot and returns a copy of the old slot (in working_slot).
1929 * For CREATE, there is no old slot. For DELETE and FLAGS_ONLY, the
1930 * old slot is detached but otherwise preserved.
1931 */
1932 if (change == KVM_MR_CREATE)
1933 kvm_create_memslot(kvm, new);
1934 else if (change == KVM_MR_DELETE)
1935 kvm_delete_memslot(kvm, old, invalid_slot);
1936 else if (change == KVM_MR_MOVE)
1937 kvm_move_memslot(kvm, old, new, invalid_slot);
1938 else if (change == KVM_MR_FLAGS_ONLY)
1939 kvm_update_flags_memslot(kvm, old, new);
1940 else
1941 BUG();
1942
1943 /* Free the temporary INVALID slot used for DELETE and MOVE. */
1944 if (change == KVM_MR_DELETE || change == KVM_MR_MOVE)
1945 kfree(invalid_slot);
1946
1947 /*
1948 * No need to refresh new->arch, changes after dropping slots_arch_lock
1949 * will directly hit the final, active memslot. Architectures are
1950 * responsible for knowing that new->arch may be stale.
1951 */
1952 kvm_commit_memory_region(kvm, old, new, change);
1953
1954 return 0;
1955 }
1956
kvm_check_memslot_overlap(struct kvm_memslots * slots,int id,gfn_t start,gfn_t end)1957 static bool kvm_check_memslot_overlap(struct kvm_memslots *slots, int id,
1958 gfn_t start, gfn_t end)
1959 {
1960 struct kvm_memslot_iter iter;
1961
1962 kvm_for_each_memslot_in_gfn_range(&iter, slots, start, end) {
1963 if (iter.slot->id != id)
1964 return true;
1965 }
1966
1967 return false;
1968 }
1969
1970 /*
1971 * Allocate some memory and give it an address in the guest physical address
1972 * space.
1973 *
1974 * Discontiguous memory is allowed, mostly for framebuffers.
1975 *
1976 * Must be called holding kvm->slots_lock for write.
1977 */
__kvm_set_memory_region(struct kvm * kvm,const struct kvm_userspace_memory_region2 * mem)1978 int __kvm_set_memory_region(struct kvm *kvm,
1979 const struct kvm_userspace_memory_region2 *mem)
1980 {
1981 struct kvm_memory_slot *old, *new;
1982 struct kvm_memslots *slots;
1983 enum kvm_mr_change change;
1984 unsigned long npages;
1985 gfn_t base_gfn;
1986 int as_id, id;
1987 int r;
1988
1989 r = check_memory_region_flags(kvm, mem);
1990 if (r)
1991 return r;
1992
1993 as_id = mem->slot >> 16;
1994 id = (u16)mem->slot;
1995
1996 /* General sanity checks */
1997 if ((mem->memory_size & (PAGE_SIZE - 1)) ||
1998 (mem->memory_size != (unsigned long)mem->memory_size))
1999 return -EINVAL;
2000 if (mem->guest_phys_addr & (PAGE_SIZE - 1))
2001 return -EINVAL;
2002 /* We can read the guest memory with __xxx_user() later on. */
2003 if ((mem->userspace_addr & (PAGE_SIZE - 1)) ||
2004 (mem->userspace_addr != untagged_addr(mem->userspace_addr)) ||
2005 !access_ok((void __user *)(unsigned long)mem->userspace_addr,
2006 mem->memory_size))
2007 return -EINVAL;
2008 if (mem->flags & KVM_MEM_GUEST_MEMFD &&
2009 (mem->guest_memfd_offset & (PAGE_SIZE - 1) ||
2010 mem->guest_memfd_offset + mem->memory_size < mem->guest_memfd_offset))
2011 return -EINVAL;
2012 if (as_id >= kvm_arch_nr_memslot_as_ids(kvm) || id >= KVM_MEM_SLOTS_NUM)
2013 return -EINVAL;
2014 if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr)
2015 return -EINVAL;
2016 if ((mem->memory_size >> PAGE_SHIFT) > KVM_MEM_MAX_NR_PAGES)
2017 return -EINVAL;
2018
2019 slots = __kvm_memslots(kvm, as_id);
2020
2021 /*
2022 * Note, the old memslot (and the pointer itself!) may be invalidated
2023 * and/or destroyed by kvm_set_memslot().
2024 */
2025 old = id_to_memslot(slots, id);
2026
2027 if (!mem->memory_size) {
2028 if (!old || !old->npages)
2029 return -EINVAL;
2030
2031 if (WARN_ON_ONCE(kvm->nr_memslot_pages < old->npages))
2032 return -EIO;
2033
2034 return kvm_set_memslot(kvm, old, NULL, KVM_MR_DELETE);
2035 }
2036
2037 base_gfn = (mem->guest_phys_addr >> PAGE_SHIFT);
2038 npages = (mem->memory_size >> PAGE_SHIFT);
2039
2040 if (!old || !old->npages) {
2041 change = KVM_MR_CREATE;
2042
2043 /*
2044 * To simplify KVM internals, the total number of pages across
2045 * all memslots must fit in an unsigned long.
2046 */
2047 if ((kvm->nr_memslot_pages + npages) < kvm->nr_memslot_pages)
2048 return -EINVAL;
2049 } else { /* Modify an existing slot. */
2050 /* Private memslots are immutable, they can only be deleted. */
2051 if (mem->flags & KVM_MEM_GUEST_MEMFD)
2052 return -EINVAL;
2053 if ((mem->userspace_addr != old->userspace_addr) ||
2054 (npages != old->npages) ||
2055 ((mem->flags ^ old->flags) & KVM_MEM_READONLY))
2056 return -EINVAL;
2057
2058 if (base_gfn != old->base_gfn)
2059 change = KVM_MR_MOVE;
2060 else if (mem->flags != old->flags)
2061 change = KVM_MR_FLAGS_ONLY;
2062 else /* Nothing to change. */
2063 return 0;
2064 }
2065
2066 if ((change == KVM_MR_CREATE || change == KVM_MR_MOVE) &&
2067 kvm_check_memslot_overlap(slots, id, base_gfn, base_gfn + npages))
2068 return -EEXIST;
2069
2070 /* Allocate a slot that will persist in the memslot. */
2071 new = kzalloc(sizeof(*new), GFP_KERNEL_ACCOUNT);
2072 if (!new)
2073 return -ENOMEM;
2074
2075 new->as_id = as_id;
2076 new->id = id;
2077 new->base_gfn = base_gfn;
2078 new->npages = npages;
2079 new->flags = mem->flags;
2080 new->userspace_addr = mem->userspace_addr;
2081 if (mem->flags & KVM_MEM_GUEST_MEMFD) {
2082 r = kvm_gmem_bind(kvm, new, mem->guest_memfd, mem->guest_memfd_offset);
2083 if (r)
2084 goto out;
2085 }
2086
2087 r = kvm_set_memslot(kvm, old, new, change);
2088 if (r)
2089 goto out_unbind;
2090
2091 return 0;
2092
2093 out_unbind:
2094 if (mem->flags & KVM_MEM_GUEST_MEMFD)
2095 kvm_gmem_unbind(new);
2096 out:
2097 kfree(new);
2098 return r;
2099 }
2100 EXPORT_SYMBOL_GPL(__kvm_set_memory_region);
2101
kvm_set_memory_region(struct kvm * kvm,const struct kvm_userspace_memory_region2 * mem)2102 int kvm_set_memory_region(struct kvm *kvm,
2103 const struct kvm_userspace_memory_region2 *mem)
2104 {
2105 int r;
2106
2107 mutex_lock(&kvm->slots_lock);
2108 r = __kvm_set_memory_region(kvm, mem);
2109 mutex_unlock(&kvm->slots_lock);
2110 return r;
2111 }
2112 EXPORT_SYMBOL_GPL(kvm_set_memory_region);
2113
kvm_vm_ioctl_set_memory_region(struct kvm * kvm,struct kvm_userspace_memory_region2 * mem)2114 static int kvm_vm_ioctl_set_memory_region(struct kvm *kvm,
2115 struct kvm_userspace_memory_region2 *mem)
2116 {
2117 if ((u16)mem->slot >= KVM_USER_MEM_SLOTS)
2118 return -EINVAL;
2119
2120 return kvm_set_memory_region(kvm, mem);
2121 }
2122
2123 #ifndef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
2124 /**
2125 * kvm_get_dirty_log - get a snapshot of dirty pages
2126 * @kvm: pointer to kvm instance
2127 * @log: slot id and address to which we copy the log
2128 * @is_dirty: set to '1' if any dirty pages were found
2129 * @memslot: set to the associated memslot, always valid on success
2130 */
kvm_get_dirty_log(struct kvm * kvm,struct kvm_dirty_log * log,int * is_dirty,struct kvm_memory_slot ** memslot)2131 int kvm_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log,
2132 int *is_dirty, struct kvm_memory_slot **memslot)
2133 {
2134 struct kvm_memslots *slots;
2135 int i, as_id, id;
2136 unsigned long n;
2137 unsigned long any = 0;
2138
2139 /* Dirty ring tracking may be exclusive to dirty log tracking */
2140 if (!kvm_use_dirty_bitmap(kvm))
2141 return -ENXIO;
2142
2143 *memslot = NULL;
2144 *is_dirty = 0;
2145
2146 as_id = log->slot >> 16;
2147 id = (u16)log->slot;
2148 if (as_id >= kvm_arch_nr_memslot_as_ids(kvm) || id >= KVM_USER_MEM_SLOTS)
2149 return -EINVAL;
2150
2151 slots = __kvm_memslots(kvm, as_id);
2152 *memslot = id_to_memslot(slots, id);
2153 if (!(*memslot) || !(*memslot)->dirty_bitmap)
2154 return -ENOENT;
2155
2156 kvm_arch_sync_dirty_log(kvm, *memslot);
2157
2158 n = kvm_dirty_bitmap_bytes(*memslot);
2159
2160 for (i = 0; !any && i < n/sizeof(long); ++i)
2161 any = (*memslot)->dirty_bitmap[i];
2162
2163 if (copy_to_user(log->dirty_bitmap, (*memslot)->dirty_bitmap, n))
2164 return -EFAULT;
2165
2166 if (any)
2167 *is_dirty = 1;
2168 return 0;
2169 }
2170 EXPORT_SYMBOL_GPL(kvm_get_dirty_log);
2171
2172 #else /* CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT */
2173 /**
2174 * kvm_get_dirty_log_protect - get a snapshot of dirty pages
2175 * and reenable dirty page tracking for the corresponding pages.
2176 * @kvm: pointer to kvm instance
2177 * @log: slot id and address to which we copy the log
2178 *
2179 * We need to keep it in mind that VCPU threads can write to the bitmap
2180 * concurrently. So, to avoid losing track of dirty pages we keep the
2181 * following order:
2182 *
2183 * 1. Take a snapshot of the bit and clear it if needed.
2184 * 2. Write protect the corresponding page.
2185 * 3. Copy the snapshot to the userspace.
2186 * 4. Upon return caller flushes TLB's if needed.
2187 *
2188 * Between 2 and 4, the guest may write to the page using the remaining TLB
2189 * entry. This is not a problem because the page is reported dirty using
2190 * the snapshot taken before and step 4 ensures that writes done after
2191 * exiting to userspace will be logged for the next call.
2192 *
2193 */
kvm_get_dirty_log_protect(struct kvm * kvm,struct kvm_dirty_log * log)2194 static int kvm_get_dirty_log_protect(struct kvm *kvm, struct kvm_dirty_log *log)
2195 {
2196 struct kvm_memslots *slots;
2197 struct kvm_memory_slot *memslot;
2198 int i, as_id, id;
2199 unsigned long n;
2200 unsigned long *dirty_bitmap;
2201 unsigned long *dirty_bitmap_buffer;
2202 bool flush;
2203
2204 /* Dirty ring tracking may be exclusive to dirty log tracking */
2205 if (!kvm_use_dirty_bitmap(kvm))
2206 return -ENXIO;
2207
2208 as_id = log->slot >> 16;
2209 id = (u16)log->slot;
2210 if (as_id >= kvm_arch_nr_memslot_as_ids(kvm) || id >= KVM_USER_MEM_SLOTS)
2211 return -EINVAL;
2212
2213 slots = __kvm_memslots(kvm, as_id);
2214 memslot = id_to_memslot(slots, id);
2215 if (!memslot || !memslot->dirty_bitmap)
2216 return -ENOENT;
2217
2218 dirty_bitmap = memslot->dirty_bitmap;
2219
2220 kvm_arch_sync_dirty_log(kvm, memslot);
2221
2222 n = kvm_dirty_bitmap_bytes(memslot);
2223 flush = false;
2224 if (kvm->manual_dirty_log_protect) {
2225 /*
2226 * Unlike kvm_get_dirty_log, we always return false in *flush,
2227 * because no flush is needed until KVM_CLEAR_DIRTY_LOG. There
2228 * is some code duplication between this function and
2229 * kvm_get_dirty_log, but hopefully all architecture
2230 * transition to kvm_get_dirty_log_protect and kvm_get_dirty_log
2231 * can be eliminated.
2232 */
2233 dirty_bitmap_buffer = dirty_bitmap;
2234 } else {
2235 dirty_bitmap_buffer = kvm_second_dirty_bitmap(memslot);
2236 memset(dirty_bitmap_buffer, 0, n);
2237
2238 KVM_MMU_LOCK(kvm);
2239 for (i = 0; i < n / sizeof(long); i++) {
2240 unsigned long mask;
2241 gfn_t offset;
2242
2243 if (!dirty_bitmap[i])
2244 continue;
2245
2246 flush = true;
2247 mask = xchg(&dirty_bitmap[i], 0);
2248 dirty_bitmap_buffer[i] = mask;
2249
2250 offset = i * BITS_PER_LONG;
2251 kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot,
2252 offset, mask);
2253 }
2254 KVM_MMU_UNLOCK(kvm);
2255 }
2256
2257 if (flush)
2258 kvm_flush_remote_tlbs_memslot(kvm, memslot);
2259
2260 if (copy_to_user(log->dirty_bitmap, dirty_bitmap_buffer, n))
2261 return -EFAULT;
2262 return 0;
2263 }
2264
2265
2266 /**
2267 * kvm_vm_ioctl_get_dirty_log - get and clear the log of dirty pages in a slot
2268 * @kvm: kvm instance
2269 * @log: slot id and address to which we copy the log
2270 *
2271 * Steps 1-4 below provide general overview of dirty page logging. See
2272 * kvm_get_dirty_log_protect() function description for additional details.
2273 *
2274 * We call kvm_get_dirty_log_protect() to handle steps 1-3, upon return we
2275 * always flush the TLB (step 4) even if previous step failed and the dirty
2276 * bitmap may be corrupt. Regardless of previous outcome the KVM logging API
2277 * does not preclude user space subsequent dirty log read. Flushing TLB ensures
2278 * writes will be marked dirty for next log read.
2279 *
2280 * 1. Take a snapshot of the bit and clear it if needed.
2281 * 2. Write protect the corresponding page.
2282 * 3. Copy the snapshot to the userspace.
2283 * 4. Flush TLB's if needed.
2284 */
kvm_vm_ioctl_get_dirty_log(struct kvm * kvm,struct kvm_dirty_log * log)2285 static int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
2286 struct kvm_dirty_log *log)
2287 {
2288 int r;
2289
2290 mutex_lock(&kvm->slots_lock);
2291
2292 r = kvm_get_dirty_log_protect(kvm, log);
2293
2294 mutex_unlock(&kvm->slots_lock);
2295 return r;
2296 }
2297
2298 /**
2299 * kvm_clear_dirty_log_protect - clear dirty bits in the bitmap
2300 * and reenable dirty page tracking for the corresponding pages.
2301 * @kvm: pointer to kvm instance
2302 * @log: slot id and address from which to fetch the bitmap of dirty pages
2303 */
kvm_clear_dirty_log_protect(struct kvm * kvm,struct kvm_clear_dirty_log * log)2304 static int kvm_clear_dirty_log_protect(struct kvm *kvm,
2305 struct kvm_clear_dirty_log *log)
2306 {
2307 struct kvm_memslots *slots;
2308 struct kvm_memory_slot *memslot;
2309 int as_id, id;
2310 gfn_t offset;
2311 unsigned long i, n;
2312 unsigned long *dirty_bitmap;
2313 unsigned long *dirty_bitmap_buffer;
2314 bool flush;
2315
2316 /* Dirty ring tracking may be exclusive to dirty log tracking */
2317 if (!kvm_use_dirty_bitmap(kvm))
2318 return -ENXIO;
2319
2320 as_id = log->slot >> 16;
2321 id = (u16)log->slot;
2322 if (as_id >= kvm_arch_nr_memslot_as_ids(kvm) || id >= KVM_USER_MEM_SLOTS)
2323 return -EINVAL;
2324
2325 if (log->first_page & 63)
2326 return -EINVAL;
2327
2328 slots = __kvm_memslots(kvm, as_id);
2329 memslot = id_to_memslot(slots, id);
2330 if (!memslot || !memslot->dirty_bitmap)
2331 return -ENOENT;
2332
2333 dirty_bitmap = memslot->dirty_bitmap;
2334
2335 n = ALIGN(log->num_pages, BITS_PER_LONG) / 8;
2336
2337 if (log->first_page > memslot->npages ||
2338 log->num_pages > memslot->npages - log->first_page ||
2339 (log->num_pages < memslot->npages - log->first_page && (log->num_pages & 63)))
2340 return -EINVAL;
2341
2342 kvm_arch_sync_dirty_log(kvm, memslot);
2343
2344 flush = false;
2345 dirty_bitmap_buffer = kvm_second_dirty_bitmap(memslot);
2346 if (copy_from_user(dirty_bitmap_buffer, log->dirty_bitmap, n))
2347 return -EFAULT;
2348
2349 KVM_MMU_LOCK(kvm);
2350 for (offset = log->first_page, i = offset / BITS_PER_LONG,
2351 n = DIV_ROUND_UP(log->num_pages, BITS_PER_LONG); n--;
2352 i++, offset += BITS_PER_LONG) {
2353 unsigned long mask = *dirty_bitmap_buffer++;
2354 atomic_long_t *p = (atomic_long_t *) &dirty_bitmap[i];
2355 if (!mask)
2356 continue;
2357
2358 mask &= atomic_long_fetch_andnot(mask, p);
2359
2360 /*
2361 * mask contains the bits that really have been cleared. This
2362 * never includes any bits beyond the length of the memslot (if
2363 * the length is not aligned to 64 pages), therefore it is not
2364 * a problem if userspace sets them in log->dirty_bitmap.
2365 */
2366 if (mask) {
2367 flush = true;
2368 kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot,
2369 offset, mask);
2370 }
2371 }
2372 KVM_MMU_UNLOCK(kvm);
2373
2374 if (flush)
2375 kvm_flush_remote_tlbs_memslot(kvm, memslot);
2376
2377 return 0;
2378 }
2379
kvm_vm_ioctl_clear_dirty_log(struct kvm * kvm,struct kvm_clear_dirty_log * log)2380 static int kvm_vm_ioctl_clear_dirty_log(struct kvm *kvm,
2381 struct kvm_clear_dirty_log *log)
2382 {
2383 int r;
2384
2385 mutex_lock(&kvm->slots_lock);
2386
2387 r = kvm_clear_dirty_log_protect(kvm, log);
2388
2389 mutex_unlock(&kvm->slots_lock);
2390 return r;
2391 }
2392 #endif /* CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT */
2393
2394 #ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
2395 /*
2396 * Returns true if _all_ gfns in the range [@start, @end) have attributes
2397 * matching @attrs.
2398 */
kvm_range_has_memory_attributes(struct kvm * kvm,gfn_t start,gfn_t end,unsigned long attrs)2399 bool kvm_range_has_memory_attributes(struct kvm *kvm, gfn_t start, gfn_t end,
2400 unsigned long attrs)
2401 {
2402 XA_STATE(xas, &kvm->mem_attr_array, start);
2403 unsigned long index;
2404 bool has_attrs;
2405 void *entry;
2406
2407 rcu_read_lock();
2408
2409 if (!attrs) {
2410 has_attrs = !xas_find(&xas, end - 1);
2411 goto out;
2412 }
2413
2414 has_attrs = true;
2415 for (index = start; index < end; index++) {
2416 do {
2417 entry = xas_next(&xas);
2418 } while (xas_retry(&xas, entry));
2419
2420 if (xas.xa_index != index || xa_to_value(entry) != attrs) {
2421 has_attrs = false;
2422 break;
2423 }
2424 }
2425
2426 out:
2427 rcu_read_unlock();
2428 return has_attrs;
2429 }
2430
kvm_supported_mem_attributes(struct kvm * kvm)2431 static u64 kvm_supported_mem_attributes(struct kvm *kvm)
2432 {
2433 if (!kvm || kvm_arch_has_private_mem(kvm))
2434 return KVM_MEMORY_ATTRIBUTE_PRIVATE;
2435
2436 return 0;
2437 }
2438
kvm_handle_gfn_range(struct kvm * kvm,struct kvm_mmu_notifier_range * range)2439 static __always_inline void kvm_handle_gfn_range(struct kvm *kvm,
2440 struct kvm_mmu_notifier_range *range)
2441 {
2442 struct kvm_gfn_range gfn_range;
2443 struct kvm_memory_slot *slot;
2444 struct kvm_memslots *slots;
2445 struct kvm_memslot_iter iter;
2446 bool found_memslot = false;
2447 bool ret = false;
2448 int i;
2449
2450 gfn_range.arg = range->arg;
2451 gfn_range.may_block = range->may_block;
2452
2453 for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) {
2454 slots = __kvm_memslots(kvm, i);
2455
2456 kvm_for_each_memslot_in_gfn_range(&iter, slots, range->start, range->end) {
2457 slot = iter.slot;
2458 gfn_range.slot = slot;
2459
2460 gfn_range.start = max(range->start, slot->base_gfn);
2461 gfn_range.end = min(range->end, slot->base_gfn + slot->npages);
2462 if (gfn_range.start >= gfn_range.end)
2463 continue;
2464
2465 if (!found_memslot) {
2466 found_memslot = true;
2467 KVM_MMU_LOCK(kvm);
2468 if (!IS_KVM_NULL_FN(range->on_lock))
2469 range->on_lock(kvm);
2470 }
2471
2472 ret |= range->handler(kvm, &gfn_range);
2473 }
2474 }
2475
2476 if (range->flush_on_ret && ret)
2477 kvm_flush_remote_tlbs(kvm);
2478
2479 if (found_memslot)
2480 KVM_MMU_UNLOCK(kvm);
2481 }
2482
kvm_pre_set_memory_attributes(struct kvm * kvm,struct kvm_gfn_range * range)2483 static bool kvm_pre_set_memory_attributes(struct kvm *kvm,
2484 struct kvm_gfn_range *range)
2485 {
2486 /*
2487 * Unconditionally add the range to the invalidation set, regardless of
2488 * whether or not the arch callback actually needs to zap SPTEs. E.g.
2489 * if KVM supports RWX attributes in the future and the attributes are
2490 * going from R=>RW, zapping isn't strictly necessary. Unconditionally
2491 * adding the range allows KVM to require that MMU invalidations add at
2492 * least one range between begin() and end(), e.g. allows KVM to detect
2493 * bugs where the add() is missed. Relaxing the rule *might* be safe,
2494 * but it's not obvious that allowing new mappings while the attributes
2495 * are in flux is desirable or worth the complexity.
2496 */
2497 kvm_mmu_invalidate_range_add(kvm, range->start, range->end);
2498
2499 return kvm_arch_pre_set_memory_attributes(kvm, range);
2500 }
2501
2502 /* Set @attributes for the gfn range [@start, @end). */
kvm_vm_set_mem_attributes(struct kvm * kvm,gfn_t start,gfn_t end,unsigned long attributes)2503 static int kvm_vm_set_mem_attributes(struct kvm *kvm, gfn_t start, gfn_t end,
2504 unsigned long attributes)
2505 {
2506 struct kvm_mmu_notifier_range pre_set_range = {
2507 .start = start,
2508 .end = end,
2509 .handler = kvm_pre_set_memory_attributes,
2510 .on_lock = kvm_mmu_invalidate_begin,
2511 .flush_on_ret = true,
2512 .may_block = true,
2513 };
2514 struct kvm_mmu_notifier_range post_set_range = {
2515 .start = start,
2516 .end = end,
2517 .arg.attributes = attributes,
2518 .handler = kvm_arch_post_set_memory_attributes,
2519 .on_lock = kvm_mmu_invalidate_end,
2520 .may_block = true,
2521 };
2522 unsigned long i;
2523 void *entry;
2524 int r = 0;
2525
2526 entry = attributes ? xa_mk_value(attributes) : NULL;
2527
2528 mutex_lock(&kvm->slots_lock);
2529
2530 /* Nothing to do if the entire range as the desired attributes. */
2531 if (kvm_range_has_memory_attributes(kvm, start, end, attributes))
2532 goto out_unlock;
2533
2534 /*
2535 * Reserve memory ahead of time to avoid having to deal with failures
2536 * partway through setting the new attributes.
2537 */
2538 for (i = start; i < end; i++) {
2539 r = xa_reserve(&kvm->mem_attr_array, i, GFP_KERNEL_ACCOUNT);
2540 if (r)
2541 goto out_unlock;
2542 }
2543
2544 kvm_handle_gfn_range(kvm, &pre_set_range);
2545
2546 for (i = start; i < end; i++) {
2547 r = xa_err(xa_store(&kvm->mem_attr_array, i, entry,
2548 GFP_KERNEL_ACCOUNT));
2549 KVM_BUG_ON(r, kvm);
2550 }
2551
2552 kvm_handle_gfn_range(kvm, &post_set_range);
2553
2554 out_unlock:
2555 mutex_unlock(&kvm->slots_lock);
2556
2557 return r;
2558 }
kvm_vm_ioctl_set_mem_attributes(struct kvm * kvm,struct kvm_memory_attributes * attrs)2559 static int kvm_vm_ioctl_set_mem_attributes(struct kvm *kvm,
2560 struct kvm_memory_attributes *attrs)
2561 {
2562 gfn_t start, end;
2563
2564 /* flags is currently not used. */
2565 if (attrs->flags)
2566 return -EINVAL;
2567 if (attrs->attributes & ~kvm_supported_mem_attributes(kvm))
2568 return -EINVAL;
2569 if (attrs->size == 0 || attrs->address + attrs->size < attrs->address)
2570 return -EINVAL;
2571 if (!PAGE_ALIGNED(attrs->address) || !PAGE_ALIGNED(attrs->size))
2572 return -EINVAL;
2573
2574 start = attrs->address >> PAGE_SHIFT;
2575 end = (attrs->address + attrs->size) >> PAGE_SHIFT;
2576
2577 /*
2578 * xarray tracks data using "unsigned long", and as a result so does
2579 * KVM. For simplicity, supports generic attributes only on 64-bit
2580 * architectures.
2581 */
2582 BUILD_BUG_ON(sizeof(attrs->attributes) != sizeof(unsigned long));
2583
2584 return kvm_vm_set_mem_attributes(kvm, start, end, attrs->attributes);
2585 }
2586 #endif /* CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES */
2587
gfn_to_memslot(struct kvm * kvm,gfn_t gfn)2588 struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
2589 {
2590 return __gfn_to_memslot(kvm_memslots(kvm), gfn);
2591 }
2592 EXPORT_SYMBOL_GPL(gfn_to_memslot);
2593
kvm_vcpu_gfn_to_memslot(struct kvm_vcpu * vcpu,gfn_t gfn)2594 struct kvm_memory_slot *kvm_vcpu_gfn_to_memslot(struct kvm_vcpu *vcpu, gfn_t gfn)
2595 {
2596 struct kvm_memslots *slots = kvm_vcpu_memslots(vcpu);
2597 u64 gen = slots->generation;
2598 struct kvm_memory_slot *slot;
2599
2600 /*
2601 * This also protects against using a memslot from a different address space,
2602 * since different address spaces have different generation numbers.
2603 */
2604 if (unlikely(gen != vcpu->last_used_slot_gen)) {
2605 vcpu->last_used_slot = NULL;
2606 vcpu->last_used_slot_gen = gen;
2607 }
2608
2609 slot = try_get_memslot(vcpu->last_used_slot, gfn);
2610 if (slot)
2611 return slot;
2612
2613 /*
2614 * Fall back to searching all memslots. We purposely use
2615 * search_memslots() instead of __gfn_to_memslot() to avoid
2616 * thrashing the VM-wide last_used_slot in kvm_memslots.
2617 */
2618 slot = search_memslots(slots, gfn, false);
2619 if (slot) {
2620 vcpu->last_used_slot = slot;
2621 return slot;
2622 }
2623
2624 return NULL;
2625 }
2626
kvm_is_visible_gfn(struct kvm * kvm,gfn_t gfn)2627 bool kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn)
2628 {
2629 struct kvm_memory_slot *memslot = gfn_to_memslot(kvm, gfn);
2630
2631 return kvm_is_visible_memslot(memslot);
2632 }
2633 EXPORT_SYMBOL_GPL(kvm_is_visible_gfn);
2634
kvm_vcpu_is_visible_gfn(struct kvm_vcpu * vcpu,gfn_t gfn)2635 bool kvm_vcpu_is_visible_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
2636 {
2637 struct kvm_memory_slot *memslot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
2638
2639 return kvm_is_visible_memslot(memslot);
2640 }
2641 EXPORT_SYMBOL_GPL(kvm_vcpu_is_visible_gfn);
2642
kvm_host_page_size(struct kvm_vcpu * vcpu,gfn_t gfn)2643 unsigned long kvm_host_page_size(struct kvm_vcpu *vcpu, gfn_t gfn)
2644 {
2645 struct vm_area_struct *vma;
2646 unsigned long addr, size;
2647
2648 size = PAGE_SIZE;
2649
2650 addr = kvm_vcpu_gfn_to_hva_prot(vcpu, gfn, NULL);
2651 if (kvm_is_error_hva(addr))
2652 return PAGE_SIZE;
2653
2654 mmap_read_lock(current->mm);
2655 vma = find_vma(current->mm, addr);
2656 if (!vma)
2657 goto out;
2658
2659 size = vma_kernel_pagesize(vma);
2660
2661 out:
2662 mmap_read_unlock(current->mm);
2663
2664 return size;
2665 }
2666
memslot_is_readonly(const struct kvm_memory_slot * slot)2667 static bool memslot_is_readonly(const struct kvm_memory_slot *slot)
2668 {
2669 return slot->flags & KVM_MEM_READONLY;
2670 }
2671
__gfn_to_hva_many(const struct kvm_memory_slot * slot,gfn_t gfn,gfn_t * nr_pages,bool write)2672 static unsigned long __gfn_to_hva_many(const struct kvm_memory_slot *slot, gfn_t gfn,
2673 gfn_t *nr_pages, bool write)
2674 {
2675 if (!slot || slot->flags & KVM_MEMSLOT_INVALID)
2676 return KVM_HVA_ERR_BAD;
2677
2678 if (memslot_is_readonly(slot) && write)
2679 return KVM_HVA_ERR_RO_BAD;
2680
2681 if (nr_pages)
2682 *nr_pages = slot->npages - (gfn - slot->base_gfn);
2683
2684 return __gfn_to_hva_memslot(slot, gfn);
2685 }
2686
gfn_to_hva_many(struct kvm_memory_slot * slot,gfn_t gfn,gfn_t * nr_pages)2687 static unsigned long gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn,
2688 gfn_t *nr_pages)
2689 {
2690 return __gfn_to_hva_many(slot, gfn, nr_pages, true);
2691 }
2692
gfn_to_hva_memslot(struct kvm_memory_slot * slot,gfn_t gfn)2693 unsigned long gfn_to_hva_memslot(struct kvm_memory_slot *slot,
2694 gfn_t gfn)
2695 {
2696 return gfn_to_hva_many(slot, gfn, NULL);
2697 }
2698 EXPORT_SYMBOL_GPL(gfn_to_hva_memslot);
2699
gfn_to_hva(struct kvm * kvm,gfn_t gfn)2700 unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn)
2701 {
2702 return gfn_to_hva_many(gfn_to_memslot(kvm, gfn), gfn, NULL);
2703 }
2704 EXPORT_SYMBOL_GPL(gfn_to_hva);
2705
kvm_vcpu_gfn_to_hva(struct kvm_vcpu * vcpu,gfn_t gfn)2706 unsigned long kvm_vcpu_gfn_to_hva(struct kvm_vcpu *vcpu, gfn_t gfn)
2707 {
2708 return gfn_to_hva_many(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn, NULL);
2709 }
2710 EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_hva);
2711
2712 /*
2713 * Return the hva of a @gfn and the R/W attribute if possible.
2714 *
2715 * @slot: the kvm_memory_slot which contains @gfn
2716 * @gfn: the gfn to be translated
2717 * @writable: used to return the read/write attribute of the @slot if the hva
2718 * is valid and @writable is not NULL
2719 */
gfn_to_hva_memslot_prot(struct kvm_memory_slot * slot,gfn_t gfn,bool * writable)2720 unsigned long gfn_to_hva_memslot_prot(struct kvm_memory_slot *slot,
2721 gfn_t gfn, bool *writable)
2722 {
2723 unsigned long hva = __gfn_to_hva_many(slot, gfn, NULL, false);
2724
2725 if (!kvm_is_error_hva(hva) && writable)
2726 *writable = !memslot_is_readonly(slot);
2727
2728 return hva;
2729 }
2730
gfn_to_hva_prot(struct kvm * kvm,gfn_t gfn,bool * writable)2731 unsigned long gfn_to_hva_prot(struct kvm *kvm, gfn_t gfn, bool *writable)
2732 {
2733 struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
2734
2735 return gfn_to_hva_memslot_prot(slot, gfn, writable);
2736 }
2737
kvm_vcpu_gfn_to_hva_prot(struct kvm_vcpu * vcpu,gfn_t gfn,bool * writable)2738 unsigned long kvm_vcpu_gfn_to_hva_prot(struct kvm_vcpu *vcpu, gfn_t gfn, bool *writable)
2739 {
2740 struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
2741
2742 return gfn_to_hva_memslot_prot(slot, gfn, writable);
2743 }
2744
check_user_page_hwpoison(unsigned long addr)2745 static inline int check_user_page_hwpoison(unsigned long addr)
2746 {
2747 int rc, flags = FOLL_HWPOISON | FOLL_WRITE;
2748
2749 rc = get_user_pages(addr, 1, flags, NULL);
2750 return rc == -EHWPOISON;
2751 }
2752
2753 /*
2754 * The fast path to get the writable pfn which will be stored in @pfn,
2755 * true indicates success, otherwise false is returned. It's also the
2756 * only part that runs if we can in atomic context.
2757 */
hva_to_pfn_fast(unsigned long addr,bool write_fault,bool * writable,kvm_pfn_t * pfn)2758 static bool hva_to_pfn_fast(unsigned long addr, bool write_fault,
2759 bool *writable, kvm_pfn_t *pfn)
2760 {
2761 struct page *page[1];
2762
2763 /*
2764 * Fast pin a writable pfn only if it is a write fault request
2765 * or the caller allows to map a writable pfn for a read fault
2766 * request.
2767 */
2768 if (!(write_fault || writable))
2769 return false;
2770
2771 if (get_user_page_fast_only(addr, FOLL_WRITE, page)) {
2772 *pfn = page_to_pfn(page[0]);
2773
2774 if (writable)
2775 *writable = true;
2776 return true;
2777 }
2778
2779 return false;
2780 }
2781
2782 /*
2783 * The slow path to get the pfn of the specified host virtual address,
2784 * 1 indicates success, -errno is returned if error is detected.
2785 */
hva_to_pfn_slow(unsigned long addr,bool * async,bool write_fault,bool interruptible,bool * writable,kvm_pfn_t * pfn)2786 static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault,
2787 bool interruptible, bool *writable, kvm_pfn_t *pfn)
2788 {
2789 /*
2790 * When a VCPU accesses a page that is not mapped into the secondary
2791 * MMU, we lookup the page using GUP to map it, so the guest VCPU can
2792 * make progress. We always want to honor NUMA hinting faults in that
2793 * case, because GUP usage corresponds to memory accesses from the VCPU.
2794 * Otherwise, we'd not trigger NUMA hinting faults once a page is
2795 * mapped into the secondary MMU and gets accessed by a VCPU.
2796 *
2797 * Note that get_user_page_fast_only() and FOLL_WRITE for now
2798 * implicitly honor NUMA hinting faults and don't need this flag.
2799 */
2800 unsigned int flags = FOLL_HWPOISON | FOLL_HONOR_NUMA_FAULT;
2801 struct page *page;
2802 int npages;
2803
2804 might_sleep();
2805
2806 if (writable)
2807 *writable = write_fault;
2808
2809 if (write_fault)
2810 flags |= FOLL_WRITE;
2811 if (async)
2812 flags |= FOLL_NOWAIT;
2813 if (interruptible)
2814 flags |= FOLL_INTERRUPTIBLE;
2815
2816 npages = get_user_pages_unlocked(addr, 1, &page, flags);
2817 if (npages != 1)
2818 return npages;
2819
2820 /* map read fault as writable if possible */
2821 if (unlikely(!write_fault) && writable) {
2822 struct page *wpage;
2823
2824 if (get_user_page_fast_only(addr, FOLL_WRITE, &wpage)) {
2825 *writable = true;
2826 put_page(page);
2827 page = wpage;
2828 }
2829 }
2830 *pfn = page_to_pfn(page);
2831 return npages;
2832 }
2833
vma_is_valid(struct vm_area_struct * vma,bool write_fault)2834 static bool vma_is_valid(struct vm_area_struct *vma, bool write_fault)
2835 {
2836 if (unlikely(!(vma->vm_flags & VM_READ)))
2837 return false;
2838
2839 if (write_fault && (unlikely(!(vma->vm_flags & VM_WRITE))))
2840 return false;
2841
2842 return true;
2843 }
2844
kvm_try_get_pfn(kvm_pfn_t pfn)2845 static int kvm_try_get_pfn(kvm_pfn_t pfn)
2846 {
2847 struct page *page = kvm_pfn_to_refcounted_page(pfn);
2848
2849 if (!page)
2850 return 1;
2851
2852 return get_page_unless_zero(page);
2853 }
2854
hva_to_pfn_remapped(struct vm_area_struct * vma,unsigned long addr,bool write_fault,bool * writable,kvm_pfn_t * p_pfn)2855 static int hva_to_pfn_remapped(struct vm_area_struct *vma,
2856 unsigned long addr, bool write_fault,
2857 bool *writable, kvm_pfn_t *p_pfn)
2858 {
2859 kvm_pfn_t pfn;
2860 pte_t *ptep;
2861 pte_t pte;
2862 spinlock_t *ptl;
2863 int r;
2864
2865 r = follow_pte(vma, addr, &ptep, &ptl);
2866 if (r) {
2867 /*
2868 * get_user_pages fails for VM_IO and VM_PFNMAP vmas and does
2869 * not call the fault handler, so do it here.
2870 */
2871 bool unlocked = false;
2872 r = fixup_user_fault(current->mm, addr,
2873 (write_fault ? FAULT_FLAG_WRITE : 0),
2874 &unlocked);
2875 if (unlocked)
2876 return -EAGAIN;
2877 if (r)
2878 return r;
2879
2880 r = follow_pte(vma, addr, &ptep, &ptl);
2881 if (r)
2882 return r;
2883 }
2884
2885 pte = ptep_get(ptep);
2886
2887 if (write_fault && !pte_write(pte)) {
2888 pfn = KVM_PFN_ERR_RO_FAULT;
2889 goto out;
2890 }
2891
2892 if (writable)
2893 *writable = pte_write(pte);
2894 pfn = pte_pfn(pte);
2895
2896 /*
2897 * Get a reference here because callers of *hva_to_pfn* and
2898 * *gfn_to_pfn* ultimately call kvm_release_pfn_clean on the
2899 * returned pfn. This is only needed if the VMA has VM_MIXEDMAP
2900 * set, but the kvm_try_get_pfn/kvm_release_pfn_clean pair will
2901 * simply do nothing for reserved pfns.
2902 *
2903 * Whoever called remap_pfn_range is also going to call e.g.
2904 * unmap_mapping_range before the underlying pages are freed,
2905 * causing a call to our MMU notifier.
2906 *
2907 * Certain IO or PFNMAP mappings can be backed with valid
2908 * struct pages, but be allocated without refcounting e.g.,
2909 * tail pages of non-compound higher order allocations, which
2910 * would then underflow the refcount when the caller does the
2911 * required put_page. Don't allow those pages here.
2912 */
2913 if (!kvm_try_get_pfn(pfn))
2914 r = -EFAULT;
2915
2916 out:
2917 pte_unmap_unlock(ptep, ptl);
2918 *p_pfn = pfn;
2919
2920 return r;
2921 }
2922
2923 /*
2924 * Pin guest page in memory and return its pfn.
2925 * @addr: host virtual address which maps memory to the guest
2926 * @atomic: whether this function is forbidden from sleeping
2927 * @interruptible: whether the process can be interrupted by non-fatal signals
2928 * @async: whether this function need to wait IO complete if the
2929 * host page is not in the memory
2930 * @write_fault: whether we should get a writable host page
2931 * @writable: whether it allows to map a writable host page for !@write_fault
2932 *
2933 * The function will map a writable host page for these two cases:
2934 * 1): @write_fault = true
2935 * 2): @write_fault = false && @writable, @writable will tell the caller
2936 * whether the mapping is writable.
2937 */
hva_to_pfn(unsigned long addr,bool atomic,bool interruptible,bool * async,bool write_fault,bool * writable)2938 kvm_pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool interruptible,
2939 bool *async, bool write_fault, bool *writable)
2940 {
2941 struct vm_area_struct *vma;
2942 kvm_pfn_t pfn;
2943 int npages, r;
2944
2945 /* we can do it either atomically or asynchronously, not both */
2946 BUG_ON(atomic && async);
2947
2948 if (hva_to_pfn_fast(addr, write_fault, writable, &pfn))
2949 return pfn;
2950
2951 if (atomic)
2952 return KVM_PFN_ERR_FAULT;
2953
2954 npages = hva_to_pfn_slow(addr, async, write_fault, interruptible,
2955 writable, &pfn);
2956 if (npages == 1)
2957 return pfn;
2958 if (npages == -EINTR)
2959 return KVM_PFN_ERR_SIGPENDING;
2960
2961 mmap_read_lock(current->mm);
2962 if (npages == -EHWPOISON ||
2963 (!async && check_user_page_hwpoison(addr))) {
2964 pfn = KVM_PFN_ERR_HWPOISON;
2965 goto exit;
2966 }
2967
2968 retry:
2969 vma = vma_lookup(current->mm, addr);
2970
2971 if (vma == NULL)
2972 pfn = KVM_PFN_ERR_FAULT;
2973 else if (vma->vm_flags & (VM_IO | VM_PFNMAP)) {
2974 r = hva_to_pfn_remapped(vma, addr, write_fault, writable, &pfn);
2975 if (r == -EAGAIN)
2976 goto retry;
2977 if (r < 0)
2978 pfn = KVM_PFN_ERR_FAULT;
2979 } else {
2980 if (async && vma_is_valid(vma, write_fault))
2981 *async = true;
2982 pfn = KVM_PFN_ERR_FAULT;
2983 }
2984 exit:
2985 mmap_read_unlock(current->mm);
2986 return pfn;
2987 }
2988
__gfn_to_pfn_memslot(const struct kvm_memory_slot * slot,gfn_t gfn,bool atomic,bool interruptible,bool * async,bool write_fault,bool * writable,hva_t * hva)2989 kvm_pfn_t __gfn_to_pfn_memslot(const struct kvm_memory_slot *slot, gfn_t gfn,
2990 bool atomic, bool interruptible, bool *async,
2991 bool write_fault, bool *writable, hva_t *hva)
2992 {
2993 unsigned long addr = __gfn_to_hva_many(slot, gfn, NULL, write_fault);
2994
2995 if (hva)
2996 *hva = addr;
2997
2998 if (kvm_is_error_hva(addr)) {
2999 if (writable)
3000 *writable = false;
3001
3002 return addr == KVM_HVA_ERR_RO_BAD ? KVM_PFN_ERR_RO_FAULT :
3003 KVM_PFN_NOSLOT;
3004 }
3005
3006 /* Do not map writable pfn in the readonly memslot. */
3007 if (writable && memslot_is_readonly(slot)) {
3008 *writable = false;
3009 writable = NULL;
3010 }
3011
3012 return hva_to_pfn(addr, atomic, interruptible, async, write_fault,
3013 writable);
3014 }
3015 EXPORT_SYMBOL_GPL(__gfn_to_pfn_memslot);
3016
gfn_to_pfn_prot(struct kvm * kvm,gfn_t gfn,bool write_fault,bool * writable)3017 kvm_pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault,
3018 bool *writable)
3019 {
3020 return __gfn_to_pfn_memslot(gfn_to_memslot(kvm, gfn), gfn, false, false,
3021 NULL, write_fault, writable, NULL);
3022 }
3023 EXPORT_SYMBOL_GPL(gfn_to_pfn_prot);
3024
gfn_to_pfn_memslot(const struct kvm_memory_slot * slot,gfn_t gfn)3025 kvm_pfn_t gfn_to_pfn_memslot(const struct kvm_memory_slot *slot, gfn_t gfn)
3026 {
3027 return __gfn_to_pfn_memslot(slot, gfn, false, false, NULL, true,
3028 NULL, NULL);
3029 }
3030 EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot);
3031
gfn_to_pfn_memslot_atomic(const struct kvm_memory_slot * slot,gfn_t gfn)3032 kvm_pfn_t gfn_to_pfn_memslot_atomic(const struct kvm_memory_slot *slot, gfn_t gfn)
3033 {
3034 return __gfn_to_pfn_memslot(slot, gfn, true, false, NULL, true,
3035 NULL, NULL);
3036 }
3037 EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot_atomic);
3038
kvm_vcpu_gfn_to_pfn_atomic(struct kvm_vcpu * vcpu,gfn_t gfn)3039 kvm_pfn_t kvm_vcpu_gfn_to_pfn_atomic(struct kvm_vcpu *vcpu, gfn_t gfn)
3040 {
3041 return gfn_to_pfn_memslot_atomic(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn);
3042 }
3043 EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_pfn_atomic);
3044
gfn_to_pfn(struct kvm * kvm,gfn_t gfn)3045 kvm_pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn)
3046 {
3047 return gfn_to_pfn_memslot(gfn_to_memslot(kvm, gfn), gfn);
3048 }
3049 EXPORT_SYMBOL_GPL(gfn_to_pfn);
3050
kvm_vcpu_gfn_to_pfn(struct kvm_vcpu * vcpu,gfn_t gfn)3051 kvm_pfn_t kvm_vcpu_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn)
3052 {
3053 return gfn_to_pfn_memslot(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn);
3054 }
3055 EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_pfn);
3056
gfn_to_page_many_atomic(struct kvm_memory_slot * slot,gfn_t gfn,struct page ** pages,int nr_pages)3057 int gfn_to_page_many_atomic(struct kvm_memory_slot *slot, gfn_t gfn,
3058 struct page **pages, int nr_pages)
3059 {
3060 unsigned long addr;
3061 gfn_t entry = 0;
3062
3063 addr = gfn_to_hva_many(slot, gfn, &entry);
3064 if (kvm_is_error_hva(addr))
3065 return -1;
3066
3067 if (entry < nr_pages)
3068 return 0;
3069
3070 return get_user_pages_fast_only(addr, nr_pages, FOLL_WRITE, pages);
3071 }
3072 EXPORT_SYMBOL_GPL(gfn_to_page_many_atomic);
3073
3074 /*
3075 * Do not use this helper unless you are absolutely certain the gfn _must_ be
3076 * backed by 'struct page'. A valid example is if the backing memslot is
3077 * controlled by KVM. Note, if the returned page is valid, it's refcount has
3078 * been elevated by gfn_to_pfn().
3079 */
gfn_to_page(struct kvm * kvm,gfn_t gfn)3080 struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn)
3081 {
3082 struct page *page;
3083 kvm_pfn_t pfn;
3084
3085 pfn = gfn_to_pfn(kvm, gfn);
3086
3087 if (is_error_noslot_pfn(pfn))
3088 return KVM_ERR_PTR_BAD_PAGE;
3089
3090 page = kvm_pfn_to_refcounted_page(pfn);
3091 if (!page)
3092 return KVM_ERR_PTR_BAD_PAGE;
3093
3094 return page;
3095 }
3096 EXPORT_SYMBOL_GPL(gfn_to_page);
3097
kvm_release_pfn(kvm_pfn_t pfn,bool dirty)3098 void kvm_release_pfn(kvm_pfn_t pfn, bool dirty)
3099 {
3100 if (dirty)
3101 kvm_release_pfn_dirty(pfn);
3102 else
3103 kvm_release_pfn_clean(pfn);
3104 }
3105
kvm_vcpu_map(struct kvm_vcpu * vcpu,gfn_t gfn,struct kvm_host_map * map)3106 int kvm_vcpu_map(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_host_map *map)
3107 {
3108 kvm_pfn_t pfn;
3109 void *hva = NULL;
3110 struct page *page = KVM_UNMAPPED_PAGE;
3111
3112 if (!map)
3113 return -EINVAL;
3114
3115 pfn = gfn_to_pfn(vcpu->kvm, gfn);
3116 if (is_error_noslot_pfn(pfn))
3117 return -EINVAL;
3118
3119 if (pfn_valid(pfn)) {
3120 page = pfn_to_page(pfn);
3121 hva = kmap(page);
3122 #ifdef CONFIG_HAS_IOMEM
3123 } else {
3124 hva = memremap(pfn_to_hpa(pfn), PAGE_SIZE, MEMREMAP_WB);
3125 #endif
3126 }
3127
3128 if (!hva)
3129 return -EFAULT;
3130
3131 map->page = page;
3132 map->hva = hva;
3133 map->pfn = pfn;
3134 map->gfn = gfn;
3135
3136 return 0;
3137 }
3138 EXPORT_SYMBOL_GPL(kvm_vcpu_map);
3139
kvm_vcpu_unmap(struct kvm_vcpu * vcpu,struct kvm_host_map * map,bool dirty)3140 void kvm_vcpu_unmap(struct kvm_vcpu *vcpu, struct kvm_host_map *map, bool dirty)
3141 {
3142 if (!map)
3143 return;
3144
3145 if (!map->hva)
3146 return;
3147
3148 if (map->page != KVM_UNMAPPED_PAGE)
3149 kunmap(map->page);
3150 #ifdef CONFIG_HAS_IOMEM
3151 else
3152 memunmap(map->hva);
3153 #endif
3154
3155 if (dirty)
3156 kvm_vcpu_mark_page_dirty(vcpu, map->gfn);
3157
3158 kvm_release_pfn(map->pfn, dirty);
3159
3160 map->hva = NULL;
3161 map->page = NULL;
3162 }
3163 EXPORT_SYMBOL_GPL(kvm_vcpu_unmap);
3164
kvm_is_ad_tracked_page(struct page * page)3165 static bool kvm_is_ad_tracked_page(struct page *page)
3166 {
3167 /*
3168 * Per page-flags.h, pages tagged PG_reserved "should in general not be
3169 * touched (e.g. set dirty) except by its owner".
3170 */
3171 return !PageReserved(page);
3172 }
3173
kvm_set_page_dirty(struct page * page)3174 static void kvm_set_page_dirty(struct page *page)
3175 {
3176 if (kvm_is_ad_tracked_page(page))
3177 SetPageDirty(page);
3178 }
3179
kvm_set_page_accessed(struct page * page)3180 static void kvm_set_page_accessed(struct page *page)
3181 {
3182 if (kvm_is_ad_tracked_page(page))
3183 mark_page_accessed(page);
3184 }
3185
kvm_release_page_clean(struct page * page)3186 void kvm_release_page_clean(struct page *page)
3187 {
3188 WARN_ON(is_error_page(page));
3189
3190 kvm_set_page_accessed(page);
3191 put_page(page);
3192 }
3193 EXPORT_SYMBOL_GPL(kvm_release_page_clean);
3194
kvm_release_pfn_clean(kvm_pfn_t pfn)3195 void kvm_release_pfn_clean(kvm_pfn_t pfn)
3196 {
3197 struct page *page;
3198
3199 if (is_error_noslot_pfn(pfn))
3200 return;
3201
3202 page = kvm_pfn_to_refcounted_page(pfn);
3203 if (!page)
3204 return;
3205
3206 kvm_release_page_clean(page);
3207 }
3208 EXPORT_SYMBOL_GPL(kvm_release_pfn_clean);
3209
kvm_release_page_dirty(struct page * page)3210 void kvm_release_page_dirty(struct page *page)
3211 {
3212 WARN_ON(is_error_page(page));
3213
3214 kvm_set_page_dirty(page);
3215 kvm_release_page_clean(page);
3216 }
3217 EXPORT_SYMBOL_GPL(kvm_release_page_dirty);
3218
kvm_release_pfn_dirty(kvm_pfn_t pfn)3219 void kvm_release_pfn_dirty(kvm_pfn_t pfn)
3220 {
3221 struct page *page;
3222
3223 if (is_error_noslot_pfn(pfn))
3224 return;
3225
3226 page = kvm_pfn_to_refcounted_page(pfn);
3227 if (!page)
3228 return;
3229
3230 kvm_release_page_dirty(page);
3231 }
3232 EXPORT_SYMBOL_GPL(kvm_release_pfn_dirty);
3233
3234 /*
3235 * Note, checking for an error/noslot pfn is the caller's responsibility when
3236 * directly marking a page dirty/accessed. Unlike the "release" helpers, the
3237 * "set" helpers are not to be used when the pfn might point at garbage.
3238 */
kvm_set_pfn_dirty(kvm_pfn_t pfn)3239 void kvm_set_pfn_dirty(kvm_pfn_t pfn)
3240 {
3241 if (WARN_ON(is_error_noslot_pfn(pfn)))
3242 return;
3243
3244 if (pfn_valid(pfn))
3245 kvm_set_page_dirty(pfn_to_page(pfn));
3246 }
3247 EXPORT_SYMBOL_GPL(kvm_set_pfn_dirty);
3248
kvm_set_pfn_accessed(kvm_pfn_t pfn)3249 void kvm_set_pfn_accessed(kvm_pfn_t pfn)
3250 {
3251 if (WARN_ON(is_error_noslot_pfn(pfn)))
3252 return;
3253
3254 if (pfn_valid(pfn))
3255 kvm_set_page_accessed(pfn_to_page(pfn));
3256 }
3257 EXPORT_SYMBOL_GPL(kvm_set_pfn_accessed);
3258
next_segment(unsigned long len,int offset)3259 static int next_segment(unsigned long len, int offset)
3260 {
3261 if (len > PAGE_SIZE - offset)
3262 return PAGE_SIZE - offset;
3263 else
3264 return len;
3265 }
3266
3267 /* Copy @len bytes from guest memory at '(@gfn * PAGE_SIZE) + @offset' to @data */
__kvm_read_guest_page(struct kvm_memory_slot * slot,gfn_t gfn,void * data,int offset,int len)3268 static int __kvm_read_guest_page(struct kvm_memory_slot *slot, gfn_t gfn,
3269 void *data, int offset, int len)
3270 {
3271 int r;
3272 unsigned long addr;
3273
3274 addr = gfn_to_hva_memslot_prot(slot, gfn, NULL);
3275 if (kvm_is_error_hva(addr))
3276 return -EFAULT;
3277 r = __copy_from_user(data, (void __user *)addr + offset, len);
3278 if (r)
3279 return -EFAULT;
3280 return 0;
3281 }
3282
kvm_read_guest_page(struct kvm * kvm,gfn_t gfn,void * data,int offset,int len)3283 int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset,
3284 int len)
3285 {
3286 struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
3287
3288 return __kvm_read_guest_page(slot, gfn, data, offset, len);
3289 }
3290 EXPORT_SYMBOL_GPL(kvm_read_guest_page);
3291
kvm_vcpu_read_guest_page(struct kvm_vcpu * vcpu,gfn_t gfn,void * data,int offset,int len)3292 int kvm_vcpu_read_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, void *data,
3293 int offset, int len)
3294 {
3295 struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
3296
3297 return __kvm_read_guest_page(slot, gfn, data, offset, len);
3298 }
3299 EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest_page);
3300
kvm_read_guest(struct kvm * kvm,gpa_t gpa,void * data,unsigned long len)3301 int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len)
3302 {
3303 gfn_t gfn = gpa >> PAGE_SHIFT;
3304 int seg;
3305 int offset = offset_in_page(gpa);
3306 int ret;
3307
3308 while ((seg = next_segment(len, offset)) != 0) {
3309 ret = kvm_read_guest_page(kvm, gfn, data, offset, seg);
3310 if (ret < 0)
3311 return ret;
3312 offset = 0;
3313 len -= seg;
3314 data += seg;
3315 ++gfn;
3316 }
3317 return 0;
3318 }
3319 EXPORT_SYMBOL_GPL(kvm_read_guest);
3320
kvm_vcpu_read_guest(struct kvm_vcpu * vcpu,gpa_t gpa,void * data,unsigned long len)3321 int kvm_vcpu_read_guest(struct kvm_vcpu *vcpu, gpa_t gpa, void *data, unsigned long len)
3322 {
3323 gfn_t gfn = gpa >> PAGE_SHIFT;
3324 int seg;
3325 int offset = offset_in_page(gpa);
3326 int ret;
3327
3328 while ((seg = next_segment(len, offset)) != 0) {
3329 ret = kvm_vcpu_read_guest_page(vcpu, gfn, data, offset, seg);
3330 if (ret < 0)
3331 return ret;
3332 offset = 0;
3333 len -= seg;
3334 data += seg;
3335 ++gfn;
3336 }
3337 return 0;
3338 }
3339 EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest);
3340
__kvm_read_guest_atomic(struct kvm_memory_slot * slot,gfn_t gfn,void * data,int offset,unsigned long len)3341 static int __kvm_read_guest_atomic(struct kvm_memory_slot *slot, gfn_t gfn,
3342 void *data, int offset, unsigned long len)
3343 {
3344 int r;
3345 unsigned long addr;
3346
3347 addr = gfn_to_hva_memslot_prot(slot, gfn, NULL);
3348 if (kvm_is_error_hva(addr))
3349 return -EFAULT;
3350 pagefault_disable();
3351 r = __copy_from_user_inatomic(data, (void __user *)addr + offset, len);
3352 pagefault_enable();
3353 if (r)
3354 return -EFAULT;
3355 return 0;
3356 }
3357
kvm_vcpu_read_guest_atomic(struct kvm_vcpu * vcpu,gpa_t gpa,void * data,unsigned long len)3358 int kvm_vcpu_read_guest_atomic(struct kvm_vcpu *vcpu, gpa_t gpa,
3359 void *data, unsigned long len)
3360 {
3361 gfn_t gfn = gpa >> PAGE_SHIFT;
3362 struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
3363 int offset = offset_in_page(gpa);
3364
3365 return __kvm_read_guest_atomic(slot, gfn, data, offset, len);
3366 }
3367 EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest_atomic);
3368
3369 /* Copy @len bytes from @data into guest memory at '(@gfn * PAGE_SIZE) + @offset' */
__kvm_write_guest_page(struct kvm * kvm,struct kvm_memory_slot * memslot,gfn_t gfn,const void * data,int offset,int len)3370 static int __kvm_write_guest_page(struct kvm *kvm,
3371 struct kvm_memory_slot *memslot, gfn_t gfn,
3372 const void *data, int offset, int len)
3373 {
3374 int r;
3375 unsigned long addr;
3376
3377 addr = gfn_to_hva_memslot(memslot, gfn);
3378 if (kvm_is_error_hva(addr))
3379 return -EFAULT;
3380 r = __copy_to_user((void __user *)addr + offset, data, len);
3381 if (r)
3382 return -EFAULT;
3383 mark_page_dirty_in_slot(kvm, memslot, gfn);
3384 return 0;
3385 }
3386
kvm_write_guest_page(struct kvm * kvm,gfn_t gfn,const void * data,int offset,int len)3387 int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn,
3388 const void *data, int offset, int len)
3389 {
3390 struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
3391
3392 return __kvm_write_guest_page(kvm, slot, gfn, data, offset, len);
3393 }
3394 EXPORT_SYMBOL_GPL(kvm_write_guest_page);
3395
kvm_vcpu_write_guest_page(struct kvm_vcpu * vcpu,gfn_t gfn,const void * data,int offset,int len)3396 int kvm_vcpu_write_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn,
3397 const void *data, int offset, int len)
3398 {
3399 struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
3400
3401 return __kvm_write_guest_page(vcpu->kvm, slot, gfn, data, offset, len);
3402 }
3403 EXPORT_SYMBOL_GPL(kvm_vcpu_write_guest_page);
3404
kvm_write_guest(struct kvm * kvm,gpa_t gpa,const void * data,unsigned long len)3405 int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data,
3406 unsigned long len)
3407 {
3408 gfn_t gfn = gpa >> PAGE_SHIFT;
3409 int seg;
3410 int offset = offset_in_page(gpa);
3411 int ret;
3412
3413 while ((seg = next_segment(len, offset)) != 0) {
3414 ret = kvm_write_guest_page(kvm, gfn, data, offset, seg);
3415 if (ret < 0)
3416 return ret;
3417 offset = 0;
3418 len -= seg;
3419 data += seg;
3420 ++gfn;
3421 }
3422 return 0;
3423 }
3424 EXPORT_SYMBOL_GPL(kvm_write_guest);
3425
kvm_vcpu_write_guest(struct kvm_vcpu * vcpu,gpa_t gpa,const void * data,unsigned long len)3426 int kvm_vcpu_write_guest(struct kvm_vcpu *vcpu, gpa_t gpa, const void *data,
3427 unsigned long len)
3428 {
3429 gfn_t gfn = gpa >> PAGE_SHIFT;
3430 int seg;
3431 int offset = offset_in_page(gpa);
3432 int ret;
3433
3434 while ((seg = next_segment(len, offset)) != 0) {
3435 ret = kvm_vcpu_write_guest_page(vcpu, gfn, data, offset, seg);
3436 if (ret < 0)
3437 return ret;
3438 offset = 0;
3439 len -= seg;
3440 data += seg;
3441 ++gfn;
3442 }
3443 return 0;
3444 }
3445 EXPORT_SYMBOL_GPL(kvm_vcpu_write_guest);
3446
__kvm_gfn_to_hva_cache_init(struct kvm_memslots * slots,struct gfn_to_hva_cache * ghc,gpa_t gpa,unsigned long len)3447 static int __kvm_gfn_to_hva_cache_init(struct kvm_memslots *slots,
3448 struct gfn_to_hva_cache *ghc,
3449 gpa_t gpa, unsigned long len)
3450 {
3451 int offset = offset_in_page(gpa);
3452 gfn_t start_gfn = gpa >> PAGE_SHIFT;
3453 gfn_t end_gfn = (gpa + len - 1) >> PAGE_SHIFT;
3454 gfn_t nr_pages_needed = end_gfn - start_gfn + 1;
3455 gfn_t nr_pages_avail;
3456
3457 /* Update ghc->generation before performing any error checks. */
3458 ghc->generation = slots->generation;
3459
3460 if (start_gfn > end_gfn) {
3461 ghc->hva = KVM_HVA_ERR_BAD;
3462 return -EINVAL;
3463 }
3464
3465 /*
3466 * If the requested region crosses two memslots, we still
3467 * verify that the entire region is valid here.
3468 */
3469 for ( ; start_gfn <= end_gfn; start_gfn += nr_pages_avail) {
3470 ghc->memslot = __gfn_to_memslot(slots, start_gfn);
3471 ghc->hva = gfn_to_hva_many(ghc->memslot, start_gfn,
3472 &nr_pages_avail);
3473 if (kvm_is_error_hva(ghc->hva))
3474 return -EFAULT;
3475 }
3476
3477 /* Use the slow path for cross page reads and writes. */
3478 if (nr_pages_needed == 1)
3479 ghc->hva += offset;
3480 else
3481 ghc->memslot = NULL;
3482
3483 ghc->gpa = gpa;
3484 ghc->len = len;
3485 return 0;
3486 }
3487
kvm_gfn_to_hva_cache_init(struct kvm * kvm,struct gfn_to_hva_cache * ghc,gpa_t gpa,unsigned long len)3488 int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
3489 gpa_t gpa, unsigned long len)
3490 {
3491 struct kvm_memslots *slots = kvm_memslots(kvm);
3492 return __kvm_gfn_to_hva_cache_init(slots, ghc, gpa, len);
3493 }
3494 EXPORT_SYMBOL_GPL(kvm_gfn_to_hva_cache_init);
3495
kvm_write_guest_offset_cached(struct kvm * kvm,struct gfn_to_hva_cache * ghc,void * data,unsigned int offset,unsigned long len)3496 int kvm_write_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
3497 void *data, unsigned int offset,
3498 unsigned long len)
3499 {
3500 struct kvm_memslots *slots = kvm_memslots(kvm);
3501 int r;
3502 gpa_t gpa = ghc->gpa + offset;
3503
3504 if (WARN_ON_ONCE(len + offset > ghc->len))
3505 return -EINVAL;
3506
3507 if (slots->generation != ghc->generation) {
3508 if (__kvm_gfn_to_hva_cache_init(slots, ghc, ghc->gpa, ghc->len))
3509 return -EFAULT;
3510 }
3511
3512 if (kvm_is_error_hva(ghc->hva))
3513 return -EFAULT;
3514
3515 if (unlikely(!ghc->memslot))
3516 return kvm_write_guest(kvm, gpa, data, len);
3517
3518 r = __copy_to_user((void __user *)ghc->hva + offset, data, len);
3519 if (r)
3520 return -EFAULT;
3521 mark_page_dirty_in_slot(kvm, ghc->memslot, gpa >> PAGE_SHIFT);
3522
3523 return 0;
3524 }
3525 EXPORT_SYMBOL_GPL(kvm_write_guest_offset_cached);
3526
kvm_write_guest_cached(struct kvm * kvm,struct gfn_to_hva_cache * ghc,void * data,unsigned long len)3527 int kvm_write_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
3528 void *data, unsigned long len)
3529 {
3530 return kvm_write_guest_offset_cached(kvm, ghc, data, 0, len);
3531 }
3532 EXPORT_SYMBOL_GPL(kvm_write_guest_cached);
3533
kvm_read_guest_offset_cached(struct kvm * kvm,struct gfn_to_hva_cache * ghc,void * data,unsigned int offset,unsigned long len)3534 int kvm_read_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
3535 void *data, unsigned int offset,
3536 unsigned long len)
3537 {
3538 struct kvm_memslots *slots = kvm_memslots(kvm);
3539 int r;
3540 gpa_t gpa = ghc->gpa + offset;
3541
3542 if (WARN_ON_ONCE(len + offset > ghc->len))
3543 return -EINVAL;
3544
3545 if (slots->generation != ghc->generation) {
3546 if (__kvm_gfn_to_hva_cache_init(slots, ghc, ghc->gpa, ghc->len))
3547 return -EFAULT;
3548 }
3549
3550 if (kvm_is_error_hva(ghc->hva))
3551 return -EFAULT;
3552
3553 if (unlikely(!ghc->memslot))
3554 return kvm_read_guest(kvm, gpa, data, len);
3555
3556 r = __copy_from_user(data, (void __user *)ghc->hva + offset, len);
3557 if (r)
3558 return -EFAULT;
3559
3560 return 0;
3561 }
3562 EXPORT_SYMBOL_GPL(kvm_read_guest_offset_cached);
3563
kvm_read_guest_cached(struct kvm * kvm,struct gfn_to_hva_cache * ghc,void * data,unsigned long len)3564 int kvm_read_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
3565 void *data, unsigned long len)
3566 {
3567 return kvm_read_guest_offset_cached(kvm, ghc, data, 0, len);
3568 }
3569 EXPORT_SYMBOL_GPL(kvm_read_guest_cached);
3570
kvm_clear_guest(struct kvm * kvm,gpa_t gpa,unsigned long len)3571 int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len)
3572 {
3573 const void *zero_page = (const void *) __va(page_to_phys(ZERO_PAGE(0)));
3574 gfn_t gfn = gpa >> PAGE_SHIFT;
3575 int seg;
3576 int offset = offset_in_page(gpa);
3577 int ret;
3578
3579 while ((seg = next_segment(len, offset)) != 0) {
3580 ret = kvm_write_guest_page(kvm, gfn, zero_page, offset, len);
3581 if (ret < 0)
3582 return ret;
3583 offset = 0;
3584 len -= seg;
3585 ++gfn;
3586 }
3587 return 0;
3588 }
3589 EXPORT_SYMBOL_GPL(kvm_clear_guest);
3590
mark_page_dirty_in_slot(struct kvm * kvm,const struct kvm_memory_slot * memslot,gfn_t gfn)3591 void mark_page_dirty_in_slot(struct kvm *kvm,
3592 const struct kvm_memory_slot *memslot,
3593 gfn_t gfn)
3594 {
3595 struct kvm_vcpu *vcpu = kvm_get_running_vcpu();
3596
3597 #ifdef CONFIG_HAVE_KVM_DIRTY_RING
3598 if (WARN_ON_ONCE(vcpu && vcpu->kvm != kvm))
3599 return;
3600
3601 WARN_ON_ONCE(!vcpu && !kvm_arch_allow_write_without_running_vcpu(kvm));
3602 #endif
3603
3604 if (memslot && kvm_slot_dirty_track_enabled(memslot)) {
3605 unsigned long rel_gfn = gfn - memslot->base_gfn;
3606 u32 slot = (memslot->as_id << 16) | memslot->id;
3607
3608 if (kvm->dirty_ring_size && vcpu)
3609 kvm_dirty_ring_push(vcpu, slot, rel_gfn);
3610 else if (memslot->dirty_bitmap)
3611 set_bit_le(rel_gfn, memslot->dirty_bitmap);
3612 }
3613 }
3614 EXPORT_SYMBOL_GPL(mark_page_dirty_in_slot);
3615
mark_page_dirty(struct kvm * kvm,gfn_t gfn)3616 void mark_page_dirty(struct kvm *kvm, gfn_t gfn)
3617 {
3618 struct kvm_memory_slot *memslot;
3619
3620 memslot = gfn_to_memslot(kvm, gfn);
3621 mark_page_dirty_in_slot(kvm, memslot, gfn);
3622 }
3623 EXPORT_SYMBOL_GPL(mark_page_dirty);
3624
kvm_vcpu_mark_page_dirty(struct kvm_vcpu * vcpu,gfn_t gfn)3625 void kvm_vcpu_mark_page_dirty(struct kvm_vcpu *vcpu, gfn_t gfn)
3626 {
3627 struct kvm_memory_slot *memslot;
3628
3629 memslot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
3630 mark_page_dirty_in_slot(vcpu->kvm, memslot, gfn);
3631 }
3632 EXPORT_SYMBOL_GPL(kvm_vcpu_mark_page_dirty);
3633
kvm_sigset_activate(struct kvm_vcpu * vcpu)3634 void kvm_sigset_activate(struct kvm_vcpu *vcpu)
3635 {
3636 if (!vcpu->sigset_active)
3637 return;
3638
3639 /*
3640 * This does a lockless modification of ->real_blocked, which is fine
3641 * because, only current can change ->real_blocked and all readers of
3642 * ->real_blocked don't care as long ->real_blocked is always a subset
3643 * of ->blocked.
3644 */
3645 sigprocmask(SIG_SETMASK, &vcpu->sigset, ¤t->real_blocked);
3646 }
3647
kvm_sigset_deactivate(struct kvm_vcpu * vcpu)3648 void kvm_sigset_deactivate(struct kvm_vcpu *vcpu)
3649 {
3650 if (!vcpu->sigset_active)
3651 return;
3652
3653 sigprocmask(SIG_SETMASK, ¤t->real_blocked, NULL);
3654 sigemptyset(¤t->real_blocked);
3655 }
3656
grow_halt_poll_ns(struct kvm_vcpu * vcpu)3657 static void grow_halt_poll_ns(struct kvm_vcpu *vcpu)
3658 {
3659 unsigned int old, val, grow, grow_start;
3660
3661 old = val = vcpu->halt_poll_ns;
3662 grow_start = READ_ONCE(halt_poll_ns_grow_start);
3663 grow = READ_ONCE(halt_poll_ns_grow);
3664 if (!grow)
3665 goto out;
3666
3667 val *= grow;
3668 if (val < grow_start)
3669 val = grow_start;
3670
3671 vcpu->halt_poll_ns = val;
3672 out:
3673 trace_kvm_halt_poll_ns_grow(vcpu->vcpu_id, val, old);
3674 }
3675
shrink_halt_poll_ns(struct kvm_vcpu * vcpu)3676 static void shrink_halt_poll_ns(struct kvm_vcpu *vcpu)
3677 {
3678 unsigned int old, val, shrink, grow_start;
3679
3680 old = val = vcpu->halt_poll_ns;
3681 shrink = READ_ONCE(halt_poll_ns_shrink);
3682 grow_start = READ_ONCE(halt_poll_ns_grow_start);
3683 if (shrink == 0)
3684 val = 0;
3685 else
3686 val /= shrink;
3687
3688 if (val < grow_start)
3689 val = 0;
3690
3691 vcpu->halt_poll_ns = val;
3692 trace_kvm_halt_poll_ns_shrink(vcpu->vcpu_id, val, old);
3693 }
3694
kvm_vcpu_check_block(struct kvm_vcpu * vcpu)3695 static int kvm_vcpu_check_block(struct kvm_vcpu *vcpu)
3696 {
3697 int ret = -EINTR;
3698 int idx = srcu_read_lock(&vcpu->kvm->srcu);
3699
3700 if (kvm_arch_vcpu_runnable(vcpu))
3701 goto out;
3702 if (kvm_cpu_has_pending_timer(vcpu))
3703 goto out;
3704 if (signal_pending(current))
3705 goto out;
3706 if (kvm_check_request(KVM_REQ_UNBLOCK, vcpu))
3707 goto out;
3708
3709 ret = 0;
3710 out:
3711 srcu_read_unlock(&vcpu->kvm->srcu, idx);
3712 return ret;
3713 }
3714
3715 /*
3716 * Block the vCPU until the vCPU is runnable, an event arrives, or a signal is
3717 * pending. This is mostly used when halting a vCPU, but may also be used
3718 * directly for other vCPU non-runnable states, e.g. x86's Wait-For-SIPI.
3719 */
kvm_vcpu_block(struct kvm_vcpu * vcpu)3720 bool kvm_vcpu_block(struct kvm_vcpu *vcpu)
3721 {
3722 struct rcuwait *wait = kvm_arch_vcpu_get_wait(vcpu);
3723 bool waited = false;
3724
3725 vcpu->stat.generic.blocking = 1;
3726
3727 preempt_disable();
3728 kvm_arch_vcpu_blocking(vcpu);
3729 prepare_to_rcuwait(wait);
3730 preempt_enable();
3731
3732 for (;;) {
3733 set_current_state(TASK_INTERRUPTIBLE);
3734
3735 if (kvm_vcpu_check_block(vcpu) < 0)
3736 break;
3737
3738 waited = true;
3739 schedule();
3740 }
3741
3742 preempt_disable();
3743 finish_rcuwait(wait);
3744 kvm_arch_vcpu_unblocking(vcpu);
3745 preempt_enable();
3746
3747 vcpu->stat.generic.blocking = 0;
3748
3749 return waited;
3750 }
3751
update_halt_poll_stats(struct kvm_vcpu * vcpu,ktime_t start,ktime_t end,bool success)3752 static inline void update_halt_poll_stats(struct kvm_vcpu *vcpu, ktime_t start,
3753 ktime_t end, bool success)
3754 {
3755 struct kvm_vcpu_stat_generic *stats = &vcpu->stat.generic;
3756 u64 poll_ns = ktime_to_ns(ktime_sub(end, start));
3757
3758 ++vcpu->stat.generic.halt_attempted_poll;
3759
3760 if (success) {
3761 ++vcpu->stat.generic.halt_successful_poll;
3762
3763 if (!vcpu_valid_wakeup(vcpu))
3764 ++vcpu->stat.generic.halt_poll_invalid;
3765
3766 stats->halt_poll_success_ns += poll_ns;
3767 KVM_STATS_LOG_HIST_UPDATE(stats->halt_poll_success_hist, poll_ns);
3768 } else {
3769 stats->halt_poll_fail_ns += poll_ns;
3770 KVM_STATS_LOG_HIST_UPDATE(stats->halt_poll_fail_hist, poll_ns);
3771 }
3772 }
3773
kvm_vcpu_max_halt_poll_ns(struct kvm_vcpu * vcpu)3774 static unsigned int kvm_vcpu_max_halt_poll_ns(struct kvm_vcpu *vcpu)
3775 {
3776 struct kvm *kvm = vcpu->kvm;
3777
3778 if (kvm->override_halt_poll_ns) {
3779 /*
3780 * Ensure kvm->max_halt_poll_ns is not read before
3781 * kvm->override_halt_poll_ns.
3782 *
3783 * Pairs with the smp_wmb() when enabling KVM_CAP_HALT_POLL.
3784 */
3785 smp_rmb();
3786 return READ_ONCE(kvm->max_halt_poll_ns);
3787 }
3788
3789 return READ_ONCE(halt_poll_ns);
3790 }
3791
3792 /*
3793 * Emulate a vCPU halt condition, e.g. HLT on x86, WFI on arm, etc... If halt
3794 * polling is enabled, busy wait for a short time before blocking to avoid the
3795 * expensive block+unblock sequence if a wake event arrives soon after the vCPU
3796 * is halted.
3797 */
kvm_vcpu_halt(struct kvm_vcpu * vcpu)3798 void kvm_vcpu_halt(struct kvm_vcpu *vcpu)
3799 {
3800 unsigned int max_halt_poll_ns = kvm_vcpu_max_halt_poll_ns(vcpu);
3801 bool halt_poll_allowed = !kvm_arch_no_poll(vcpu);
3802 ktime_t start, cur, poll_end;
3803 bool waited = false;
3804 bool do_halt_poll;
3805 u64 halt_ns;
3806
3807 if (vcpu->halt_poll_ns > max_halt_poll_ns)
3808 vcpu->halt_poll_ns = max_halt_poll_ns;
3809
3810 do_halt_poll = halt_poll_allowed && vcpu->halt_poll_ns;
3811
3812 start = cur = poll_end = ktime_get();
3813 if (do_halt_poll) {
3814 ktime_t stop = ktime_add_ns(start, vcpu->halt_poll_ns);
3815
3816 do {
3817 if (kvm_vcpu_check_block(vcpu) < 0)
3818 goto out;
3819 cpu_relax();
3820 poll_end = cur = ktime_get();
3821 } while (kvm_vcpu_can_poll(cur, stop));
3822 }
3823
3824 waited = kvm_vcpu_block(vcpu);
3825
3826 cur = ktime_get();
3827 if (waited) {
3828 vcpu->stat.generic.halt_wait_ns +=
3829 ktime_to_ns(cur) - ktime_to_ns(poll_end);
3830 KVM_STATS_LOG_HIST_UPDATE(vcpu->stat.generic.halt_wait_hist,
3831 ktime_to_ns(cur) - ktime_to_ns(poll_end));
3832 }
3833 out:
3834 /* The total time the vCPU was "halted", including polling time. */
3835 halt_ns = ktime_to_ns(cur) - ktime_to_ns(start);
3836
3837 /*
3838 * Note, halt-polling is considered successful so long as the vCPU was
3839 * never actually scheduled out, i.e. even if the wake event arrived
3840 * after of the halt-polling loop itself, but before the full wait.
3841 */
3842 if (do_halt_poll)
3843 update_halt_poll_stats(vcpu, start, poll_end, !waited);
3844
3845 if (halt_poll_allowed) {
3846 /* Recompute the max halt poll time in case it changed. */
3847 max_halt_poll_ns = kvm_vcpu_max_halt_poll_ns(vcpu);
3848
3849 if (!vcpu_valid_wakeup(vcpu)) {
3850 shrink_halt_poll_ns(vcpu);
3851 } else if (max_halt_poll_ns) {
3852 if (halt_ns <= vcpu->halt_poll_ns)
3853 ;
3854 /* we had a long block, shrink polling */
3855 else if (vcpu->halt_poll_ns &&
3856 halt_ns > max_halt_poll_ns)
3857 shrink_halt_poll_ns(vcpu);
3858 /* we had a short halt and our poll time is too small */
3859 else if (vcpu->halt_poll_ns < max_halt_poll_ns &&
3860 halt_ns < max_halt_poll_ns)
3861 grow_halt_poll_ns(vcpu);
3862 } else {
3863 vcpu->halt_poll_ns = 0;
3864 }
3865 }
3866
3867 trace_kvm_vcpu_wakeup(halt_ns, waited, vcpu_valid_wakeup(vcpu));
3868 }
3869 EXPORT_SYMBOL_GPL(kvm_vcpu_halt);
3870
kvm_vcpu_wake_up(struct kvm_vcpu * vcpu)3871 bool kvm_vcpu_wake_up(struct kvm_vcpu *vcpu)
3872 {
3873 if (__kvm_vcpu_wake_up(vcpu)) {
3874 WRITE_ONCE(vcpu->ready, true);
3875 ++vcpu->stat.generic.halt_wakeup;
3876 return true;
3877 }
3878
3879 return false;
3880 }
3881 EXPORT_SYMBOL_GPL(kvm_vcpu_wake_up);
3882
3883 #ifndef CONFIG_S390
3884 /*
3885 * Kick a sleeping VCPU, or a guest VCPU in guest mode, into host kernel mode.
3886 */
kvm_vcpu_kick(struct kvm_vcpu * vcpu)3887 void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
3888 {
3889 int me, cpu;
3890
3891 if (kvm_vcpu_wake_up(vcpu))
3892 return;
3893
3894 me = get_cpu();
3895 /*
3896 * The only state change done outside the vcpu mutex is IN_GUEST_MODE
3897 * to EXITING_GUEST_MODE. Therefore the moderately expensive "should
3898 * kick" check does not need atomic operations if kvm_vcpu_kick is used
3899 * within the vCPU thread itself.
3900 */
3901 if (vcpu == __this_cpu_read(kvm_running_vcpu)) {
3902 if (vcpu->mode == IN_GUEST_MODE)
3903 WRITE_ONCE(vcpu->mode, EXITING_GUEST_MODE);
3904 goto out;
3905 }
3906
3907 /*
3908 * Note, the vCPU could get migrated to a different pCPU at any point
3909 * after kvm_arch_vcpu_should_kick(), which could result in sending an
3910 * IPI to the previous pCPU. But, that's ok because the purpose of the
3911 * IPI is to force the vCPU to leave IN_GUEST_MODE, and migrating the
3912 * vCPU also requires it to leave IN_GUEST_MODE.
3913 */
3914 if (kvm_arch_vcpu_should_kick(vcpu)) {
3915 cpu = READ_ONCE(vcpu->cpu);
3916 if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu))
3917 smp_send_reschedule(cpu);
3918 }
3919 out:
3920 put_cpu();
3921 }
3922 EXPORT_SYMBOL_GPL(kvm_vcpu_kick);
3923 #endif /* !CONFIG_S390 */
3924
kvm_vcpu_yield_to(struct kvm_vcpu * target)3925 int kvm_vcpu_yield_to(struct kvm_vcpu *target)
3926 {
3927 struct pid *pid;
3928 struct task_struct *task = NULL;
3929 int ret = 0;
3930
3931 rcu_read_lock();
3932 pid = rcu_dereference(target->pid);
3933 if (pid)
3934 task = get_pid_task(pid, PIDTYPE_PID);
3935 rcu_read_unlock();
3936 if (!task)
3937 return ret;
3938 ret = yield_to(task, 1);
3939 put_task_struct(task);
3940
3941 return ret;
3942 }
3943 EXPORT_SYMBOL_GPL(kvm_vcpu_yield_to);
3944
3945 /*
3946 * Helper that checks whether a VCPU is eligible for directed yield.
3947 * Most eligible candidate to yield is decided by following heuristics:
3948 *
3949 * (a) VCPU which has not done pl-exit or cpu relax intercepted recently
3950 * (preempted lock holder), indicated by @in_spin_loop.
3951 * Set at the beginning and cleared at the end of interception/PLE handler.
3952 *
3953 * (b) VCPU which has done pl-exit/ cpu relax intercepted but did not get
3954 * chance last time (mostly it has become eligible now since we have probably
3955 * yielded to lockholder in last iteration. This is done by toggling
3956 * @dy_eligible each time a VCPU checked for eligibility.)
3957 *
3958 * Yielding to a recently pl-exited/cpu relax intercepted VCPU before yielding
3959 * to preempted lock-holder could result in wrong VCPU selection and CPU
3960 * burning. Giving priority for a potential lock-holder increases lock
3961 * progress.
3962 *
3963 * Since algorithm is based on heuristics, accessing another VCPU data without
3964 * locking does not harm. It may result in trying to yield to same VCPU, fail
3965 * and continue with next VCPU and so on.
3966 */
kvm_vcpu_eligible_for_directed_yield(struct kvm_vcpu * vcpu)3967 static bool kvm_vcpu_eligible_for_directed_yield(struct kvm_vcpu *vcpu)
3968 {
3969 #ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT
3970 bool eligible;
3971
3972 eligible = !vcpu->spin_loop.in_spin_loop ||
3973 vcpu->spin_loop.dy_eligible;
3974
3975 if (vcpu->spin_loop.in_spin_loop)
3976 kvm_vcpu_set_dy_eligible(vcpu, !vcpu->spin_loop.dy_eligible);
3977
3978 return eligible;
3979 #else
3980 return true;
3981 #endif
3982 }
3983
3984 /*
3985 * Unlike kvm_arch_vcpu_runnable, this function is called outside
3986 * a vcpu_load/vcpu_put pair. However, for most architectures
3987 * kvm_arch_vcpu_runnable does not require vcpu_load.
3988 */
kvm_arch_dy_runnable(struct kvm_vcpu * vcpu)3989 bool __weak kvm_arch_dy_runnable(struct kvm_vcpu *vcpu)
3990 {
3991 return kvm_arch_vcpu_runnable(vcpu);
3992 }
3993
vcpu_dy_runnable(struct kvm_vcpu * vcpu)3994 static bool vcpu_dy_runnable(struct kvm_vcpu *vcpu)
3995 {
3996 if (kvm_arch_dy_runnable(vcpu))
3997 return true;
3998
3999 #ifdef CONFIG_KVM_ASYNC_PF
4000 if (!list_empty_careful(&vcpu->async_pf.done))
4001 return true;
4002 #endif
4003
4004 return false;
4005 }
4006
4007 /*
4008 * By default, simply query the target vCPU's current mode when checking if a
4009 * vCPU was preempted in kernel mode. All architectures except x86 (or more
4010 * specifical, except VMX) allow querying whether or not a vCPU is in kernel
4011 * mode even if the vCPU is NOT loaded, i.e. using kvm_arch_vcpu_in_kernel()
4012 * directly for cross-vCPU checks is functionally correct and accurate.
4013 */
kvm_arch_vcpu_preempted_in_kernel(struct kvm_vcpu * vcpu)4014 bool __weak kvm_arch_vcpu_preempted_in_kernel(struct kvm_vcpu *vcpu)
4015 {
4016 return kvm_arch_vcpu_in_kernel(vcpu);
4017 }
4018
kvm_arch_dy_has_pending_interrupt(struct kvm_vcpu * vcpu)4019 bool __weak kvm_arch_dy_has_pending_interrupt(struct kvm_vcpu *vcpu)
4020 {
4021 return false;
4022 }
4023
kvm_vcpu_on_spin(struct kvm_vcpu * me,bool yield_to_kernel_mode)4024 void kvm_vcpu_on_spin(struct kvm_vcpu *me, bool yield_to_kernel_mode)
4025 {
4026 struct kvm *kvm = me->kvm;
4027 struct kvm_vcpu *vcpu;
4028 int last_boosted_vcpu = me->kvm->last_boosted_vcpu;
4029 unsigned long i;
4030 int yielded = 0;
4031 int try = 3;
4032 int pass;
4033
4034 kvm_vcpu_set_in_spin_loop(me, true);
4035 /*
4036 * We boost the priority of a VCPU that is runnable but not
4037 * currently running, because it got preempted by something
4038 * else and called schedule in __vcpu_run. Hopefully that
4039 * VCPU is holding the lock that we need and will release it.
4040 * We approximate round-robin by starting at the last boosted VCPU.
4041 */
4042 for (pass = 0; pass < 2 && !yielded && try; pass++) {
4043 kvm_for_each_vcpu(i, vcpu, kvm) {
4044 if (!pass && i <= last_boosted_vcpu) {
4045 i = last_boosted_vcpu;
4046 continue;
4047 } else if (pass && i > last_boosted_vcpu)
4048 break;
4049 if (!READ_ONCE(vcpu->ready))
4050 continue;
4051 if (vcpu == me)
4052 continue;
4053 if (kvm_vcpu_is_blocking(vcpu) && !vcpu_dy_runnable(vcpu))
4054 continue;
4055
4056 /*
4057 * Treat the target vCPU as being in-kernel if it has a
4058 * pending interrupt, as the vCPU trying to yield may
4059 * be spinning waiting on IPI delivery, i.e. the target
4060 * vCPU is in-kernel for the purposes of directed yield.
4061 */
4062 if (READ_ONCE(vcpu->preempted) && yield_to_kernel_mode &&
4063 !kvm_arch_dy_has_pending_interrupt(vcpu) &&
4064 !kvm_arch_vcpu_preempted_in_kernel(vcpu))
4065 continue;
4066 if (!kvm_vcpu_eligible_for_directed_yield(vcpu))
4067 continue;
4068
4069 yielded = kvm_vcpu_yield_to(vcpu);
4070 if (yielded > 0) {
4071 kvm->last_boosted_vcpu = i;
4072 break;
4073 } else if (yielded < 0) {
4074 try--;
4075 if (!try)
4076 break;
4077 }
4078 }
4079 }
4080 kvm_vcpu_set_in_spin_loop(me, false);
4081
4082 /* Ensure vcpu is not eligible during next spinloop */
4083 kvm_vcpu_set_dy_eligible(me, false);
4084 }
4085 EXPORT_SYMBOL_GPL(kvm_vcpu_on_spin);
4086
kvm_page_in_dirty_ring(struct kvm * kvm,unsigned long pgoff)4087 static bool kvm_page_in_dirty_ring(struct kvm *kvm, unsigned long pgoff)
4088 {
4089 #ifdef CONFIG_HAVE_KVM_DIRTY_RING
4090 return (pgoff >= KVM_DIRTY_LOG_PAGE_OFFSET) &&
4091 (pgoff < KVM_DIRTY_LOG_PAGE_OFFSET +
4092 kvm->dirty_ring_size / PAGE_SIZE);
4093 #else
4094 return false;
4095 #endif
4096 }
4097
kvm_vcpu_fault(struct vm_fault * vmf)4098 static vm_fault_t kvm_vcpu_fault(struct vm_fault *vmf)
4099 {
4100 struct kvm_vcpu *vcpu = vmf->vma->vm_file->private_data;
4101 struct page *page;
4102
4103 if (vmf->pgoff == 0)
4104 page = virt_to_page(vcpu->run);
4105 #ifdef CONFIG_X86
4106 else if (vmf->pgoff == KVM_PIO_PAGE_OFFSET)
4107 page = virt_to_page(vcpu->arch.pio_data);
4108 #endif
4109 #ifdef CONFIG_KVM_MMIO
4110 else if (vmf->pgoff == KVM_COALESCED_MMIO_PAGE_OFFSET)
4111 page = virt_to_page(vcpu->kvm->coalesced_mmio_ring);
4112 #endif
4113 else if (kvm_page_in_dirty_ring(vcpu->kvm, vmf->pgoff))
4114 page = kvm_dirty_ring_get_page(
4115 &vcpu->dirty_ring,
4116 vmf->pgoff - KVM_DIRTY_LOG_PAGE_OFFSET);
4117 else
4118 return kvm_arch_vcpu_fault(vcpu, vmf);
4119 get_page(page);
4120 vmf->page = page;
4121 return 0;
4122 }
4123
4124 static const struct vm_operations_struct kvm_vcpu_vm_ops = {
4125 .fault = kvm_vcpu_fault,
4126 };
4127
kvm_vcpu_mmap(struct file * file,struct vm_area_struct * vma)4128 static int kvm_vcpu_mmap(struct file *file, struct vm_area_struct *vma)
4129 {
4130 struct kvm_vcpu *vcpu = file->private_data;
4131 unsigned long pages = vma_pages(vma);
4132
4133 if ((kvm_page_in_dirty_ring(vcpu->kvm, vma->vm_pgoff) ||
4134 kvm_page_in_dirty_ring(vcpu->kvm, vma->vm_pgoff + pages - 1)) &&
4135 ((vma->vm_flags & VM_EXEC) || !(vma->vm_flags & VM_SHARED)))
4136 return -EINVAL;
4137
4138 vma->vm_ops = &kvm_vcpu_vm_ops;
4139 return 0;
4140 }
4141
kvm_vcpu_release(struct inode * inode,struct file * filp)4142 static int kvm_vcpu_release(struct inode *inode, struct file *filp)
4143 {
4144 struct kvm_vcpu *vcpu = filp->private_data;
4145
4146 kvm_put_kvm(vcpu->kvm);
4147 return 0;
4148 }
4149
4150 static struct file_operations kvm_vcpu_fops = {
4151 .release = kvm_vcpu_release,
4152 .unlocked_ioctl = kvm_vcpu_ioctl,
4153 .mmap = kvm_vcpu_mmap,
4154 .llseek = noop_llseek,
4155 KVM_COMPAT(kvm_vcpu_compat_ioctl),
4156 };
4157
4158 /*
4159 * Allocates an inode for the vcpu.
4160 */
create_vcpu_fd(struct kvm_vcpu * vcpu)4161 static int create_vcpu_fd(struct kvm_vcpu *vcpu)
4162 {
4163 char name[8 + 1 + ITOA_MAX_LEN + 1];
4164
4165 snprintf(name, sizeof(name), "kvm-vcpu:%d", vcpu->vcpu_id);
4166 return anon_inode_getfd(name, &kvm_vcpu_fops, vcpu, O_RDWR | O_CLOEXEC);
4167 }
4168
4169 #ifdef __KVM_HAVE_ARCH_VCPU_DEBUGFS
vcpu_get_pid(void * data,u64 * val)4170 static int vcpu_get_pid(void *data, u64 *val)
4171 {
4172 struct kvm_vcpu *vcpu = data;
4173
4174 rcu_read_lock();
4175 *val = pid_nr(rcu_dereference(vcpu->pid));
4176 rcu_read_unlock();
4177 return 0;
4178 }
4179
4180 DEFINE_SIMPLE_ATTRIBUTE(vcpu_get_pid_fops, vcpu_get_pid, NULL, "%llu\n");
4181
kvm_create_vcpu_debugfs(struct kvm_vcpu * vcpu)4182 static void kvm_create_vcpu_debugfs(struct kvm_vcpu *vcpu)
4183 {
4184 struct dentry *debugfs_dentry;
4185 char dir_name[ITOA_MAX_LEN * 2];
4186
4187 if (!debugfs_initialized())
4188 return;
4189
4190 snprintf(dir_name, sizeof(dir_name), "vcpu%d", vcpu->vcpu_id);
4191 debugfs_dentry = debugfs_create_dir(dir_name,
4192 vcpu->kvm->debugfs_dentry);
4193 debugfs_create_file("pid", 0444, debugfs_dentry, vcpu,
4194 &vcpu_get_pid_fops);
4195
4196 kvm_arch_create_vcpu_debugfs(vcpu, debugfs_dentry);
4197 }
4198 #endif
4199
4200 /*
4201 * Creates some virtual cpus. Good luck creating more than one.
4202 */
kvm_vm_ioctl_create_vcpu(struct kvm * kvm,u32 id)4203 static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id)
4204 {
4205 int r;
4206 struct kvm_vcpu *vcpu;
4207 struct page *page;
4208
4209 if (id >= KVM_MAX_VCPU_IDS)
4210 return -EINVAL;
4211
4212 mutex_lock(&kvm->lock);
4213 if (kvm->created_vcpus >= kvm->max_vcpus) {
4214 mutex_unlock(&kvm->lock);
4215 return -EINVAL;
4216 }
4217
4218 r = kvm_arch_vcpu_precreate(kvm, id);
4219 if (r) {
4220 mutex_unlock(&kvm->lock);
4221 return r;
4222 }
4223
4224 kvm->created_vcpus++;
4225 mutex_unlock(&kvm->lock);
4226
4227 vcpu = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL_ACCOUNT);
4228 if (!vcpu) {
4229 r = -ENOMEM;
4230 goto vcpu_decrement;
4231 }
4232
4233 BUILD_BUG_ON(sizeof(struct kvm_run) > PAGE_SIZE);
4234 page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
4235 if (!page) {
4236 r = -ENOMEM;
4237 goto vcpu_free;
4238 }
4239 vcpu->run = page_address(page);
4240
4241 kvm_vcpu_init(vcpu, kvm, id);
4242
4243 r = kvm_arch_vcpu_create(vcpu);
4244 if (r)
4245 goto vcpu_free_run_page;
4246
4247 if (kvm->dirty_ring_size) {
4248 r = kvm_dirty_ring_alloc(&vcpu->dirty_ring,
4249 id, kvm->dirty_ring_size);
4250 if (r)
4251 goto arch_vcpu_destroy;
4252 }
4253
4254 mutex_lock(&kvm->lock);
4255
4256 #ifdef CONFIG_LOCKDEP
4257 /* Ensure that lockdep knows vcpu->mutex is taken *inside* kvm->lock */
4258 mutex_lock(&vcpu->mutex);
4259 mutex_unlock(&vcpu->mutex);
4260 #endif
4261
4262 if (kvm_get_vcpu_by_id(kvm, id)) {
4263 r = -EEXIST;
4264 goto unlock_vcpu_destroy;
4265 }
4266
4267 vcpu->vcpu_idx = atomic_read(&kvm->online_vcpus);
4268 r = xa_reserve(&kvm->vcpu_array, vcpu->vcpu_idx, GFP_KERNEL_ACCOUNT);
4269 if (r)
4270 goto unlock_vcpu_destroy;
4271
4272 /* Now it's all set up, let userspace reach it */
4273 kvm_get_kvm(kvm);
4274 r = create_vcpu_fd(vcpu);
4275 if (r < 0)
4276 goto kvm_put_xa_release;
4277
4278 if (KVM_BUG_ON(xa_store(&kvm->vcpu_array, vcpu->vcpu_idx, vcpu, 0), kvm)) {
4279 r = -EINVAL;
4280 goto kvm_put_xa_release;
4281 }
4282
4283 /*
4284 * Pairs with smp_rmb() in kvm_get_vcpu. Store the vcpu
4285 * pointer before kvm->online_vcpu's incremented value.
4286 */
4287 smp_wmb();
4288 atomic_inc(&kvm->online_vcpus);
4289
4290 mutex_unlock(&kvm->lock);
4291 kvm_arch_vcpu_postcreate(vcpu);
4292 kvm_create_vcpu_debugfs(vcpu);
4293 return r;
4294
4295 kvm_put_xa_release:
4296 kvm_put_kvm_no_destroy(kvm);
4297 xa_release(&kvm->vcpu_array, vcpu->vcpu_idx);
4298 unlock_vcpu_destroy:
4299 mutex_unlock(&kvm->lock);
4300 kvm_dirty_ring_free(&vcpu->dirty_ring);
4301 arch_vcpu_destroy:
4302 kvm_arch_vcpu_destroy(vcpu);
4303 vcpu_free_run_page:
4304 free_page((unsigned long)vcpu->run);
4305 vcpu_free:
4306 kmem_cache_free(kvm_vcpu_cache, vcpu);
4307 vcpu_decrement:
4308 mutex_lock(&kvm->lock);
4309 kvm->created_vcpus--;
4310 mutex_unlock(&kvm->lock);
4311 return r;
4312 }
4313
kvm_vcpu_ioctl_set_sigmask(struct kvm_vcpu * vcpu,sigset_t * sigset)4314 static int kvm_vcpu_ioctl_set_sigmask(struct kvm_vcpu *vcpu, sigset_t *sigset)
4315 {
4316 if (sigset) {
4317 sigdelsetmask(sigset, sigmask(SIGKILL)|sigmask(SIGSTOP));
4318 vcpu->sigset_active = 1;
4319 vcpu->sigset = *sigset;
4320 } else
4321 vcpu->sigset_active = 0;
4322 return 0;
4323 }
4324
kvm_vcpu_stats_read(struct file * file,char __user * user_buffer,size_t size,loff_t * offset)4325 static ssize_t kvm_vcpu_stats_read(struct file *file, char __user *user_buffer,
4326 size_t size, loff_t *offset)
4327 {
4328 struct kvm_vcpu *vcpu = file->private_data;
4329
4330 return kvm_stats_read(vcpu->stats_id, &kvm_vcpu_stats_header,
4331 &kvm_vcpu_stats_desc[0], &vcpu->stat,
4332 sizeof(vcpu->stat), user_buffer, size, offset);
4333 }
4334
kvm_vcpu_stats_release(struct inode * inode,struct file * file)4335 static int kvm_vcpu_stats_release(struct inode *inode, struct file *file)
4336 {
4337 struct kvm_vcpu *vcpu = file->private_data;
4338
4339 kvm_put_kvm(vcpu->kvm);
4340 return 0;
4341 }
4342
4343 static const struct file_operations kvm_vcpu_stats_fops = {
4344 .owner = THIS_MODULE,
4345 .read = kvm_vcpu_stats_read,
4346 .release = kvm_vcpu_stats_release,
4347 .llseek = noop_llseek,
4348 };
4349
kvm_vcpu_ioctl_get_stats_fd(struct kvm_vcpu * vcpu)4350 static int kvm_vcpu_ioctl_get_stats_fd(struct kvm_vcpu *vcpu)
4351 {
4352 int fd;
4353 struct file *file;
4354 char name[15 + ITOA_MAX_LEN + 1];
4355
4356 snprintf(name, sizeof(name), "kvm-vcpu-stats:%d", vcpu->vcpu_id);
4357
4358 fd = get_unused_fd_flags(O_CLOEXEC);
4359 if (fd < 0)
4360 return fd;
4361
4362 file = anon_inode_getfile(name, &kvm_vcpu_stats_fops, vcpu, O_RDONLY);
4363 if (IS_ERR(file)) {
4364 put_unused_fd(fd);
4365 return PTR_ERR(file);
4366 }
4367
4368 kvm_get_kvm(vcpu->kvm);
4369
4370 file->f_mode |= FMODE_PREAD;
4371 fd_install(fd, file);
4372
4373 return fd;
4374 }
4375
kvm_vcpu_ioctl(struct file * filp,unsigned int ioctl,unsigned long arg)4376 static long kvm_vcpu_ioctl(struct file *filp,
4377 unsigned int ioctl, unsigned long arg)
4378 {
4379 struct kvm_vcpu *vcpu = filp->private_data;
4380 void __user *argp = (void __user *)arg;
4381 int r;
4382 struct kvm_fpu *fpu = NULL;
4383 struct kvm_sregs *kvm_sregs = NULL;
4384
4385 if (vcpu->kvm->mm != current->mm || vcpu->kvm->vm_dead)
4386 return -EIO;
4387
4388 if (unlikely(_IOC_TYPE(ioctl) != KVMIO))
4389 return -EINVAL;
4390
4391 /*
4392 * Some architectures have vcpu ioctls that are asynchronous to vcpu
4393 * execution; mutex_lock() would break them.
4394 */
4395 r = kvm_arch_vcpu_async_ioctl(filp, ioctl, arg);
4396 if (r != -ENOIOCTLCMD)
4397 return r;
4398
4399 if (mutex_lock_killable(&vcpu->mutex))
4400 return -EINTR;
4401 switch (ioctl) {
4402 case KVM_RUN: {
4403 struct pid *oldpid;
4404 r = -EINVAL;
4405 if (arg)
4406 goto out;
4407 oldpid = rcu_access_pointer(vcpu->pid);
4408 if (unlikely(oldpid != task_pid(current))) {
4409 /* The thread running this VCPU changed. */
4410 struct pid *newpid;
4411
4412 r = kvm_arch_vcpu_run_pid_change(vcpu);
4413 if (r)
4414 break;
4415
4416 newpid = get_task_pid(current, PIDTYPE_PID);
4417 rcu_assign_pointer(vcpu->pid, newpid);
4418 if (oldpid)
4419 synchronize_rcu();
4420 put_pid(oldpid);
4421 }
4422 r = kvm_arch_vcpu_ioctl_run(vcpu);
4423 trace_kvm_userspace_exit(vcpu->run->exit_reason, r);
4424 break;
4425 }
4426 case KVM_GET_REGS: {
4427 struct kvm_regs *kvm_regs;
4428
4429 r = -ENOMEM;
4430 kvm_regs = kzalloc(sizeof(struct kvm_regs), GFP_KERNEL_ACCOUNT);
4431 if (!kvm_regs)
4432 goto out;
4433 r = kvm_arch_vcpu_ioctl_get_regs(vcpu, kvm_regs);
4434 if (r)
4435 goto out_free1;
4436 r = -EFAULT;
4437 if (copy_to_user(argp, kvm_regs, sizeof(struct kvm_regs)))
4438 goto out_free1;
4439 r = 0;
4440 out_free1:
4441 kfree(kvm_regs);
4442 break;
4443 }
4444 case KVM_SET_REGS: {
4445 struct kvm_regs *kvm_regs;
4446
4447 kvm_regs = memdup_user(argp, sizeof(*kvm_regs));
4448 if (IS_ERR(kvm_regs)) {
4449 r = PTR_ERR(kvm_regs);
4450 goto out;
4451 }
4452 r = kvm_arch_vcpu_ioctl_set_regs(vcpu, kvm_regs);
4453 kfree(kvm_regs);
4454 break;
4455 }
4456 case KVM_GET_SREGS: {
4457 kvm_sregs = kzalloc(sizeof(struct kvm_sregs),
4458 GFP_KERNEL_ACCOUNT);
4459 r = -ENOMEM;
4460 if (!kvm_sregs)
4461 goto out;
4462 r = kvm_arch_vcpu_ioctl_get_sregs(vcpu, kvm_sregs);
4463 if (r)
4464 goto out;
4465 r = -EFAULT;
4466 if (copy_to_user(argp, kvm_sregs, sizeof(struct kvm_sregs)))
4467 goto out;
4468 r = 0;
4469 break;
4470 }
4471 case KVM_SET_SREGS: {
4472 kvm_sregs = memdup_user(argp, sizeof(*kvm_sregs));
4473 if (IS_ERR(kvm_sregs)) {
4474 r = PTR_ERR(kvm_sregs);
4475 kvm_sregs = NULL;
4476 goto out;
4477 }
4478 r = kvm_arch_vcpu_ioctl_set_sregs(vcpu, kvm_sregs);
4479 break;
4480 }
4481 case KVM_GET_MP_STATE: {
4482 struct kvm_mp_state mp_state;
4483
4484 r = kvm_arch_vcpu_ioctl_get_mpstate(vcpu, &mp_state);
4485 if (r)
4486 goto out;
4487 r = -EFAULT;
4488 if (copy_to_user(argp, &mp_state, sizeof(mp_state)))
4489 goto out;
4490 r = 0;
4491 break;
4492 }
4493 case KVM_SET_MP_STATE: {
4494 struct kvm_mp_state mp_state;
4495
4496 r = -EFAULT;
4497 if (copy_from_user(&mp_state, argp, sizeof(mp_state)))
4498 goto out;
4499 r = kvm_arch_vcpu_ioctl_set_mpstate(vcpu, &mp_state);
4500 break;
4501 }
4502 case KVM_TRANSLATE: {
4503 struct kvm_translation tr;
4504
4505 r = -EFAULT;
4506 if (copy_from_user(&tr, argp, sizeof(tr)))
4507 goto out;
4508 r = kvm_arch_vcpu_ioctl_translate(vcpu, &tr);
4509 if (r)
4510 goto out;
4511 r = -EFAULT;
4512 if (copy_to_user(argp, &tr, sizeof(tr)))
4513 goto out;
4514 r = 0;
4515 break;
4516 }
4517 case KVM_SET_GUEST_DEBUG: {
4518 struct kvm_guest_debug dbg;
4519
4520 r = -EFAULT;
4521 if (copy_from_user(&dbg, argp, sizeof(dbg)))
4522 goto out;
4523 r = kvm_arch_vcpu_ioctl_set_guest_debug(vcpu, &dbg);
4524 break;
4525 }
4526 case KVM_SET_SIGNAL_MASK: {
4527 struct kvm_signal_mask __user *sigmask_arg = argp;
4528 struct kvm_signal_mask kvm_sigmask;
4529 sigset_t sigset, *p;
4530
4531 p = NULL;
4532 if (argp) {
4533 r = -EFAULT;
4534 if (copy_from_user(&kvm_sigmask, argp,
4535 sizeof(kvm_sigmask)))
4536 goto out;
4537 r = -EINVAL;
4538 if (kvm_sigmask.len != sizeof(sigset))
4539 goto out;
4540 r = -EFAULT;
4541 if (copy_from_user(&sigset, sigmask_arg->sigset,
4542 sizeof(sigset)))
4543 goto out;
4544 p = &sigset;
4545 }
4546 r = kvm_vcpu_ioctl_set_sigmask(vcpu, p);
4547 break;
4548 }
4549 case KVM_GET_FPU: {
4550 fpu = kzalloc(sizeof(struct kvm_fpu), GFP_KERNEL_ACCOUNT);
4551 r = -ENOMEM;
4552 if (!fpu)
4553 goto out;
4554 r = kvm_arch_vcpu_ioctl_get_fpu(vcpu, fpu);
4555 if (r)
4556 goto out;
4557 r = -EFAULT;
4558 if (copy_to_user(argp, fpu, sizeof(struct kvm_fpu)))
4559 goto out;
4560 r = 0;
4561 break;
4562 }
4563 case KVM_SET_FPU: {
4564 fpu = memdup_user(argp, sizeof(*fpu));
4565 if (IS_ERR(fpu)) {
4566 r = PTR_ERR(fpu);
4567 fpu = NULL;
4568 goto out;
4569 }
4570 r = kvm_arch_vcpu_ioctl_set_fpu(vcpu, fpu);
4571 break;
4572 }
4573 case KVM_GET_STATS_FD: {
4574 r = kvm_vcpu_ioctl_get_stats_fd(vcpu);
4575 break;
4576 }
4577 default:
4578 r = kvm_arch_vcpu_ioctl(filp, ioctl, arg);
4579 }
4580 out:
4581 mutex_unlock(&vcpu->mutex);
4582 kfree(fpu);
4583 kfree(kvm_sregs);
4584 return r;
4585 }
4586
4587 #ifdef CONFIG_KVM_COMPAT
kvm_vcpu_compat_ioctl(struct file * filp,unsigned int ioctl,unsigned long arg)4588 static long kvm_vcpu_compat_ioctl(struct file *filp,
4589 unsigned int ioctl, unsigned long arg)
4590 {
4591 struct kvm_vcpu *vcpu = filp->private_data;
4592 void __user *argp = compat_ptr(arg);
4593 int r;
4594
4595 if (vcpu->kvm->mm != current->mm || vcpu->kvm->vm_dead)
4596 return -EIO;
4597
4598 switch (ioctl) {
4599 case KVM_SET_SIGNAL_MASK: {
4600 struct kvm_signal_mask __user *sigmask_arg = argp;
4601 struct kvm_signal_mask kvm_sigmask;
4602 sigset_t sigset;
4603
4604 if (argp) {
4605 r = -EFAULT;
4606 if (copy_from_user(&kvm_sigmask, argp,
4607 sizeof(kvm_sigmask)))
4608 goto out;
4609 r = -EINVAL;
4610 if (kvm_sigmask.len != sizeof(compat_sigset_t))
4611 goto out;
4612 r = -EFAULT;
4613 if (get_compat_sigset(&sigset,
4614 (compat_sigset_t __user *)sigmask_arg->sigset))
4615 goto out;
4616 r = kvm_vcpu_ioctl_set_sigmask(vcpu, &sigset);
4617 } else
4618 r = kvm_vcpu_ioctl_set_sigmask(vcpu, NULL);
4619 break;
4620 }
4621 default:
4622 r = kvm_vcpu_ioctl(filp, ioctl, arg);
4623 }
4624
4625 out:
4626 return r;
4627 }
4628 #endif
4629
kvm_device_mmap(struct file * filp,struct vm_area_struct * vma)4630 static int kvm_device_mmap(struct file *filp, struct vm_area_struct *vma)
4631 {
4632 struct kvm_device *dev = filp->private_data;
4633
4634 if (dev->ops->mmap)
4635 return dev->ops->mmap(dev, vma);
4636
4637 return -ENODEV;
4638 }
4639
kvm_device_ioctl_attr(struct kvm_device * dev,int (* accessor)(struct kvm_device * dev,struct kvm_device_attr * attr),unsigned long arg)4640 static int kvm_device_ioctl_attr(struct kvm_device *dev,
4641 int (*accessor)(struct kvm_device *dev,
4642 struct kvm_device_attr *attr),
4643 unsigned long arg)
4644 {
4645 struct kvm_device_attr attr;
4646
4647 if (!accessor)
4648 return -EPERM;
4649
4650 if (copy_from_user(&attr, (void __user *)arg, sizeof(attr)))
4651 return -EFAULT;
4652
4653 return accessor(dev, &attr);
4654 }
4655
kvm_device_ioctl(struct file * filp,unsigned int ioctl,unsigned long arg)4656 static long kvm_device_ioctl(struct file *filp, unsigned int ioctl,
4657 unsigned long arg)
4658 {
4659 struct kvm_device *dev = filp->private_data;
4660
4661 if (dev->kvm->mm != current->mm || dev->kvm->vm_dead)
4662 return -EIO;
4663
4664 switch (ioctl) {
4665 case KVM_SET_DEVICE_ATTR:
4666 return kvm_device_ioctl_attr(dev, dev->ops->set_attr, arg);
4667 case KVM_GET_DEVICE_ATTR:
4668 return kvm_device_ioctl_attr(dev, dev->ops->get_attr, arg);
4669 case KVM_HAS_DEVICE_ATTR:
4670 return kvm_device_ioctl_attr(dev, dev->ops->has_attr, arg);
4671 default:
4672 if (dev->ops->ioctl)
4673 return dev->ops->ioctl(dev, ioctl, arg);
4674
4675 return -ENOTTY;
4676 }
4677 }
4678
kvm_device_release(struct inode * inode,struct file * filp)4679 static int kvm_device_release(struct inode *inode, struct file *filp)
4680 {
4681 struct kvm_device *dev = filp->private_data;
4682 struct kvm *kvm = dev->kvm;
4683
4684 if (dev->ops->release) {
4685 mutex_lock(&kvm->lock);
4686 list_del_rcu(&dev->vm_node);
4687 synchronize_rcu();
4688 dev->ops->release(dev);
4689 mutex_unlock(&kvm->lock);
4690 }
4691
4692 kvm_put_kvm(kvm);
4693 return 0;
4694 }
4695
4696 static struct file_operations kvm_device_fops = {
4697 .unlocked_ioctl = kvm_device_ioctl,
4698 .release = kvm_device_release,
4699 KVM_COMPAT(kvm_device_ioctl),
4700 .mmap = kvm_device_mmap,
4701 };
4702
kvm_device_from_filp(struct file * filp)4703 struct kvm_device *kvm_device_from_filp(struct file *filp)
4704 {
4705 if (filp->f_op != &kvm_device_fops)
4706 return NULL;
4707
4708 return filp->private_data;
4709 }
4710
4711 static const struct kvm_device_ops *kvm_device_ops_table[KVM_DEV_TYPE_MAX] = {
4712 #ifdef CONFIG_KVM_MPIC
4713 [KVM_DEV_TYPE_FSL_MPIC_20] = &kvm_mpic_ops,
4714 [KVM_DEV_TYPE_FSL_MPIC_42] = &kvm_mpic_ops,
4715 #endif
4716 };
4717
kvm_register_device_ops(const struct kvm_device_ops * ops,u32 type)4718 int kvm_register_device_ops(const struct kvm_device_ops *ops, u32 type)
4719 {
4720 if (type >= ARRAY_SIZE(kvm_device_ops_table))
4721 return -ENOSPC;
4722
4723 if (kvm_device_ops_table[type] != NULL)
4724 return -EEXIST;
4725
4726 kvm_device_ops_table[type] = ops;
4727 return 0;
4728 }
4729
kvm_unregister_device_ops(u32 type)4730 void kvm_unregister_device_ops(u32 type)
4731 {
4732 if (kvm_device_ops_table[type] != NULL)
4733 kvm_device_ops_table[type] = NULL;
4734 }
4735
kvm_ioctl_create_device(struct kvm * kvm,struct kvm_create_device * cd)4736 static int kvm_ioctl_create_device(struct kvm *kvm,
4737 struct kvm_create_device *cd)
4738 {
4739 const struct kvm_device_ops *ops;
4740 struct kvm_device *dev;
4741 bool test = cd->flags & KVM_CREATE_DEVICE_TEST;
4742 int type;
4743 int ret;
4744
4745 if (cd->type >= ARRAY_SIZE(kvm_device_ops_table))
4746 return -ENODEV;
4747
4748 type = array_index_nospec(cd->type, ARRAY_SIZE(kvm_device_ops_table));
4749 ops = kvm_device_ops_table[type];
4750 if (ops == NULL)
4751 return -ENODEV;
4752
4753 if (test)
4754 return 0;
4755
4756 dev = kzalloc(sizeof(*dev), GFP_KERNEL_ACCOUNT);
4757 if (!dev)
4758 return -ENOMEM;
4759
4760 dev->ops = ops;
4761 dev->kvm = kvm;
4762
4763 mutex_lock(&kvm->lock);
4764 ret = ops->create(dev, type);
4765 if (ret < 0) {
4766 mutex_unlock(&kvm->lock);
4767 kfree(dev);
4768 return ret;
4769 }
4770 list_add_rcu(&dev->vm_node, &kvm->devices);
4771 mutex_unlock(&kvm->lock);
4772
4773 if (ops->init)
4774 ops->init(dev);
4775
4776 kvm_get_kvm(kvm);
4777 ret = anon_inode_getfd(ops->name, &kvm_device_fops, dev, O_RDWR | O_CLOEXEC);
4778 if (ret < 0) {
4779 kvm_put_kvm_no_destroy(kvm);
4780 mutex_lock(&kvm->lock);
4781 list_del_rcu(&dev->vm_node);
4782 synchronize_rcu();
4783 if (ops->release)
4784 ops->release(dev);
4785 mutex_unlock(&kvm->lock);
4786 if (ops->destroy)
4787 ops->destroy(dev);
4788 return ret;
4789 }
4790
4791 cd->fd = ret;
4792 return 0;
4793 }
4794
kvm_vm_ioctl_check_extension_generic(struct kvm * kvm,long arg)4795 static int kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg)
4796 {
4797 switch (arg) {
4798 case KVM_CAP_USER_MEMORY:
4799 case KVM_CAP_USER_MEMORY2:
4800 case KVM_CAP_DESTROY_MEMORY_REGION_WORKS:
4801 case KVM_CAP_JOIN_MEMORY_REGIONS_WORKS:
4802 case KVM_CAP_INTERNAL_ERROR_DATA:
4803 #ifdef CONFIG_HAVE_KVM_MSI
4804 case KVM_CAP_SIGNAL_MSI:
4805 #endif
4806 #ifdef CONFIG_HAVE_KVM_IRQCHIP
4807 case KVM_CAP_IRQFD:
4808 #endif
4809 case KVM_CAP_IOEVENTFD_ANY_LENGTH:
4810 case KVM_CAP_CHECK_EXTENSION_VM:
4811 case KVM_CAP_ENABLE_CAP_VM:
4812 case KVM_CAP_HALT_POLL:
4813 return 1;
4814 #ifdef CONFIG_KVM_MMIO
4815 case KVM_CAP_COALESCED_MMIO:
4816 return KVM_COALESCED_MMIO_PAGE_OFFSET;
4817 case KVM_CAP_COALESCED_PIO:
4818 return 1;
4819 #endif
4820 #ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
4821 case KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2:
4822 return KVM_DIRTY_LOG_MANUAL_CAPS;
4823 #endif
4824 #ifdef CONFIG_HAVE_KVM_IRQ_ROUTING
4825 case KVM_CAP_IRQ_ROUTING:
4826 return KVM_MAX_IRQ_ROUTES;
4827 #endif
4828 #if KVM_MAX_NR_ADDRESS_SPACES > 1
4829 case KVM_CAP_MULTI_ADDRESS_SPACE:
4830 if (kvm)
4831 return kvm_arch_nr_memslot_as_ids(kvm);
4832 return KVM_MAX_NR_ADDRESS_SPACES;
4833 #endif
4834 case KVM_CAP_NR_MEMSLOTS:
4835 return KVM_USER_MEM_SLOTS;
4836 case KVM_CAP_DIRTY_LOG_RING:
4837 #ifdef CONFIG_HAVE_KVM_DIRTY_RING_TSO
4838 return KVM_DIRTY_RING_MAX_ENTRIES * sizeof(struct kvm_dirty_gfn);
4839 #else
4840 return 0;
4841 #endif
4842 case KVM_CAP_DIRTY_LOG_RING_ACQ_REL:
4843 #ifdef CONFIG_HAVE_KVM_DIRTY_RING_ACQ_REL
4844 return KVM_DIRTY_RING_MAX_ENTRIES * sizeof(struct kvm_dirty_gfn);
4845 #else
4846 return 0;
4847 #endif
4848 #ifdef CONFIG_NEED_KVM_DIRTY_RING_WITH_BITMAP
4849 case KVM_CAP_DIRTY_LOG_RING_WITH_BITMAP:
4850 #endif
4851 case KVM_CAP_BINARY_STATS_FD:
4852 case KVM_CAP_SYSTEM_EVENT_DATA:
4853 case KVM_CAP_DEVICE_CTRL:
4854 return 1;
4855 #ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
4856 case KVM_CAP_MEMORY_ATTRIBUTES:
4857 return kvm_supported_mem_attributes(kvm);
4858 #endif
4859 #ifdef CONFIG_KVM_PRIVATE_MEM
4860 case KVM_CAP_GUEST_MEMFD:
4861 return !kvm || kvm_arch_has_private_mem(kvm);
4862 #endif
4863 default:
4864 break;
4865 }
4866 return kvm_vm_ioctl_check_extension(kvm, arg);
4867 }
4868
kvm_vm_ioctl_enable_dirty_log_ring(struct kvm * kvm,u32 size)4869 static int kvm_vm_ioctl_enable_dirty_log_ring(struct kvm *kvm, u32 size)
4870 {
4871 int r;
4872
4873 if (!KVM_DIRTY_LOG_PAGE_OFFSET)
4874 return -EINVAL;
4875
4876 /* the size should be power of 2 */
4877 if (!size || (size & (size - 1)))
4878 return -EINVAL;
4879
4880 /* Should be bigger to keep the reserved entries, or a page */
4881 if (size < kvm_dirty_ring_get_rsvd_entries() *
4882 sizeof(struct kvm_dirty_gfn) || size < PAGE_SIZE)
4883 return -EINVAL;
4884
4885 if (size > KVM_DIRTY_RING_MAX_ENTRIES *
4886 sizeof(struct kvm_dirty_gfn))
4887 return -E2BIG;
4888
4889 /* We only allow it to set once */
4890 if (kvm->dirty_ring_size)
4891 return -EINVAL;
4892
4893 mutex_lock(&kvm->lock);
4894
4895 if (kvm->created_vcpus) {
4896 /* We don't allow to change this value after vcpu created */
4897 r = -EINVAL;
4898 } else {
4899 kvm->dirty_ring_size = size;
4900 r = 0;
4901 }
4902
4903 mutex_unlock(&kvm->lock);
4904 return r;
4905 }
4906
kvm_vm_ioctl_reset_dirty_pages(struct kvm * kvm)4907 static int kvm_vm_ioctl_reset_dirty_pages(struct kvm *kvm)
4908 {
4909 unsigned long i;
4910 struct kvm_vcpu *vcpu;
4911 int cleared = 0;
4912
4913 if (!kvm->dirty_ring_size)
4914 return -EINVAL;
4915
4916 mutex_lock(&kvm->slots_lock);
4917
4918 kvm_for_each_vcpu(i, vcpu, kvm)
4919 cleared += kvm_dirty_ring_reset(vcpu->kvm, &vcpu->dirty_ring);
4920
4921 mutex_unlock(&kvm->slots_lock);
4922
4923 if (cleared)
4924 kvm_flush_remote_tlbs(kvm);
4925
4926 return cleared;
4927 }
4928
kvm_vm_ioctl_enable_cap(struct kvm * kvm,struct kvm_enable_cap * cap)4929 int __attribute__((weak)) kvm_vm_ioctl_enable_cap(struct kvm *kvm,
4930 struct kvm_enable_cap *cap)
4931 {
4932 return -EINVAL;
4933 }
4934
kvm_are_all_memslots_empty(struct kvm * kvm)4935 bool kvm_are_all_memslots_empty(struct kvm *kvm)
4936 {
4937 int i;
4938
4939 lockdep_assert_held(&kvm->slots_lock);
4940
4941 for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) {
4942 if (!kvm_memslots_empty(__kvm_memslots(kvm, i)))
4943 return false;
4944 }
4945
4946 return true;
4947 }
4948 EXPORT_SYMBOL_GPL(kvm_are_all_memslots_empty);
4949
kvm_vm_ioctl_enable_cap_generic(struct kvm * kvm,struct kvm_enable_cap * cap)4950 static int kvm_vm_ioctl_enable_cap_generic(struct kvm *kvm,
4951 struct kvm_enable_cap *cap)
4952 {
4953 switch (cap->cap) {
4954 #ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
4955 case KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2: {
4956 u64 allowed_options = KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE;
4957
4958 if (cap->args[0] & KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE)
4959 allowed_options = KVM_DIRTY_LOG_MANUAL_CAPS;
4960
4961 if (cap->flags || (cap->args[0] & ~allowed_options))
4962 return -EINVAL;
4963 kvm->manual_dirty_log_protect = cap->args[0];
4964 return 0;
4965 }
4966 #endif
4967 case KVM_CAP_HALT_POLL: {
4968 if (cap->flags || cap->args[0] != (unsigned int)cap->args[0])
4969 return -EINVAL;
4970
4971 kvm->max_halt_poll_ns = cap->args[0];
4972
4973 /*
4974 * Ensure kvm->override_halt_poll_ns does not become visible
4975 * before kvm->max_halt_poll_ns.
4976 *
4977 * Pairs with the smp_rmb() in kvm_vcpu_max_halt_poll_ns().
4978 */
4979 smp_wmb();
4980 kvm->override_halt_poll_ns = true;
4981
4982 return 0;
4983 }
4984 case KVM_CAP_DIRTY_LOG_RING:
4985 case KVM_CAP_DIRTY_LOG_RING_ACQ_REL:
4986 if (!kvm_vm_ioctl_check_extension_generic(kvm, cap->cap))
4987 return -EINVAL;
4988
4989 return kvm_vm_ioctl_enable_dirty_log_ring(kvm, cap->args[0]);
4990 case KVM_CAP_DIRTY_LOG_RING_WITH_BITMAP: {
4991 int r = -EINVAL;
4992
4993 if (!IS_ENABLED(CONFIG_NEED_KVM_DIRTY_RING_WITH_BITMAP) ||
4994 !kvm->dirty_ring_size || cap->flags)
4995 return r;
4996
4997 mutex_lock(&kvm->slots_lock);
4998
4999 /*
5000 * For simplicity, allow enabling ring+bitmap if and only if
5001 * there are no memslots, e.g. to ensure all memslots allocate
5002 * a bitmap after the capability is enabled.
5003 */
5004 if (kvm_are_all_memslots_empty(kvm)) {
5005 kvm->dirty_ring_with_bitmap = true;
5006 r = 0;
5007 }
5008
5009 mutex_unlock(&kvm->slots_lock);
5010
5011 return r;
5012 }
5013 default:
5014 return kvm_vm_ioctl_enable_cap(kvm, cap);
5015 }
5016 }
5017
kvm_vm_stats_read(struct file * file,char __user * user_buffer,size_t size,loff_t * offset)5018 static ssize_t kvm_vm_stats_read(struct file *file, char __user *user_buffer,
5019 size_t size, loff_t *offset)
5020 {
5021 struct kvm *kvm = file->private_data;
5022
5023 return kvm_stats_read(kvm->stats_id, &kvm_vm_stats_header,
5024 &kvm_vm_stats_desc[0], &kvm->stat,
5025 sizeof(kvm->stat), user_buffer, size, offset);
5026 }
5027
kvm_vm_stats_release(struct inode * inode,struct file * file)5028 static int kvm_vm_stats_release(struct inode *inode, struct file *file)
5029 {
5030 struct kvm *kvm = file->private_data;
5031
5032 kvm_put_kvm(kvm);
5033 return 0;
5034 }
5035
5036 static const struct file_operations kvm_vm_stats_fops = {
5037 .owner = THIS_MODULE,
5038 .read = kvm_vm_stats_read,
5039 .release = kvm_vm_stats_release,
5040 .llseek = noop_llseek,
5041 };
5042
kvm_vm_ioctl_get_stats_fd(struct kvm * kvm)5043 static int kvm_vm_ioctl_get_stats_fd(struct kvm *kvm)
5044 {
5045 int fd;
5046 struct file *file;
5047
5048 fd = get_unused_fd_flags(O_CLOEXEC);
5049 if (fd < 0)
5050 return fd;
5051
5052 file = anon_inode_getfile("kvm-vm-stats",
5053 &kvm_vm_stats_fops, kvm, O_RDONLY);
5054 if (IS_ERR(file)) {
5055 put_unused_fd(fd);
5056 return PTR_ERR(file);
5057 }
5058
5059 kvm_get_kvm(kvm);
5060
5061 file->f_mode |= FMODE_PREAD;
5062 fd_install(fd, file);
5063
5064 return fd;
5065 }
5066
5067 #define SANITY_CHECK_MEM_REGION_FIELD(field) \
5068 do { \
5069 BUILD_BUG_ON(offsetof(struct kvm_userspace_memory_region, field) != \
5070 offsetof(struct kvm_userspace_memory_region2, field)); \
5071 BUILD_BUG_ON(sizeof_field(struct kvm_userspace_memory_region, field) != \
5072 sizeof_field(struct kvm_userspace_memory_region2, field)); \
5073 } while (0)
5074
kvm_vm_ioctl(struct file * filp,unsigned int ioctl,unsigned long arg)5075 static long kvm_vm_ioctl(struct file *filp,
5076 unsigned int ioctl, unsigned long arg)
5077 {
5078 struct kvm *kvm = filp->private_data;
5079 void __user *argp = (void __user *)arg;
5080 int r;
5081
5082 if (kvm->mm != current->mm || kvm->vm_dead)
5083 return -EIO;
5084 switch (ioctl) {
5085 case KVM_CREATE_VCPU:
5086 r = kvm_vm_ioctl_create_vcpu(kvm, arg);
5087 break;
5088 case KVM_ENABLE_CAP: {
5089 struct kvm_enable_cap cap;
5090
5091 r = -EFAULT;
5092 if (copy_from_user(&cap, argp, sizeof(cap)))
5093 goto out;
5094 r = kvm_vm_ioctl_enable_cap_generic(kvm, &cap);
5095 break;
5096 }
5097 case KVM_SET_USER_MEMORY_REGION2:
5098 case KVM_SET_USER_MEMORY_REGION: {
5099 struct kvm_userspace_memory_region2 mem;
5100 unsigned long size;
5101
5102 if (ioctl == KVM_SET_USER_MEMORY_REGION) {
5103 /*
5104 * Fields beyond struct kvm_userspace_memory_region shouldn't be
5105 * accessed, but avoid leaking kernel memory in case of a bug.
5106 */
5107 memset(&mem, 0, sizeof(mem));
5108 size = sizeof(struct kvm_userspace_memory_region);
5109 } else {
5110 size = sizeof(struct kvm_userspace_memory_region2);
5111 }
5112
5113 /* Ensure the common parts of the two structs are identical. */
5114 SANITY_CHECK_MEM_REGION_FIELD(slot);
5115 SANITY_CHECK_MEM_REGION_FIELD(flags);
5116 SANITY_CHECK_MEM_REGION_FIELD(guest_phys_addr);
5117 SANITY_CHECK_MEM_REGION_FIELD(memory_size);
5118 SANITY_CHECK_MEM_REGION_FIELD(userspace_addr);
5119
5120 r = -EFAULT;
5121 if (copy_from_user(&mem, argp, size))
5122 goto out;
5123
5124 r = -EINVAL;
5125 if (ioctl == KVM_SET_USER_MEMORY_REGION &&
5126 (mem.flags & ~KVM_SET_USER_MEMORY_REGION_V1_FLAGS))
5127 goto out;
5128
5129 r = kvm_vm_ioctl_set_memory_region(kvm, &mem);
5130 break;
5131 }
5132 case KVM_GET_DIRTY_LOG: {
5133 struct kvm_dirty_log log;
5134
5135 r = -EFAULT;
5136 if (copy_from_user(&log, argp, sizeof(log)))
5137 goto out;
5138 r = kvm_vm_ioctl_get_dirty_log(kvm, &log);
5139 break;
5140 }
5141 #ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
5142 case KVM_CLEAR_DIRTY_LOG: {
5143 struct kvm_clear_dirty_log log;
5144
5145 r = -EFAULT;
5146 if (copy_from_user(&log, argp, sizeof(log)))
5147 goto out;
5148 r = kvm_vm_ioctl_clear_dirty_log(kvm, &log);
5149 break;
5150 }
5151 #endif
5152 #ifdef CONFIG_KVM_MMIO
5153 case KVM_REGISTER_COALESCED_MMIO: {
5154 struct kvm_coalesced_mmio_zone zone;
5155
5156 r = -EFAULT;
5157 if (copy_from_user(&zone, argp, sizeof(zone)))
5158 goto out;
5159 r = kvm_vm_ioctl_register_coalesced_mmio(kvm, &zone);
5160 break;
5161 }
5162 case KVM_UNREGISTER_COALESCED_MMIO: {
5163 struct kvm_coalesced_mmio_zone zone;
5164
5165 r = -EFAULT;
5166 if (copy_from_user(&zone, argp, sizeof(zone)))
5167 goto out;
5168 r = kvm_vm_ioctl_unregister_coalesced_mmio(kvm, &zone);
5169 break;
5170 }
5171 #endif
5172 case KVM_IRQFD: {
5173 struct kvm_irqfd data;
5174
5175 r = -EFAULT;
5176 if (copy_from_user(&data, argp, sizeof(data)))
5177 goto out;
5178 r = kvm_irqfd(kvm, &data);
5179 break;
5180 }
5181 case KVM_IOEVENTFD: {
5182 struct kvm_ioeventfd data;
5183
5184 r = -EFAULT;
5185 if (copy_from_user(&data, argp, sizeof(data)))
5186 goto out;
5187 r = kvm_ioeventfd(kvm, &data);
5188 break;
5189 }
5190 #ifdef CONFIG_HAVE_KVM_MSI
5191 case KVM_SIGNAL_MSI: {
5192 struct kvm_msi msi;
5193
5194 r = -EFAULT;
5195 if (copy_from_user(&msi, argp, sizeof(msi)))
5196 goto out;
5197 r = kvm_send_userspace_msi(kvm, &msi);
5198 break;
5199 }
5200 #endif
5201 #ifdef __KVM_HAVE_IRQ_LINE
5202 case KVM_IRQ_LINE_STATUS:
5203 case KVM_IRQ_LINE: {
5204 struct kvm_irq_level irq_event;
5205
5206 r = -EFAULT;
5207 if (copy_from_user(&irq_event, argp, sizeof(irq_event)))
5208 goto out;
5209
5210 r = kvm_vm_ioctl_irq_line(kvm, &irq_event,
5211 ioctl == KVM_IRQ_LINE_STATUS);
5212 if (r)
5213 goto out;
5214
5215 r = -EFAULT;
5216 if (ioctl == KVM_IRQ_LINE_STATUS) {
5217 if (copy_to_user(argp, &irq_event, sizeof(irq_event)))
5218 goto out;
5219 }
5220
5221 r = 0;
5222 break;
5223 }
5224 #endif
5225 #ifdef CONFIG_HAVE_KVM_IRQ_ROUTING
5226 case KVM_SET_GSI_ROUTING: {
5227 struct kvm_irq_routing routing;
5228 struct kvm_irq_routing __user *urouting;
5229 struct kvm_irq_routing_entry *entries = NULL;
5230
5231 r = -EFAULT;
5232 if (copy_from_user(&routing, argp, sizeof(routing)))
5233 goto out;
5234 r = -EINVAL;
5235 if (!kvm_arch_can_set_irq_routing(kvm))
5236 goto out;
5237 if (routing.nr > KVM_MAX_IRQ_ROUTES)
5238 goto out;
5239 if (routing.flags)
5240 goto out;
5241 if (routing.nr) {
5242 urouting = argp;
5243 entries = vmemdup_array_user(urouting->entries,
5244 routing.nr, sizeof(*entries));
5245 if (IS_ERR(entries)) {
5246 r = PTR_ERR(entries);
5247 goto out;
5248 }
5249 }
5250 r = kvm_set_irq_routing(kvm, entries, routing.nr,
5251 routing.flags);
5252 kvfree(entries);
5253 break;
5254 }
5255 #endif /* CONFIG_HAVE_KVM_IRQ_ROUTING */
5256 #ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
5257 case KVM_SET_MEMORY_ATTRIBUTES: {
5258 struct kvm_memory_attributes attrs;
5259
5260 r = -EFAULT;
5261 if (copy_from_user(&attrs, argp, sizeof(attrs)))
5262 goto out;
5263
5264 r = kvm_vm_ioctl_set_mem_attributes(kvm, &attrs);
5265 break;
5266 }
5267 #endif /* CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES */
5268 case KVM_CREATE_DEVICE: {
5269 struct kvm_create_device cd;
5270
5271 r = -EFAULT;
5272 if (copy_from_user(&cd, argp, sizeof(cd)))
5273 goto out;
5274
5275 r = kvm_ioctl_create_device(kvm, &cd);
5276 if (r)
5277 goto out;
5278
5279 r = -EFAULT;
5280 if (copy_to_user(argp, &cd, sizeof(cd)))
5281 goto out;
5282
5283 r = 0;
5284 break;
5285 }
5286 case KVM_CHECK_EXTENSION:
5287 r = kvm_vm_ioctl_check_extension_generic(kvm, arg);
5288 break;
5289 case KVM_RESET_DIRTY_RINGS:
5290 r = kvm_vm_ioctl_reset_dirty_pages(kvm);
5291 break;
5292 case KVM_GET_STATS_FD:
5293 r = kvm_vm_ioctl_get_stats_fd(kvm);
5294 break;
5295 #ifdef CONFIG_KVM_PRIVATE_MEM
5296 case KVM_CREATE_GUEST_MEMFD: {
5297 struct kvm_create_guest_memfd guest_memfd;
5298
5299 r = -EFAULT;
5300 if (copy_from_user(&guest_memfd, argp, sizeof(guest_memfd)))
5301 goto out;
5302
5303 r = kvm_gmem_create(kvm, &guest_memfd);
5304 break;
5305 }
5306 #endif
5307 default:
5308 r = kvm_arch_vm_ioctl(filp, ioctl, arg);
5309 }
5310 out:
5311 return r;
5312 }
5313
5314 #ifdef CONFIG_KVM_COMPAT
5315 struct compat_kvm_dirty_log {
5316 __u32 slot;
5317 __u32 padding1;
5318 union {
5319 compat_uptr_t dirty_bitmap; /* one bit per page */
5320 __u64 padding2;
5321 };
5322 };
5323
5324 struct compat_kvm_clear_dirty_log {
5325 __u32 slot;
5326 __u32 num_pages;
5327 __u64 first_page;
5328 union {
5329 compat_uptr_t dirty_bitmap; /* one bit per page */
5330 __u64 padding2;
5331 };
5332 };
5333
kvm_arch_vm_compat_ioctl(struct file * filp,unsigned int ioctl,unsigned long arg)5334 long __weak kvm_arch_vm_compat_ioctl(struct file *filp, unsigned int ioctl,
5335 unsigned long arg)
5336 {
5337 return -ENOTTY;
5338 }
5339
kvm_vm_compat_ioctl(struct file * filp,unsigned int ioctl,unsigned long arg)5340 static long kvm_vm_compat_ioctl(struct file *filp,
5341 unsigned int ioctl, unsigned long arg)
5342 {
5343 struct kvm *kvm = filp->private_data;
5344 int r;
5345
5346 if (kvm->mm != current->mm || kvm->vm_dead)
5347 return -EIO;
5348
5349 r = kvm_arch_vm_compat_ioctl(filp, ioctl, arg);
5350 if (r != -ENOTTY)
5351 return r;
5352
5353 switch (ioctl) {
5354 #ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
5355 case KVM_CLEAR_DIRTY_LOG: {
5356 struct compat_kvm_clear_dirty_log compat_log;
5357 struct kvm_clear_dirty_log log;
5358
5359 if (copy_from_user(&compat_log, (void __user *)arg,
5360 sizeof(compat_log)))
5361 return -EFAULT;
5362 log.slot = compat_log.slot;
5363 log.num_pages = compat_log.num_pages;
5364 log.first_page = compat_log.first_page;
5365 log.padding2 = compat_log.padding2;
5366 log.dirty_bitmap = compat_ptr(compat_log.dirty_bitmap);
5367
5368 r = kvm_vm_ioctl_clear_dirty_log(kvm, &log);
5369 break;
5370 }
5371 #endif
5372 case KVM_GET_DIRTY_LOG: {
5373 struct compat_kvm_dirty_log compat_log;
5374 struct kvm_dirty_log log;
5375
5376 if (copy_from_user(&compat_log, (void __user *)arg,
5377 sizeof(compat_log)))
5378 return -EFAULT;
5379 log.slot = compat_log.slot;
5380 log.padding1 = compat_log.padding1;
5381 log.padding2 = compat_log.padding2;
5382 log.dirty_bitmap = compat_ptr(compat_log.dirty_bitmap);
5383
5384 r = kvm_vm_ioctl_get_dirty_log(kvm, &log);
5385 break;
5386 }
5387 default:
5388 r = kvm_vm_ioctl(filp, ioctl, arg);
5389 }
5390 return r;
5391 }
5392 #endif
5393
5394 static struct file_operations kvm_vm_fops = {
5395 .release = kvm_vm_release,
5396 .unlocked_ioctl = kvm_vm_ioctl,
5397 .llseek = noop_llseek,
5398 KVM_COMPAT(kvm_vm_compat_ioctl),
5399 };
5400
file_is_kvm(struct file * file)5401 bool file_is_kvm(struct file *file)
5402 {
5403 return file && file->f_op == &kvm_vm_fops;
5404 }
5405 EXPORT_SYMBOL_GPL(file_is_kvm);
5406
kvm_dev_ioctl_create_vm(unsigned long type)5407 static int kvm_dev_ioctl_create_vm(unsigned long type)
5408 {
5409 char fdname[ITOA_MAX_LEN + 1];
5410 int r, fd;
5411 struct kvm *kvm;
5412 struct file *file;
5413
5414 fd = get_unused_fd_flags(O_CLOEXEC);
5415 if (fd < 0)
5416 return fd;
5417
5418 snprintf(fdname, sizeof(fdname), "%d", fd);
5419
5420 kvm = kvm_create_vm(type, fdname);
5421 if (IS_ERR(kvm)) {
5422 r = PTR_ERR(kvm);
5423 goto put_fd;
5424 }
5425
5426 file = anon_inode_getfile("kvm-vm", &kvm_vm_fops, kvm, O_RDWR);
5427 if (IS_ERR(file)) {
5428 r = PTR_ERR(file);
5429 goto put_kvm;
5430 }
5431
5432 /*
5433 * Don't call kvm_put_kvm anymore at this point; file->f_op is
5434 * already set, with ->release() being kvm_vm_release(). In error
5435 * cases it will be called by the final fput(file) and will take
5436 * care of doing kvm_put_kvm(kvm).
5437 */
5438 kvm_uevent_notify_change(KVM_EVENT_CREATE_VM, kvm);
5439
5440 fd_install(fd, file);
5441 return fd;
5442
5443 put_kvm:
5444 kvm_put_kvm(kvm);
5445 put_fd:
5446 put_unused_fd(fd);
5447 return r;
5448 }
5449
kvm_dev_ioctl(struct file * filp,unsigned int ioctl,unsigned long arg)5450 static long kvm_dev_ioctl(struct file *filp,
5451 unsigned int ioctl, unsigned long arg)
5452 {
5453 int r = -EINVAL;
5454
5455 switch (ioctl) {
5456 case KVM_GET_API_VERSION:
5457 if (arg)
5458 goto out;
5459 r = KVM_API_VERSION;
5460 break;
5461 case KVM_CREATE_VM:
5462 r = kvm_dev_ioctl_create_vm(arg);
5463 break;
5464 case KVM_CHECK_EXTENSION:
5465 r = kvm_vm_ioctl_check_extension_generic(NULL, arg);
5466 break;
5467 case KVM_GET_VCPU_MMAP_SIZE:
5468 if (arg)
5469 goto out;
5470 r = PAGE_SIZE; /* struct kvm_run */
5471 #ifdef CONFIG_X86
5472 r += PAGE_SIZE; /* pio data page */
5473 #endif
5474 #ifdef CONFIG_KVM_MMIO
5475 r += PAGE_SIZE; /* coalesced mmio ring page */
5476 #endif
5477 break;
5478 default:
5479 return kvm_arch_dev_ioctl(filp, ioctl, arg);
5480 }
5481 out:
5482 return r;
5483 }
5484
5485 static struct file_operations kvm_chardev_ops = {
5486 .unlocked_ioctl = kvm_dev_ioctl,
5487 .llseek = noop_llseek,
5488 KVM_COMPAT(kvm_dev_ioctl),
5489 };
5490
5491 static struct miscdevice kvm_dev = {
5492 KVM_MINOR,
5493 "kvm",
5494 &kvm_chardev_ops,
5495 };
5496
5497 #ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING
5498 __visible bool kvm_rebooting;
5499 EXPORT_SYMBOL_GPL(kvm_rebooting);
5500
5501 static DEFINE_PER_CPU(bool, hardware_enabled);
5502 static int kvm_usage_count;
5503
__hardware_enable_nolock(void)5504 static int __hardware_enable_nolock(void)
5505 {
5506 if (__this_cpu_read(hardware_enabled))
5507 return 0;
5508
5509 if (kvm_arch_hardware_enable()) {
5510 pr_info("kvm: enabling virtualization on CPU%d failed\n",
5511 raw_smp_processor_id());
5512 return -EIO;
5513 }
5514
5515 __this_cpu_write(hardware_enabled, true);
5516 return 0;
5517 }
5518
hardware_enable_nolock(void * failed)5519 static void hardware_enable_nolock(void *failed)
5520 {
5521 if (__hardware_enable_nolock())
5522 atomic_inc(failed);
5523 }
5524
kvm_online_cpu(unsigned int cpu)5525 static int kvm_online_cpu(unsigned int cpu)
5526 {
5527 int ret = 0;
5528
5529 /*
5530 * Abort the CPU online process if hardware virtualization cannot
5531 * be enabled. Otherwise running VMs would encounter unrecoverable
5532 * errors when scheduled to this CPU.
5533 */
5534 mutex_lock(&kvm_lock);
5535 if (kvm_usage_count)
5536 ret = __hardware_enable_nolock();
5537 mutex_unlock(&kvm_lock);
5538 return ret;
5539 }
5540
hardware_disable_nolock(void * junk)5541 static void hardware_disable_nolock(void *junk)
5542 {
5543 /*
5544 * Note, hardware_disable_all_nolock() tells all online CPUs to disable
5545 * hardware, not just CPUs that successfully enabled hardware!
5546 */
5547 if (!__this_cpu_read(hardware_enabled))
5548 return;
5549
5550 kvm_arch_hardware_disable();
5551
5552 __this_cpu_write(hardware_enabled, false);
5553 }
5554
kvm_offline_cpu(unsigned int cpu)5555 static int kvm_offline_cpu(unsigned int cpu)
5556 {
5557 mutex_lock(&kvm_lock);
5558 if (kvm_usage_count)
5559 hardware_disable_nolock(NULL);
5560 mutex_unlock(&kvm_lock);
5561 return 0;
5562 }
5563
hardware_disable_all_nolock(void)5564 static void hardware_disable_all_nolock(void)
5565 {
5566 BUG_ON(!kvm_usage_count);
5567
5568 kvm_usage_count--;
5569 if (!kvm_usage_count)
5570 on_each_cpu(hardware_disable_nolock, NULL, 1);
5571 }
5572
hardware_disable_all(void)5573 static void hardware_disable_all(void)
5574 {
5575 cpus_read_lock();
5576 mutex_lock(&kvm_lock);
5577 hardware_disable_all_nolock();
5578 mutex_unlock(&kvm_lock);
5579 cpus_read_unlock();
5580 }
5581
hardware_enable_all(void)5582 static int hardware_enable_all(void)
5583 {
5584 atomic_t failed = ATOMIC_INIT(0);
5585 int r;
5586
5587 /*
5588 * Do not enable hardware virtualization if the system is going down.
5589 * If userspace initiated a forced reboot, e.g. reboot -f, then it's
5590 * possible for an in-flight KVM_CREATE_VM to trigger hardware enabling
5591 * after kvm_reboot() is called. Note, this relies on system_state
5592 * being set _before_ kvm_reboot(), which is why KVM uses a syscore ops
5593 * hook instead of registering a dedicated reboot notifier (the latter
5594 * runs before system_state is updated).
5595 */
5596 if (system_state == SYSTEM_HALT || system_state == SYSTEM_POWER_OFF ||
5597 system_state == SYSTEM_RESTART)
5598 return -EBUSY;
5599
5600 /*
5601 * When onlining a CPU, cpu_online_mask is set before kvm_online_cpu()
5602 * is called, and so on_each_cpu() between them includes the CPU that
5603 * is being onlined. As a result, hardware_enable_nolock() may get
5604 * invoked before kvm_online_cpu(), which also enables hardware if the
5605 * usage count is non-zero. Disable CPU hotplug to avoid attempting to
5606 * enable hardware multiple times.
5607 */
5608 cpus_read_lock();
5609 mutex_lock(&kvm_lock);
5610
5611 r = 0;
5612
5613 kvm_usage_count++;
5614 if (kvm_usage_count == 1) {
5615 on_each_cpu(hardware_enable_nolock, &failed, 1);
5616
5617 if (atomic_read(&failed)) {
5618 hardware_disable_all_nolock();
5619 r = -EBUSY;
5620 }
5621 }
5622
5623 mutex_unlock(&kvm_lock);
5624 cpus_read_unlock();
5625
5626 return r;
5627 }
5628
kvm_shutdown(void)5629 static void kvm_shutdown(void)
5630 {
5631 /*
5632 * Disable hardware virtualization and set kvm_rebooting to indicate
5633 * that KVM has asynchronously disabled hardware virtualization, i.e.
5634 * that relevant errors and exceptions aren't entirely unexpected.
5635 * Some flavors of hardware virtualization need to be disabled before
5636 * transferring control to firmware (to perform shutdown/reboot), e.g.
5637 * on x86, virtualization can block INIT interrupts, which are used by
5638 * firmware to pull APs back under firmware control. Note, this path
5639 * is used for both shutdown and reboot scenarios, i.e. neither name is
5640 * 100% comprehensive.
5641 */
5642 pr_info("kvm: exiting hardware virtualization\n");
5643 kvm_rebooting = true;
5644 on_each_cpu(hardware_disable_nolock, NULL, 1);
5645 }
5646
kvm_suspend(void)5647 static int kvm_suspend(void)
5648 {
5649 /*
5650 * Secondary CPUs and CPU hotplug are disabled across the suspend/resume
5651 * callbacks, i.e. no need to acquire kvm_lock to ensure the usage count
5652 * is stable. Assert that kvm_lock is not held to ensure the system
5653 * isn't suspended while KVM is enabling hardware. Hardware enabling
5654 * can be preempted, but the task cannot be frozen until it has dropped
5655 * all locks (userspace tasks are frozen via a fake signal).
5656 */
5657 lockdep_assert_not_held(&kvm_lock);
5658 lockdep_assert_irqs_disabled();
5659
5660 if (kvm_usage_count)
5661 hardware_disable_nolock(NULL);
5662 return 0;
5663 }
5664
kvm_resume(void)5665 static void kvm_resume(void)
5666 {
5667 lockdep_assert_not_held(&kvm_lock);
5668 lockdep_assert_irqs_disabled();
5669
5670 if (kvm_usage_count)
5671 WARN_ON_ONCE(__hardware_enable_nolock());
5672 }
5673
5674 static struct syscore_ops kvm_syscore_ops = {
5675 .suspend = kvm_suspend,
5676 .resume = kvm_resume,
5677 .shutdown = kvm_shutdown,
5678 };
5679 #else /* CONFIG_KVM_GENERIC_HARDWARE_ENABLING */
hardware_enable_all(void)5680 static int hardware_enable_all(void)
5681 {
5682 return 0;
5683 }
5684
hardware_disable_all(void)5685 static void hardware_disable_all(void)
5686 {
5687
5688 }
5689 #endif /* CONFIG_KVM_GENERIC_HARDWARE_ENABLING */
5690
kvm_iodevice_destructor(struct kvm_io_device * dev)5691 static void kvm_iodevice_destructor(struct kvm_io_device *dev)
5692 {
5693 if (dev->ops->destructor)
5694 dev->ops->destructor(dev);
5695 }
5696
kvm_io_bus_destroy(struct kvm_io_bus * bus)5697 static void kvm_io_bus_destroy(struct kvm_io_bus *bus)
5698 {
5699 int i;
5700
5701 for (i = 0; i < bus->dev_count; i++) {
5702 struct kvm_io_device *pos = bus->range[i].dev;
5703
5704 kvm_iodevice_destructor(pos);
5705 }
5706 kfree(bus);
5707 }
5708
kvm_io_bus_cmp(const struct kvm_io_range * r1,const struct kvm_io_range * r2)5709 static inline int kvm_io_bus_cmp(const struct kvm_io_range *r1,
5710 const struct kvm_io_range *r2)
5711 {
5712 gpa_t addr1 = r1->addr;
5713 gpa_t addr2 = r2->addr;
5714
5715 if (addr1 < addr2)
5716 return -1;
5717
5718 /* If r2->len == 0, match the exact address. If r2->len != 0,
5719 * accept any overlapping write. Any order is acceptable for
5720 * overlapping ranges, because kvm_io_bus_get_first_dev ensures
5721 * we process all of them.
5722 */
5723 if (r2->len) {
5724 addr1 += r1->len;
5725 addr2 += r2->len;
5726 }
5727
5728 if (addr1 > addr2)
5729 return 1;
5730
5731 return 0;
5732 }
5733
kvm_io_bus_sort_cmp(const void * p1,const void * p2)5734 static int kvm_io_bus_sort_cmp(const void *p1, const void *p2)
5735 {
5736 return kvm_io_bus_cmp(p1, p2);
5737 }
5738
kvm_io_bus_get_first_dev(struct kvm_io_bus * bus,gpa_t addr,int len)5739 static int kvm_io_bus_get_first_dev(struct kvm_io_bus *bus,
5740 gpa_t addr, int len)
5741 {
5742 struct kvm_io_range *range, key;
5743 int off;
5744
5745 key = (struct kvm_io_range) {
5746 .addr = addr,
5747 .len = len,
5748 };
5749
5750 range = bsearch(&key, bus->range, bus->dev_count,
5751 sizeof(struct kvm_io_range), kvm_io_bus_sort_cmp);
5752 if (range == NULL)
5753 return -ENOENT;
5754
5755 off = range - bus->range;
5756
5757 while (off > 0 && kvm_io_bus_cmp(&key, &bus->range[off-1]) == 0)
5758 off--;
5759
5760 return off;
5761 }
5762
__kvm_io_bus_write(struct kvm_vcpu * vcpu,struct kvm_io_bus * bus,struct kvm_io_range * range,const void * val)5763 static int __kvm_io_bus_write(struct kvm_vcpu *vcpu, struct kvm_io_bus *bus,
5764 struct kvm_io_range *range, const void *val)
5765 {
5766 int idx;
5767
5768 idx = kvm_io_bus_get_first_dev(bus, range->addr, range->len);
5769 if (idx < 0)
5770 return -EOPNOTSUPP;
5771
5772 while (idx < bus->dev_count &&
5773 kvm_io_bus_cmp(range, &bus->range[idx]) == 0) {
5774 if (!kvm_iodevice_write(vcpu, bus->range[idx].dev, range->addr,
5775 range->len, val))
5776 return idx;
5777 idx++;
5778 }
5779
5780 return -EOPNOTSUPP;
5781 }
5782
5783 /* kvm_io_bus_write - called under kvm->slots_lock */
kvm_io_bus_write(struct kvm_vcpu * vcpu,enum kvm_bus bus_idx,gpa_t addr,int len,const void * val)5784 int kvm_io_bus_write(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr,
5785 int len, const void *val)
5786 {
5787 struct kvm_io_bus *bus;
5788 struct kvm_io_range range;
5789 int r;
5790
5791 range = (struct kvm_io_range) {
5792 .addr = addr,
5793 .len = len,
5794 };
5795
5796 bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu);
5797 if (!bus)
5798 return -ENOMEM;
5799 r = __kvm_io_bus_write(vcpu, bus, &range, val);
5800 return r < 0 ? r : 0;
5801 }
5802 EXPORT_SYMBOL_GPL(kvm_io_bus_write);
5803
5804 /* kvm_io_bus_write_cookie - called under kvm->slots_lock */
kvm_io_bus_write_cookie(struct kvm_vcpu * vcpu,enum kvm_bus bus_idx,gpa_t addr,int len,const void * val,long cookie)5805 int kvm_io_bus_write_cookie(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx,
5806 gpa_t addr, int len, const void *val, long cookie)
5807 {
5808 struct kvm_io_bus *bus;
5809 struct kvm_io_range range;
5810
5811 range = (struct kvm_io_range) {
5812 .addr = addr,
5813 .len = len,
5814 };
5815
5816 bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu);
5817 if (!bus)
5818 return -ENOMEM;
5819
5820 /* First try the device referenced by cookie. */
5821 if ((cookie >= 0) && (cookie < bus->dev_count) &&
5822 (kvm_io_bus_cmp(&range, &bus->range[cookie]) == 0))
5823 if (!kvm_iodevice_write(vcpu, bus->range[cookie].dev, addr, len,
5824 val))
5825 return cookie;
5826
5827 /*
5828 * cookie contained garbage; fall back to search and return the
5829 * correct cookie value.
5830 */
5831 return __kvm_io_bus_write(vcpu, bus, &range, val);
5832 }
5833
__kvm_io_bus_read(struct kvm_vcpu * vcpu,struct kvm_io_bus * bus,struct kvm_io_range * range,void * val)5834 static int __kvm_io_bus_read(struct kvm_vcpu *vcpu, struct kvm_io_bus *bus,
5835 struct kvm_io_range *range, void *val)
5836 {
5837 int idx;
5838
5839 idx = kvm_io_bus_get_first_dev(bus, range->addr, range->len);
5840 if (idx < 0)
5841 return -EOPNOTSUPP;
5842
5843 while (idx < bus->dev_count &&
5844 kvm_io_bus_cmp(range, &bus->range[idx]) == 0) {
5845 if (!kvm_iodevice_read(vcpu, bus->range[idx].dev, range->addr,
5846 range->len, val))
5847 return idx;
5848 idx++;
5849 }
5850
5851 return -EOPNOTSUPP;
5852 }
5853
5854 /* kvm_io_bus_read - called under kvm->slots_lock */
kvm_io_bus_read(struct kvm_vcpu * vcpu,enum kvm_bus bus_idx,gpa_t addr,int len,void * val)5855 int kvm_io_bus_read(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr,
5856 int len, void *val)
5857 {
5858 struct kvm_io_bus *bus;
5859 struct kvm_io_range range;
5860 int r;
5861
5862 range = (struct kvm_io_range) {
5863 .addr = addr,
5864 .len = len,
5865 };
5866
5867 bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu);
5868 if (!bus)
5869 return -ENOMEM;
5870 r = __kvm_io_bus_read(vcpu, bus, &range, val);
5871 return r < 0 ? r : 0;
5872 }
5873
kvm_io_bus_register_dev(struct kvm * kvm,enum kvm_bus bus_idx,gpa_t addr,int len,struct kvm_io_device * dev)5874 int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
5875 int len, struct kvm_io_device *dev)
5876 {
5877 int i;
5878 struct kvm_io_bus *new_bus, *bus;
5879 struct kvm_io_range range;
5880
5881 lockdep_assert_held(&kvm->slots_lock);
5882
5883 bus = kvm_get_bus(kvm, bus_idx);
5884 if (!bus)
5885 return -ENOMEM;
5886
5887 /* exclude ioeventfd which is limited by maximum fd */
5888 if (bus->dev_count - bus->ioeventfd_count > NR_IOBUS_DEVS - 1)
5889 return -ENOSPC;
5890
5891 new_bus = kmalloc(struct_size(bus, range, bus->dev_count + 1),
5892 GFP_KERNEL_ACCOUNT);
5893 if (!new_bus)
5894 return -ENOMEM;
5895
5896 range = (struct kvm_io_range) {
5897 .addr = addr,
5898 .len = len,
5899 .dev = dev,
5900 };
5901
5902 for (i = 0; i < bus->dev_count; i++)
5903 if (kvm_io_bus_cmp(&bus->range[i], &range) > 0)
5904 break;
5905
5906 memcpy(new_bus, bus, sizeof(*bus) + i * sizeof(struct kvm_io_range));
5907 new_bus->dev_count++;
5908 new_bus->range[i] = range;
5909 memcpy(new_bus->range + i + 1, bus->range + i,
5910 (bus->dev_count - i) * sizeof(struct kvm_io_range));
5911 rcu_assign_pointer(kvm->buses[bus_idx], new_bus);
5912 synchronize_srcu_expedited(&kvm->srcu);
5913 kfree(bus);
5914
5915 return 0;
5916 }
5917
kvm_io_bus_unregister_dev(struct kvm * kvm,enum kvm_bus bus_idx,struct kvm_io_device * dev)5918 int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
5919 struct kvm_io_device *dev)
5920 {
5921 int i;
5922 struct kvm_io_bus *new_bus, *bus;
5923
5924 lockdep_assert_held(&kvm->slots_lock);
5925
5926 bus = kvm_get_bus(kvm, bus_idx);
5927 if (!bus)
5928 return 0;
5929
5930 for (i = 0; i < bus->dev_count; i++) {
5931 if (bus->range[i].dev == dev) {
5932 break;
5933 }
5934 }
5935
5936 if (i == bus->dev_count)
5937 return 0;
5938
5939 new_bus = kmalloc(struct_size(bus, range, bus->dev_count - 1),
5940 GFP_KERNEL_ACCOUNT);
5941 if (new_bus) {
5942 memcpy(new_bus, bus, struct_size(bus, range, i));
5943 new_bus->dev_count--;
5944 memcpy(new_bus->range + i, bus->range + i + 1,
5945 flex_array_size(new_bus, range, new_bus->dev_count - i));
5946 }
5947
5948 rcu_assign_pointer(kvm->buses[bus_idx], new_bus);
5949 synchronize_srcu_expedited(&kvm->srcu);
5950
5951 /*
5952 * If NULL bus is installed, destroy the old bus, including all the
5953 * attached devices. Otherwise, destroy the caller's device only.
5954 */
5955 if (!new_bus) {
5956 pr_err("kvm: failed to shrink bus, removing it completely\n");
5957 kvm_io_bus_destroy(bus);
5958 return -ENOMEM;
5959 }
5960
5961 kvm_iodevice_destructor(dev);
5962 kfree(bus);
5963 return 0;
5964 }
5965
kvm_io_bus_get_dev(struct kvm * kvm,enum kvm_bus bus_idx,gpa_t addr)5966 struct kvm_io_device *kvm_io_bus_get_dev(struct kvm *kvm, enum kvm_bus bus_idx,
5967 gpa_t addr)
5968 {
5969 struct kvm_io_bus *bus;
5970 int dev_idx, srcu_idx;
5971 struct kvm_io_device *iodev = NULL;
5972
5973 srcu_idx = srcu_read_lock(&kvm->srcu);
5974
5975 bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu);
5976 if (!bus)
5977 goto out_unlock;
5978
5979 dev_idx = kvm_io_bus_get_first_dev(bus, addr, 1);
5980 if (dev_idx < 0)
5981 goto out_unlock;
5982
5983 iodev = bus->range[dev_idx].dev;
5984
5985 out_unlock:
5986 srcu_read_unlock(&kvm->srcu, srcu_idx);
5987
5988 return iodev;
5989 }
5990 EXPORT_SYMBOL_GPL(kvm_io_bus_get_dev);
5991
kvm_debugfs_open(struct inode * inode,struct file * file,int (* get)(void *,u64 *),int (* set)(void *,u64),const char * fmt)5992 static int kvm_debugfs_open(struct inode *inode, struct file *file,
5993 int (*get)(void *, u64 *), int (*set)(void *, u64),
5994 const char *fmt)
5995 {
5996 int ret;
5997 struct kvm_stat_data *stat_data = inode->i_private;
5998
5999 /*
6000 * The debugfs files are a reference to the kvm struct which
6001 * is still valid when kvm_destroy_vm is called. kvm_get_kvm_safe
6002 * avoids the race between open and the removal of the debugfs directory.
6003 */
6004 if (!kvm_get_kvm_safe(stat_data->kvm))
6005 return -ENOENT;
6006
6007 ret = simple_attr_open(inode, file, get,
6008 kvm_stats_debugfs_mode(stat_data->desc) & 0222
6009 ? set : NULL, fmt);
6010 if (ret)
6011 kvm_put_kvm(stat_data->kvm);
6012
6013 return ret;
6014 }
6015
kvm_debugfs_release(struct inode * inode,struct file * file)6016 static int kvm_debugfs_release(struct inode *inode, struct file *file)
6017 {
6018 struct kvm_stat_data *stat_data = inode->i_private;
6019
6020 simple_attr_release(inode, file);
6021 kvm_put_kvm(stat_data->kvm);
6022
6023 return 0;
6024 }
6025
kvm_get_stat_per_vm(struct kvm * kvm,size_t offset,u64 * val)6026 static int kvm_get_stat_per_vm(struct kvm *kvm, size_t offset, u64 *val)
6027 {
6028 *val = *(u64 *)((void *)(&kvm->stat) + offset);
6029
6030 return 0;
6031 }
6032
kvm_clear_stat_per_vm(struct kvm * kvm,size_t offset)6033 static int kvm_clear_stat_per_vm(struct kvm *kvm, size_t offset)
6034 {
6035 *(u64 *)((void *)(&kvm->stat) + offset) = 0;
6036
6037 return 0;
6038 }
6039
kvm_get_stat_per_vcpu(struct kvm * kvm,size_t offset,u64 * val)6040 static int kvm_get_stat_per_vcpu(struct kvm *kvm, size_t offset, u64 *val)
6041 {
6042 unsigned long i;
6043 struct kvm_vcpu *vcpu;
6044
6045 *val = 0;
6046
6047 kvm_for_each_vcpu(i, vcpu, kvm)
6048 *val += *(u64 *)((void *)(&vcpu->stat) + offset);
6049
6050 return 0;
6051 }
6052
kvm_clear_stat_per_vcpu(struct kvm * kvm,size_t offset)6053 static int kvm_clear_stat_per_vcpu(struct kvm *kvm, size_t offset)
6054 {
6055 unsigned long i;
6056 struct kvm_vcpu *vcpu;
6057
6058 kvm_for_each_vcpu(i, vcpu, kvm)
6059 *(u64 *)((void *)(&vcpu->stat) + offset) = 0;
6060
6061 return 0;
6062 }
6063
kvm_stat_data_get(void * data,u64 * val)6064 static int kvm_stat_data_get(void *data, u64 *val)
6065 {
6066 int r = -EFAULT;
6067 struct kvm_stat_data *stat_data = data;
6068
6069 switch (stat_data->kind) {
6070 case KVM_STAT_VM:
6071 r = kvm_get_stat_per_vm(stat_data->kvm,
6072 stat_data->desc->desc.offset, val);
6073 break;
6074 case KVM_STAT_VCPU:
6075 r = kvm_get_stat_per_vcpu(stat_data->kvm,
6076 stat_data->desc->desc.offset, val);
6077 break;
6078 }
6079
6080 return r;
6081 }
6082
kvm_stat_data_clear(void * data,u64 val)6083 static int kvm_stat_data_clear(void *data, u64 val)
6084 {
6085 int r = -EFAULT;
6086 struct kvm_stat_data *stat_data = data;
6087
6088 if (val)
6089 return -EINVAL;
6090
6091 switch (stat_data->kind) {
6092 case KVM_STAT_VM:
6093 r = kvm_clear_stat_per_vm(stat_data->kvm,
6094 stat_data->desc->desc.offset);
6095 break;
6096 case KVM_STAT_VCPU:
6097 r = kvm_clear_stat_per_vcpu(stat_data->kvm,
6098 stat_data->desc->desc.offset);
6099 break;
6100 }
6101
6102 return r;
6103 }
6104
kvm_stat_data_open(struct inode * inode,struct file * file)6105 static int kvm_stat_data_open(struct inode *inode, struct file *file)
6106 {
6107 __simple_attr_check_format("%llu\n", 0ull);
6108 return kvm_debugfs_open(inode, file, kvm_stat_data_get,
6109 kvm_stat_data_clear, "%llu\n");
6110 }
6111
6112 static const struct file_operations stat_fops_per_vm = {
6113 .owner = THIS_MODULE,
6114 .open = kvm_stat_data_open,
6115 .release = kvm_debugfs_release,
6116 .read = simple_attr_read,
6117 .write = simple_attr_write,
6118 .llseek = no_llseek,
6119 };
6120
vm_stat_get(void * _offset,u64 * val)6121 static int vm_stat_get(void *_offset, u64 *val)
6122 {
6123 unsigned offset = (long)_offset;
6124 struct kvm *kvm;
6125 u64 tmp_val;
6126
6127 *val = 0;
6128 mutex_lock(&kvm_lock);
6129 list_for_each_entry(kvm, &vm_list, vm_list) {
6130 kvm_get_stat_per_vm(kvm, offset, &tmp_val);
6131 *val += tmp_val;
6132 }
6133 mutex_unlock(&kvm_lock);
6134 return 0;
6135 }
6136
vm_stat_clear(void * _offset,u64 val)6137 static int vm_stat_clear(void *_offset, u64 val)
6138 {
6139 unsigned offset = (long)_offset;
6140 struct kvm *kvm;
6141
6142 if (val)
6143 return -EINVAL;
6144
6145 mutex_lock(&kvm_lock);
6146 list_for_each_entry(kvm, &vm_list, vm_list) {
6147 kvm_clear_stat_per_vm(kvm, offset);
6148 }
6149 mutex_unlock(&kvm_lock);
6150
6151 return 0;
6152 }
6153
6154 DEFINE_SIMPLE_ATTRIBUTE(vm_stat_fops, vm_stat_get, vm_stat_clear, "%llu\n");
6155 DEFINE_SIMPLE_ATTRIBUTE(vm_stat_readonly_fops, vm_stat_get, NULL, "%llu\n");
6156
vcpu_stat_get(void * _offset,u64 * val)6157 static int vcpu_stat_get(void *_offset, u64 *val)
6158 {
6159 unsigned offset = (long)_offset;
6160 struct kvm *kvm;
6161 u64 tmp_val;
6162
6163 *val = 0;
6164 mutex_lock(&kvm_lock);
6165 list_for_each_entry(kvm, &vm_list, vm_list) {
6166 kvm_get_stat_per_vcpu(kvm, offset, &tmp_val);
6167 *val += tmp_val;
6168 }
6169 mutex_unlock(&kvm_lock);
6170 return 0;
6171 }
6172
vcpu_stat_clear(void * _offset,u64 val)6173 static int vcpu_stat_clear(void *_offset, u64 val)
6174 {
6175 unsigned offset = (long)_offset;
6176 struct kvm *kvm;
6177
6178 if (val)
6179 return -EINVAL;
6180
6181 mutex_lock(&kvm_lock);
6182 list_for_each_entry(kvm, &vm_list, vm_list) {
6183 kvm_clear_stat_per_vcpu(kvm, offset);
6184 }
6185 mutex_unlock(&kvm_lock);
6186
6187 return 0;
6188 }
6189
6190 DEFINE_SIMPLE_ATTRIBUTE(vcpu_stat_fops, vcpu_stat_get, vcpu_stat_clear,
6191 "%llu\n");
6192 DEFINE_SIMPLE_ATTRIBUTE(vcpu_stat_readonly_fops, vcpu_stat_get, NULL, "%llu\n");
6193
kvm_uevent_notify_change(unsigned int type,struct kvm * kvm)6194 static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm)
6195 {
6196 struct kobj_uevent_env *env;
6197 unsigned long long created, active;
6198
6199 if (!kvm_dev.this_device || !kvm)
6200 return;
6201
6202 mutex_lock(&kvm_lock);
6203 if (type == KVM_EVENT_CREATE_VM) {
6204 kvm_createvm_count++;
6205 kvm_active_vms++;
6206 } else if (type == KVM_EVENT_DESTROY_VM) {
6207 kvm_active_vms--;
6208 }
6209 created = kvm_createvm_count;
6210 active = kvm_active_vms;
6211 mutex_unlock(&kvm_lock);
6212
6213 env = kzalloc(sizeof(*env), GFP_KERNEL_ACCOUNT);
6214 if (!env)
6215 return;
6216
6217 add_uevent_var(env, "CREATED=%llu", created);
6218 add_uevent_var(env, "COUNT=%llu", active);
6219
6220 if (type == KVM_EVENT_CREATE_VM) {
6221 add_uevent_var(env, "EVENT=create");
6222 kvm->userspace_pid = task_pid_nr(current);
6223 } else if (type == KVM_EVENT_DESTROY_VM) {
6224 add_uevent_var(env, "EVENT=destroy");
6225 }
6226 add_uevent_var(env, "PID=%d", kvm->userspace_pid);
6227
6228 if (!IS_ERR(kvm->debugfs_dentry)) {
6229 char *tmp, *p = kmalloc(PATH_MAX, GFP_KERNEL_ACCOUNT);
6230
6231 if (p) {
6232 tmp = dentry_path_raw(kvm->debugfs_dentry, p, PATH_MAX);
6233 if (!IS_ERR(tmp))
6234 add_uevent_var(env, "STATS_PATH=%s", tmp);
6235 kfree(p);
6236 }
6237 }
6238 /* no need for checks, since we are adding at most only 5 keys */
6239 env->envp[env->envp_idx++] = NULL;
6240 kobject_uevent_env(&kvm_dev.this_device->kobj, KOBJ_CHANGE, env->envp);
6241 kfree(env);
6242 }
6243
kvm_init_debug(void)6244 static void kvm_init_debug(void)
6245 {
6246 const struct file_operations *fops;
6247 const struct _kvm_stats_desc *pdesc;
6248 int i;
6249
6250 kvm_debugfs_dir = debugfs_create_dir("kvm", NULL);
6251
6252 for (i = 0; i < kvm_vm_stats_header.num_desc; ++i) {
6253 pdesc = &kvm_vm_stats_desc[i];
6254 if (kvm_stats_debugfs_mode(pdesc) & 0222)
6255 fops = &vm_stat_fops;
6256 else
6257 fops = &vm_stat_readonly_fops;
6258 debugfs_create_file(pdesc->name, kvm_stats_debugfs_mode(pdesc),
6259 kvm_debugfs_dir,
6260 (void *)(long)pdesc->desc.offset, fops);
6261 }
6262
6263 for (i = 0; i < kvm_vcpu_stats_header.num_desc; ++i) {
6264 pdesc = &kvm_vcpu_stats_desc[i];
6265 if (kvm_stats_debugfs_mode(pdesc) & 0222)
6266 fops = &vcpu_stat_fops;
6267 else
6268 fops = &vcpu_stat_readonly_fops;
6269 debugfs_create_file(pdesc->name, kvm_stats_debugfs_mode(pdesc),
6270 kvm_debugfs_dir,
6271 (void *)(long)pdesc->desc.offset, fops);
6272 }
6273 }
6274
6275 static inline
preempt_notifier_to_vcpu(struct preempt_notifier * pn)6276 struct kvm_vcpu *preempt_notifier_to_vcpu(struct preempt_notifier *pn)
6277 {
6278 return container_of(pn, struct kvm_vcpu, preempt_notifier);
6279 }
6280
kvm_sched_in(struct preempt_notifier * pn,int cpu)6281 static void kvm_sched_in(struct preempt_notifier *pn, int cpu)
6282 {
6283 struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn);
6284
6285 WRITE_ONCE(vcpu->preempted, false);
6286 WRITE_ONCE(vcpu->ready, false);
6287
6288 __this_cpu_write(kvm_running_vcpu, vcpu);
6289 kvm_arch_sched_in(vcpu, cpu);
6290 kvm_arch_vcpu_load(vcpu, cpu);
6291 }
6292
kvm_sched_out(struct preempt_notifier * pn,struct task_struct * next)6293 static void kvm_sched_out(struct preempt_notifier *pn,
6294 struct task_struct *next)
6295 {
6296 struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn);
6297
6298 if (current->on_rq) {
6299 WRITE_ONCE(vcpu->preempted, true);
6300 WRITE_ONCE(vcpu->ready, true);
6301 }
6302 kvm_arch_vcpu_put(vcpu);
6303 __this_cpu_write(kvm_running_vcpu, NULL);
6304 }
6305
6306 /**
6307 * kvm_get_running_vcpu - get the vcpu running on the current CPU.
6308 *
6309 * We can disable preemption locally around accessing the per-CPU variable,
6310 * and use the resolved vcpu pointer after enabling preemption again,
6311 * because even if the current thread is migrated to another CPU, reading
6312 * the per-CPU value later will give us the same value as we update the
6313 * per-CPU variable in the preempt notifier handlers.
6314 */
kvm_get_running_vcpu(void)6315 struct kvm_vcpu *kvm_get_running_vcpu(void)
6316 {
6317 struct kvm_vcpu *vcpu;
6318
6319 preempt_disable();
6320 vcpu = __this_cpu_read(kvm_running_vcpu);
6321 preempt_enable();
6322
6323 return vcpu;
6324 }
6325 EXPORT_SYMBOL_GPL(kvm_get_running_vcpu);
6326
6327 /**
6328 * kvm_get_running_vcpus - get the per-CPU array of currently running vcpus.
6329 */
kvm_get_running_vcpus(void)6330 struct kvm_vcpu * __percpu *kvm_get_running_vcpus(void)
6331 {
6332 return &kvm_running_vcpu;
6333 }
6334
6335 #ifdef CONFIG_GUEST_PERF_EVENTS
kvm_guest_state(void)6336 static unsigned int kvm_guest_state(void)
6337 {
6338 struct kvm_vcpu *vcpu = kvm_get_running_vcpu();
6339 unsigned int state;
6340
6341 if (!kvm_arch_pmi_in_guest(vcpu))
6342 return 0;
6343
6344 state = PERF_GUEST_ACTIVE;
6345 if (!kvm_arch_vcpu_in_kernel(vcpu))
6346 state |= PERF_GUEST_USER;
6347
6348 return state;
6349 }
6350
kvm_guest_get_ip(void)6351 static unsigned long kvm_guest_get_ip(void)
6352 {
6353 struct kvm_vcpu *vcpu = kvm_get_running_vcpu();
6354
6355 /* Retrieving the IP must be guarded by a call to kvm_guest_state(). */
6356 if (WARN_ON_ONCE(!kvm_arch_pmi_in_guest(vcpu)))
6357 return 0;
6358
6359 return kvm_arch_vcpu_get_ip(vcpu);
6360 }
6361
6362 static struct perf_guest_info_callbacks kvm_guest_cbs = {
6363 .state = kvm_guest_state,
6364 .get_ip = kvm_guest_get_ip,
6365 .handle_intel_pt_intr = NULL,
6366 };
6367
kvm_register_perf_callbacks(unsigned int (* pt_intr_handler)(void))6368 void kvm_register_perf_callbacks(unsigned int (*pt_intr_handler)(void))
6369 {
6370 kvm_guest_cbs.handle_intel_pt_intr = pt_intr_handler;
6371 perf_register_guest_info_callbacks(&kvm_guest_cbs);
6372 }
kvm_unregister_perf_callbacks(void)6373 void kvm_unregister_perf_callbacks(void)
6374 {
6375 perf_unregister_guest_info_callbacks(&kvm_guest_cbs);
6376 }
6377 #endif
6378
kvm_init(unsigned vcpu_size,unsigned vcpu_align,struct module * module)6379 int kvm_init(unsigned vcpu_size, unsigned vcpu_align, struct module *module)
6380 {
6381 int r;
6382 int cpu;
6383
6384 #ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING
6385 r = cpuhp_setup_state_nocalls(CPUHP_AP_KVM_ONLINE, "kvm/cpu:online",
6386 kvm_online_cpu, kvm_offline_cpu);
6387 if (r)
6388 return r;
6389
6390 register_syscore_ops(&kvm_syscore_ops);
6391 #endif
6392
6393 /* A kmem cache lets us meet the alignment requirements of fx_save. */
6394 if (!vcpu_align)
6395 vcpu_align = __alignof__(struct kvm_vcpu);
6396 kvm_vcpu_cache =
6397 kmem_cache_create_usercopy("kvm_vcpu", vcpu_size, vcpu_align,
6398 SLAB_ACCOUNT,
6399 offsetof(struct kvm_vcpu, arch),
6400 offsetofend(struct kvm_vcpu, stats_id)
6401 - offsetof(struct kvm_vcpu, arch),
6402 NULL);
6403 if (!kvm_vcpu_cache) {
6404 r = -ENOMEM;
6405 goto err_vcpu_cache;
6406 }
6407
6408 for_each_possible_cpu(cpu) {
6409 if (!alloc_cpumask_var_node(&per_cpu(cpu_kick_mask, cpu),
6410 GFP_KERNEL, cpu_to_node(cpu))) {
6411 r = -ENOMEM;
6412 goto err_cpu_kick_mask;
6413 }
6414 }
6415
6416 r = kvm_irqfd_init();
6417 if (r)
6418 goto err_irqfd;
6419
6420 r = kvm_async_pf_init();
6421 if (r)
6422 goto err_async_pf;
6423
6424 kvm_chardev_ops.owner = module;
6425 kvm_vm_fops.owner = module;
6426 kvm_vcpu_fops.owner = module;
6427 kvm_device_fops.owner = module;
6428
6429 kvm_preempt_ops.sched_in = kvm_sched_in;
6430 kvm_preempt_ops.sched_out = kvm_sched_out;
6431
6432 kvm_init_debug();
6433
6434 r = kvm_vfio_ops_init();
6435 if (WARN_ON_ONCE(r))
6436 goto err_vfio;
6437
6438 kvm_gmem_init(module);
6439
6440 /*
6441 * Registration _must_ be the very last thing done, as this exposes
6442 * /dev/kvm to userspace, i.e. all infrastructure must be setup!
6443 */
6444 r = misc_register(&kvm_dev);
6445 if (r) {
6446 pr_err("kvm: misc device register failed\n");
6447 goto err_register;
6448 }
6449
6450 return 0;
6451
6452 err_register:
6453 kvm_vfio_ops_exit();
6454 err_vfio:
6455 kvm_async_pf_deinit();
6456 err_async_pf:
6457 kvm_irqfd_exit();
6458 err_irqfd:
6459 err_cpu_kick_mask:
6460 for_each_possible_cpu(cpu)
6461 free_cpumask_var(per_cpu(cpu_kick_mask, cpu));
6462 kmem_cache_destroy(kvm_vcpu_cache);
6463 err_vcpu_cache:
6464 #ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING
6465 unregister_syscore_ops(&kvm_syscore_ops);
6466 cpuhp_remove_state_nocalls(CPUHP_AP_KVM_ONLINE);
6467 #endif
6468 return r;
6469 }
6470 EXPORT_SYMBOL_GPL(kvm_init);
6471
kvm_exit(void)6472 void kvm_exit(void)
6473 {
6474 int cpu;
6475
6476 /*
6477 * Note, unregistering /dev/kvm doesn't strictly need to come first,
6478 * fops_get(), a.k.a. try_module_get(), prevents acquiring references
6479 * to KVM while the module is being stopped.
6480 */
6481 misc_deregister(&kvm_dev);
6482
6483 debugfs_remove_recursive(kvm_debugfs_dir);
6484 for_each_possible_cpu(cpu)
6485 free_cpumask_var(per_cpu(cpu_kick_mask, cpu));
6486 kmem_cache_destroy(kvm_vcpu_cache);
6487 kvm_vfio_ops_exit();
6488 kvm_async_pf_deinit();
6489 #ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING
6490 unregister_syscore_ops(&kvm_syscore_ops);
6491 cpuhp_remove_state_nocalls(CPUHP_AP_KVM_ONLINE);
6492 #endif
6493 kvm_irqfd_exit();
6494 }
6495 EXPORT_SYMBOL_GPL(kvm_exit);
6496
6497 struct kvm_vm_worker_thread_context {
6498 struct kvm *kvm;
6499 struct task_struct *parent;
6500 struct completion init_done;
6501 kvm_vm_thread_fn_t thread_fn;
6502 uintptr_t data;
6503 int err;
6504 };
6505
kvm_vm_worker_thread(void * context)6506 static int kvm_vm_worker_thread(void *context)
6507 {
6508 /*
6509 * The init_context is allocated on the stack of the parent thread, so
6510 * we have to locally copy anything that is needed beyond initialization
6511 */
6512 struct kvm_vm_worker_thread_context *init_context = context;
6513 struct task_struct *parent;
6514 struct kvm *kvm = init_context->kvm;
6515 kvm_vm_thread_fn_t thread_fn = init_context->thread_fn;
6516 uintptr_t data = init_context->data;
6517 int err;
6518
6519 err = kthread_park(current);
6520 /* kthread_park(current) is never supposed to return an error */
6521 WARN_ON(err != 0);
6522 if (err)
6523 goto init_complete;
6524
6525 err = cgroup_attach_task_all(init_context->parent, current);
6526 if (err) {
6527 kvm_err("%s: cgroup_attach_task_all failed with err %d\n",
6528 __func__, err);
6529 goto init_complete;
6530 }
6531
6532 set_user_nice(current, task_nice(init_context->parent));
6533
6534 init_complete:
6535 init_context->err = err;
6536 complete(&init_context->init_done);
6537 init_context = NULL;
6538
6539 if (err)
6540 goto out;
6541
6542 /* Wait to be woken up by the spawner before proceeding. */
6543 kthread_parkme();
6544
6545 if (!kthread_should_stop())
6546 err = thread_fn(kvm, data);
6547
6548 out:
6549 /*
6550 * Move kthread back to its original cgroup to prevent it lingering in
6551 * the cgroup of the VM process, after the latter finishes its
6552 * execution.
6553 *
6554 * kthread_stop() waits on the 'exited' completion condition which is
6555 * set in exit_mm(), via mm_release(), in do_exit(). However, the
6556 * kthread is removed from the cgroup in the cgroup_exit() which is
6557 * called after the exit_mm(). This causes the kthread_stop() to return
6558 * before the kthread actually quits the cgroup.
6559 */
6560 rcu_read_lock();
6561 parent = rcu_dereference(current->real_parent);
6562 get_task_struct(parent);
6563 rcu_read_unlock();
6564 cgroup_attach_task_all(parent, current);
6565 put_task_struct(parent);
6566
6567 return err;
6568 }
6569
kvm_vm_create_worker_thread(struct kvm * kvm,kvm_vm_thread_fn_t thread_fn,uintptr_t data,const char * name,struct task_struct ** thread_ptr)6570 int kvm_vm_create_worker_thread(struct kvm *kvm, kvm_vm_thread_fn_t thread_fn,
6571 uintptr_t data, const char *name,
6572 struct task_struct **thread_ptr)
6573 {
6574 struct kvm_vm_worker_thread_context init_context = {};
6575 struct task_struct *thread;
6576
6577 *thread_ptr = NULL;
6578 init_context.kvm = kvm;
6579 init_context.parent = current;
6580 init_context.thread_fn = thread_fn;
6581 init_context.data = data;
6582 init_completion(&init_context.init_done);
6583
6584 thread = kthread_run(kvm_vm_worker_thread, &init_context,
6585 "%s-%d", name, task_pid_nr(current));
6586 if (IS_ERR(thread))
6587 return PTR_ERR(thread);
6588
6589 /* kthread_run is never supposed to return NULL */
6590 WARN_ON(thread == NULL);
6591
6592 wait_for_completion(&init_context.init_done);
6593
6594 if (!init_context.err)
6595 *thread_ptr = thread;
6596
6597 return init_context.err;
6598 }
6599