1 // SPDX-License-Identifier: GPL-2.0
2 /*
3 * Device driver to expose SGX enclave memory to KVM guests.
4 *
5 * Copyright(c) 2021 Intel Corporation.
6 */
7
8 #include <linux/miscdevice.h>
9 #include <linux/mm.h>
10 #include <linux/mman.h>
11 #include <linux/sched/mm.h>
12 #include <linux/sched/signal.h>
13 #include <linux/slab.h>
14 #include <linux/xarray.h>
15 #include <asm/sgx.h>
16 #include <uapi/asm/sgx.h>
17
18 #include "encls.h"
19 #include "sgx.h"
20
21 struct sgx_vepc {
22 struct xarray page_array;
23 struct mutex lock;
24 };
25
26 /*
27 * Temporary SECS pages that cannot be EREMOVE'd due to having child in other
28 * virtual EPC instances, and the lock to protect it.
29 */
30 static struct mutex zombie_secs_pages_lock;
31 static struct list_head zombie_secs_pages;
32
__sgx_vepc_fault(struct sgx_vepc * vepc,struct vm_area_struct * vma,unsigned long addr)33 static int __sgx_vepc_fault(struct sgx_vepc *vepc,
34 struct vm_area_struct *vma, unsigned long addr)
35 {
36 struct sgx_epc_page *epc_page;
37 unsigned long index, pfn;
38 int ret;
39
40 WARN_ON(!mutex_is_locked(&vepc->lock));
41
42 /* Calculate index of EPC page in virtual EPC's page_array */
43 index = vma->vm_pgoff + PFN_DOWN(addr - vma->vm_start);
44
45 epc_page = xa_load(&vepc->page_array, index);
46 if (epc_page)
47 return 0;
48
49 epc_page = sgx_alloc_epc_page(vepc, false);
50 if (IS_ERR(epc_page))
51 return PTR_ERR(epc_page);
52
53 ret = xa_err(xa_store(&vepc->page_array, index, epc_page, GFP_KERNEL));
54 if (ret)
55 goto err_free;
56
57 pfn = PFN_DOWN(sgx_get_epc_phys_addr(epc_page));
58
59 ret = vmf_insert_pfn(vma, addr, pfn);
60 if (ret != VM_FAULT_NOPAGE) {
61 ret = -EFAULT;
62 goto err_delete;
63 }
64
65 return 0;
66
67 err_delete:
68 xa_erase(&vepc->page_array, index);
69 err_free:
70 sgx_free_epc_page(epc_page);
71 return ret;
72 }
73
sgx_vepc_fault(struct vm_fault * vmf)74 static vm_fault_t sgx_vepc_fault(struct vm_fault *vmf)
75 {
76 struct vm_area_struct *vma = vmf->vma;
77 struct sgx_vepc *vepc = vma->vm_private_data;
78 int ret;
79
80 mutex_lock(&vepc->lock);
81 ret = __sgx_vepc_fault(vepc, vma, vmf->address);
82 mutex_unlock(&vepc->lock);
83
84 if (!ret)
85 return VM_FAULT_NOPAGE;
86
87 if (ret == -EBUSY && (vmf->flags & FAULT_FLAG_ALLOW_RETRY)) {
88 mmap_read_unlock(vma->vm_mm);
89 return VM_FAULT_RETRY;
90 }
91
92 return VM_FAULT_SIGBUS;
93 }
94
95 static const struct vm_operations_struct sgx_vepc_vm_ops = {
96 .fault = sgx_vepc_fault,
97 };
98
sgx_vepc_mmap(struct file * file,struct vm_area_struct * vma)99 static int sgx_vepc_mmap(struct file *file, struct vm_area_struct *vma)
100 {
101 struct sgx_vepc *vepc = file->private_data;
102
103 if (!(vma->vm_flags & VM_SHARED))
104 return -EINVAL;
105
106 vma->vm_ops = &sgx_vepc_vm_ops;
107 /* Don't copy VMA in fork() */
108 vma->vm_flags |= VM_PFNMAP | VM_IO | VM_DONTDUMP | VM_DONTCOPY;
109 vma->vm_private_data = vepc;
110
111 return 0;
112 }
113
sgx_vepc_free_page(struct sgx_epc_page * epc_page)114 static int sgx_vepc_free_page(struct sgx_epc_page *epc_page)
115 {
116 int ret;
117
118 /*
119 * Take a previously guest-owned EPC page and return it to the
120 * general EPC page pool.
121 *
122 * Guests can not be trusted to have left this page in a good
123 * state, so run EREMOVE on the page unconditionally. In the
124 * case that a guest properly EREMOVE'd this page, a superfluous
125 * EREMOVE is harmless.
126 */
127 ret = __eremove(sgx_get_epc_virt_addr(epc_page));
128 if (ret) {
129 /*
130 * Only SGX_CHILD_PRESENT is expected, which is because of
131 * EREMOVE'ing an SECS still with child, in which case it can
132 * be handled by EREMOVE'ing the SECS again after all pages in
133 * virtual EPC have been EREMOVE'd. See comments in below in
134 * sgx_vepc_release().
135 *
136 * The user of virtual EPC (KVM) needs to guarantee there's no
137 * logical processor is still running in the enclave in guest,
138 * otherwise EREMOVE will get SGX_ENCLAVE_ACT which cannot be
139 * handled here.
140 */
141 WARN_ONCE(ret != SGX_CHILD_PRESENT, EREMOVE_ERROR_MESSAGE,
142 ret, ret);
143 return ret;
144 }
145
146 sgx_free_epc_page(epc_page);
147
148 return 0;
149 }
150
sgx_vepc_release(struct inode * inode,struct file * file)151 static int sgx_vepc_release(struct inode *inode, struct file *file)
152 {
153 struct sgx_vepc *vepc = file->private_data;
154 struct sgx_epc_page *epc_page, *tmp, *entry;
155 unsigned long index;
156
157 LIST_HEAD(secs_pages);
158
159 xa_for_each(&vepc->page_array, index, entry) {
160 /*
161 * Remove all normal, child pages. sgx_vepc_free_page()
162 * will fail if EREMOVE fails, but this is OK and expected on
163 * SECS pages. Those can only be EREMOVE'd *after* all their
164 * child pages. Retries below will clean them up.
165 */
166 if (sgx_vepc_free_page(entry))
167 continue;
168
169 xa_erase(&vepc->page_array, index);
170 }
171
172 /*
173 * Retry EREMOVE'ing pages. This will clean up any SECS pages that
174 * only had children in this 'epc' area.
175 */
176 xa_for_each(&vepc->page_array, index, entry) {
177 epc_page = entry;
178 /*
179 * An EREMOVE failure here means that the SECS page still
180 * has children. But, since all children in this 'sgx_vepc'
181 * have been removed, the SECS page must have a child on
182 * another instance.
183 */
184 if (sgx_vepc_free_page(epc_page))
185 list_add_tail(&epc_page->list, &secs_pages);
186
187 xa_erase(&vepc->page_array, index);
188 }
189
190 /*
191 * SECS pages are "pinned" by child pages, and "unpinned" once all
192 * children have been EREMOVE'd. A child page in this instance
193 * may have pinned an SECS page encountered in an earlier release(),
194 * creating a zombie. Since some children were EREMOVE'd above,
195 * try to EREMOVE all zombies in the hopes that one was unpinned.
196 */
197 mutex_lock(&zombie_secs_pages_lock);
198 list_for_each_entry_safe(epc_page, tmp, &zombie_secs_pages, list) {
199 /*
200 * Speculatively remove the page from the list of zombies,
201 * if the page is successfully EREMOVE'd it will be added to
202 * the list of free pages. If EREMOVE fails, throw the page
203 * on the local list, which will be spliced on at the end.
204 */
205 list_del(&epc_page->list);
206
207 if (sgx_vepc_free_page(epc_page))
208 list_add_tail(&epc_page->list, &secs_pages);
209 }
210
211 if (!list_empty(&secs_pages))
212 list_splice_tail(&secs_pages, &zombie_secs_pages);
213 mutex_unlock(&zombie_secs_pages_lock);
214
215 kfree(vepc);
216
217 return 0;
218 }
219
sgx_vepc_open(struct inode * inode,struct file * file)220 static int sgx_vepc_open(struct inode *inode, struct file *file)
221 {
222 struct sgx_vepc *vepc;
223
224 vepc = kzalloc(sizeof(struct sgx_vepc), GFP_KERNEL);
225 if (!vepc)
226 return -ENOMEM;
227 mutex_init(&vepc->lock);
228 xa_init(&vepc->page_array);
229
230 file->private_data = vepc;
231
232 return 0;
233 }
234
235 static const struct file_operations sgx_vepc_fops = {
236 .owner = THIS_MODULE,
237 .open = sgx_vepc_open,
238 .release = sgx_vepc_release,
239 .mmap = sgx_vepc_mmap,
240 };
241
242 static struct miscdevice sgx_vepc_dev = {
243 .minor = MISC_DYNAMIC_MINOR,
244 .name = "sgx_vepc",
245 .nodename = "sgx_vepc",
246 .fops = &sgx_vepc_fops,
247 };
248
sgx_vepc_init(void)249 int __init sgx_vepc_init(void)
250 {
251 /* SGX virtualization requires KVM to work */
252 if (!cpu_feature_enabled(X86_FEATURE_VMX))
253 return -ENODEV;
254
255 INIT_LIST_HEAD(&zombie_secs_pages);
256 mutex_init(&zombie_secs_pages_lock);
257
258 return misc_register(&sgx_vepc_dev);
259 }
260
261 /**
262 * sgx_virt_ecreate() - Run ECREATE on behalf of guest
263 * @pageinfo: Pointer to PAGEINFO structure
264 * @secs: Userspace pointer to SECS page
265 * @trapnr: trap number injected to guest in case of ECREATE error
266 *
267 * Run ECREATE on behalf of guest after KVM traps ECREATE for the purpose
268 * of enforcing policies of guest's enclaves, and return the trap number
269 * which should be injected to guest in case of any ECREATE error.
270 *
271 * Return:
272 * - 0: ECREATE was successful.
273 * - <0: on error.
274 */
sgx_virt_ecreate(struct sgx_pageinfo * pageinfo,void __user * secs,int * trapnr)275 int sgx_virt_ecreate(struct sgx_pageinfo *pageinfo, void __user *secs,
276 int *trapnr)
277 {
278 int ret;
279
280 /*
281 * @secs is an untrusted, userspace-provided address. It comes from
282 * KVM and is assumed to be a valid pointer which points somewhere in
283 * userspace. This can fault and call SGX or other fault handlers when
284 * userspace mapping @secs doesn't exist.
285 *
286 * Add a WARN() to make sure @secs is already valid userspace pointer
287 * from caller (KVM), who should already have handled invalid pointer
288 * case (for instance, made by malicious guest). All other checks,
289 * such as alignment of @secs, are deferred to ENCLS itself.
290 */
291 if (WARN_ON_ONCE(!access_ok(secs, PAGE_SIZE)))
292 return -EINVAL;
293
294 __uaccess_begin();
295 ret = __ecreate(pageinfo, (void *)secs);
296 __uaccess_end();
297
298 if (encls_faulted(ret)) {
299 *trapnr = ENCLS_TRAPNR(ret);
300 return -EFAULT;
301 }
302
303 /* ECREATE doesn't return an error code, it faults or succeeds. */
304 WARN_ON_ONCE(ret);
305 return 0;
306 }
307 EXPORT_SYMBOL_GPL(sgx_virt_ecreate);
308
__sgx_virt_einit(void __user * sigstruct,void __user * token,void __user * secs)309 static int __sgx_virt_einit(void __user *sigstruct, void __user *token,
310 void __user *secs)
311 {
312 int ret;
313
314 /*
315 * Make sure all userspace pointers from caller (KVM) are valid.
316 * All other checks deferred to ENCLS itself. Also see comment
317 * for @secs in sgx_virt_ecreate().
318 */
319 #define SGX_EINITTOKEN_SIZE 304
320 if (WARN_ON_ONCE(!access_ok(sigstruct, sizeof(struct sgx_sigstruct)) ||
321 !access_ok(token, SGX_EINITTOKEN_SIZE) ||
322 !access_ok(secs, PAGE_SIZE)))
323 return -EINVAL;
324
325 __uaccess_begin();
326 ret = __einit((void *)sigstruct, (void *)token, (void *)secs);
327 __uaccess_end();
328
329 return ret;
330 }
331
332 /**
333 * sgx_virt_einit() - Run EINIT on behalf of guest
334 * @sigstruct: Userspace pointer to SIGSTRUCT structure
335 * @token: Userspace pointer to EINITTOKEN structure
336 * @secs: Userspace pointer to SECS page
337 * @lepubkeyhash: Pointer to guest's *virtual* SGX_LEPUBKEYHASH MSR values
338 * @trapnr: trap number injected to guest in case of EINIT error
339 *
340 * Run EINIT on behalf of guest after KVM traps EINIT. If SGX_LC is available
341 * in host, SGX driver may rewrite the hardware values at wish, therefore KVM
342 * needs to update hardware values to guest's virtual MSR values in order to
343 * ensure EINIT is executed with expected hardware values.
344 *
345 * Return:
346 * - 0: EINIT was successful.
347 * - <0: on error.
348 */
sgx_virt_einit(void __user * sigstruct,void __user * token,void __user * secs,u64 * lepubkeyhash,int * trapnr)349 int sgx_virt_einit(void __user *sigstruct, void __user *token,
350 void __user *secs, u64 *lepubkeyhash, int *trapnr)
351 {
352 int ret;
353
354 if (!cpu_feature_enabled(X86_FEATURE_SGX_LC)) {
355 ret = __sgx_virt_einit(sigstruct, token, secs);
356 } else {
357 preempt_disable();
358
359 sgx_update_lepubkeyhash(lepubkeyhash);
360
361 ret = __sgx_virt_einit(sigstruct, token, secs);
362 preempt_enable();
363 }
364
365 /* Propagate up the error from the WARN_ON_ONCE in __sgx_virt_einit() */
366 if (ret == -EINVAL)
367 return ret;
368
369 if (encls_faulted(ret)) {
370 *trapnr = ENCLS_TRAPNR(ret);
371 return -EFAULT;
372 }
373
374 return ret;
375 }
376 EXPORT_SYMBOL_GPL(sgx_virt_einit);
377