xref: /openbsd/sys/arch/amd64/amd64/locore.S (revision f4d0a262)
1/*	$OpenBSD: locore.S,v 1.148 2024/08/02 22:24:51 guenther Exp $	*/
2/*	$NetBSD: locore.S,v 1.13 2004/03/25 18:33:17 drochner Exp $	*/
3
4/*
5 * Copyright-o-rama!
6 */
7
8/*
9 * Copyright (c) 2001 Wasabi Systems, Inc.
10 * All rights reserved.
11 *
12 * Written by Frank van der Linden for Wasabi Systems, Inc.
13 *
14 * Redistribution and use in source and binary forms, with or without
15 * modification, are permitted provided that the following conditions
16 * are met:
17 * 1. Redistributions of source code must retain the above copyright
18 *    notice, this list of conditions and the following disclaimer.
19 * 2. Redistributions in binary form must reproduce the above copyright
20 *    notice, this list of conditions and the following disclaimer in the
21 *    documentation and/or other materials provided with the distribution.
22 * 3. All advertising materials mentioning features or use of this software
23 *    must display the following acknowledgement:
24 *      This product includes software developed for the NetBSD Project by
25 *      Wasabi Systems, Inc.
26 * 4. The name of Wasabi Systems, Inc. may not be used to endorse
27 *    or promote products derived from this software without specific prior
28 *    written permission.
29 *
30 * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND
31 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
32 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
33 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL WASABI SYSTEMS, INC
34 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
35 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
36 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
37 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
38 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
39 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
40 * POSSIBILITY OF SUCH DAMAGE.
41 */
42
43
44/*-
45 * Copyright (c) 1998, 2000 The NetBSD Foundation, Inc.
46 * All rights reserved.
47 *
48 * This code is derived from software contributed to The NetBSD Foundation
49 * by Charles M. Hannum.
50 *
51 * Redistribution and use in source and binary forms, with or without
52 * modification, are permitted provided that the following conditions
53 * are met:
54 * 1. Redistributions of source code must retain the above copyright
55 *    notice, this list of conditions and the following disclaimer.
56 * 2. Redistributions in binary form must reproduce the above copyright
57 *    notice, this list of conditions and the following disclaimer in the
58 *    documentation and/or other materials provided with the distribution.
59 *
60 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
61 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
62 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
63 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
64 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
65 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
66 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
67 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
68 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
69 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
70 * POSSIBILITY OF SUCH DAMAGE.
71 */
72
73/*-
74 * Copyright (c) 1990 The Regents of the University of California.
75 * All rights reserved.
76 *
77 * This code is derived from software contributed to Berkeley by
78 * William Jolitz.
79 *
80 * Redistribution and use in source and binary forms, with or without
81 * modification, are permitted provided that the following conditions
82 * are met:
83 * 1. Redistributions of source code must retain the above copyright
84 *    notice, this list of conditions and the following disclaimer.
85 * 2. Redistributions in binary form must reproduce the above copyright
86 *    notice, this list of conditions and the following disclaimer in the
87 *    documentation and/or other materials provided with the distribution.
88 * 3. Neither the name of the University nor the names of its contributors
89 *    may be used to endorse or promote products derived from this software
90 *    without specific prior written permission.
91 *
92 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
93 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
94 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
95 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
96 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
97 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
98 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
99 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
100 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
101 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
102 * SUCH DAMAGE.
103 *
104 *	@(#)locore.s	7.3 (Berkeley) 5/13/91
105 */
106
107#include "assym.h"
108#include "efi.h"
109#include "lapic.h"
110#include "ksyms.h"
111#include "xen.h"
112#include "hyperv.h"
113
114#include <sys/syscall.h>
115
116#include <machine/param.h>
117#include <machine/codepatch.h>
118#include <machine/psl.h>
119#include <machine/segments.h>
120#include <machine/specialreg.h>
121#include <machine/trap.h>			/* T_PROTFLT */
122#include <machine/frameasm.h>
123
124#if NLAPIC > 0
125#include <machine/i82489reg.h>
126#endif
127
128/*
129 * override user-land alignment before including asm.h
130 */
131#define	ALIGN_DATA	.align	8,0xcc
132
133#include <machine/asm.h>
134
135#define SET_CURPROC(proc,cpu)			\
136	movq	CPUVAR(SELF),cpu	;	\
137	movq	proc,CPUVAR(CURPROC)      ;	\
138	movq	cpu,P_CPU(proc)
139
140#define GET_CURPCB(reg)			movq	CPUVAR(CURPCB),reg
141#define SET_CURPCB(reg)			movq	reg,CPUVAR(CURPCB)
142
143
144/*
145 * Initialization
146 */
147	.data
148
149#if NLAPIC > 0
150	.align	NBPG, 0xcc
151	.globl	local_apic, lapic_id, lapic_tpr
152local_apic:
153	.space	LAPIC_ID
154lapic_id:
155	.long	0x00000000
156	.space	LAPIC_TPRI-(LAPIC_ID+4)
157lapic_tpr:
158	.space	LAPIC_PPRI-LAPIC_TPRI
159lapic_ppr:
160	.space	LAPIC_ISR-LAPIC_PPRI
161lapic_isr:
162	.space	NBPG-LAPIC_ISR
163#endif
164
165/*****************************************************************************/
166
167/*
168 * Signal trampoline; copied to a page mapped into userspace.
169 * gdb's backtrace logic matches against the instructions in this.
170 */
171	.section .rodata
172	.globl	sigcode
173sigcode:
174	endbr64
175	call	1f
176	movq	%rsp,%rdi
177	pushq	%rdi			/* fake return address */
178	movq	$SYS_sigreturn,%rax
179	.globl sigcodecall
180sigcodecall:
181	syscall
182	.globl	sigcoderet
183sigcoderet:
184	int3
1851:	CODEPATCH_START
186	JMP_RETPOLINE(rax)
187	CODEPATCH_END(CPTAG_RETPOLINE_RAX)
188	.globl	esigcode
189esigcode:
190	.globl	sigfill
191sigfill:
192	int3
193esigfill:
194	.globl	sigfillsiz
195sigfillsiz:
196	.long	esigfill - sigfill
197
198	.text
199/*
200 * void lgdt(struct region_descriptor *rdp);
201 * Change the global descriptor table.
202 */
203NENTRY(lgdt)
204	RETGUARD_SETUP(lgdt, r11)
205	/* Reload the descriptor table. */
206	movq	%rdi,%rax
207	lgdt	(%rax)
208	/* Flush the prefetch q. */
209	jmp	1f
210	nop
2111:	/* Reload "stale" selectors. */
212	movl	$GSEL(GDATA_SEL, SEL_KPL),%eax
213	movl	%eax,%ds
214	movl	%eax,%es
215	movl	%eax,%ss
216	/* Reload code selector by doing intersegment return. */
217	popq	%rax
218	pushq	$GSEL(GCODE_SEL, SEL_KPL)
219	pushq	%rax
220	RETGUARD_CHECK(lgdt, r11)
221	lretq
222END(lgdt)
223
224#if defined(DDB) || NEFI > 0
225ENTRY(setjmp)
226	RETGUARD_SETUP(setjmp, r11)
227	/*
228	 * Only save registers that must be preserved across function
229	 * calls according to the ABI (%rbx, %rsp, %rbp, %r12-%r15)
230	 * and %rip.
231	 */
232	movq	%rdi,%rax
233	movq	%rbx,(%rax)
234	movq	%rsp,8(%rax)
235	movq	%rbp,16(%rax)
236	movq	%r12,24(%rax)
237	movq	%r13,32(%rax)
238	movq	%r14,40(%rax)
239	movq	%r15,48(%rax)
240	movq	(%rsp),%rdx
241	movq	%rdx,56(%rax)
242	xorl	%eax,%eax
243	RETGUARD_CHECK(setjmp, r11)
244	ret
245	lfence
246END(setjmp)
247
248ENTRY(longjmp)
249	movq	%rdi,%rax
250	movq	8(%rax),%rsp
251	movq	56(%rax),%rdx
252	movq	%rdx,(%rsp)
253	RETGUARD_SETUP(longjmp, r11)
254	movq	(%rax),%rbx
255	movq	16(%rax),%rbp
256	movq	24(%rax),%r12
257	movq	32(%rax),%r13
258	movq	40(%rax),%r14
259	movq	48(%rax),%r15
260	xorl	%eax,%eax
261	incl	%eax
262	RETGUARD_CHECK(longjmp, r11)
263	ret
264	lfence
265END(longjmp)
266#endif /* DDB || NEFI > 0 */
267
268/*****************************************************************************/
269
270/*
271 * int cpu_switchto(struct proc *old, struct proc *new)
272 * Switch from "old" proc to "new".
273 */
274ENTRY(cpu_switchto)
275	pushq	%rbx
276	pushq	%rbp
277	pushq	%r12
278	pushq	%r13
279	pushq	%r14
280	pushq	%r15
281
282	movq	%rdi, %r13
283	movq	%rsi, %r12
284
285	/* Record new proc. */
286	movb	$SONPROC,P_STAT(%r12)	# p->p_stat = SONPROC
287	SET_CURPROC(%r12,%rcx)
288
289	movl	CPUVAR(CPUID),%r9d
290
291	/* for the FPU/"extended CPU state" handling below */
292	movq	xsave_mask(%rip),%rdx
293	movl	%edx,%eax
294	shrq	$32,%rdx
295
296	/* If old proc exited, don't bother. */
297	xorl	%ecx,%ecx
298	testq	%r13,%r13
299	jz	switch_exited
300
301	/*
302	 * Save old context.
303	 *
304	 * Registers:
305	 *   %rax - scratch
306	 *   %r13 - old proc, then old pcb
307	 *   %rcx - old pmap if not P_SYSTEM
308	 *   %r12 - new proc
309	 *   %r9d - cpuid
310	 */
311
312	/* remember the pmap if not P_SYSTEM */
313	testl	$P_SYSTEM,P_FLAG(%r13)
314	movq	P_ADDR(%r13),%r13
315	jnz	0f
316	movq	PCB_PMAP(%r13),%rcx
3170:
318
319	/* Save stack pointers. */
320	movq	%rsp,PCB_RSP(%r13)
321	movq	%rbp,PCB_RBP(%r13)
322
323	/*
324	 * If the old proc ran in userspace then save the
325	 * floating-point/"extended state" registers
326	 */
327	testl	$CPUPF_USERXSTATE,CPUVAR(PFLAGS)
328	jz	.Lxstate_reset
329
330	movq	%r13, %rdi
331#if PCB_SAVEFPU != 0
332	addq	$PCB_SAVEFPU,%rdi
333#endif
334	CODEPATCH_START
335	fxsave64	(%rdi)
336	CODEPATCH_END(CPTAG_XSAVE)
337
338switch_exited:
339	/* now clear the xstate */
340	movq	proc0paddr(%rip),%rdi
341#if PCB_SAVEFPU != 0
342	addq	$PCB_SAVEFPU,%rdi
343#endif
344	CODEPATCH_START
345	fxrstor64	(%rdi)
346	CODEPATCH_END(CPTAG_XRSTORS)
347	andl	$~CPUPF_USERXSTATE,CPUVAR(PFLAGS)
348
349.Lxstate_reset:
350	/*
351	 * If the segment registers haven't been reset since the old proc
352	 * ran in userspace then reset them now
353	 */
354	testl	$CPUPF_USERSEGS,CPUVAR(PFLAGS)
355	jz	restore_saved
356	andl	$~CPUPF_USERSEGS,CPUVAR(PFLAGS)
357
358	/* set %ds, %es, %fs, and %gs to expected value to prevent info leak */
359	movw	$(GSEL(GUDATA_SEL, SEL_UPL)),%ax
360	movw	%ax,%ds
361	movw	%ax,%es
362	movw	%ax,%fs
363	cli			/* block interrupts when on user GS.base */
364	swapgs			/* switch from kernel to user GS.base */
365	movw	%ax,%gs		/* set %gs to UDATA and GS.base to 0 */
366	swapgs			/* back to kernel GS.base */
367
368restore_saved:
369	/*
370	 * Restore saved context.
371	 *
372	 * Registers:
373	 *   %rax, %rdx - scratch
374	 *   %rcx - old pmap if not P_SYSTEM
375	 *   %r12 - new process
376	 *   %r13 - new pcb
377	 *   %rbx - new pmap if not P_SYSTEM
378	 */
379
380	movq	P_ADDR(%r12),%r13
381
382	/* remember the pmap if not P_SYSTEM */
383	xorl	%ebx,%ebx
384	testl	$P_SYSTEM,P_FLAG(%r12)
385	jnz	1f
386	movq	PCB_PMAP(%r13),%rbx
3871:
388
389	/* No interrupts while loading new state. */
390	cli
391
392	/* Restore stack pointers. */
393	movq	PCB_RSP(%r13),%rsp
394	movq	PCB_RBP(%r13),%rbp
395
396	/* Stack pivot done, setup RETGUARD */
397	RETGUARD_SETUP_OFF(cpu_switchto, r11, 6*8)
398
399	/* don't switch cr3 to the same thing it already was */
400	movq	PCB_CR3(%r13),%rax
401	movq	%cr3,%rdi
402	xorq	%rax,%rdi
403	btrq	$63,%rdi	/* ignore CR3_REUSE_PCID */
404	testq	%rdi,%rdi
405	jz	.Lsame_cr3
406
407#ifdef DIAGNOSTIC
408	/* verify ci_proc_pmap had been updated properly */
409	cmpq	%rcx,CPUVAR(PROC_PMAP)
410	jnz	.Lbogus_proc_pmap
411#endif
412	/* record which pmap this CPU should get IPIs for */
413	movq	%rbx,CPUVAR(PROC_PMAP)
414
415.Lset_cr3:
416	movq	%rax,%cr3			/* %rax used below too */
417
418.Lsame_cr3:
419	/*
420	 * If we switched from a userland thread with a shallow call stack
421	 * (e.g interrupt->ast->mi_ast->prempt->mi_switch->cpu_switchto)
422	 * then the RSB may have attacker controlled entries when we switch
423	 * to a deeper call stack in the new thread.  Refill the RSB with
424	 * entries safe to speculate into/through.
425	 */
426	RET_STACK_REFILL_WITH_RCX
427
428	/* Don't bother with the rest if switching to a system process. */
429	testq	%rbx,%rbx
430	jz	switch_restored
431
432	/* record the bits needed for future U-->K transition */
433	movq	PCB_KSTACK(%r13),%rdx
434	subq	$FRAMESIZE,%rdx
435	movq	%rdx,CPUVAR(KERN_RSP)
436
437	CODEPATCH_START
438	/*
439	 * Meltdown: iff we're doing separate U+K and U-K page tables,
440	 * then record them in cpu_info for easy access in syscall and
441	 * interrupt trampolines.
442	 */
443	movq	PM_PDIRPA_INTEL(%rbx),%rdx
444	orq	cr3_reuse_pcid,%rax
445	orq	cr3_pcid_proc_intel,%rdx
446	movq	%rax,CPUVAR(KERN_CR3)
447	movq	%rdx,CPUVAR(USER_CR3)
448	CODEPATCH_END(CPTAG_MELTDOWN_NOP)
449
450switch_restored:
451	SET_CURPCB(%r13)
452
453	/* Interrupts are okay again. */
454	sti
455	popq	%r15
456	popq	%r14
457	popq	%r13
458	popq	%r12
459	popq	%rbp
460	popq	%rbx
461	RETGUARD_CHECK(cpu_switchto, r11)
462	ret
463	lfence
464
465#ifdef DIAGNOSTIC
466.Lbogus_proc_pmap:
467	leaq	bogus_proc_pmap,%rdi
468	call	panic
469	int3	/* NOTREACHED */
470	.pushsection .rodata
471bogus_proc_pmap:
472	.asciz	"curcpu->ci_proc_pmap didn't point to previous pmap"
473	.popsection
474#endif /* DIAGNOSTIC */
475END(cpu_switchto)
476
477NENTRY(retpoline_rax)
478	CODEPATCH_START
479	JMP_RETPOLINE(rax)
480	CODEPATCH_END(CPTAG_RETPOLINE_RAX)
481END(retpoline_rax)
482
483NENTRY(__x86_indirect_thunk_r11)
484	CODEPATCH_START
485	JMP_RETPOLINE(r11)
486	CODEPATCH_END(CPTAG_RETPOLINE_R11)
487END(__x86_indirect_thunk_r11)
488
489ENTRY(cpu_idle_cycle_hlt)
490	RETGUARD_SETUP(cpu_idle_cycle_hlt, r11)
491	sti
492	hlt
493	RETGUARD_CHECK(cpu_idle_cycle_hlt, r11)
494	ret
495	lfence
496END(cpu_idle_cycle_hlt)
497
498/*
499 * savectx(struct pcb *pcb);
500 * Update pcb, saving current processor state.
501 */
502ENTRY(savectx)
503	RETGUARD_SETUP(savectx, r11)
504	/* Save stack pointers. */
505	movq	%rsp,PCB_RSP(%rdi)
506	movq	%rbp,PCB_RBP(%rdi)
507	RETGUARD_CHECK(savectx, r11)
508	ret
509	lfence
510END(savectx)
511
512/*
513 * syscall insn entry.
514 * Enter here with interrupts blocked; %rcx contains the caller's
515 * %rip and the original rflags has been copied to %r11.  %cs and
516 * %ss have been updated to the kernel segments, but %rsp is still
517 * the user-space value.
518 * First order of business is to swap to the kernel GS.base so that
519 * we can access our struct cpu_info.  After possibly mucking with
520 * pagetables, we switch to our kernel stack.  Once that's in place
521 * we can save the rest of the syscall frame and unblock interrupts.
522 */
523KUTEXT_PAGE_START
524 	.align	NBPG, 0xcc
525XUsyscall_meltdown:
526	/*
527	 * This is the real Xsyscall_meltdown page, which is mapped into
528	 * the U-K page tables at the same location as Xsyscall_meltdown
529	 * below.  For this, the Meltdown case, we use the scratch space
530	 * in cpu_info so we can switch to the kernel page tables
531	 * (thank you, Intel), at which point we'll continue at the
532	 * "SYSCALL_ENTRY" after Xsyscall below.
533	 * In case the CPU speculates past the mov to cr3, we put a
534	 * retpoline-style pause-lfence-jmp-to-pause loop.
535	 */
536	endbr64
537	swapgs
538	movq	%rax,CPUVAR(SCRATCH)
539	movq	CPUVAR(KERN_CR3),%rax
540	movq	%rax,%cr3
5410:	pause
542	lfence
543	jmp	0b
544KUTEXT_PAGE_END
545
546KTEXT_PAGE_START
547	.align	NBPG, 0xcc
548GENTRY(Xsyscall_meltdown)
549	/* pad to match real Xsyscall_meltdown positioning above */
550	movq	CPUVAR(KERN_CR3),%rax
551	movq	%rax,%cr3
552GENTRY(Xsyscall)
553	endbr64
554	swapgs
555	movq	%rax,CPUVAR(SCRATCH)
556	SYSCALL_ENTRY			/* create trapframe */
557	sti
558
559	movq	CPUVAR(CURPROC),%r14
560	movq	%rsp,P_MD_REGS(%r14)	# save pointer to frame
561	andl	$~MDP_IRET,P_MD_FLAGS(%r14)
562	movq	%rsp,%rdi
563	call	syscall
564
565.Lsyscall_check_asts:
566	/* Check for ASTs on exit to user mode. */
567	cli
568	CHECK_ASTPENDING(%r11)
569	je	2f
570	CLEAR_ASTPENDING(%r11)
571	sti
572	movq	%rsp,%rdi
573	call	ast
574	jmp	.Lsyscall_check_asts
575
5762:
577#ifdef DIAGNOSTIC
578	cmpl	$IPL_NONE,CPUVAR(ILEVEL)
579	jne	.Lsyscall_spl_not_lowered
580#endif /* DIAGNOSTIC */
581
582	/* Could registers have been changed that require an iretq? */
583	testl	$MDP_IRET, P_MD_FLAGS(%r14)
584	jne	intr_user_exit_post_ast
585
586	/* Restore FPU/"extended CPU state" if it's not already in the CPU */
587	testl	$CPUPF_USERXSTATE,CPUVAR(PFLAGS)
588	jz	.Lsyscall_restore_xstate
589
590	/* Restore FS.base if it's not already in the CPU */
591	testl	$CPUPF_USERSEGS,CPUVAR(PFLAGS)
592	jz	.Lsyscall_restore_fsbase
593
594.Lsyscall_restore_registers:
595	/*
596	 * If the pmap we're now on isn't the same as the one we
597	 * were on last time we were in userspace, then use IBPB
598	 * to prevent cross-process branch-target injection.
599	 */
600	CODEPATCH_START
601	movq	CPUVAR(PROC_PMAP),%rbx
602	cmpq	CPUVAR(USER_PMAP),%rbx
603	je	1f
604	xorl	%edx,%edx
605	movl	$PRED_CMD_IBPB,%eax
606	movl	$MSR_PRED_CMD,%ecx
607	wrmsr
608	movq	%rbx,CPUVAR(USER_PMAP)
6091:
610	CODEPATCH_END(CPTAG_IBPB_NOP)
611	call	pku_xonly
612	RET_STACK_REFILL_WITH_RCX
613
614	movq	TF_R8(%rsp),%r8
615	movq	TF_R9(%rsp),%r9
616	movq	TF_R10(%rsp),%r10
617	movq	TF_R12(%rsp),%r12
618	movq	TF_R13(%rsp),%r13
619	movq	TF_R14(%rsp),%r14
620	movq	TF_R15(%rsp),%r15
621	movq	TF_RBX(%rsp),%rbx
622	movq	TF_RDX(%rsp),%rdx
623
624	CODEPATCH_START
625	xorl	%edi,%edi
626	xorl	%esi,%esi
627	xorl	%r11d,%r11d
628	xorl	%eax,%eax
629	xorl	%ecx,%ecx
630	movw	%ds,TF_R8(%rsp)
631	verw	TF_R8(%rsp)
632	CODEPATCH_END(CPTAG_MDS)
633
634	movq	TF_RDI(%rsp),%rdi
635	movq	TF_RSI(%rsp),%rsi
636	movq	TF_RBP(%rsp),%rbp
637
638	/*
639	 * We need to finish reading from the trapframe, then switch
640	 * to the user page tables, swapgs, and return.  We need
641	 * to get the final value for the register that was used
642	 * for the mov to %cr3 from somewhere accessible on the
643	 * user page tables, so save it in CPUVAR(SCRATCH) across
644	 * the switch.
645	 */
646	movq	TF_RAX(%rsp),%rax
647	movq	TF_RIP(%rsp),%rcx
648	movq	TF_RFLAGS(%rsp),%r11
649	movq	TF_RSP(%rsp),%rsp
650	CODEPATCH_START
651	movq	%rax,CPUVAR(SCRATCH)
652	movq	CPUVAR(USER_CR3),%rax
653	PCID_SET_REUSE_NOP
654	movq	%rax,%cr3
655Xsyscall_trampback:
6560:	pause
657	lfence
658	jmp	0b
659	CODEPATCH_END(CPTAG_MELTDOWN_NOP)
660	swapgs
661	sysretq
662END(Xsyscall)
663END(Xsyscall_meltdown)
664KTEXT_PAGE_END
665
666KUTEXT_PAGE_START
667	.space	(Xsyscall_trampback - Xsyscall_meltdown) - \
668		(. - XUsyscall_meltdown), 0xcc
669	movq	%rax,%cr3
670	movq	CPUVAR(SCRATCH),%rax
671	swapgs
672	sysretq
673KUTEXT_PAGE_END
674
675	.text
676	_ALIGN_TRAPS
677	/* in this case, need FS.base but not xstate, rarely happens */
678.Lsyscall_restore_fsbase:	/* CPU doesn't have curproc's FS.base */
679	orl	$CPUPF_USERSEGS,CPUVAR(PFLAGS)
680	movq	CPUVAR(CURPCB),%rdi
681	jmp	.Lsyscall_restore_fsbase_real
682
683	_ALIGN_TRAPS
684.Lsyscall_restore_xstate:	/* CPU doesn't have curproc's xstate */
685	orl	$(CPUPF_USERXSTATE|CPUPF_USERSEGS),CPUVAR(PFLAGS)
686	movq	CPUVAR(CURPCB),%rdi
687	movq	xsave_mask(%rip),%rdx
688	movl	%edx,%eax
689	shrq	$32,%rdx
690#if PCB_SAVEFPU != 0
691	addq	$PCB_SAVEFPU,%rdi
692#endif
693	/* untouched state so can't fault */
694	CODEPATCH_START
695	fxrstor64	(%rdi)
696	CODEPATCH_END(CPTAG_XRSTORS)
697#if PCB_SAVEFPU != 0
698	subq	$PCB_SAVEFPU,%rdi
699#endif
700.Lsyscall_restore_fsbase_real:
701	movq	PCB_FSBASE(%rdi),%rdx
702	movl	%edx,%eax
703	shrq	$32,%rdx
704	movl	$MSR_FSBASE,%ecx
705	wrmsr
706	jmp	.Lsyscall_restore_registers
707
708#ifdef DIAGNOSTIC
709.Lsyscall_spl_not_lowered:
710	leaq	spl_lowered(%rip), %rdi
711	movl	TF_ERR(%rsp),%esi	/* syscall # stashed above */
712	movl	TF_RDI(%rsp),%edx
713	movl	%ebx,%ecx
714	movl	CPUVAR(ILEVEL),%r8d
715	xorq	%rax,%rax
716	call	printf
717#ifdef DDB
718	int	$3
719#endif /* DDB */
720	movl	$IPL_NONE,CPUVAR(ILEVEL)
721	jmp	.Lsyscall_check_asts
722
723	.section .rodata
724spl_lowered:
725	.asciz	"WARNING: SPL NOT LOWERED ON SYSCALL %d %d EXIT %x %x\n"
726	.text
727#endif
728
729NENTRY(proc_trampoline)
730	call	proc_trampoline_mi
731	movq	%r13,%rdi
732	movq	%r12,%rax
733	call	retpoline_rax
734	movq	CPUVAR(CURPROC),%r14
735	jmp	.Lsyscall_check_asts
736END(proc_trampoline)
737
738
739/*
740 * Returning to userspace via iretq.  We do things in this order:
741 *  - check for ASTs
742 *  - restore FPU/"extended CPU state" if it's not already in the CPU
743 *  - DIAGNOSTIC: no more C calls after this, so check the SPL
744 *  - restore FS.base if it's not already in the CPU
745 *  - restore most registers
746 *  - update the iret frame from the trapframe
747 *  - finish reading from the trapframe
748 *  - switch to the trampoline stack	\
749 *  - jump to the .kutext segment	|-- Meltdown workaround
750 *  - switch to the user page tables	/
751 *  - swapgs
752 *  - iretq
753 */
754KTEXT_PAGE_START
755        _ALIGN_TRAPS
756GENTRY(intr_user_exit)
757#ifdef DIAGNOSTIC
758	pushfq
759	popq	%rdx
760	testq	$PSL_I,%rdx
761	jnz	.Lintr_user_exit_not_blocked
762#endif /* DIAGNOSTIC */
763
764	/* Check for ASTs */
765	CHECK_ASTPENDING(%r11)
766	je	intr_user_exit_post_ast
767	CLEAR_ASTPENDING(%r11)
768	sti
769	movq	%rsp,%rdi
770	call	ast
771	cli
772	jmp	intr_user_exit
773
774intr_user_exit_post_ast:
775	/* Restore FPU/"extended CPU state" if it's not already in the CPU */
776	testl	$CPUPF_USERXSTATE,CPUVAR(PFLAGS)
777	jz	.Lintr_restore_xstate
778
779	/* Restore FS.base if it's not already in the CPU */
780	testl	$CPUPF_USERSEGS,CPUVAR(PFLAGS)
781	jz	.Lintr_restore_fsbase
782
783.Lintr_restore_registers:
784#ifdef DIAGNOSTIC
785	/* no more C calls after this, so check the SPL */
786	cmpl	$0,CPUVAR(ILEVEL)
787	jne	.Luser_spl_not_lowered
788#endif /* DIAGNOSTIC */
789
790	/*
791	 * If the pmap we're now on isn't the same as the one we
792	 * were on last time we were in userspace, then use IBPB
793	 * to prevent cross-process branch-target injection.
794	 */
795	CODEPATCH_START
796	movq	CPUVAR(PROC_PMAP),%rbx
797	cmpq	CPUVAR(USER_PMAP),%rbx
798	je	1f
799	xorl	%edx,%edx
800	movl	$PRED_CMD_IBPB,%eax
801	movl	$MSR_PRED_CMD,%ecx
802	wrmsr
803	movq	%rbx,CPUVAR(USER_PMAP)
8041:
805	CODEPATCH_END(CPTAG_IBPB_NOP)
806	call	pku_xonly
807	RET_STACK_REFILL_WITH_RCX
808
809	movq	TF_R8(%rsp),%r8
810	movq	TF_R9(%rsp),%r9
811	movq	TF_R10(%rsp),%r10
812	movq	TF_R12(%rsp),%r12
813	movq	TF_R13(%rsp),%r13
814	movq	TF_R14(%rsp),%r14
815	movq	TF_R15(%rsp),%r15
816	movq	TF_RBX(%rsp),%rbx
817
818	CODEPATCH_START
819	xorl	%edi,%edi
820	xorl	%esi,%esi
821	xorl	%r11d,%r11d
822	xorl	%eax,%eax
823	xorl	%edx,%edx
824	xorl	%ecx,%ecx
825	movw	%ds,TF_R8(%rsp)
826	verw	TF_R8(%rsp)
827	CODEPATCH_END(CPTAG_MDS)
828
829	movq	TF_RDI(%rsp),%rdi
830	movq	TF_RSI(%rsp),%rsi
831	movq	TF_RBP(%rsp),%rbp
832
833	/*
834	 * To get the final value for the register that was used
835	 * for the mov to %cr3, we need access to somewhere accessible
836	 * on the user page tables, so we save it in CPUVAR(SCRATCH)
837	 * across the switch.
838	 */
839	/* update iret frame */
840	movq	CPUVAR(INTR_RSP),%rdx
841	movq	$(GSEL(GUCODE_SEL,SEL_UPL)),IRETQ_CS(%rdx)
842	movq	TF_RIP(%rsp),%rax
843	movq	%rax,IRETQ_RIP(%rdx)
844	movq	TF_RFLAGS(%rsp),%rax
845	movq	%rax,IRETQ_RFLAGS(%rdx)
846	movq	TF_RSP(%rsp),%rax
847	movq	%rax,IRETQ_RSP(%rdx)
848	movq	$(GSEL(GUDATA_SEL,SEL_UPL)),IRETQ_SS(%rdx)
849	/* finish with the trap frame */
850	movq	TF_RAX(%rsp),%rax
851	movq	TF_RCX(%rsp),%rcx
852	movq	TF_R11(%rsp),%r11
853	/* switch to the trampoline stack */
854	xchgq	%rdx,%rsp
855	movq	TF_RDX(%rdx),%rdx
856	CODEPATCH_START
857	movq	%rax,CPUVAR(SCRATCH)
858	movq	CPUVAR(USER_CR3),%rax
859	PCID_SET_REUSE_NOP
860	movq	%rax,%cr3
861Xiretq_trampback:
862KTEXT_PAGE_END
863/* the movq %cr3 switches to this "KUTEXT" page */
864KUTEXT_PAGE_START
865	.space	(Xiretq_trampback - Xsyscall_meltdown) - \
866		(. - XUsyscall_meltdown), 0xcc
867	movq	CPUVAR(SCRATCH),%rax
868.Liretq_swapgs:
869	swapgs
870doreti_iret_meltdown:
871	iretq
872KUTEXT_PAGE_END
873/*
874 * Back to the "KTEXT" page to fill in the speculation trap and the
875 * swapgs+iretq used for non-Meltdown kernels.  This switching back
876 * and forth between segments is so that we can do the .space
877 * calculation below to guarantee the iretq's above and below line
878 * up, so the 'doreti_iret' label lines up with the iretq whether
879 * the CPU is affected by Meltdown or not.
880 */
881KTEXT_PAGE_START
8820:	pause
883	lfence
884	jmp	0b
885	.space	(.Liretq_swapgs - XUsyscall_meltdown) - \
886		(. - Xsyscall_meltdown), 0xcc
887	CODEPATCH_END(CPTAG_MELTDOWN_NOP)
888	swapgs
889
890	.globl	doreti_iret
891doreti_iret:
892	iretq
893KTEXT_PAGE_END
894
895	.text
896	_ALIGN_TRAPS
897.Lintr_restore_xstate:		/* CPU doesn't have curproc's xstate */
898	orl	$CPUPF_USERXSTATE,CPUVAR(PFLAGS)
899	movq	CPUVAR(CURPCB),%rdi
900#if PCB_SAVEFPU != 0
901	addq	$PCB_SAVEFPU,%rdi
902#endif
903	movq	xsave_mask(%rip),%rdx
904	movl	%edx,%eax
905	shrq	$32, %rdx
906	CODEPATCH_START
907	fxrstor64	(%rdi)
908	CODEPATCH_END(CPTAG_XRSTORS)
909	//testl	%eax,%eax
910	//jnz	.Lintr_xrstor_faulted
911.Lintr_restore_fsbase:		/* CPU doesn't have curproc's FS.base */
912	orl	$CPUPF_USERSEGS,CPUVAR(PFLAGS)
913	movq	CPUVAR(CURPCB),%rdx
914	movq	PCB_FSBASE(%rdx),%rdx
915	movl	%edx,%eax
916	shrq	$32,%rdx
917	movl	$MSR_FSBASE,%ecx
918	wrmsr
919	jmp	.Lintr_restore_registers
920
921.Lintr_xrstor_faulted:
922	/*
923	 * xrstor faulted; we need to reset the FPU state and call trap()
924	 * to post a signal, which requires interrupts be enabled.
925	 */
926	sti
927	movq	proc0paddr(%rip),%rdi
928#if PCB_SAVEFPU != 0
929	addq	$PCB_SAVEFPU,%rdi
930#endif
931	CODEPATCH_START
932	fxrstor64	(%rdi)
933	CODEPATCH_END(CPTAG_XRSTORS)
934	movq	$T_PROTFLT,TF_TRAPNO(%rsp)
935	jmp	recall_trap
936
937#ifdef DIAGNOSTIC
938.Lintr_user_exit_not_blocked:
939	movl	warn_once(%rip),%edi
940	testl	%edi,%edi
941	jnz	1f
942	incl	%edi
943	movl	%edi,warn_once(%rip)
944	leaq	.Lnot_blocked(%rip),%rdi
945	call	printf
946#ifdef DDB
947	int	$3
948#endif /* DDB */
9491:	cli
950	jmp	intr_user_exit
951
952.Luser_spl_not_lowered:
953	sti
954	leaq	intr_spl_lowered(%rip),%rdi
955	movl	CPUVAR(ILEVEL),%esi
956	xorl	%edx,%edx		/* always SPL zero for userspace */
957	xorl	%eax,%eax
958	call	printf
959#ifdef DDB
960	int	$3
961#endif /* DDB */
962	movl	$0,CPUVAR(ILEVEL)
963	cli
964	jmp	intr_user_exit
965
966	.section .rodata
967intr_spl_lowered:
968	.asciz	"WARNING: SPL NOT LOWERED ON TRAP EXIT %x %x\n"
969	.text
970#endif /* DIAGNOSTIC */
971END(Xintr_user_exit)
972
973
974/*
975 * Return to supervisor mode from trap or interrupt
976 */
977NENTRY(intr_fast_exit)
978#ifdef DIAGNOSTIC
979	pushfq
980	popq	%rdx
981	testq	$PSL_I,%rdx
982	jnz	.Lintr_exit_not_blocked
983#endif /* DIAGNOSTIC */
984	movq	TF_RDI(%rsp),%rdi
985	movq	TF_RSI(%rsp),%rsi
986	movq	TF_R8(%rsp),%r8
987	movq	TF_R9(%rsp),%r9
988	movq	TF_R10(%rsp),%r10
989	movq	TF_R12(%rsp),%r12
990	movq	TF_R13(%rsp),%r13
991	movq	TF_R14(%rsp),%r14
992	movq	TF_R15(%rsp),%r15
993	movq	TF_RBP(%rsp),%rbp
994	movq	TF_RBX(%rsp),%rbx
995	movq	TF_RDX(%rsp),%rdx
996	movq	TF_RCX(%rsp),%rcx
997	movq	TF_R11(%rsp),%r11
998	movq	TF_RAX(%rsp),%rax
999	addq	$TF_RIP,%rsp
1000	iretq
1001
1002#ifdef DIAGNOSTIC
1003.Lintr_exit_not_blocked:
1004	movl	warn_once(%rip),%edi
1005	testl	%edi,%edi
1006	jnz	1f
1007	incl	%edi
1008	movl	%edi,warn_once(%rip)
1009	leaq	.Lnot_blocked(%rip),%rdi
1010	call	printf
1011#ifdef DDB
1012	int	$3
1013#endif /* DDB */
10141:	cli
1015	jmp	intr_fast_exit
1016
1017	.data
1018.global warn_once
1019warn_once:
1020	.long	0
1021	.section .rodata
1022.Lnot_blocked:
1023	.asciz	"WARNING: INTERRUPTS NOT BLOCKED ON INTERRUPT RETURN: 0x%x 0x%x\n"
1024	.text
1025#endif
1026END(intr_fast_exit)
1027
1028/*
1029 * FPU/"extended CPU state" handling
1030 *	void xrstor_kern(sfp, mask)
1031 *		using first of xrstors/xrstor/fxrstor, load given state
1032 *		which is assumed to be trusted: i.e., unaltered from
1033 *		xsaves/xsaveopt/xsave/fxsave by kernel
1034 * 	int xrstor_user(sfp, mask)
1035 *		using first of xrstor/fxrstor, load given state which might
1036 *		not be trustable: #GP faults will be caught; returns 0/1 if
1037 *		okay/it trapped.
1038 *	void fpusave(sfp)
1039 *		save current state, but retain it in the FPU
1040 *	void fpusavereset(sfp)
1041 *		save current state and reset FPU to initial/kernel state
1042 *	int xsetbv_user(reg, mask)
1043 *		load specified %xcr# register, returns 0/1 if okay/it trapped
1044 */
1045
1046ENTRY(xrstor_kern)
1047	RETGUARD_SETUP(xrstor_kern, r11)
1048	movq	%rsi, %rdx
1049	movl	%esi, %eax
1050	shrq	$32, %rdx
1051	CODEPATCH_START
1052	fxrstor64	(%rdi)
1053	CODEPATCH_END(CPTAG_XRSTORS)
1054	RETGUARD_CHECK(xrstor_kern, r11)
1055	ret
1056	lfence
1057END(xrstor_kern)
1058
1059ENTRY(xrstor_user)
1060	RETGUARD_SETUP(xrstor_user, r11)
1061	movq	%rsi, %rdx
1062	movl	%esi, %eax
1063	shrq	$32, %rdx
1064	.globl	xrstor_fault
1065xrstor_fault:
1066	CODEPATCH_START
1067	fxrstor64	(%rdi)
1068	CODEPATCH_END(CPTAG_XRSTOR)
1069	xorl	%eax, %eax
1070	RETGUARD_CHECK(xrstor_user, r11)
1071	ret
1072	lfence
1073NENTRY(xrstor_resume)
1074	movl	$1, %eax
1075	RETGUARD_CHECK(xrstor_user, r11)
1076	ret
1077	lfence
1078END(xrstor_user)
1079
1080ENTRY(fpusave)
1081	RETGUARD_SETUP(fpusave, r11)
1082	movq	xsave_mask(%rip),%rdx
1083	movl	%edx,%eax
1084	shrq	$32,%rdx
1085	CODEPATCH_START
1086	fxsave64	(%rdi)
1087	CODEPATCH_END(CPTAG_XSAVE)
1088	RETGUARD_CHECK(fpusave, r11)
1089	ret
1090	lfence
1091END(fpusave)
1092
1093ENTRY(fpusavereset)
1094	RETGUARD_SETUP(fpusavereset, r11)
1095	movq	xsave_mask(%rip),%rdx
1096	movl	%edx,%eax
1097	shrq	$32,%rdx
1098	CODEPATCH_START
1099	fxsave64	(%rdi)
1100	CODEPATCH_END(CPTAG_XSAVE)
1101	movq	proc0paddr(%rip),%rdi
1102#if PCB_SAVEFPU != 0
1103	addq	$PCB_SAVEFPU,%rdi
1104#endif
1105	CODEPATCH_START
1106	fxrstor64	(%rdi)
1107	CODEPATCH_END(CPTAG_XRSTORS)
1108	RETGUARD_CHECK(fpusavereset, r11)
1109	ret
1110	lfence
1111END(fpusavereset)
1112
1113ENTRY(xsetbv_user)
1114	RETGUARD_SETUP(xsetbv_user, r11)
1115	movl	%edi, %ecx
1116	movq	%rsi, %rdx
1117	movl	%esi, %eax
1118	shrq	$32, %rdx
1119	.globl	xsetbv_fault
1120xsetbv_fault:
1121	xsetbv
1122	xorl	%eax, %eax
1123	RETGUARD_CHECK(xsetbv_user, r11)
1124	ret
1125	lfence
1126NENTRY(xsetbv_resume)
1127	movl	$1, %eax
1128	RETGUARD_CHECK(xsetbv_user, r11)
1129	ret
1130	lfence
1131END(xsetbv_user)
1132
1133CODEPATCH_CODE(_xrstor,		xrstor64 (%rdi))
1134CODEPATCH_CODE(_xrstors,	xrstors64 (%rdi))
1135CODEPATCH_CODE(_xsave,		xsave64 (%rdi))
1136CODEPATCH_CODE(_xsaves,		xsaves64 (%rdi))
1137CODEPATCH_CODE(_xsaveopt,	xsaveopt64 (%rdi))
1138CODEPATCH_CODE(_pcid_set_reuse,
1139		orl	$(CR3_REUSE_PCID >> 32),CPUVAR(USER_CR3 + 4))
1140CODEPATCH_CODE_LEN(_jmprax,	jmp *%rax; int3)
1141CODEPATCH_CODE_LEN(_jmpr11,	jmp *%r11; int3)
1142CODEPATCH_CODE_LEN(_jmpr13,	jmp *%r13; int3)
1143
1144ENTRY(pagezero)
1145	RETGUARD_SETUP(pagezero, r11)
1146	movq    $-PAGE_SIZE,%rdx
1147	subq    %rdx,%rdi
1148	xorq    %rax,%rax
11491:
1150	movnti  %rax,(%rdi,%rdx)
1151	movnti  %rax,8(%rdi,%rdx)
1152	movnti  %rax,16(%rdi,%rdx)
1153	movnti  %rax,24(%rdi,%rdx)
1154	addq    $32,%rdx
1155	jne     1b
1156	sfence
1157	RETGUARD_CHECK(pagezero, r11)
1158	ret
1159	lfence
1160END(pagezero)
1161
1162/* void pku_xonly(void) */
1163ENTRY(pku_xonly)
1164	movq	pg_xo,%rax	/* have PKU support? */
1165	cmpq	$0,%rax
1166	je	1f
1167	movl	$0,%ecx		/* force PKRU for xonly restriction */
1168	movl	$0,%edx
1169	movl	$PGK_VALUE,%eax	/* key0 normal, key1 is exec without read */
1170	wrpkru
11711:	ret
1172	lfence
1173END(pku_xonly)
1174
1175/* int rdmsr_safe(u_int msr, uint64_t *data) */
1176ENTRY(rdmsr_safe)
1177	RETGUARD_SETUP(rdmsr_safe, r10)
1178
1179	movl	%edi,	%ecx	/* u_int msr */
1180	.globl	rdmsr_safe_fault
1181rdmsr_safe_fault:
1182	rdmsr
1183	salq	$32, %rdx
1184	movl	%eax, %eax
1185	orq	%rdx, %rax
1186	movq	%rax, (%rsi)	/* *data */
1187	xorq	%rax, %rax
1188
1189	RETGUARD_CHECK(rdmsr_safe, r10)
1190	ret
1191	lfence
1192
1193NENTRY(rdmsr_resume)
1194	movl	$0x1, %eax
1195	RETGUARD_CHECK(rdmsr_safe, r10)
1196	ret
1197	lfence
1198END(rdmsr_safe)
1199
1200#if NHYPERV > 0
1201/* uint64_t hv_hypercall_trampoline(uint64_t control, paddr_t input, paddr_t output) */
1202NENTRY(hv_hypercall_trampoline)
1203	endbr64
1204	mov	%rdx, %r8
1205	mov	%rsi, %rdx
1206	mov	%rdi, %rcx
1207	jmp	hv_hypercall_page
1208END(hv_hypercall_trampoline)
1209	/* Hypercall page needs to be page aligned */
1210	.text
1211	.align	NBPG, 0xcc
1212	.globl	hv_hypercall_page
1213hv_hypercall_page:
1214	.skip	0x1000, 0xcc
1215#endif /* NHYPERV > 0 */
1216
1217#if NXEN > 0
1218	/* Hypercall page needs to be page aligned */
1219	.text
1220	.align	NBPG, 0xcc
1221	.globl	xen_hypercall_page
1222xen_hypercall_page:
1223	.skip	0x1000, 0xcc
1224#endif /* NXEN > 0 */
1225