xref: /freebsd/sys/amd64/amd64/exception.S (revision 9768746b)
1/*-
2 * Copyright (c) 1989, 1990 William F. Jolitz.
3 * Copyright (c) 1990 The Regents of the University of California.
4 * Copyright (c) 2007-2018 The FreeBSD Foundation
5 * All rights reserved.
6 *
7 * Portions of this software were developed by A. Joseph Koshy under
8 * sponsorship from the FreeBSD Foundation and Google, Inc.
9 *
10 * Portions of this software were developed by
11 * Konstantin Belousov <kib@FreeBSD.org> under sponsorship from
12 * the FreeBSD Foundation.
13 *
14 * Redistribution and use in source and binary forms, with or without
15 * modification, are permitted provided that the following conditions
16 * are met:
17 * 1. Redistributions of source code must retain the above copyright
18 *    notice, this list of conditions and the following disclaimer.
19 * 2. Redistributions in binary form must reproduce the above copyright
20 *    notice, this list of conditions and the following disclaimer in the
21 *    documentation and/or other materials provided with the distribution.
22 * 3. Neither the name of the University nor the names of its contributors
23 *    may be used to endorse or promote products derived from this software
24 *    without specific prior written permission.
25 *
26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36 * SUCH DAMAGE.
37 *
38 * $FreeBSD$
39 */
40
41#include "opt_atpic.h"
42#include "opt_hwpmc_hooks.h"
43
44#include "assym.inc"
45
46#include <machine/psl.h>
47#include <machine/asmacros.h>
48#include <machine/trap.h>
49#include <machine/specialreg.h>
50#include <machine/pmap.h>
51
52#ifdef KDTRACE_HOOKS
53	.bss
54	.globl	dtrace_invop_jump_addr
55	.align	8
56	.type	dtrace_invop_jump_addr,@object
57	.size	dtrace_invop_jump_addr,8
58dtrace_invop_jump_addr:
59	.zero	8
60	.globl	dtrace_invop_calltrap_addr
61	.align	8
62	.type	dtrace_invop_calltrap_addr,@object
63	.size	dtrace_invop_calltrap_addr,8
64dtrace_invop_calltrap_addr:
65	.zero	8
66#endif
67	.text
68#ifdef HWPMC_HOOKS
69	ENTRY(start_exceptions)
70#endif
71
72/*****************************************************************************/
73/* Trap handling                                                             */
74/*****************************************************************************/
75/*
76 * Trap and fault vector routines.
77 *
78 * All traps are 'interrupt gates', SDT_SYSIGT.  An interrupt gate pushes
79 * state on the stack but also disables interrupts.  This is important for
80 * us for the use of the swapgs instruction.  We cannot be interrupted
81 * until the GS.base value is correct.  For most traps, we automatically
82 * then enable interrupts if the interrupted context had them enabled.
83 * This is equivalent to the i386 port's use of SDT_SYS386TGT.
84 *
85 * The cpu will push a certain amount of state onto the kernel stack for
86 * the current process.  See amd64/include/frame.h.
87 * This includes the current RFLAGS (status register, which includes
88 * the interrupt disable state prior to the trap), the code segment register,
89 * and the return instruction pointer are pushed by the cpu.  The cpu
90 * will also push an 'error' code for certain traps.  We push a dummy
91 * error code for those traps where the cpu doesn't in order to maintain
92 * a consistent frame.  We also push a contrived 'trap number'.
93 *
94 * The CPU does not push the general registers, so we must do that, and we
95 * must restore them prior to calling 'iret'.  The CPU adjusts %cs and %ss
96 * but does not mess with %ds, %es, %gs or %fs.  We swap the %gs base for
97 * for the kernel mode operation shortly, without changes to the selector
98 * loaded.  Since superuser long mode works with any selectors loaded into
99 * segment registers other then %cs, which makes them mostly unused in long
100 * mode, and kernel does not reference %fs, leave them alone.  The segment
101 * registers are reloaded on return to the usermode.
102 */
103
104/* Traps that we leave interrupts disabled for. */
105	.macro	TRAP_NOEN	l, trapno
106	PTI_ENTRY	\l,\l\()_pti_k,\l\()_pti_u
107\l\()_pti_k:
108	subq	$TF_RIP,%rsp
109	movl	$\trapno,TF_TRAPNO(%rsp)
110	movq	$0,TF_ADDR(%rsp)
111	movq	$0,TF_ERR(%rsp)
112	jmp	alltraps_noen_k
113\l\()_pti_u:
114	subq	$TF_RIP,%rsp
115	movl	$\trapno,TF_TRAPNO(%rsp)
116	movq	$0,TF_ADDR(%rsp)
117	movq	$0,TF_ERR(%rsp)
118	jmp	alltraps_noen_u
119
120	.globl	X\l
121	.type	X\l,@function
122X\l:
123	subq	$TF_RIP,%rsp
124	movl	$\trapno,TF_TRAPNO(%rsp)
125	movq	$0,TF_ADDR(%rsp)
126	movq	$0,TF_ERR(%rsp)
127	testb	$SEL_RPL_MASK,TF_CS(%rsp)
128	jz	alltraps_noen_k
129	swapgs
130	lfence
131	jmp	alltraps_noen_u
132	.endm
133
134	TRAP_NOEN	bpt, T_BPTFLT
135#ifdef KDTRACE_HOOKS
136	TRAP_NOEN	dtrace_ret, T_DTRACE_RET
137#endif
138
139/* Regular traps; The cpu does not supply tf_err for these. */
140	.macro	TRAP	l, trapno
141	PTI_ENTRY	\l,\l\()_pti_k,\l\()_pti_u
142\l\()_pti_k:
143	subq	$TF_RIP,%rsp
144	movl	$\trapno,TF_TRAPNO(%rsp)
145	movq	$0,TF_ADDR(%rsp)
146	movq	$0,TF_ERR(%rsp)
147	jmp	alltraps_k
148\l\()_pti_u:
149	subq	$TF_RIP,%rsp
150	movl	$\trapno,TF_TRAPNO(%rsp)
151	movq	$0,TF_ADDR(%rsp)
152	movq	$0,TF_ERR(%rsp)
153	jmp	alltraps_u
154
155	.globl	X\l
156	.type	X\l,@function
157X\l:
158	subq	$TF_RIP,%rsp
159	movl	$\trapno,TF_TRAPNO(%rsp)
160	movq	$0,TF_ADDR(%rsp)
161	movq	$0,TF_ERR(%rsp)
162	testb	$SEL_RPL_MASK,TF_CS(%rsp)
163	jz	alltraps_k
164	swapgs
165	lfence
166	jmp	alltraps_u
167	.endm
168
169	TRAP	div, T_DIVIDE
170	TRAP	ofl, T_OFLOW
171	TRAP	bnd, T_BOUND
172	TRAP	ill, T_PRIVINFLT
173	TRAP	dna, T_DNA
174	TRAP	fpusegm, T_FPOPFLT
175	TRAP	rsvd, T_RESERVED
176	TRAP	fpu, T_ARITHTRAP
177	TRAP	xmm, T_XMMFLT
178
179/* This group of traps have tf_err already pushed by the cpu. */
180	.macro	TRAP_ERR	l, trapno
181	PTI_ENTRY	\l,\l\()_pti_k,\l\()_pti_u,has_err=1
182\l\()_pti_k:
183	subq	$TF_ERR,%rsp
184	movl	$\trapno,TF_TRAPNO(%rsp)
185	movq	$0,TF_ADDR(%rsp)
186	jmp	alltraps_k
187\l\()_pti_u:
188	subq	$TF_ERR,%rsp
189	movl	$\trapno,TF_TRAPNO(%rsp)
190	movq	$0,TF_ADDR(%rsp)
191	jmp	alltraps_u
192	.globl	X\l
193	.type	X\l,@function
194X\l:
195	subq	$TF_ERR,%rsp
196	movl	$\trapno,TF_TRAPNO(%rsp)
197	movq	$0,TF_ADDR(%rsp)
198	testb	$SEL_RPL_MASK,TF_CS(%rsp)
199	jz	alltraps_k
200	swapgs
201	lfence
202	jmp	alltraps_u
203	.endm
204
205	TRAP_ERR	tss, T_TSSFLT
206	TRAP_ERR	align, T_ALIGNFLT
207
208	/*
209	 * alltraps_u/k entry points.
210	 * SWAPGS must be already performed by prologue,
211	 * if this is the first time in the kernel from userland.
212	 * Reenable interrupts if they were enabled before the trap.
213	 * This approximates SDT_SYS386TGT on the i386 port.
214	 */
215	SUPERALIGN_TEXT
216	.globl	alltraps_u
217	.type	alltraps_u,@function
218alltraps_u:
219	movq	%rdi,TF_RDI(%rsp)
220	movq	%rdx,TF_RDX(%rsp)
221	movq	%rax,TF_RAX(%rsp)
222	movq	%rcx,TF_RCX(%rsp)
223	movq	PCPU(CURPCB),%rdi
224	andl	$~PCB_FULL_IRET,PCB_FLAGS(%rdi)
225	call	handle_ibrs_entry
226	jmp	alltraps_save_segs
227	SUPERALIGN_TEXT
228	.globl	alltraps_k
229	.type	alltraps_k,@function
230alltraps_k:
231	lfence
232	movq	%rdi,TF_RDI(%rsp)
233	movq	%rdx,TF_RDX(%rsp)
234	movq	%rax,TF_RAX(%rsp)
235	movq	%rcx,TF_RCX(%rsp)
236alltraps_save_segs:
237	SAVE_SEGS
238	testl	$PSL_I,TF_RFLAGS(%rsp)
239	jz	alltraps_pushregs_no_rax
240	sti
241alltraps_pushregs_no_rax:
242	movq	%rsi,TF_RSI(%rsp)
243	movq	%r8,TF_R8(%rsp)
244	movq	%r9,TF_R9(%rsp)
245	movq	%rbx,TF_RBX(%rsp)
246	movq	%rbp,TF_RBP(%rsp)
247	movq	%r10,TF_R10(%rsp)
248	movq	%r11,TF_R11(%rsp)
249	movq	%r12,TF_R12(%rsp)
250	movq	%r13,TF_R13(%rsp)
251	movq	%r14,TF_R14(%rsp)
252	movq	%r15,TF_R15(%rsp)
253	movl	$TF_HASSEGS,TF_FLAGS(%rsp)
254	pushfq
255	andq	$~(PSL_D | PSL_AC),(%rsp)
256	popfq
257#ifdef KDTRACE_HOOKS
258	/*
259	 * DTrace Function Boundary Trace (fbt) probes are triggered
260	 * by int3 (0xcc) which causes the #BP (T_BPTFLT) breakpoint
261	 * interrupt. For all other trap types, just handle them in
262	 * the usual way.
263	 */
264	testb	$SEL_RPL_MASK,TF_CS(%rsp) /* Did we come from kernel? */
265	jnz	calltrap		/* ignore userland traps */
266	cmpl	$T_BPTFLT,TF_TRAPNO(%rsp)
267	jne	calltrap
268
269	/* Check if there is no DTrace hook registered. */
270	cmpq	$0,dtrace_invop_jump_addr
271	je	calltrap
272
273	/*
274	 * Set our jump address for the jump back in the event that
275	 * the breakpoint wasn't caused by DTrace at all.
276	 */
277	movq	$calltrap,dtrace_invop_calltrap_addr(%rip)
278
279	/* Jump to the code hooked in by DTrace. */
280	jmpq	*dtrace_invop_jump_addr
281#endif
282	.globl	calltrap
283	.type	calltrap,@function
284calltrap:
285	KMSAN_ENTER
286	movq	%rsp, %rdi
287	call	trap_check
288	KMSAN_LEAVE
289	jmp	doreti			/* Handle any pending ASTs */
290
291	/*
292	 * alltraps_noen_u/k entry points.
293	 * Again, SWAPGS must be already performed by prologue, if needed.
294	 * Unlike alltraps above, we want to leave the interrupts disabled.
295	 * This corresponds to SDT_SYS386IGT on the i386 port.
296	 */
297	SUPERALIGN_TEXT
298	.globl	alltraps_noen_u
299	.type	alltraps_noen_u,@function
300alltraps_noen_u:
301	movq	%rdi,TF_RDI(%rsp)
302	movq	PCPU(CURPCB),%rdi
303	andl	$~PCB_FULL_IRET,PCB_FLAGS(%rdi)
304	jmp	alltraps_noen_save_segs
305	SUPERALIGN_TEXT
306	.globl	alltraps_noen_k
307	.type	alltraps_noen_k,@function
308alltraps_noen_k:
309	lfence
310	movq	%rdi,TF_RDI(%rsp)
311alltraps_noen_save_segs:
312	SAVE_SEGS
313	movq	%rdx,TF_RDX(%rsp)
314	movq	%rax,TF_RAX(%rsp)
315	movq	%rcx,TF_RCX(%rsp)
316	testb	$SEL_RPL_MASK,TF_CS(%rsp)
317	jz	alltraps_pushregs_no_rax
318	call	handle_ibrs_entry
319	jmp	alltraps_pushregs_no_rax
320
321IDTVEC(dblfault)
322	subq	$TF_ERR,%rsp
323	movl	$T_DOUBLEFLT,TF_TRAPNO(%rsp)
324	movq	$0,TF_ADDR(%rsp)
325	movq	$0,TF_ERR(%rsp)
326	movq	%rdi,TF_RDI(%rsp)
327	movq	%rsi,TF_RSI(%rsp)
328	movq	%rdx,TF_RDX(%rsp)
329	movq	%rcx,TF_RCX(%rsp)
330	movq	%r8,TF_R8(%rsp)
331	movq	%r9,TF_R9(%rsp)
332	movq	%rax,TF_RAX(%rsp)
333	movq	%rbx,TF_RBX(%rsp)
334	movq	%rbp,TF_RBP(%rsp)
335	movq	%r10,TF_R10(%rsp)
336	movq	%r11,TF_R11(%rsp)
337	movq	%r12,TF_R12(%rsp)
338	movq	%r13,TF_R13(%rsp)
339	movq	%r14,TF_R14(%rsp)
340	movq	%r15,TF_R15(%rsp)
341	SAVE_SEGS
342	movl	$TF_HASSEGS,TF_FLAGS(%rsp)
343	pushfq
344	andq	$~(PSL_D | PSL_AC),(%rsp)
345	popfq
346	movq	TF_SIZE(%rsp),%rdx
347	movl	%edx,%eax
348	shrq	$32,%rdx
349	movl	$MSR_GSBASE,%ecx
350	wrmsr
351	movq	%cr3,%rax
352	movq	%rax,PCPU(SAVED_UCR3)
353	movq	PCPU(KCR3),%rax
354	cmpq	$~0,%rax
355	je	2f
356	movq	%rax,%cr3
3572:	KMSAN_ENTER
358	movq	%rsp,%rdi
359	call	dblfault_handler
360	KMSAN_LEAVE
3613:	hlt
362	jmp	3b
363
364	ALIGN_TEXT
365IDTVEC(page_pti)
366	testb	$SEL_RPL_MASK,PTI_CS-PTI_ERR(%rsp)
367	jz	page_k
368	swapgs
369	lfence
370	pushq	%rax
371	movq	%cr3,%rax
372	movq	%rax,PCPU(SAVED_UCR3)
373	cmpq	$~0,PCPU(UCR3)
374	jne	1f
375	popq	%rax
376	jmp	page_u
3771:	pushq	%rdx
378	PTI_UUENTRY has_err=1
379	jmp	page_u
380	ALIGN_TEXT
381IDTVEC(page)
382	testb	$SEL_RPL_MASK,TF_CS-TF_ERR(%rsp) /* Did we come from kernel? */
383	jnz	page_u_swapgs		/* already running with kernel GS.base */
384page_k:
385	lfence
386	subq	$TF_ERR,%rsp
387	movq	%rdi,TF_RDI(%rsp)	/* free up GP registers */
388	movq	%rax,TF_RAX(%rsp)
389	movq	%rdx,TF_RDX(%rsp)
390	movq	%rcx,TF_RCX(%rsp)
391	jmp	page_cr2
392	ALIGN_TEXT
393page_u_swapgs:
394	swapgs
395	lfence
396page_u:
397	subq	$TF_ERR,%rsp
398	movq	%rdi,TF_RDI(%rsp)
399	movq	%rax,TF_RAX(%rsp)
400	movq	%rdx,TF_RDX(%rsp)
401	movq	%rcx,TF_RCX(%rsp)
402	movq	PCPU(CURPCB),%rdi
403	andl	$~PCB_FULL_IRET,PCB_FLAGS(%rdi)
404	movq	PCPU(SAVED_UCR3),%rax
405	movq	%rax,PCB_SAVED_UCR3(%rdi)
406	call	handle_ibrs_entry
407page_cr2:
408	movq	%cr2,%rdi		/* preserve %cr2 before ..  */
409	movq	%rdi,TF_ADDR(%rsp)	/* enabling interrupts. */
410	SAVE_SEGS
411	movl	$T_PAGEFLT,TF_TRAPNO(%rsp)
412	testl	$PSL_I,TF_RFLAGS(%rsp)
413	jz	alltraps_pushregs_no_rax
414	sti
415	jmp	alltraps_pushregs_no_rax
416
417	/*
418	 * We have to special-case this one.  If we get a trap in doreti() at
419	 * the iretq stage, we'll reenter with the wrong gs state.  We'll have
420	 * to do a special the swapgs in this case even coming from the kernel.
421	 * XXX linux has a trap handler for their equivalent of load_gs().
422	 *
423	 * On the stack, we have the hardware interrupt frame to return
424	 * to usermode (faulted) and another frame with error code, for
425	 * fault.  For PTI, copy both frames to the main thread stack.
426	 * Handle the potential 16-byte alignment adjustment incurred
427	 * during the second fault by copying both frames independently
428	 * while unwinding the stack in between.
429	 */
430	.macro PROTF_ENTRY name,trapno
431\name\()_pti_doreti:
432	swapgs
433	lfence
434	cmpq	$~0,PCPU(UCR3)
435	je	1f
436	pushq	%rax
437	pushq	%rdx
438	movq	PCPU(KCR3),%rax
439	movq	%rax,%cr3
440	movq	PCPU(RSP0),%rax
441	subq	$2*PTI_SIZE-3*8,%rax /* no err, %rax, %rdx in faulted frame */
442	MOVE_STACKS	(PTI_SIZE / 8)
443	addq	$PTI_SIZE,%rax
444	movq	PTI_RSP(%rsp),%rsp
445	MOVE_STACKS	(PTI_SIZE / 8 - 3)
446	subq	$PTI_SIZE,%rax
447	movq	%rax,%rsp
448	popq	%rdx
449	popq	%rax
4501:	swapgs
451	jmp	X\name
452IDTVEC(\name\()_pti)
453	cmpq	$doreti_iret,PTI_RIP-2*8(%rsp)
454	je	\name\()_pti_doreti
455	testb	$SEL_RPL_MASK,PTI_CS-2*8(%rsp) /* %rax, %rdx not yet pushed */
456	jz	X\name		/* lfence is not needed until %gs: use */
457	PTI_UENTRY has_err=1
458	swapgs	/* fence provided by PTI_UENTRY */
459IDTVEC(\name)
460	subq	$TF_ERR,%rsp
461	movl	$\trapno,TF_TRAPNO(%rsp)
462	jmp	prot_addrf
463	.endm
464
465	PROTF_ENTRY	missing, T_SEGNPFLT
466	PROTF_ENTRY	stk, T_STKFLT
467	PROTF_ENTRY	prot, T_PROTFLT
468
469prot_addrf:
470	movq	$0,TF_ADDR(%rsp)
471	movq	%rdi,TF_RDI(%rsp)	/* free up a GP register */
472	movq	%rax,TF_RAX(%rsp)
473	movq	%rdx,TF_RDX(%rsp)
474	movq	%rcx,TF_RCX(%rsp)
475	movw	%fs,TF_FS(%rsp)
476	movw	%gs,TF_GS(%rsp)
477	leaq	doreti_iret(%rip),%rdi
478	cmpq	%rdi,TF_RIP(%rsp)
479	je	5f			/* kernel but with user gsbase!! */
480	testb	$SEL_RPL_MASK,TF_CS(%rsp) /* Did we come from kernel? */
481	jz	6f			/* already running with kernel GS.base */
482	testb	$CPUID_STDEXT_FSGSBASE,cpu_stdext_feature(%rip)
483	jz	2f
484	cmpw	$KUF32SEL,TF_FS(%rsp)
485	jne	1f
486	rdfsbase %rax
4871:	cmpw	$KUG32SEL,TF_GS(%rsp)
488	jne	2f
489	rdgsbase %rdx
4902:	swapgs
491	lfence
492	movq	PCPU(CURPCB),%rdi
493	testb	$CPUID_STDEXT_FSGSBASE,cpu_stdext_feature(%rip)
494	jz	4f
495	cmpw	$KUF32SEL,TF_FS(%rsp)
496	jne	3f
497	movq	%rax,PCB_FSBASE(%rdi)
4983:	cmpw	$KUG32SEL,TF_GS(%rsp)
499	jne	4f
500	movq	%rdx,PCB_GSBASE(%rdi)
501	orl	$PCB_FULL_IRET,PCB_FLAGS(%rdi)	/* full iret from user #gp */
5024:	call	handle_ibrs_entry
503	movw	%es,TF_ES(%rsp)
504	movw	%ds,TF_DS(%rsp)
505	testl	$PSL_I,TF_RFLAGS(%rsp)
506	jz	alltraps_pushregs_no_rax
507	sti
508	jmp	alltraps_pushregs_no_rax
509
5105:	swapgs
5116:	lfence
512	movq	PCPU(CURPCB),%rdi
513	jmp	4b
514
515/*
516 * Fast syscall entry point.  We enter here with just our new %cs/%ss set,
517 * and the new privilige level.  We are still running on the old user stack
518 * pointer.  We have to juggle a few things around to find our stack etc.
519 * swapgs gives us access to our PCPU space only.
520 *
521 * We do not support invoking this from a custom segment registers,
522 * esp. %cs, %ss, %fs, %gs, e.g. using entries from an LDT.
523 */
524	SUPERALIGN_TEXT
525IDTVEC(fast_syscall_pti)
526	swapgs
527	cmpq	$~0,PCPU(UCR3)
528	je	fast_syscall_common
529	movq	%rax,PCPU(SCRATCH_RAX)
530	movq	PCPU(KCR3),%rax
531	movq	%rax,%cr3
532	movq	PCPU(SCRATCH_RAX),%rax
533	jmp	fast_syscall_common
534	SUPERALIGN_TEXT
535IDTVEC(fast_syscall)
536	swapgs
537fast_syscall_common:
538	movq	%rsp,PCPU(SCRATCH_RSP)
539	movq	PCPU(RSP0),%rsp
540	/* Now emulate a trapframe. Make the 8 byte alignment odd for call. */
541	subq	$TF_SIZE,%rsp
542	/* defer TF_RSP till we have a spare register */
543	movq	%r11,TF_RFLAGS(%rsp)
544	movq	%rcx,TF_RIP(%rsp)	/* %rcx original value is in %r10 */
545	movq	PCPU(SCRATCH_RSP),%r11	/* %r11 already saved */
546	movq	%r11,TF_RSP(%rsp)	/* user stack pointer */
547	/*
548	 * Save a few arg registers early to free them for use in
549	 * handle_ibrs_entry().  %r10 is especially tricky.  It is not an
550	 * arg register, but it holds the arg register %rcx.  Profiling
551	 * preserves %rcx, but may clobber %r10.  Profiling may also
552	 * clobber %r11, but %r11 (original %eflags) has been saved.
553	 */
554	movq	%rax,TF_RAX(%rsp)	/* syscall number */
555	movq	%rdx,TF_RDX(%rsp)	/* arg 3 */
556	movq	%r10,TF_RCX(%rsp)	/* arg 4 */
557	SAVE_SEGS
558	call	handle_ibrs_entry
559	movq	PCPU(CURPCB),%r11
560	andl	$~PCB_FULL_IRET,PCB_FLAGS(%r11)
561	sti
562	movq	$KUDSEL,TF_SS(%rsp)
563	movq	$KUCSEL,TF_CS(%rsp)
564	movq	$2,TF_ERR(%rsp)
565	movq	%rdi,TF_RDI(%rsp)	/* arg 1 */
566	movq	%rsi,TF_RSI(%rsp)	/* arg 2 */
567	movq	%r8,TF_R8(%rsp)		/* arg 5 */
568	movq	%r9,TF_R9(%rsp)		/* arg 6 */
569	movq	%rbx,TF_RBX(%rsp)	/* C preserved */
570	movq	%rbp,TF_RBP(%rsp)	/* C preserved */
571	movq	%r12,TF_R12(%rsp)	/* C preserved */
572	movq	%r13,TF_R13(%rsp)	/* C preserved */
573	movq	%r14,TF_R14(%rsp)	/* C preserved */
574	movq	%r15,TF_R15(%rsp)	/* C preserved */
575	movl	$TF_HASSEGS,TF_FLAGS(%rsp)
576	movq	PCPU(CURTHREAD),%rdi
577	movq	%rsp,TD_FRAME(%rdi)
578	movl	TF_RFLAGS(%rsp),%esi
579	andl	$PSL_T,%esi
580	call	amd64_syscall
5811:	movq	PCPU(CURPCB),%rax
582	/* Disable interrupts before testing PCB_FULL_IRET. */
583	cli
584	testl	$PCB_FULL_IRET,PCB_FLAGS(%rax)
585	jnz	4f
586	/* Check for and handle AST's on return to userland. */
587	movq	PCPU(CURTHREAD),%rax
588	cmpl	$0,TD_AST(%rax)
589	jne	3f
590	call	handle_ibrs_exit
591	callq	*mds_handler
592	/* Restore preserved registers. */
593	movq	TF_RDI(%rsp),%rdi	/* bonus; preserve arg 1 */
594	movq	TF_RSI(%rsp),%rsi	/* bonus: preserve arg 2 */
595	movq	TF_RDX(%rsp),%rdx	/* return value 2 */
596	movq	TF_RAX(%rsp),%rax	/* return value 1 */
597	movq	TF_RFLAGS(%rsp),%r11	/* original %rflags */
598	movq	TF_RIP(%rsp),%rcx	/* original %rip */
599	movq	TF_RSP(%rsp),%rsp	/* user stack pointer */
600	xorl	%r8d,%r8d		/* zero the rest of GPRs */
601	xorl	%r10d,%r10d
602	cmpq	$~0,PCPU(UCR3)
603	je	2f
604	movq	PCPU(UCR3),%r9
605	andq	PCPU(UCR3_LOAD_MASK),%r9
606	movq	%r9,%cr3
6072:	xorl	%r9d,%r9d
608	movq	$PMAP_UCR3_NOMASK,PCPU(UCR3_LOAD_MASK)
609	swapgs
610	sysretq
611
6123:	/* AST scheduled. */
613	sti
614	movq	%rsp,%rdi
615	call	ast
616	jmp	1b
617
6184:	/* Requested full context restore, use doreti for that. */
619	jmp	doreti
620
621/*
622 * Here for CYA insurance, in case a "syscall" instruction gets
623 * issued from 32 bit compatibility mode. MSR_CSTAR has to point
624 * to *something* if EFER_SCE is enabled.
625 */
626IDTVEC(fast_syscall32)
627	sysret
628
629/*
630 * DB# handler is very similar to NM#, because 'mov/pop %ss' delay
631 * generation of exception until the next instruction is executed,
632 * which might be a kernel entry.  So we must execute the handler
633 * on IST stack and be ready for non-kernel GSBASE.
634 */
635IDTVEC(dbg)
636	subq	$TF_RIP,%rsp
637	movl	$(T_TRCTRAP),TF_TRAPNO(%rsp)
638	movq	$0,TF_ADDR(%rsp)
639	movq	$0,TF_ERR(%rsp)
640	movq	%rdi,TF_RDI(%rsp)
641	movq	%rsi,TF_RSI(%rsp)
642	movq	%rdx,TF_RDX(%rsp)
643	movq	%rcx,TF_RCX(%rsp)
644	movq	%r8,TF_R8(%rsp)
645	movq	%r9,TF_R9(%rsp)
646	movq	%rax,TF_RAX(%rsp)
647	movq	%rbx,TF_RBX(%rsp)
648	movq	%rbp,TF_RBP(%rsp)
649	movq	%r10,TF_R10(%rsp)
650	movq	%r11,TF_R11(%rsp)
651	movq	%r12,TF_R12(%rsp)
652	movq	%r13,TF_R13(%rsp)
653	movq	%r14,TF_R14(%rsp)
654	movq	%r15,TF_R15(%rsp)
655	SAVE_SEGS
656	movl	$TF_HASSEGS,TF_FLAGS(%rsp)
657	pushfq
658	andq	$~(PSL_D | PSL_AC),(%rsp)
659	popfq
660	testb	$SEL_RPL_MASK,TF_CS(%rsp)
661	jnz	dbg_fromuserspace
662	lfence
663	/*
664	 * We've interrupted the kernel.  See comment in NMI handler about
665	 * registers use.
666	 */
667	movq	%cr2,%r15
668	movl	$MSR_GSBASE,%ecx
669	rdmsr
670	movq	%rax,%r12
671	shlq	$32,%rdx
672	orq	%rdx,%r12
673	/* Retrieve and load the canonical value for GS.base. */
674	movq	TF_SIZE(%rsp),%rdx
675	movl	%edx,%eax
676	shrq	$32,%rdx
677	wrmsr
678	movq	%cr3,%r13
679	movq	PCPU(KCR3),%rax
680	cmpq	$~0,%rax
681	je	1f
682	movq	%rax,%cr3
6831:	testl	$CPUID_STDEXT3_IBPB,cpu_stdext_feature3(%rip)
684	je	2f
685	movl	$MSR_IA32_SPEC_CTRL,%ecx
686	rdmsr
687	movl	%eax,%r14d
688	call	handle_ibrs_entry
6892:	movq	%rsp,%rdi
690	call	trap
691	testl	$CPUID_STDEXT3_IBPB,cpu_stdext_feature3(%rip)
692	je	3f
693	movl	%r14d,%eax
694	xorl	%edx,%edx
695	movl	$MSR_IA32_SPEC_CTRL,%ecx
696	wrmsr
697	/*
698	 * Put back the preserved MSR_GSBASE value.
699	 */
7003:	movl	$MSR_GSBASE,%ecx
701	movq	%r12,%rdx
702	movl	%edx,%eax
703	shrq	$32,%rdx
704	wrmsr
705	movq	%r13,%cr3
706	movq	%r15,%cr2
707	RESTORE_REGS
708	addq	$TF_RIP,%rsp
709	jmp	doreti_iret
710dbg_fromuserspace:
711	/*
712	 * Switch to kernel GSBASE and kernel page table, and copy frame
713	 * from the IST stack to the normal kernel stack, since trap()
714	 * re-enables interrupts, and since we might trap on DB# while
715	 * in trap().
716	 */
717	swapgs
718	lfence
719	movq	PCPU(KCR3),%rax
720	cmpq	$~0,%rax
721	je	1f
722	movq	%rax,%cr3
7231:	movq	PCPU(RSP0),%rax
724	movl	$TF_SIZE,%ecx
725	subq	%rcx,%rax
726	movq	%rax,%rdi
727	movq	%rsp,%rsi
728	rep;movsb
729	movq	%rax,%rsp
730	call	handle_ibrs_entry
731	movq	PCPU(CURPCB),%rdi
732	orl	$PCB_FULL_IRET,PCB_FLAGS(%rdi)
733	testb	$CPUID_STDEXT_FSGSBASE,cpu_stdext_feature(%rip)
734	jz	3f
735	cmpw	$KUF32SEL,TF_FS(%rsp)
736	jne	2f
737	rdfsbase %rax
738	movq	%rax,PCB_FSBASE(%rdi)
7392:	cmpw	$KUG32SEL,TF_GS(%rsp)
740	jne	3f
741	movl	$MSR_KGSBASE,%ecx
742	rdmsr
743	shlq	$32,%rdx
744	orq	%rdx,%rax
745	movq	%rax,PCB_GSBASE(%rdi)
7463:	jmp	calltrap
747
748/*
749 * NMI handling is special.
750 *
751 * First, NMIs do not respect the state of the processor's RFLAGS.IF
752 * bit.  The NMI handler may be entered at any time, including when
753 * the processor is in a critical section with RFLAGS.IF == 0.
754 * The processor's GS.base value could be invalid on entry to the
755 * handler.
756 *
757 * Second, the processor treats NMIs specially, blocking further NMIs
758 * until an 'iretq' instruction is executed.  We thus need to execute
759 * the NMI handler with interrupts disabled, to prevent a nested interrupt
760 * from executing an 'iretq' instruction and inadvertently taking the
761 * processor out of NMI mode.
762 *
763 * Third, the NMI handler runs on its own stack (tss_ist2). The canonical
764 * GS.base value for the processor is stored just above the bottom of its
765 * NMI stack.  For NMIs taken from kernel mode, the current value in
766 * the processor's GS.base is saved at entry to C-preserved register %r12,
767 * the canonical value for GS.base is then loaded into the processor, and
768 * the saved value is restored at exit time.  For NMIs taken from user mode,
769 * the cheaper 'SWAPGS' instructions are used for swapping GS.base.
770 */
771
772IDTVEC(nmi)
773	subq	$TF_RIP,%rsp
774	movl	$(T_NMI),TF_TRAPNO(%rsp)
775	movq	$0,TF_ADDR(%rsp)
776	movq	$0,TF_ERR(%rsp)
777	movq	%rdi,TF_RDI(%rsp)
778	movq	%rsi,TF_RSI(%rsp)
779	movq	%rdx,TF_RDX(%rsp)
780	movq	%rcx,TF_RCX(%rsp)
781	movq	%r8,TF_R8(%rsp)
782	movq	%r9,TF_R9(%rsp)
783	movq	%rax,TF_RAX(%rsp)
784	movq	%rbx,TF_RBX(%rsp)
785	movq	%rbp,TF_RBP(%rsp)
786	movq	%r10,TF_R10(%rsp)
787	movq	%r11,TF_R11(%rsp)
788	movq	%r12,TF_R12(%rsp)
789	movq	%r13,TF_R13(%rsp)
790	movq	%r14,TF_R14(%rsp)
791	movq	%r15,TF_R15(%rsp)
792	SAVE_SEGS
793	movl	$TF_HASSEGS,TF_FLAGS(%rsp)
794	pushfq
795	andq	$~(PSL_D | PSL_AC),(%rsp)
796	popfq
797	xorl	%ebx,%ebx
798	testb	$SEL_RPL_MASK,TF_CS(%rsp)
799	jnz	nmi_fromuserspace
800	/*
801	 * We've interrupted the kernel.  Preserve in callee-saved regs:
802	 * GS.base in %r12,
803	 * %cr3 in %r13,
804	 * possibly lower half of MSR_IA32_SPEC_CTL in %r14d,
805	 * %cr2 in %r15.
806	 */
807	lfence
808	movq	%cr2,%r15
809	movl	$MSR_GSBASE,%ecx
810	rdmsr
811	movq	%rax,%r12
812	shlq	$32,%rdx
813	orq	%rdx,%r12
814	/* Retrieve and load the canonical value for GS.base. */
815	movq	TF_SIZE(%rsp),%rdx
816	movl	%edx,%eax
817	shrq	$32,%rdx
818	wrmsr
819	movq	%cr3,%r13
820	movq	PCPU(KCR3),%rax
821	cmpq	$~0,%rax
822	je	1f
823	movq	%rax,%cr3
8241:	testl	$CPUID_STDEXT3_IBPB,cpu_stdext_feature3(%rip)
825	je	nmi_calltrap
826	movl	$MSR_IA32_SPEC_CTRL,%ecx
827	rdmsr
828	movl	%eax,%r14d
829	call	handle_ibrs_entry
830	jmp	nmi_calltrap
831nmi_fromuserspace:
832	incl	%ebx
833	swapgs
834	lfence
835	movq	%cr3,%r13
836	movq	PCPU(KCR3),%rax
837	cmpq	$~0,%rax
838	je	1f
839	movq	%rax,%cr3
8401:	call	handle_ibrs_entry
841	movq	PCPU(CURPCB),%rdi
842	testq	%rdi,%rdi
843	jz	3f
844	orl	$PCB_FULL_IRET,PCB_FLAGS(%rdi)
845	testb	$CPUID_STDEXT_FSGSBASE,cpu_stdext_feature(%rip)
846	jz	3f
847	cmpw	$KUF32SEL,TF_FS(%rsp)
848	jne	2f
849	rdfsbase %rax
850	movq	%rax,PCB_FSBASE(%rdi)
8512:	cmpw	$KUG32SEL,TF_GS(%rsp)
852	jne	3f
853	movl	$MSR_KGSBASE,%ecx
854	rdmsr
855	shlq	$32,%rdx
856	orq	%rdx,%rax
857	movq	%rax,PCB_GSBASE(%rdi)
8583:
859/* Note: this label is also used by ddb and gdb: */
860nmi_calltrap:
861	KMSAN_ENTER
862	movq	%rsp,%rdi
863	call	trap
864	KMSAN_LEAVE
865#ifdef HWPMC_HOOKS
866	/*
867	 * Capture a userspace callchain if needed.
868	 *
869	 * - Check if the current trap was from user mode.
870	 * - Check if the current thread is valid.
871	 * - Check if the thread requires a user call chain to be
872	 *   captured.
873	 *
874	 * We are still in NMI mode at this point.
875	 */
876	testl	%ebx,%ebx
877	jz	nocallchain	/* not from userspace */
878	movq	PCPU(CURTHREAD),%rax
879	orq	%rax,%rax	/* curthread present? */
880	jz	nocallchain
881	/*
882	 * Move execution to the regular kernel stack, because we
883	 * committed to return through doreti.
884	 */
885	movq	%rsp,%rsi	/* source stack pointer */
886	movq	$TF_SIZE,%rcx
887	movq	PCPU(RSP0),%rdx
888	subq	%rcx,%rdx
889	movq	%rdx,%rdi	/* destination stack pointer */
890	shrq	$3,%rcx		/* trap frame size in long words */
891	pushfq
892	andq	$~(PSL_D | PSL_AC),(%rsp)
893	popfq
894	rep
895	movsq			/* copy trapframe */
896	movq	%rdx,%rsp	/* we are on the regular kstack */
897
898	testl	$TDP_CALLCHAIN,TD_PFLAGS(%rax) /* flagged for capture? */
899	jz	nocallchain
900	/*
901	 * A user callchain is to be captured, so:
902	 * - Take the processor out of "NMI" mode by faking an "iret",
903	 *   to allow for nested NMI interrupts.
904	 * - Enable interrupts, so that copyin() can work.
905	 */
906	movl	%ss,%eax
907	pushq	%rax		/* tf_ss */
908	pushq	%rdx		/* tf_rsp (on kernel stack) */
909	pushfq			/* tf_rflags */
910	movl	%cs,%eax
911	pushq	%rax		/* tf_cs */
912	pushq	$outofnmi	/* tf_rip */
913	iretq
914outofnmi:
915	/*
916	 * At this point the processor has exited NMI mode and is running
917	 * with interrupts turned off on the normal kernel stack.
918	 *
919	 * If a pending NMI gets recognized at or after this point, it
920	 * will cause a kernel callchain to be traced.
921	 *
922	 * We turn interrupts back on, and call the user callchain capture hook.
923	 */
924	movq	pmc_hook,%rax
925	orq	%rax,%rax
926	jz	nocallchain
927	movq	PCPU(CURTHREAD),%rdi		/* thread */
928	movq	$PMC_FN_USER_CALLCHAIN,%rsi	/* command */
929	movq	%rsp,%rdx			/* frame */
930	sti
931	call	*%rax
932	cli
933nocallchain:
934#endif
935	testl	%ebx,%ebx	/* %ebx != 0 => return to userland */
936	jnz	doreti_exit
937	/*
938	 * Restore speculation control MSR, if preserved.
939	 */
940	testl	$CPUID_STDEXT3_IBPB,cpu_stdext_feature3(%rip)
941	je	1f
942	movl	%r14d,%eax
943	xorl	%edx,%edx
944	movl	$MSR_IA32_SPEC_CTRL,%ecx
945	wrmsr
946	/*
947	 * Put back the preserved MSR_GSBASE value.
948	 */
9491:	movl	$MSR_GSBASE,%ecx
950	movq	%r12,%rdx
951	movl	%edx,%eax
952	shrq	$32,%rdx
953	wrmsr
954	cmpb	$0, nmi_flush_l1d_sw(%rip)
955	je	2f
956	call	flush_l1d_sw		/* bhyve L1TF assist */
9572:	movq	%r13,%cr3
958	movq	%r15,%cr2
959	RESTORE_REGS
960	addq	$TF_RIP,%rsp
961	jmp	doreti_iret
962
963/*
964 * MC# handling is similar to NMI.
965 *
966 * As with NMIs, machine check exceptions do not respect RFLAGS.IF and
967 * can occur at any time with a GS.base value that does not correspond
968 * to the privilege level in CS.
969 *
970 * Machine checks are not unblocked by iretq, but it is best to run
971 * the handler with interrupts disabled since the exception may have
972 * interrupted a critical section.
973 *
974 * The MC# handler runs on its own stack (tss_ist3).  The canonical
975 * GS.base value for the processor is stored just above the bottom of
976 * its MC# stack.  For exceptions taken from kernel mode, the current
977 * value in the processor's GS.base is saved at entry to C-preserved
978 * register %r12, the canonical value for GS.base is then loaded into
979 * the processor, and the saved value is restored at exit time.  For
980 * exceptions taken from user mode, the cheaper 'SWAPGS' instructions
981 * are used for swapping GS.base.
982 */
983
984IDTVEC(mchk)
985	subq	$TF_RIP,%rsp
986	movl	$(T_MCHK),TF_TRAPNO(%rsp)
987	movq	$0,TF_ADDR(%rsp)
988	movq	$0,TF_ERR(%rsp)
989	movq	%rdi,TF_RDI(%rsp)
990	movq	%rsi,TF_RSI(%rsp)
991	movq	%rdx,TF_RDX(%rsp)
992	movq	%rcx,TF_RCX(%rsp)
993	movq	%r8,TF_R8(%rsp)
994	movq	%r9,TF_R9(%rsp)
995	movq	%rax,TF_RAX(%rsp)
996	movq	%rbx,TF_RBX(%rsp)
997	movq	%rbp,TF_RBP(%rsp)
998	movq	%r10,TF_R10(%rsp)
999	movq	%r11,TF_R11(%rsp)
1000	movq	%r12,TF_R12(%rsp)
1001	movq	%r13,TF_R13(%rsp)
1002	movq	%r14,TF_R14(%rsp)
1003	movq	%r15,TF_R15(%rsp)
1004	SAVE_SEGS
1005	movl	$TF_HASSEGS,TF_FLAGS(%rsp)
1006	pushfq
1007	andq	$~(PSL_D | PSL_AC),(%rsp)
1008	popfq
1009	xorl	%ebx,%ebx
1010	testb	$SEL_RPL_MASK,TF_CS(%rsp)
1011	jnz	mchk_fromuserspace
1012	/*
1013	 * We've interrupted the kernel.  See comment in NMI handler about
1014	 * registers use.
1015	 */
1016	movq	%cr2,%r15
1017	movl	$MSR_GSBASE,%ecx
1018	rdmsr
1019	movq	%rax,%r12
1020	shlq	$32,%rdx
1021	orq	%rdx,%r12
1022	/* Retrieve and load the canonical value for GS.base. */
1023	movq	TF_SIZE(%rsp),%rdx
1024	movl	%edx,%eax
1025	shrq	$32,%rdx
1026	wrmsr
1027	movq	%cr3,%r13
1028	movq	PCPU(KCR3),%rax
1029	cmpq	$~0,%rax
1030	je	1f
1031	movq	%rax,%cr3
10321:	testl	$CPUID_STDEXT3_IBPB,cpu_stdext_feature3(%rip)
1033	je	mchk_calltrap
1034	movl	$MSR_IA32_SPEC_CTRL,%ecx
1035	rdmsr
1036	movl	%eax,%r14d
1037	call	handle_ibrs_entry
1038	jmp	mchk_calltrap
1039mchk_fromuserspace:
1040	incl	%ebx
1041	swapgs
1042	movq	%cr3,%r13
1043	movq	PCPU(KCR3),%rax
1044	cmpq	$~0,%rax
1045	je	1f
1046	movq	%rax,%cr3
10471:	call	handle_ibrs_entry
1048/* Note: this label is also used by ddb and gdb: */
1049mchk_calltrap:
1050	KMSAN_ENTER
1051	movq	%rsp,%rdi
1052	call	mca_intr
1053	KMSAN_LEAVE
1054	testl	%ebx,%ebx	/* %ebx != 0 => return to userland */
1055	jnz	doreti_exit
1056	/*
1057	 * Restore speculation control MSR, if preserved.
1058	 */
1059	testl	$CPUID_STDEXT3_IBPB,cpu_stdext_feature3(%rip)
1060	je	1f
1061	movl	%r14d,%eax
1062	xorl	%edx,%edx
1063	movl	$MSR_IA32_SPEC_CTRL,%ecx
1064	wrmsr
1065	/*
1066	 * Put back the preserved MSR_GSBASE value.
1067	 */
10681:	movl	$MSR_GSBASE,%ecx
1069	movq	%r12,%rdx
1070	movl	%edx,%eax
1071	shrq	$32,%rdx
1072	wrmsr
1073	movq	%r13,%cr3
1074	movq	%r15,%cr2
1075	RESTORE_REGS
1076	addq	$TF_RIP,%rsp
1077	jmp	doreti_iret
1078
1079ENTRY(fork_trampoline)
1080	movq	%r12,%rdi		/* function */
1081	movq	%rbx,%rsi		/* arg1 */
1082	movq	%rsp,%rdx		/* trapframe pointer */
1083	call	fork_exit
1084	jmp	doreti			/* Handle any ASTs */
1085
1086/*
1087 * To efficiently implement classification of trap and interrupt handlers
1088 * for profiling, there must be only trap handlers between the labels btrap
1089 * and bintr, and only interrupt handlers between the labels bintr and
1090 * eintr.  This is implemented (partly) by including files that contain
1091 * some of the handlers.  Before including the files, set up a normal asm
1092 * environment so that the included files doen't need to know that they are
1093 * included.
1094 */
1095
1096#ifdef COMPAT_FREEBSD32
1097	.data
1098	.p2align 4
1099	.text
1100	SUPERALIGN_TEXT
1101
1102#include <amd64/ia32/ia32_exception.S>
1103#endif
1104
1105	.data
1106	.p2align 4
1107	.text
1108	SUPERALIGN_TEXT
1109#include <amd64/amd64/apic_vector.S>
1110
1111#ifdef DEV_ATPIC
1112	.data
1113	.p2align 4
1114	.text
1115	SUPERALIGN_TEXT
1116
1117#include <amd64/amd64/atpic_vector.S>
1118#endif
1119
1120/*
1121 * void doreti(struct trapframe)
1122 *
1123 * Handle return from interrupts, traps and syscalls.
1124 */
1125	.text
1126	SUPERALIGN_TEXT
1127	.type	doreti,@function
1128	.globl	doreti
1129doreti:
1130	/*
1131	 * Check if ASTs can be handled now.
1132	 */
1133	testb	$SEL_RPL_MASK,TF_CS(%rsp) /* are we returning to user mode? */
1134	jz	doreti_exit		/* can't handle ASTs now if not */
1135
1136doreti_ast:
1137	/*
1138	 * Check for ASTs atomically with returning.  Disabling CPU
1139	 * interrupts provides sufficient locking even in the SMP case,
1140	 * since we will be informed of any new ASTs by an IPI.
1141	 */
1142	cli
1143	movq	PCPU(CURTHREAD),%rax
1144	cmpl	$0,TD_AST(%rax)
1145	je	doreti_exit
1146	sti
1147	movq	%rsp,%rdi	/* pass a pointer to the trapframe */
1148	call	ast
1149	jmp	doreti_ast
1150
1151	/*
1152	 * doreti_exit:	pop registers, iret.
1153	 *
1154	 *	The segment register pop is a special case, since it may
1155	 *	fault if (for example) a sigreturn specifies bad segment
1156	 *	registers.  The fault is handled in trap.c.
1157	 */
1158doreti_exit:
1159	movq	PCPU(CURPCB),%r8
1160
1161	/*
1162	 * Do not reload segment registers for kernel.
1163	 * Since we do not reload segments registers with sane
1164	 * values on kernel entry, descriptors referenced by
1165	 * segments registers might be not valid.  This is fatal
1166	 * for user mode, but is not a problem for the kernel.
1167	 */
1168	testb	$SEL_RPL_MASK,TF_CS(%rsp)
1169	jz	ld_regs
1170	testl	$PCB_FULL_IRET,PCB_FLAGS(%r8)
1171	jz	ld_regs
1172	andl	$~PCB_FULL_IRET,PCB_FLAGS(%r8)
1173	testl	$TF_HASSEGS,TF_FLAGS(%rsp)
1174	je	set_segs
1175
1176do_segs:
1177	/* Restore %fs and fsbase */
1178	movw	TF_FS(%rsp),%ax
1179	.globl	ld_fs
1180ld_fs:
1181	movw	%ax,%fs
1182	cmpw	$KUF32SEL,%ax
1183	jne	1f
1184	movl	$MSR_FSBASE,%ecx
1185	movl	PCB_FSBASE(%r8),%eax
1186	movl	PCB_FSBASE+4(%r8),%edx
1187	.globl	ld_fsbase
1188ld_fsbase:
1189	wrmsr
11901:
1191	/* Restore %gs and gsbase */
1192	movw	TF_GS(%rsp),%si
1193	pushfq
1194	cli
1195	movl	$MSR_GSBASE,%ecx
1196	/* Save current kernel %gs base into %r12d:%r13d */
1197	rdmsr
1198	movl	%eax,%r12d
1199	movl	%edx,%r13d
1200	.globl	ld_gs
1201ld_gs:
1202	movw	%si,%gs
1203	/* Save user %gs base into %r14d:%r15d */
1204	rdmsr
1205	movl	%eax,%r14d
1206	movl	%edx,%r15d
1207	/* Restore kernel %gs base */
1208	movl	%r12d,%eax
1209	movl	%r13d,%edx
1210	wrmsr
1211	popfq
1212	/*
1213	 * Restore user %gs base, either from PCB if used for TLS, or
1214	 * from the previously saved msr read.
1215	 */
1216	movl	$MSR_KGSBASE,%ecx
1217	cmpw	$KUG32SEL,%si
1218	jne	1f
1219	movl	PCB_GSBASE(%r8),%eax
1220	movl	PCB_GSBASE+4(%r8),%edx
1221	jmp	ld_gsbase
12221:
1223	movl	%r14d,%eax
1224	movl	%r15d,%edx
1225	.globl	ld_gsbase
1226ld_gsbase:
1227	wrmsr	/* May trap if non-canonical, but only for TLS. */
1228	.globl	ld_es
1229ld_es:
1230	movw	TF_ES(%rsp),%es
1231	.globl	ld_ds
1232ld_ds:
1233	movw	TF_DS(%rsp),%ds
1234ld_regs:
1235	RESTORE_REGS
1236	testb	$SEL_RPL_MASK,TF_CS(%rsp) /* Did we come from kernel? */
1237	jz	2f			/* keep running with kernel GS.base */
1238	cli
1239	call	handle_ibrs_exit_rs
1240	callq	*mds_handler
1241	cmpq	$~0,PCPU(UCR3)
1242	je	1f
1243	pushq	%rdx
1244	movq	PCPU(PTI_RSP0),%rdx
1245	subq	$PTI_SIZE,%rdx
1246	movq	%rax,PTI_RAX(%rdx)
1247	popq	%rax
1248	movq	%rax,PTI_RDX(%rdx)
1249	movq	TF_RIP(%rsp),%rax
1250	movq	%rax,PTI_RIP(%rdx)
1251	movq	TF_CS(%rsp),%rax
1252	movq	%rax,PTI_CS(%rdx)
1253	movq	TF_RFLAGS(%rsp),%rax
1254	movq	%rax,PTI_RFLAGS(%rdx)
1255	movq	TF_RSP(%rsp),%rax
1256	movq	%rax,PTI_RSP(%rdx)
1257	movq	TF_SS(%rsp),%rax
1258	movq	%rax,PTI_SS(%rdx)
1259	movq	PCPU(UCR3),%rax
1260	andq	PCPU(UCR3_LOAD_MASK),%rax
1261	movq	$PMAP_UCR3_NOMASK,PCPU(UCR3_LOAD_MASK)
1262	swapgs
1263	movq	%rdx,%rsp
1264	movq	%rax,%cr3
1265	popq	%rdx
1266	popq	%rax
1267	addq	$8,%rsp
1268	jmp	doreti_iret
12691:	swapgs
12702:	addq	$TF_RIP,%rsp
1271	.globl	doreti_iret
1272doreti_iret:
1273	iretq
1274
1275set_segs:
1276	movw	$KUDSEL,%ax
1277	movw	%ax,TF_DS(%rsp)
1278	movw	%ax,TF_ES(%rsp)
1279	movw	$KUF32SEL,TF_FS(%rsp)
1280	movw	$KUG32SEL,TF_GS(%rsp)
1281	jmp	do_segs
1282
1283	/*
1284	 * doreti_iret_fault.  Alternative return code for
1285	 * the case where we get a fault in the doreti_exit code
1286	 * above.  trap() (amd64/amd64/trap.c) catches this specific
1287	 * case, sends the process a signal and continues in the
1288	 * corresponding place in the code below.
1289	 */
1290	ALIGN_TEXT
1291	.globl	doreti_iret_fault
1292doreti_iret_fault:
1293	subq	$TF_RIP,%rsp		/* space including tf_err, tf_trapno */
1294	movq	%rax,TF_RAX(%rsp)
1295	movq	%rdx,TF_RDX(%rsp)
1296	movq	%rcx,TF_RCX(%rsp)
1297	call	handle_ibrs_entry
1298	testb	$SEL_RPL_MASK,TF_CS(%rsp)
1299	jz	1f
1300	sti
13011:
1302	SAVE_SEGS
1303	movl	$TF_HASSEGS,TF_FLAGS(%rsp)
1304	movq	%rdi,TF_RDI(%rsp)
1305	movq	%rsi,TF_RSI(%rsp)
1306	movq	%r8,TF_R8(%rsp)
1307	movq	%r9,TF_R9(%rsp)
1308	movq	%rbx,TF_RBX(%rsp)
1309	movq	%rbp,TF_RBP(%rsp)
1310	movq	%r10,TF_R10(%rsp)
1311	movq	%r11,TF_R11(%rsp)
1312	movq	%r12,TF_R12(%rsp)
1313	movq	%r13,TF_R13(%rsp)
1314	movq	%r14,TF_R14(%rsp)
1315	movq	%r15,TF_R15(%rsp)
1316	movl	$T_PROTFLT,TF_TRAPNO(%rsp)
1317	movq	$0,TF_ERR(%rsp)	/* XXX should be the error code */
1318	movq	$0,TF_ADDR(%rsp)
1319	jmp	calltrap
1320
1321	ALIGN_TEXT
1322	.globl	ds_load_fault
1323ds_load_fault:
1324	movl	$T_PROTFLT,TF_TRAPNO(%rsp)
1325	testb	$SEL_RPL_MASK,TF_CS(%rsp)
1326	jz	1f
1327	sti
13281:
1329	movq	%rsp,%rdi
1330	call	trap
1331	movw	$KUDSEL,TF_DS(%rsp)
1332	jmp	doreti
1333
1334	ALIGN_TEXT
1335	.globl	es_load_fault
1336es_load_fault:
1337	movl	$T_PROTFLT,TF_TRAPNO(%rsp)
1338	testl	$PSL_I,TF_RFLAGS(%rsp)
1339	jz	1f
1340	sti
13411:
1342	movq	%rsp,%rdi
1343	call	trap
1344	movw	$KUDSEL,TF_ES(%rsp)
1345	jmp	doreti
1346
1347	ALIGN_TEXT
1348	.globl	fs_load_fault
1349fs_load_fault:
1350	testl	$PSL_I,TF_RFLAGS(%rsp)
1351	jz	1f
1352	sti
13531:
1354	movl	$T_PROTFLT,TF_TRAPNO(%rsp)
1355	movq	%rsp,%rdi
1356	call	trap
1357	movw	$KUF32SEL,TF_FS(%rsp)
1358	jmp	doreti
1359
1360	ALIGN_TEXT
1361	.globl	gs_load_fault
1362gs_load_fault:
1363	popfq
1364	movl	$T_PROTFLT,TF_TRAPNO(%rsp)
1365	testl	$PSL_I,TF_RFLAGS(%rsp)
1366	jz	1f
1367	sti
13681:
1369	movq	%rsp,%rdi
1370	call	trap
1371	movw	$KUG32SEL,TF_GS(%rsp)
1372	jmp	doreti
1373
1374	ALIGN_TEXT
1375	.globl	fsbase_load_fault
1376fsbase_load_fault:
1377	movl	$T_PROTFLT,TF_TRAPNO(%rsp)
1378	testl	$PSL_I,TF_RFLAGS(%rsp)
1379	jz	1f
1380	sti
13811:
1382	movq	%rsp,%rdi
1383	call	trap
1384	movq	PCPU(CURTHREAD),%r8
1385	movq	TD_PCB(%r8),%r8
1386	movq	$0,PCB_FSBASE(%r8)
1387	jmp	doreti
1388
1389	ALIGN_TEXT
1390	.globl	gsbase_load_fault
1391gsbase_load_fault:
1392	movl	$T_PROTFLT,TF_TRAPNO(%rsp)
1393	testl	$PSL_I,TF_RFLAGS(%rsp)
1394	jz	1f
1395	sti
13961:
1397	movq	%rsp,%rdi
1398	call	trap
1399	movq	PCPU(CURTHREAD),%r8
1400	movq	TD_PCB(%r8),%r8
1401	movq	$0,PCB_GSBASE(%r8)
1402	jmp	doreti
1403
1404#ifdef HWPMC_HOOKS
1405	ENTRY(end_exceptions)
1406#endif
1407