xref: /freebsd/sys/amd64/amd64/exception.S (revision 1f474190)
1/*-
2 * Copyright (c) 1989, 1990 William F. Jolitz.
3 * Copyright (c) 1990 The Regents of the University of California.
4 * Copyright (c) 2007-2018 The FreeBSD Foundation
5 * All rights reserved.
6 *
7 * Portions of this software were developed by A. Joseph Koshy under
8 * sponsorship from the FreeBSD Foundation and Google, Inc.
9 *
10 * Portions of this software were developed by
11 * Konstantin Belousov <kib@FreeBSD.org> under sponsorship from
12 * the FreeBSD Foundation.
13 *
14 * Redistribution and use in source and binary forms, with or without
15 * modification, are permitted provided that the following conditions
16 * are met:
17 * 1. Redistributions of source code must retain the above copyright
18 *    notice, this list of conditions and the following disclaimer.
19 * 2. Redistributions in binary form must reproduce the above copyright
20 *    notice, this list of conditions and the following disclaimer in the
21 *    documentation and/or other materials provided with the distribution.
22 * 3. Neither the name of the University nor the names of its contributors
23 *    may be used to endorse or promote products derived from this software
24 *    without specific prior written permission.
25 *
26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36 * SUCH DAMAGE.
37 *
38 * $FreeBSD$
39 */
40
41#include "opt_atpic.h"
42#include "opt_hwpmc_hooks.h"
43
44#include "assym.inc"
45
46#include <machine/psl.h>
47#include <machine/asmacros.h>
48#include <machine/trap.h>
49#include <machine/specialreg.h>
50#include <machine/pmap.h>
51
52#ifdef KDTRACE_HOOKS
53	.bss
54	.globl	dtrace_invop_jump_addr
55	.align	8
56	.type	dtrace_invop_jump_addr,@object
57	.size	dtrace_invop_jump_addr,8
58dtrace_invop_jump_addr:
59	.zero	8
60	.globl	dtrace_invop_calltrap_addr
61	.align	8
62	.type	dtrace_invop_calltrap_addr,@object
63	.size	dtrace_invop_calltrap_addr,8
64dtrace_invop_calltrap_addr:
65	.zero	8
66#endif
67	.text
68#ifdef HWPMC_HOOKS
69	ENTRY(start_exceptions)
70#endif
71
72/*****************************************************************************/
73/* Trap handling                                                             */
74/*****************************************************************************/
75/*
76 * Trap and fault vector routines.
77 *
78 * All traps are 'interrupt gates', SDT_SYSIGT.  An interrupt gate pushes
79 * state on the stack but also disables interrupts.  This is important for
80 * us for the use of the swapgs instruction.  We cannot be interrupted
81 * until the GS.base value is correct.  For most traps, we automatically
82 * then enable interrupts if the interrupted context had them enabled.
83 * This is equivalent to the i386 port's use of SDT_SYS386TGT.
84 *
85 * The cpu will push a certain amount of state onto the kernel stack for
86 * the current process.  See amd64/include/frame.h.
87 * This includes the current RFLAGS (status register, which includes
88 * the interrupt disable state prior to the trap), the code segment register,
89 * and the return instruction pointer are pushed by the cpu.  The cpu
90 * will also push an 'error' code for certain traps.  We push a dummy
91 * error code for those traps where the cpu doesn't in order to maintain
92 * a consistent frame.  We also push a contrived 'trap number'.
93 *
94 * The CPU does not push the general registers, so we must do that, and we
95 * must restore them prior to calling 'iret'.  The CPU adjusts %cs and %ss
96 * but does not mess with %ds, %es, %gs or %fs.  We swap the %gs base for
97 * for the kernel mode operation shortly, without changes to the selector
98 * loaded.  Since superuser long mode works with any selectors loaded into
99 * segment registers other then %cs, which makes them mostly unused in long
100 * mode, and kernel does not reference %fs, leave them alone.  The segment
101 * registers are reloaded on return to the usermode.
102 */
103
104MCOUNT_LABEL(user)
105MCOUNT_LABEL(btrap)
106
107/* Traps that we leave interrupts disabled for. */
108	.macro	TRAP_NOEN	l, trapno
109	PTI_ENTRY	\l,\l\()_pti_k,\l\()_pti_u
110\l\()_pti_k:
111	subq	$TF_RIP,%rsp
112	movl	$\trapno,TF_TRAPNO(%rsp)
113	movq	$0,TF_ADDR(%rsp)
114	movq	$0,TF_ERR(%rsp)
115	jmp	alltraps_noen_k
116\l\()_pti_u:
117	subq	$TF_RIP,%rsp
118	movl	$\trapno,TF_TRAPNO(%rsp)
119	movq	$0,TF_ADDR(%rsp)
120	movq	$0,TF_ERR(%rsp)
121	jmp	alltraps_noen_u
122
123	.globl	X\l
124	.type	X\l,@function
125X\l:
126	subq	$TF_RIP,%rsp
127	movl	$\trapno,TF_TRAPNO(%rsp)
128	movq	$0,TF_ADDR(%rsp)
129	movq	$0,TF_ERR(%rsp)
130	testb	$SEL_RPL_MASK,TF_CS(%rsp)
131	jz	alltraps_noen_k
132	swapgs
133	lfence
134	jmp	alltraps_noen_u
135	.endm
136
137	TRAP_NOEN	bpt, T_BPTFLT
138#ifdef KDTRACE_HOOKS
139	TRAP_NOEN	dtrace_ret, T_DTRACE_RET
140#endif
141
142/* Regular traps; The cpu does not supply tf_err for these. */
143	.macro	TRAP	l, trapno
144	PTI_ENTRY	\l,\l\()_pti_k,\l\()_pti_u
145\l\()_pti_k:
146	subq	$TF_RIP,%rsp
147	movl	$\trapno,TF_TRAPNO(%rsp)
148	movq	$0,TF_ADDR(%rsp)
149	movq	$0,TF_ERR(%rsp)
150	jmp	alltraps_k
151\l\()_pti_u:
152	subq	$TF_RIP,%rsp
153	movl	$\trapno,TF_TRAPNO(%rsp)
154	movq	$0,TF_ADDR(%rsp)
155	movq	$0,TF_ERR(%rsp)
156	jmp	alltraps_u
157
158	.globl	X\l
159	.type	X\l,@function
160X\l:
161	subq	$TF_RIP,%rsp
162	movl	$\trapno,TF_TRAPNO(%rsp)
163	movq	$0,TF_ADDR(%rsp)
164	movq	$0,TF_ERR(%rsp)
165	testb	$SEL_RPL_MASK,TF_CS(%rsp)
166	jz	alltraps_k
167	swapgs
168	lfence
169	jmp	alltraps_u
170	.endm
171
172	TRAP	div, T_DIVIDE
173	TRAP	ofl, T_OFLOW
174	TRAP	bnd, T_BOUND
175	TRAP	ill, T_PRIVINFLT
176	TRAP	dna, T_DNA
177	TRAP	fpusegm, T_FPOPFLT
178	TRAP	rsvd, T_RESERVED
179	TRAP	fpu, T_ARITHTRAP
180	TRAP	xmm, T_XMMFLT
181
182/* This group of traps have tf_err already pushed by the cpu. */
183	.macro	TRAP_ERR	l, trapno
184	PTI_ENTRY	\l,\l\()_pti_k,\l\()_pti_u,has_err=1
185\l\()_pti_k:
186	subq	$TF_ERR,%rsp
187	movl	$\trapno,TF_TRAPNO(%rsp)
188	movq	$0,TF_ADDR(%rsp)
189	jmp	alltraps_k
190\l\()_pti_u:
191	subq	$TF_ERR,%rsp
192	movl	$\trapno,TF_TRAPNO(%rsp)
193	movq	$0,TF_ADDR(%rsp)
194	jmp	alltraps_u
195	.globl	X\l
196	.type	X\l,@function
197X\l:
198	subq	$TF_ERR,%rsp
199	movl	$\trapno,TF_TRAPNO(%rsp)
200	movq	$0,TF_ADDR(%rsp)
201	testb	$SEL_RPL_MASK,TF_CS(%rsp)
202	jz	alltraps_k
203	swapgs
204	lfence
205	jmp	alltraps_u
206	.endm
207
208	TRAP_ERR	tss, T_TSSFLT
209	TRAP_ERR	align, T_ALIGNFLT
210
211	/*
212	 * alltraps_u/k entry points.
213	 * SWAPGS must be already performed by prologue,
214	 * if this is the first time in the kernel from userland.
215	 * Reenable interrupts if they were enabled before the trap.
216	 * This approximates SDT_SYS386TGT on the i386 port.
217	 */
218	SUPERALIGN_TEXT
219	.globl	alltraps_u
220	.type	alltraps_u,@function
221alltraps_u:
222	movq	%rdi,TF_RDI(%rsp)
223	movq	%rdx,TF_RDX(%rsp)
224	movq	%rax,TF_RAX(%rsp)
225	movq	%rcx,TF_RCX(%rsp)
226	movq	PCPU(CURPCB),%rdi
227	andl	$~PCB_FULL_IRET,PCB_FLAGS(%rdi)
228	call	handle_ibrs_entry
229	jmp	alltraps_save_segs
230	SUPERALIGN_TEXT
231	.globl	alltraps_k
232	.type	alltraps_k,@function
233alltraps_k:
234	lfence
235	movq	%rdi,TF_RDI(%rsp)
236	movq	%rdx,TF_RDX(%rsp)
237	movq	%rax,TF_RAX(%rsp)
238	movq	%rcx,TF_RCX(%rsp)
239alltraps_save_segs:
240	SAVE_SEGS
241	testl	$PSL_I,TF_RFLAGS(%rsp)
242	jz	alltraps_pushregs_no_rax
243	sti
244alltraps_pushregs_no_rax:
245	movq	%rsi,TF_RSI(%rsp)
246	movq	%r8,TF_R8(%rsp)
247	movq	%r9,TF_R9(%rsp)
248	movq	%rbx,TF_RBX(%rsp)
249	movq	%rbp,TF_RBP(%rsp)
250	movq	%r10,TF_R10(%rsp)
251	movq	%r11,TF_R11(%rsp)
252	movq	%r12,TF_R12(%rsp)
253	movq	%r13,TF_R13(%rsp)
254	movq	%r14,TF_R14(%rsp)
255	movq	%r15,TF_R15(%rsp)
256	movl	$TF_HASSEGS,TF_FLAGS(%rsp)
257	pushfq
258	andq	$~(PSL_D | PSL_AC),(%rsp)
259	popfq
260	FAKE_MCOUNT(TF_RIP(%rsp))
261#ifdef KDTRACE_HOOKS
262	/*
263	 * DTrace Function Boundary Trace (fbt) probes are triggered
264	 * by int3 (0xcc) which causes the #BP (T_BPTFLT) breakpoint
265	 * interrupt. For all other trap types, just handle them in
266	 * the usual way.
267	 */
268	testb	$SEL_RPL_MASK,TF_CS(%rsp) /* Did we come from kernel? */
269	jnz	calltrap		/* ignore userland traps */
270	cmpl	$T_BPTFLT,TF_TRAPNO(%rsp)
271	jne	calltrap
272
273	/* Check if there is no DTrace hook registered. */
274	cmpq	$0,dtrace_invop_jump_addr
275	je	calltrap
276
277	/*
278	 * Set our jump address for the jump back in the event that
279	 * the breakpoint wasn't caused by DTrace at all.
280	 */
281	movq	$calltrap,dtrace_invop_calltrap_addr(%rip)
282
283	/* Jump to the code hooked in by DTrace. */
284	jmpq	*dtrace_invop_jump_addr
285#endif
286	.globl	calltrap
287	.type	calltrap,@function
288calltrap:
289	movq	%rsp,%rdi
290	call	trap_check
291	MEXITCOUNT
292	jmp	doreti			/* Handle any pending ASTs */
293
294	/*
295	 * alltraps_noen_u/k entry points.
296	 * Again, SWAPGS must be already performed by prologue, if needed.
297	 * Unlike alltraps above, we want to leave the interrupts disabled.
298	 * This corresponds to SDT_SYS386IGT on the i386 port.
299	 */
300	SUPERALIGN_TEXT
301	.globl	alltraps_noen_u
302	.type	alltraps_noen_u,@function
303alltraps_noen_u:
304	movq	%rdi,TF_RDI(%rsp)
305	movq	PCPU(CURPCB),%rdi
306	andl	$~PCB_FULL_IRET,PCB_FLAGS(%rdi)
307	jmp	alltraps_noen_save_segs
308	SUPERALIGN_TEXT
309	.globl	alltraps_noen_k
310	.type	alltraps_noen_k,@function
311alltraps_noen_k:
312	lfence
313	movq	%rdi,TF_RDI(%rsp)
314alltraps_noen_save_segs:
315	SAVE_SEGS
316	movq	%rdx,TF_RDX(%rsp)
317	movq	%rax,TF_RAX(%rsp)
318	movq	%rcx,TF_RCX(%rsp)
319	testb	$SEL_RPL_MASK,TF_CS(%rsp)
320	jz	alltraps_pushregs_no_rax
321	call	handle_ibrs_entry
322	jmp	alltraps_pushregs_no_rax
323
324IDTVEC(dblfault)
325	subq	$TF_ERR,%rsp
326	movl	$T_DOUBLEFLT,TF_TRAPNO(%rsp)
327	movq	$0,TF_ADDR(%rsp)
328	movq	$0,TF_ERR(%rsp)
329	movq	%rdi,TF_RDI(%rsp)
330	movq	%rsi,TF_RSI(%rsp)
331	movq	%rdx,TF_RDX(%rsp)
332	movq	%rcx,TF_RCX(%rsp)
333	movq	%r8,TF_R8(%rsp)
334	movq	%r9,TF_R9(%rsp)
335	movq	%rax,TF_RAX(%rsp)
336	movq	%rbx,TF_RBX(%rsp)
337	movq	%rbp,TF_RBP(%rsp)
338	movq	%r10,TF_R10(%rsp)
339	movq	%r11,TF_R11(%rsp)
340	movq	%r12,TF_R12(%rsp)
341	movq	%r13,TF_R13(%rsp)
342	movq	%r14,TF_R14(%rsp)
343	movq	%r15,TF_R15(%rsp)
344	SAVE_SEGS
345	movl	$TF_HASSEGS,TF_FLAGS(%rsp)
346	pushfq
347	andq	$~(PSL_D | PSL_AC),(%rsp)
348	popfq
349	movq	TF_SIZE(%rsp),%rdx
350	movl	%edx,%eax
351	shrq	$32,%rdx
352	movl	$MSR_GSBASE,%ecx
353	wrmsr
354	movq	%cr3,%rax
355	movq	%rax,PCPU(SAVED_UCR3)
356	movq	PCPU(KCR3),%rax
357	cmpq	$~0,%rax
358	je	2f
359	movq	%rax,%cr3
3602:	movq	%rsp,%rdi
361	call	dblfault_handler
3623:	hlt
363	jmp	3b
364
365	ALIGN_TEXT
366IDTVEC(page_pti)
367	testb	$SEL_RPL_MASK,PTI_CS-PTI_ERR(%rsp)
368	jz	page_k
369	swapgs
370	lfence
371	pushq	%rax
372	movq	%cr3,%rax
373	movq	%rax,PCPU(SAVED_UCR3)
374	cmpq	$~0,PCPU(UCR3)
375	jne	1f
376	popq	%rax
377	jmp	page_u
3781:	pushq	%rdx
379	PTI_UUENTRY has_err=1
380	jmp	page_u
381	ALIGN_TEXT
382IDTVEC(page)
383	testb	$SEL_RPL_MASK,TF_CS-TF_ERR(%rsp) /* Did we come from kernel? */
384	jnz	page_u_swapgs		/* already running with kernel GS.base */
385page_k:
386	lfence
387	subq	$TF_ERR,%rsp
388	movq	%rdi,TF_RDI(%rsp)	/* free up GP registers */
389	movq	%rax,TF_RAX(%rsp)
390	movq	%rdx,TF_RDX(%rsp)
391	movq	%rcx,TF_RCX(%rsp)
392	jmp	page_cr2
393	ALIGN_TEXT
394page_u_swapgs:
395	swapgs
396	lfence
397page_u:
398	subq	$TF_ERR,%rsp
399	movq	%rdi,TF_RDI(%rsp)
400	movq	%rax,TF_RAX(%rsp)
401	movq	%rdx,TF_RDX(%rsp)
402	movq	%rcx,TF_RCX(%rsp)
403	movq	PCPU(CURPCB),%rdi
404	andl	$~PCB_FULL_IRET,PCB_FLAGS(%rdi)
405	movq	PCPU(SAVED_UCR3),%rax
406	movq	%rax,PCB_SAVED_UCR3(%rdi)
407	call	handle_ibrs_entry
408page_cr2:
409	movq	%cr2,%rdi		/* preserve %cr2 before ..  */
410	movq	%rdi,TF_ADDR(%rsp)	/* enabling interrupts. */
411	SAVE_SEGS
412	movl	$T_PAGEFLT,TF_TRAPNO(%rsp)
413	testl	$PSL_I,TF_RFLAGS(%rsp)
414	jz	alltraps_pushregs_no_rax
415	sti
416	jmp	alltraps_pushregs_no_rax
417
418	/*
419	 * We have to special-case this one.  If we get a trap in doreti() at
420	 * the iretq stage, we'll reenter with the wrong gs state.  We'll have
421	 * to do a special the swapgs in this case even coming from the kernel.
422	 * XXX linux has a trap handler for their equivalent of load_gs().
423	 *
424	 * On the stack, we have the hardware interrupt frame to return
425	 * to usermode (faulted) and another frame with error code, for
426	 * fault.  For PTI, copy both frames to the main thread stack.
427	 * Handle the potential 16-byte alignment adjustment incurred
428	 * during the second fault by copying both frames independently
429	 * while unwinding the stack in between.
430	 */
431	.macro PROTF_ENTRY name,trapno
432\name\()_pti_doreti:
433	swapgs
434	lfence
435	cmpq	$~0,PCPU(UCR3)
436	je	1f
437	pushq	%rax
438	pushq	%rdx
439	movq	PCPU(KCR3),%rax
440	movq	%rax,%cr3
441	movq	PCPU(RSP0),%rax
442	subq	$2*PTI_SIZE-3*8,%rax /* no err, %rax, %rdx in faulted frame */
443	MOVE_STACKS	(PTI_SIZE / 8)
444	addq	$PTI_SIZE,%rax
445	movq	PTI_RSP(%rsp),%rsp
446	MOVE_STACKS	(PTI_SIZE / 8 - 3)
447	subq	$PTI_SIZE,%rax
448	movq	%rax,%rsp
449	popq	%rdx
450	popq	%rax
4511:	swapgs
452	jmp	X\name
453IDTVEC(\name\()_pti)
454	cmpq	$doreti_iret,PTI_RIP-2*8(%rsp)
455	je	\name\()_pti_doreti
456	testb	$SEL_RPL_MASK,PTI_CS-2*8(%rsp) /* %rax, %rdx not yet pushed */
457	jz	X\name		/* lfence is not needed until %gs: use */
458	PTI_UENTRY has_err=1
459	swapgs	/* fence provided by PTI_UENTRY */
460IDTVEC(\name)
461	subq	$TF_ERR,%rsp
462	movl	$\trapno,TF_TRAPNO(%rsp)
463	jmp	prot_addrf
464	.endm
465
466	PROTF_ENTRY	missing, T_SEGNPFLT
467	PROTF_ENTRY	stk, T_STKFLT
468	PROTF_ENTRY	prot, T_PROTFLT
469
470prot_addrf:
471	movq	$0,TF_ADDR(%rsp)
472	movq	%rdi,TF_RDI(%rsp)	/* free up a GP register */
473	movq	%rax,TF_RAX(%rsp)
474	movq	%rdx,TF_RDX(%rsp)
475	movq	%rcx,TF_RCX(%rsp)
476	movw	%fs,TF_FS(%rsp)
477	movw	%gs,TF_GS(%rsp)
478	leaq	doreti_iret(%rip),%rdi
479	cmpq	%rdi,TF_RIP(%rsp)
480	je	5f			/* kernel but with user gsbase!! */
481	testb	$SEL_RPL_MASK,TF_CS(%rsp) /* Did we come from kernel? */
482	jz	6f			/* already running with kernel GS.base */
483	testb	$CPUID_STDEXT_FSGSBASE,cpu_stdext_feature(%rip)
484	jz	2f
485	cmpw	$KUF32SEL,TF_FS(%rsp)
486	jne	1f
487	rdfsbase %rax
4881:	cmpw	$KUG32SEL,TF_GS(%rsp)
489	jne	2f
490	rdgsbase %rdx
4912:	swapgs
492	lfence
493	movq	PCPU(CURPCB),%rdi
494	testb	$CPUID_STDEXT_FSGSBASE,cpu_stdext_feature(%rip)
495	jz	4f
496	cmpw	$KUF32SEL,TF_FS(%rsp)
497	jne	3f
498	movq	%rax,PCB_FSBASE(%rdi)
4993:	cmpw	$KUG32SEL,TF_GS(%rsp)
500	jne	4f
501	movq	%rdx,PCB_GSBASE(%rdi)
502	orl	$PCB_FULL_IRET,PCB_FLAGS(%rdi)	/* full iret from user #gp */
5034:	call	handle_ibrs_entry
504	movw	%es,TF_ES(%rsp)
505	movw	%ds,TF_DS(%rsp)
506	testl	$PSL_I,TF_RFLAGS(%rsp)
507	jz	alltraps_pushregs_no_rax
508	sti
509	jmp	alltraps_pushregs_no_rax
510
5115:	swapgs
5126:	lfence
513	movq	PCPU(CURPCB),%rdi
514	jmp	4b
515
516/*
517 * Fast syscall entry point.  We enter here with just our new %cs/%ss set,
518 * and the new privilige level.  We are still running on the old user stack
519 * pointer.  We have to juggle a few things around to find our stack etc.
520 * swapgs gives us access to our PCPU space only.
521 *
522 * We do not support invoking this from a custom segment registers,
523 * esp. %cs, %ss, %fs, %gs, e.g. using entries from an LDT.
524 */
525	SUPERALIGN_TEXT
526IDTVEC(fast_syscall_pti)
527	swapgs
528	lfence
529	cmpq	$~0,PCPU(UCR3)
530	je	fast_syscall_common
531	movq	%rax,PCPU(SCRATCH_RAX)
532	movq	PCPU(KCR3),%rax
533	movq	%rax,%cr3
534	movq	PCPU(SCRATCH_RAX),%rax
535	jmp	fast_syscall_common
536	SUPERALIGN_TEXT
537IDTVEC(fast_syscall)
538	swapgs
539	lfence
540fast_syscall_common:
541	movq	%rsp,PCPU(SCRATCH_RSP)
542	movq	PCPU(RSP0),%rsp
543	/* Now emulate a trapframe. Make the 8 byte alignment odd for call. */
544	subq	$TF_SIZE,%rsp
545	/* defer TF_RSP till we have a spare register */
546	movq	%r11,TF_RFLAGS(%rsp)
547	movq	%rcx,TF_RIP(%rsp)	/* %rcx original value is in %r10 */
548	movq	PCPU(SCRATCH_RSP),%r11	/* %r11 already saved */
549	movq	%r11,TF_RSP(%rsp)	/* user stack pointer */
550	/*
551	 * Save a few arg registers early to free them for use in
552	 * handle_ibrs_entry().  %r10 is especially tricky.  It is not an
553	 * arg register, but it holds the arg register %rcx.  Profiling
554	 * preserves %rcx, but may clobber %r10.  Profiling may also
555	 * clobber %r11, but %r11 (original %eflags) has been saved.
556	 */
557	movq	%rax,TF_RAX(%rsp)	/* syscall number */
558	movq	%rdx,TF_RDX(%rsp)	/* arg 3 */
559	movq	%r10,TF_RCX(%rsp)	/* arg 4 */
560	SAVE_SEGS
561	call	handle_ibrs_entry
562	movq	PCPU(CURPCB),%r11
563	andl	$~PCB_FULL_IRET,PCB_FLAGS(%r11)
564	sti
565	movq	$KUDSEL,TF_SS(%rsp)
566	movq	$KUCSEL,TF_CS(%rsp)
567	movq	$2,TF_ERR(%rsp)
568	movq	%rdi,TF_RDI(%rsp)	/* arg 1 */
569	movq	%rsi,TF_RSI(%rsp)	/* arg 2 */
570	movq	%r8,TF_R8(%rsp)		/* arg 5 */
571	movq	%r9,TF_R9(%rsp)		/* arg 6 */
572	movq	%rbx,TF_RBX(%rsp)	/* C preserved */
573	movq	%rbp,TF_RBP(%rsp)	/* C preserved */
574	movq	%r12,TF_R12(%rsp)	/* C preserved */
575	movq	%r13,TF_R13(%rsp)	/* C preserved */
576	movq	%r14,TF_R14(%rsp)	/* C preserved */
577	movq	%r15,TF_R15(%rsp)	/* C preserved */
578	movl	$TF_HASSEGS,TF_FLAGS(%rsp)
579	FAKE_MCOUNT(TF_RIP(%rsp))
580	movq	PCPU(CURTHREAD),%rdi
581	movq	%rsp,TD_FRAME(%rdi)
582	movl	TF_RFLAGS(%rsp),%esi
583	andl	$PSL_T,%esi
584	call	amd64_syscall
5851:	movq	PCPU(CURPCB),%rax
586	/* Disable interrupts before testing PCB_FULL_IRET. */
587	cli
588	testl	$PCB_FULL_IRET,PCB_FLAGS(%rax)
589	jnz	4f
590	/* Check for and handle AST's on return to userland. */
591	movq	PCPU(CURTHREAD),%rax
592	testl	$TDF_ASTPENDING | TDF_NEEDRESCHED,TD_FLAGS(%rax)
593	jne	3f
594	call	handle_ibrs_exit
595	callq	*mds_handler
596	/* Restore preserved registers. */
597	MEXITCOUNT
598	movq	TF_RDI(%rsp),%rdi	/* bonus; preserve arg 1 */
599	movq	TF_RSI(%rsp),%rsi	/* bonus: preserve arg 2 */
600	movq	TF_RDX(%rsp),%rdx	/* return value 2 */
601	movq	TF_RAX(%rsp),%rax	/* return value 1 */
602	movq	TF_RFLAGS(%rsp),%r11	/* original %rflags */
603	movq	TF_RIP(%rsp),%rcx	/* original %rip */
604	movq	TF_RSP(%rsp),%rsp	/* user stack pointer */
605	xorl	%r8d,%r8d		/* zero the rest of GPRs */
606	xorl	%r10d,%r10d
607	cmpq	$~0,PCPU(UCR3)
608	je	2f
609	movq	PCPU(UCR3),%r9
610	andq	PCPU(UCR3_LOAD_MASK),%r9
611	movq	%r9,%cr3
6122:	xorl	%r9d,%r9d
613	movq	$PMAP_UCR3_NOMASK,PCPU(UCR3_LOAD_MASK)
614	swapgs
615	sysretq
616
6173:	/* AST scheduled. */
618	sti
619	movq	%rsp,%rdi
620	call	ast
621	jmp	1b
622
6234:	/* Requested full context restore, use doreti for that. */
624	MEXITCOUNT
625	jmp	doreti
626
627/*
628 * Here for CYA insurance, in case a "syscall" instruction gets
629 * issued from 32 bit compatibility mode. MSR_CSTAR has to point
630 * to *something* if EFER_SCE is enabled.
631 */
632IDTVEC(fast_syscall32)
633	sysret
634
635/*
636 * DB# handler is very similar to NM#, because 'mov/pop %ss' delay
637 * generation of exception until the next instruction is executed,
638 * which might be a kernel entry.  So we must execute the handler
639 * on IST stack and be ready for non-kernel GSBASE.
640 */
641IDTVEC(dbg)
642	subq	$TF_RIP,%rsp
643	movl	$(T_TRCTRAP),TF_TRAPNO(%rsp)
644	movq	$0,TF_ADDR(%rsp)
645	movq	$0,TF_ERR(%rsp)
646	movq	%rdi,TF_RDI(%rsp)
647	movq	%rsi,TF_RSI(%rsp)
648	movq	%rdx,TF_RDX(%rsp)
649	movq	%rcx,TF_RCX(%rsp)
650	movq	%r8,TF_R8(%rsp)
651	movq	%r9,TF_R9(%rsp)
652	movq	%rax,TF_RAX(%rsp)
653	movq	%rbx,TF_RBX(%rsp)
654	movq	%rbp,TF_RBP(%rsp)
655	movq	%r10,TF_R10(%rsp)
656	movq	%r11,TF_R11(%rsp)
657	movq	%r12,TF_R12(%rsp)
658	movq	%r13,TF_R13(%rsp)
659	movq	%r14,TF_R14(%rsp)
660	movq	%r15,TF_R15(%rsp)
661	SAVE_SEGS
662	movl	$TF_HASSEGS,TF_FLAGS(%rsp)
663	pushfq
664	andq	$~(PSL_D | PSL_AC),(%rsp)
665	popfq
666	testb	$SEL_RPL_MASK,TF_CS(%rsp)
667	jnz	dbg_fromuserspace
668	lfence
669	/*
670	 * We've interrupted the kernel.  Preserve GS.base in %r12,
671	 * %cr3 in %r13, and possibly lower half of MSR_IA32_SPEC_CTL in %r14d.
672	 */
673	movl	$MSR_GSBASE,%ecx
674	rdmsr
675	movq	%rax,%r12
676	shlq	$32,%rdx
677	orq	%rdx,%r12
678	/* Retrieve and load the canonical value for GS.base. */
679	movq	TF_SIZE(%rsp),%rdx
680	movl	%edx,%eax
681	shrq	$32,%rdx
682	wrmsr
683	movq	%cr3,%r13
684	movq	PCPU(KCR3),%rax
685	cmpq	$~0,%rax
686	je	1f
687	movq	%rax,%cr3
6881:	testl	$CPUID_STDEXT3_IBPB,cpu_stdext_feature3(%rip)
689	je	2f
690	movl	$MSR_IA32_SPEC_CTRL,%ecx
691	rdmsr
692	movl	%eax,%r14d
693	call	handle_ibrs_entry
6942:	FAKE_MCOUNT(TF_RIP(%rsp))
695	movq	%rsp,%rdi
696	call	trap
697	MEXITCOUNT
698	testl	$CPUID_STDEXT3_IBPB,cpu_stdext_feature3(%rip)
699	je	3f
700	movl	%r14d,%eax
701	xorl	%edx,%edx
702	movl	$MSR_IA32_SPEC_CTRL,%ecx
703	wrmsr
704	/*
705	 * Put back the preserved MSR_GSBASE value.
706	 */
7073:	movl	$MSR_GSBASE,%ecx
708	movq	%r12,%rdx
709	movl	%edx,%eax
710	shrq	$32,%rdx
711	wrmsr
712	movq	%r13,%cr3
713	RESTORE_REGS
714	addq	$TF_RIP,%rsp
715	jmp	doreti_iret
716dbg_fromuserspace:
717	/*
718	 * Switch to kernel GSBASE and kernel page table, and copy frame
719	 * from the IST stack to the normal kernel stack, since trap()
720	 * re-enables interrupts, and since we might trap on DB# while
721	 * in trap().
722	 */
723	swapgs
724	lfence
725	movq	PCPU(KCR3),%rax
726	cmpq	$~0,%rax
727	je	1f
728	movq	%rax,%cr3
7291:	movq	PCPU(RSP0),%rax
730	movl	$TF_SIZE,%ecx
731	subq	%rcx,%rax
732	movq	%rax,%rdi
733	movq	%rsp,%rsi
734	rep;movsb
735	movq	%rax,%rsp
736	call	handle_ibrs_entry
737	movq	PCPU(CURPCB),%rdi
738	orl	$PCB_FULL_IRET,PCB_FLAGS(%rdi)
739	testb	$CPUID_STDEXT_FSGSBASE,cpu_stdext_feature(%rip)
740	jz	3f
741	cmpw	$KUF32SEL,TF_FS(%rsp)
742	jne	2f
743	rdfsbase %rax
744	movq	%rax,PCB_FSBASE(%rdi)
7452:	cmpw	$KUG32SEL,TF_GS(%rsp)
746	jne	3f
747	movl	$MSR_KGSBASE,%ecx
748	rdmsr
749	shlq	$32,%rdx
750	orq	%rdx,%rax
751	movq	%rax,PCB_GSBASE(%rdi)
7523:	jmp	calltrap
753
754/*
755 * NMI handling is special.
756 *
757 * First, NMIs do not respect the state of the processor's RFLAGS.IF
758 * bit.  The NMI handler may be entered at any time, including when
759 * the processor is in a critical section with RFLAGS.IF == 0.
760 * The processor's GS.base value could be invalid on entry to the
761 * handler.
762 *
763 * Second, the processor treats NMIs specially, blocking further NMIs
764 * until an 'iretq' instruction is executed.  We thus need to execute
765 * the NMI handler with interrupts disabled, to prevent a nested interrupt
766 * from executing an 'iretq' instruction and inadvertently taking the
767 * processor out of NMI mode.
768 *
769 * Third, the NMI handler runs on its own stack (tss_ist2). The canonical
770 * GS.base value for the processor is stored just above the bottom of its
771 * NMI stack.  For NMIs taken from kernel mode, the current value in
772 * the processor's GS.base is saved at entry to C-preserved register %r12,
773 * the canonical value for GS.base is then loaded into the processor, and
774 * the saved value is restored at exit time.  For NMIs taken from user mode,
775 * the cheaper 'SWAPGS' instructions are used for swapping GS.base.
776 */
777
778IDTVEC(nmi)
779	subq	$TF_RIP,%rsp
780	movl	$(T_NMI),TF_TRAPNO(%rsp)
781	movq	$0,TF_ADDR(%rsp)
782	movq	$0,TF_ERR(%rsp)
783	movq	%rdi,TF_RDI(%rsp)
784	movq	%rsi,TF_RSI(%rsp)
785	movq	%rdx,TF_RDX(%rsp)
786	movq	%rcx,TF_RCX(%rsp)
787	movq	%r8,TF_R8(%rsp)
788	movq	%r9,TF_R9(%rsp)
789	movq	%rax,TF_RAX(%rsp)
790	movq	%rbx,TF_RBX(%rsp)
791	movq	%rbp,TF_RBP(%rsp)
792	movq	%r10,TF_R10(%rsp)
793	movq	%r11,TF_R11(%rsp)
794	movq	%r12,TF_R12(%rsp)
795	movq	%r13,TF_R13(%rsp)
796	movq	%r14,TF_R14(%rsp)
797	movq	%r15,TF_R15(%rsp)
798	SAVE_SEGS
799	movl	$TF_HASSEGS,TF_FLAGS(%rsp)
800	pushfq
801	andq	$~(PSL_D | PSL_AC),(%rsp)
802	popfq
803	xorl	%ebx,%ebx
804	testb	$SEL_RPL_MASK,TF_CS(%rsp)
805	jnz	nmi_fromuserspace
806	/*
807	 * We've interrupted the kernel.  Preserve GS.base in %r12,
808	 * %cr3 in %r13, and possibly lower half of MSR_IA32_SPEC_CTL in %r14d.
809	 */
810	lfence
811	movl	$MSR_GSBASE,%ecx
812	rdmsr
813	movq	%rax,%r12
814	shlq	$32,%rdx
815	orq	%rdx,%r12
816	/* Retrieve and load the canonical value for GS.base. */
817	movq	TF_SIZE(%rsp),%rdx
818	movl	%edx,%eax
819	shrq	$32,%rdx
820	wrmsr
821	movq	%cr3,%r13
822	movq	PCPU(KCR3),%rax
823	cmpq	$~0,%rax
824	je	1f
825	movq	%rax,%cr3
8261:	testl	$CPUID_STDEXT3_IBPB,cpu_stdext_feature3(%rip)
827	je	nmi_calltrap
828	movl	$MSR_IA32_SPEC_CTRL,%ecx
829	rdmsr
830	movl	%eax,%r14d
831	call	handle_ibrs_entry
832	jmp	nmi_calltrap
833nmi_fromuserspace:
834	incl	%ebx
835	swapgs
836	lfence
837	movq	%cr3,%r13
838	movq	PCPU(KCR3),%rax
839	cmpq	$~0,%rax
840	je	1f
841	movq	%rax,%cr3
8421:	call	handle_ibrs_entry
843	movq	PCPU(CURPCB),%rdi
844	testq	%rdi,%rdi
845	jz	3f
846	orl	$PCB_FULL_IRET,PCB_FLAGS(%rdi)
847	testb	$CPUID_STDEXT_FSGSBASE,cpu_stdext_feature(%rip)
848	jz	3f
849	cmpw	$KUF32SEL,TF_FS(%rsp)
850	jne	2f
851	rdfsbase %rax
852	movq	%rax,PCB_FSBASE(%rdi)
8532:	cmpw	$KUG32SEL,TF_GS(%rsp)
854	jne	3f
855	movl	$MSR_KGSBASE,%ecx
856	rdmsr
857	shlq	$32,%rdx
858	orq	%rdx,%rax
859	movq	%rax,PCB_GSBASE(%rdi)
8603:
861/* Note: this label is also used by ddb and gdb: */
862nmi_calltrap:
863	FAKE_MCOUNT(TF_RIP(%rsp))
864	movq	%rsp,%rdi
865	call	trap
866	MEXITCOUNT
867#ifdef HWPMC_HOOKS
868	/*
869	 * Capture a userspace callchain if needed.
870	 *
871	 * - Check if the current trap was from user mode.
872	 * - Check if the current thread is valid.
873	 * - Check if the thread requires a user call chain to be
874	 *   captured.
875	 *
876	 * We are still in NMI mode at this point.
877	 */
878	testl	%ebx,%ebx
879	jz	nocallchain	/* not from userspace */
880	movq	PCPU(CURTHREAD),%rax
881	orq	%rax,%rax	/* curthread present? */
882	jz	nocallchain
883	/*
884	 * Move execution to the regular kernel stack, because we
885	 * committed to return through doreti.
886	 */
887	movq	%rsp,%rsi	/* source stack pointer */
888	movq	$TF_SIZE,%rcx
889	movq	PCPU(RSP0),%rdx
890	subq	%rcx,%rdx
891	movq	%rdx,%rdi	/* destination stack pointer */
892	shrq	$3,%rcx		/* trap frame size in long words */
893	pushfq
894	andq	$~(PSL_D | PSL_AC),(%rsp)
895	popfq
896	rep
897	movsq			/* copy trapframe */
898	movq	%rdx,%rsp	/* we are on the regular kstack */
899
900	testl	$TDP_CALLCHAIN,TD_PFLAGS(%rax) /* flagged for capture? */
901	jz	nocallchain
902	/*
903	 * A user callchain is to be captured, so:
904	 * - Take the processor out of "NMI" mode by faking an "iret",
905	 *   to allow for nested NMI interrupts.
906	 * - Enable interrupts, so that copyin() can work.
907	 */
908	movl	%ss,%eax
909	pushq	%rax		/* tf_ss */
910	pushq	%rdx		/* tf_rsp (on kernel stack) */
911	pushfq			/* tf_rflags */
912	movl	%cs,%eax
913	pushq	%rax		/* tf_cs */
914	pushq	$outofnmi	/* tf_rip */
915	iretq
916outofnmi:
917	/*
918	 * At this point the processor has exited NMI mode and is running
919	 * with interrupts turned off on the normal kernel stack.
920	 *
921	 * If a pending NMI gets recognized at or after this point, it
922	 * will cause a kernel callchain to be traced.
923	 *
924	 * We turn interrupts back on, and call the user callchain capture hook.
925	 */
926	movq	pmc_hook,%rax
927	orq	%rax,%rax
928	jz	nocallchain
929	movq	PCPU(CURTHREAD),%rdi		/* thread */
930	movq	$PMC_FN_USER_CALLCHAIN,%rsi	/* command */
931	movq	%rsp,%rdx			/* frame */
932	sti
933	call	*%rax
934	cli
935nocallchain:
936#endif
937	testl	%ebx,%ebx	/* %ebx == 0 => return to userland */
938	jnz	doreti_exit
939	/*
940	 * Restore speculation control MSR, if preserved.
941	 */
942	testl	$CPUID_STDEXT3_IBPB,cpu_stdext_feature3(%rip)
943	je	1f
944	movl	%r14d,%eax
945	xorl	%edx,%edx
946	movl	$MSR_IA32_SPEC_CTRL,%ecx
947	wrmsr
948	/*
949	 * Put back the preserved MSR_GSBASE value.
950	 */
9511:	movl	$MSR_GSBASE,%ecx
952	movq	%r12,%rdx
953	movl	%edx,%eax
954	shrq	$32,%rdx
955	wrmsr
956	cmpb	$0, nmi_flush_l1d_sw(%rip)
957	je	2f
958	call	flush_l1d_sw		/* bhyve L1TF assist */
9592:	movq	%r13,%cr3
960	RESTORE_REGS
961	addq	$TF_RIP,%rsp
962	jmp	doreti_iret
963
964/*
965 * MC# handling is similar to NMI.
966 *
967 * As with NMIs, machine check exceptions do not respect RFLAGS.IF and
968 * can occur at any time with a GS.base value that does not correspond
969 * to the privilege level in CS.
970 *
971 * Machine checks are not unblocked by iretq, but it is best to run
972 * the handler with interrupts disabled since the exception may have
973 * interrupted a critical section.
974 *
975 * The MC# handler runs on its own stack (tss_ist3).  The canonical
976 * GS.base value for the processor is stored just above the bottom of
977 * its MC# stack.  For exceptions taken from kernel mode, the current
978 * value in the processor's GS.base is saved at entry to C-preserved
979 * register %r12, the canonical value for GS.base is then loaded into
980 * the processor, and the saved value is restored at exit time.  For
981 * exceptions taken from user mode, the cheaper 'SWAPGS' instructions
982 * are used for swapping GS.base.
983 */
984
985IDTVEC(mchk)
986	subq	$TF_RIP,%rsp
987	movl	$(T_MCHK),TF_TRAPNO(%rsp)
988	movq	$0,TF_ADDR(%rsp)
989	movq	$0,TF_ERR(%rsp)
990	movq	%rdi,TF_RDI(%rsp)
991	movq	%rsi,TF_RSI(%rsp)
992	movq	%rdx,TF_RDX(%rsp)
993	movq	%rcx,TF_RCX(%rsp)
994	movq	%r8,TF_R8(%rsp)
995	movq	%r9,TF_R9(%rsp)
996	movq	%rax,TF_RAX(%rsp)
997	movq	%rbx,TF_RBX(%rsp)
998	movq	%rbp,TF_RBP(%rsp)
999	movq	%r10,TF_R10(%rsp)
1000	movq	%r11,TF_R11(%rsp)
1001	movq	%r12,TF_R12(%rsp)
1002	movq	%r13,TF_R13(%rsp)
1003	movq	%r14,TF_R14(%rsp)
1004	movq	%r15,TF_R15(%rsp)
1005	SAVE_SEGS
1006	movl	$TF_HASSEGS,TF_FLAGS(%rsp)
1007	pushfq
1008	andq	$~(PSL_D | PSL_AC),(%rsp)
1009	popfq
1010	xorl	%ebx,%ebx
1011	testb	$SEL_RPL_MASK,TF_CS(%rsp)
1012	jnz	mchk_fromuserspace
1013	/*
1014	 * We've interrupted the kernel.  Preserve GS.base in %r12,
1015	 * %cr3 in %r13, and possibly lower half of MSR_IA32_SPEC_CTL in %r14d.
1016	 */
1017	movl	$MSR_GSBASE,%ecx
1018	rdmsr
1019	movq	%rax,%r12
1020	shlq	$32,%rdx
1021	orq	%rdx,%r12
1022	/* Retrieve and load the canonical value for GS.base. */
1023	movq	TF_SIZE(%rsp),%rdx
1024	movl	%edx,%eax
1025	shrq	$32,%rdx
1026	wrmsr
1027	movq	%cr3,%r13
1028	movq	PCPU(KCR3),%rax
1029	cmpq	$~0,%rax
1030	je	1f
1031	movq	%rax,%cr3
10321:	testl	$CPUID_STDEXT3_IBPB,cpu_stdext_feature3(%rip)
1033	je	mchk_calltrap
1034	movl	$MSR_IA32_SPEC_CTRL,%ecx
1035	rdmsr
1036	movl	%eax,%r14d
1037	call	handle_ibrs_entry
1038	jmp	mchk_calltrap
1039mchk_fromuserspace:
1040	incl	%ebx
1041	swapgs
1042	movq	%cr3,%r13
1043	movq	PCPU(KCR3),%rax
1044	cmpq	$~0,%rax
1045	je	1f
1046	movq	%rax,%cr3
10471:	call	handle_ibrs_entry
1048/* Note: this label is also used by ddb and gdb: */
1049mchk_calltrap:
1050	FAKE_MCOUNT(TF_RIP(%rsp))
1051	movq	%rsp,%rdi
1052	call	mca_intr
1053	MEXITCOUNT
1054	testl	%ebx,%ebx	/* %ebx == 0 => return to userland */
1055	jnz	doreti_exit
1056	/*
1057	 * Restore speculation control MSR, if preserved.
1058	 */
1059	testl	$CPUID_STDEXT3_IBPB,cpu_stdext_feature3(%rip)
1060	je	1f
1061	movl	%r14d,%eax
1062	xorl	%edx,%edx
1063	movl	$MSR_IA32_SPEC_CTRL,%ecx
1064	wrmsr
1065	/*
1066	 * Put back the preserved MSR_GSBASE value.
1067	 */
10681:	movl	$MSR_GSBASE,%ecx
1069	movq	%r12,%rdx
1070	movl	%edx,%eax
1071	shrq	$32,%rdx
1072	wrmsr
1073	movq	%r13,%cr3
1074	RESTORE_REGS
1075	addq	$TF_RIP,%rsp
1076	jmp	doreti_iret
1077
1078ENTRY(fork_trampoline)
1079	movq	%r12,%rdi		/* function */
1080	movq	%rbx,%rsi		/* arg1 */
1081	movq	%rsp,%rdx		/* trapframe pointer */
1082	call	fork_exit
1083	MEXITCOUNT
1084	jmp	doreti			/* Handle any ASTs */
1085
1086/*
1087 * To efficiently implement classification of trap and interrupt handlers
1088 * for profiling, there must be only trap handlers between the labels btrap
1089 * and bintr, and only interrupt handlers between the labels bintr and
1090 * eintr.  This is implemented (partly) by including files that contain
1091 * some of the handlers.  Before including the files, set up a normal asm
1092 * environment so that the included files doen't need to know that they are
1093 * included.
1094 */
1095
1096#ifdef COMPAT_FREEBSD32
1097	.data
1098	.p2align 4
1099	.text
1100	SUPERALIGN_TEXT
1101
1102#include <amd64/ia32/ia32_exception.S>
1103#endif
1104
1105	.data
1106	.p2align 4
1107	.text
1108	SUPERALIGN_TEXT
1109MCOUNT_LABEL(bintr)
1110
1111#include <amd64/amd64/apic_vector.S>
1112
1113#ifdef DEV_ATPIC
1114	.data
1115	.p2align 4
1116	.text
1117	SUPERALIGN_TEXT
1118
1119#include <amd64/amd64/atpic_vector.S>
1120#endif
1121
1122	.text
1123MCOUNT_LABEL(eintr)
1124
1125/*
1126 * void doreti(struct trapframe)
1127 *
1128 * Handle return from interrupts, traps and syscalls.
1129 */
1130	.text
1131	SUPERALIGN_TEXT
1132	.type	doreti,@function
1133	.globl	doreti
1134doreti:
1135	FAKE_MCOUNT($bintr)		/* init "from" bintr -> doreti */
1136	/*
1137	 * Check if ASTs can be handled now.
1138	 */
1139	testb	$SEL_RPL_MASK,TF_CS(%rsp) /* are we returning to user mode? */
1140	jz	doreti_exit		/* can't handle ASTs now if not */
1141
1142doreti_ast:
1143	/*
1144	 * Check for ASTs atomically with returning.  Disabling CPU
1145	 * interrupts provides sufficient locking even in the SMP case,
1146	 * since we will be informed of any new ASTs by an IPI.
1147	 */
1148	cli
1149	movq	PCPU(CURTHREAD),%rax
1150	testl	$TDF_ASTPENDING | TDF_NEEDRESCHED,TD_FLAGS(%rax)
1151	je	doreti_exit
1152	sti
1153	movq	%rsp,%rdi	/* pass a pointer to the trapframe */
1154	call	ast
1155	jmp	doreti_ast
1156
1157	/*
1158	 * doreti_exit:	pop registers, iret.
1159	 *
1160	 *	The segment register pop is a special case, since it may
1161	 *	fault if (for example) a sigreturn specifies bad segment
1162	 *	registers.  The fault is handled in trap.c.
1163	 */
1164doreti_exit:
1165	MEXITCOUNT
1166	movq	PCPU(CURPCB),%r8
1167
1168	/*
1169	 * Do not reload segment registers for kernel.
1170	 * Since we do not reload segments registers with sane
1171	 * values on kernel entry, descriptors referenced by
1172	 * segments registers might be not valid.  This is fatal
1173	 * for user mode, but is not a problem for the kernel.
1174	 */
1175	testb	$SEL_RPL_MASK,TF_CS(%rsp)
1176	jz	ld_regs
1177	testl	$PCB_FULL_IRET,PCB_FLAGS(%r8)
1178	jz	ld_regs
1179	andl	$~PCB_FULL_IRET,PCB_FLAGS(%r8)
1180	testl	$TF_HASSEGS,TF_FLAGS(%rsp)
1181	je	set_segs
1182
1183do_segs:
1184	/* Restore %fs and fsbase */
1185	movw	TF_FS(%rsp),%ax
1186	.globl	ld_fs
1187ld_fs:
1188	movw	%ax,%fs
1189	cmpw	$KUF32SEL,%ax
1190	jne	1f
1191	movl	$MSR_FSBASE,%ecx
1192	movl	PCB_FSBASE(%r8),%eax
1193	movl	PCB_FSBASE+4(%r8),%edx
1194	.globl	ld_fsbase
1195ld_fsbase:
1196	wrmsr
11971:
1198	/* Restore %gs and gsbase */
1199	movw	TF_GS(%rsp),%si
1200	pushfq
1201	cli
1202	movl	$MSR_GSBASE,%ecx
1203	/* Save current kernel %gs base into %r12d:%r13d */
1204	rdmsr
1205	movl	%eax,%r12d
1206	movl	%edx,%r13d
1207	.globl	ld_gs
1208ld_gs:
1209	movw	%si,%gs
1210	/* Save user %gs base into %r14d:%r15d */
1211	rdmsr
1212	movl	%eax,%r14d
1213	movl	%edx,%r15d
1214	/* Restore kernel %gs base */
1215	movl	%r12d,%eax
1216	movl	%r13d,%edx
1217	wrmsr
1218	popfq
1219	/*
1220	 * Restore user %gs base, either from PCB if used for TLS, or
1221	 * from the previously saved msr read.
1222	 */
1223	movl	$MSR_KGSBASE,%ecx
1224	cmpw	$KUG32SEL,%si
1225	jne	1f
1226	movl	PCB_GSBASE(%r8),%eax
1227	movl	PCB_GSBASE+4(%r8),%edx
1228	jmp	ld_gsbase
12291:
1230	movl	%r14d,%eax
1231	movl	%r15d,%edx
1232	.globl	ld_gsbase
1233ld_gsbase:
1234	wrmsr	/* May trap if non-canonical, but only for TLS. */
1235	.globl	ld_es
1236ld_es:
1237	movw	TF_ES(%rsp),%es
1238	.globl	ld_ds
1239ld_ds:
1240	movw	TF_DS(%rsp),%ds
1241ld_regs:
1242	RESTORE_REGS
1243	testb	$SEL_RPL_MASK,TF_CS(%rsp) /* Did we come from kernel? */
1244	jz	2f			/* keep running with kernel GS.base */
1245	cli
1246	call	handle_ibrs_exit_rs
1247	callq	*mds_handler
1248	cmpq	$~0,PCPU(UCR3)
1249	je	1f
1250	pushq	%rdx
1251	movq	PCPU(PTI_RSP0),%rdx
1252	subq	$PTI_SIZE,%rdx
1253	movq	%rax,PTI_RAX(%rdx)
1254	popq	%rax
1255	movq	%rax,PTI_RDX(%rdx)
1256	movq	TF_RIP(%rsp),%rax
1257	movq	%rax,PTI_RIP(%rdx)
1258	movq	TF_CS(%rsp),%rax
1259	movq	%rax,PTI_CS(%rdx)
1260	movq	TF_RFLAGS(%rsp),%rax
1261	movq	%rax,PTI_RFLAGS(%rdx)
1262	movq	TF_RSP(%rsp),%rax
1263	movq	%rax,PTI_RSP(%rdx)
1264	movq	TF_SS(%rsp),%rax
1265	movq	%rax,PTI_SS(%rdx)
1266	movq	PCPU(UCR3),%rax
1267	andq	PCPU(UCR3_LOAD_MASK),%rax
1268	movq	$PMAP_UCR3_NOMASK,PCPU(UCR3_LOAD_MASK)
1269	swapgs
1270	movq	%rdx,%rsp
1271	movq	%rax,%cr3
1272	popq	%rdx
1273	popq	%rax
1274	addq	$8,%rsp
1275	jmp	doreti_iret
12761:	swapgs
12772:	addq	$TF_RIP,%rsp
1278	.globl	doreti_iret
1279doreti_iret:
1280	iretq
1281
1282set_segs:
1283	movw	$KUDSEL,%ax
1284	movw	%ax,TF_DS(%rsp)
1285	movw	%ax,TF_ES(%rsp)
1286	movw	$KUF32SEL,TF_FS(%rsp)
1287	movw	$KUG32SEL,TF_GS(%rsp)
1288	jmp	do_segs
1289
1290	/*
1291	 * doreti_iret_fault.  Alternative return code for
1292	 * the case where we get a fault in the doreti_exit code
1293	 * above.  trap() (amd64/amd64/trap.c) catches this specific
1294	 * case, sends the process a signal and continues in the
1295	 * corresponding place in the code below.
1296	 */
1297	ALIGN_TEXT
1298	.globl	doreti_iret_fault
1299doreti_iret_fault:
1300	subq	$TF_RIP,%rsp		/* space including tf_err, tf_trapno */
1301	movq	%rax,TF_RAX(%rsp)
1302	movq	%rdx,TF_RDX(%rsp)
1303	movq	%rcx,TF_RCX(%rsp)
1304	call	handle_ibrs_entry
1305	testb	$SEL_RPL_MASK,TF_CS(%rsp)
1306	jz	1f
1307	sti
13081:
1309	SAVE_SEGS
1310	movl	$TF_HASSEGS,TF_FLAGS(%rsp)
1311	movq	%rdi,TF_RDI(%rsp)
1312	movq	%rsi,TF_RSI(%rsp)
1313	movq	%r8,TF_R8(%rsp)
1314	movq	%r9,TF_R9(%rsp)
1315	movq	%rbx,TF_RBX(%rsp)
1316	movq	%rbp,TF_RBP(%rsp)
1317	movq	%r10,TF_R10(%rsp)
1318	movq	%r11,TF_R11(%rsp)
1319	movq	%r12,TF_R12(%rsp)
1320	movq	%r13,TF_R13(%rsp)
1321	movq	%r14,TF_R14(%rsp)
1322	movq	%r15,TF_R15(%rsp)
1323	movl	$T_PROTFLT,TF_TRAPNO(%rsp)
1324	movq	$0,TF_ERR(%rsp)	/* XXX should be the error code */
1325	movq	$0,TF_ADDR(%rsp)
1326	FAKE_MCOUNT(TF_RIP(%rsp))
1327	jmp	calltrap
1328
1329	ALIGN_TEXT
1330	.globl	ds_load_fault
1331ds_load_fault:
1332	movl	$T_PROTFLT,TF_TRAPNO(%rsp)
1333	testb	$SEL_RPL_MASK,TF_CS(%rsp)
1334	jz	1f
1335	sti
13361:
1337	movq	%rsp,%rdi
1338	call	trap
1339	movw	$KUDSEL,TF_DS(%rsp)
1340	jmp	doreti
1341
1342	ALIGN_TEXT
1343	.globl	es_load_fault
1344es_load_fault:
1345	movl	$T_PROTFLT,TF_TRAPNO(%rsp)
1346	testl	$PSL_I,TF_RFLAGS(%rsp)
1347	jz	1f
1348	sti
13491:
1350	movq	%rsp,%rdi
1351	call	trap
1352	movw	$KUDSEL,TF_ES(%rsp)
1353	jmp	doreti
1354
1355	ALIGN_TEXT
1356	.globl	fs_load_fault
1357fs_load_fault:
1358	testl	$PSL_I,TF_RFLAGS(%rsp)
1359	jz	1f
1360	sti
13611:
1362	movl	$T_PROTFLT,TF_TRAPNO(%rsp)
1363	movq	%rsp,%rdi
1364	call	trap
1365	movw	$KUF32SEL,TF_FS(%rsp)
1366	jmp	doreti
1367
1368	ALIGN_TEXT
1369	.globl	gs_load_fault
1370gs_load_fault:
1371	popfq
1372	movl	$T_PROTFLT,TF_TRAPNO(%rsp)
1373	testl	$PSL_I,TF_RFLAGS(%rsp)
1374	jz	1f
1375	sti
13761:
1377	movq	%rsp,%rdi
1378	call	trap
1379	movw	$KUG32SEL,TF_GS(%rsp)
1380	jmp	doreti
1381
1382	ALIGN_TEXT
1383	.globl	fsbase_load_fault
1384fsbase_load_fault:
1385	movl	$T_PROTFLT,TF_TRAPNO(%rsp)
1386	testl	$PSL_I,TF_RFLAGS(%rsp)
1387	jz	1f
1388	sti
13891:
1390	movq	%rsp,%rdi
1391	call	trap
1392	movq	PCPU(CURTHREAD),%r8
1393	movq	TD_PCB(%r8),%r8
1394	movq	$0,PCB_FSBASE(%r8)
1395	jmp	doreti
1396
1397	ALIGN_TEXT
1398	.globl	gsbase_load_fault
1399gsbase_load_fault:
1400	movl	$T_PROTFLT,TF_TRAPNO(%rsp)
1401	testl	$PSL_I,TF_RFLAGS(%rsp)
1402	jz	1f
1403	sti
14041:
1405	movq	%rsp,%rdi
1406	call	trap
1407	movq	PCPU(CURTHREAD),%r8
1408	movq	TD_PCB(%r8),%r8
1409	movq	$0,PCB_GSBASE(%r8)
1410	jmp	doreti
1411
1412#ifdef HWPMC_HOOKS
1413	ENTRY(end_exceptions)
1414#endif
1415