xref: /dragonfly/sys/platform/pc64/x86_64/exception.S (revision 029e6489)
1/*-
2 * Copyright (c) 1989, 1990 William F. Jolitz.
3 * Copyright (c) 1990 The Regents of the University of California.
4 * Copyright (c) 2007 The FreeBSD Foundation
5 * Copyright (c) 2008 The DragonFly Project.
6 * Copyright (c) 2008 Jordan Gordeev.
7 * All rights reserved.
8 *
9 * Portions of this software were developed by A. Joseph Koshy under
10 * sponsorship from the FreeBSD Foundation and Google, Inc.
11 *
12 * Redistribution and use in source and binary forms, with or without
13 * modification, are permitted provided that the following conditions
14 * are met:
15 * 1. Redistributions of source code must retain the above copyright
16 *    notice, this list of conditions and the following disclaimer.
17 * 2. Redistributions in binary form must reproduce the above copyright
18 *    notice, this list of conditions and the following disclaimer in the
19 *    documentation and/or other materials provided with the distribution.
20 * 3. Neither the name of the University nor the names of its contributors
21 *    may be used to endorse or promote products derived from this software
22 *    without specific prior written permission.
23 *
24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 * SUCH DAMAGE.
35 */
36
37#if 0 /* JG */
38#include "opt_atpic.h"
39#endif
40
41#include <machine/asmacros.h>
42#include <machine/psl.h>
43#include <machine/trap.h>
44#include <machine/segments.h>
45
46#include "assym.s"
47
48	.text
49
50	.globl	lwkt_switch_return
51
52/*****************************************************************************/
53/* Trap handling                                                             */
54/*****************************************************************************/
55/*
56 * Trap and fault vector routines.
57 *
58 * All traps are 'interrupt gates', SDT_SYSIGT.  An interrupt gate pushes
59 * state on the stack but also disables interrupts.  This is important for
60 * us for the use of the swapgs instruction.  We cannot be interrupted
61 * until the GS.base value is correct.  For most traps, we automatically
62 * then enable interrupts if the interrupted context had them enabled.
63 *
64 * The cpu will push a certain amount of state onto the kernel stack for
65 * the current process.  See x86_64/include/frame.h.
66 * This includes the current RFLAGS (status register, which includes
67 * the interrupt disable state prior to the trap), the code segment register,
68 * and the return instruction pointer are pushed by the cpu.  The cpu
69 * will also push an 'error' code for certain traps.  We push a dummy
70 * error code for those traps where the cpu doesn't in order to maintain
71 * a consistent frame.  We also push a contrived 'trap number'.
72 *
73 * The cpu does not push the general registers, we must do that, and we
74 * must restore them prior to calling 'iret'.  The cpu adjusts the %cs and
75 * %ss segment registers, but does not mess with %ds, %es, or %fs.  Thus we
76 * must load them with appropriate values for supervisor mode operation.
77 */
78
79MCOUNT_LABEL(user)
80MCOUNT_LABEL(btrap)
81
82/*
83 * Interrupts must be disabled for all traps, otherwise horrible %gs
84 * issues will occur.
85 */
86
87/* Regular traps; The cpu does not supply tf_err for these. */
88#define	TRAP(a)	 \
89	PUSH_FRAME_TFRIP ;			\
90	movq $0,TF_XFLAGS(%rsp) ;		\
91	movq $(a),TF_TRAPNO(%rsp) ;		\
92	movq $0,TF_ADDR(%rsp) ;			\
93	movq $0,TF_ERR(%rsp) ;			\
94	jmp alltraps
95
96/* This group of traps have tf_err already pushed by the cpu */
97#define	TRAP_ERR(a)				\
98	PUSH_FRAME_TFERR ;			\
99	movq $(a),TF_TRAPNO(%rsp) ;		\
100	movq $0,TF_ADDR(%rsp) ;			\
101	movq $0,TF_XFLAGS(%rsp) ;		\
102	jmp alltraps
103
104/*
105 * Due to a historical artifact, it is possible for a #DB exception
106 * to occur in certain bad places that would normlally be protected by
107 * the interrupt gate's interrupt disablement.
108 *
109 * Due to this possibly occuring in the system call entry code, we also
110 * run #DB on an ist2 stack to force the cpu to load a new %rsp, otherwise
111 * it might push the cpu exception frame onto the user stack.  To make things
112 * easier we just point ist2 at our trampoline area.
113 */
114IDTVEC(dbg)
115#ifdef DIRECT_DISALLOW_SS_CPUBUG
116	/*
117	 * Directly disallow #DB faults which can occur at critical points
118	 * in the code due to a historical artifact of how the cpu operates.
119	 * %gs state might not match RPL.  Test the %rip and iretq immediately
120	 * (valid %gs and %cr3 state not needed).  If we don't need kernel
121	 * reporting we can enable this and its a bit safer from unintended
122	 * consequences.
123	 *
124	 * If this is not enabled the kernel still catches the problem.  It
125	 * will report the problem and continue properly.
126	 */
127	cmpq	$Xbpt,0(%rsp)
128	je	200f
129	cmpq	$Xfast_syscall,0(%rsp)
130	je	200f
131#endif
132
133	/*
134	 * Ok, regardless of the RPL mask in the trap frame, we took
135	 * the trap on a separate stack via ist2.  This means we
136	 * must copy it appropriately.
137	 *
138	 * If coming from userland we can skip directly to the normal
139	 * TRAP code because it will handle the fact that we are on an
140	 * alternative stack (dbgstack set by ist2), even though it isn't
141	 * the trampoline stack).  The frame will be moved to the correct
142	 * kernel stack.
143	 */
144	testb   $SEL_RPL_MASK,TF_CS-TF_RIP(%rsp)
145	jnz	210f				/* jnz from userland */
146
147	/*
148	 * From kernel - %gs and %cr3 may be inconsistent.  Save original
149	 * values and load consistent values, restore after return.
150	 *
151	 * The trap handler is NOT allowed to block for this case.
152	 */
153	subq	$TR_RIP, %rsp
154	movq	%rax, TR_RAX(%rsp)
155	movq	%rcx, TR_RCX(%rsp)
156	movq	%rdx, TR_RDX(%rsp)
157
158	cld
159	movq	%cr3,%rax			/* save CR3 */
160	movq	%rax, TR_PCB_CR3_SAVED(%rsp)
161	movl	$MSR_GSBASE,%ecx		/* save %gs */
162	rdmsr
163	shlq	$32,%rdx
164	orq	%rdx,%rax
165	movq	%rax, TR_PCB_GS_SAVED(%rsp)
166	movq	TR_PCB_GS_KERNEL(%rsp),%rdx	/* retrieve kernel %gs */
167	movl	%edx,%eax
168	shrq	$32,%rdx
169	wrmsr
170	movq	PCPU(trampoline)+TR_PCB_CR3,%rax
171	movq	%rax,%cr3
172
173	movq	TR_RDX(%rsp), %rdx
174	movq	TR_RCX(%rsp), %rcx
175	movq	TR_RAX(%rsp), %rax
176	addq	$TR_RIP, %rsp
177
178	/*
179	 * We are coming from the kernel.
180	 *
181	 * We are on the IST2 stack and, in fact, we have to *STAY* on this
182	 * stack so no longer try to shift our frame to the kernel %rsp
183	 * in the trap frame, since this %rsp might actually be a user %rsp
184	 * in the mov mem,%ss + syscall DBG trap case.
185	 *
186	 * Run the normal trap.  Because TF_CS is at a kernel RPL, the
187	 * normal code will skip the usual swapgs and KMMU (trampoline)
188	 * code.  We've handled the rest.
189	 *
190	 * NOTE: at this point the trampframe is above the normal stack
191	 *	 frame.  The trap code will be ignorant of the special
192	 *	 TR_* registers above the cpu hardware frame portion,
193	 *	 and the TR_* registers below it will be overwritten.
194	 */
195	PUSH_FRAME_TFRIP
196	movq	$0,TF_XFLAGS(%rsp)
197	movq	$T_TRCTRAP,TF_TRAPNO(%rsp)
198	movq	$0,TF_ADDR(%rsp)
199	movq	$0,TF_ERR(%rsp)
200
201	FAKE_MCOUNT(TF_RIP(%rsp))
202	cld
203	movq	%rsp, %rdi
204	call	trap
205	MEXITCOUNT
206
207	/*
208	 * Pop the frame (since we're coming from kernel mode, this will
209	 * not mess with %cr3 or %gs), then restore %cr3 and %gs for our
210	 * iretq.  Not optimal but more readable and this is not a
211	 * critical path.
212	 */
213	POP_FRAME(nop)
214
215	subq	$TR_RIP, %rsp
216	movq	%rax, TR_RAX(%rsp)
217	movq	%rcx, TR_RCX(%rsp)
218	movq	%rdx, TR_RDX(%rsp)
219
220	movl	$MSR_GSBASE,%ecx		/* restore %gs */
221	movq	TR_PCB_GS_SAVED(%rsp),%rdx
222	movl	%edx,%eax
223	shrq	$32,%rdx
224	wrmsr
225
226	movq	TR_PCB_CR3_SAVED(%rsp),%rax	/* restore %cr3 */
227	movq	%rax,%cr3
228
229	movq	TR_RAX(%rsp),%rax
230	movq	TR_RCX(%rsp),%rcx
231	movq	TR_RDX(%rsp),%rdx
232	addq	$TR_RIP, %rsp
233
234	/*
235	 * Direct iretq. No point jumping to doreti because the
236	 * exception code that deals with iretq faults can't handle
237	 * non-deterministic %gs/%cr3 state.
238	 */
239#ifdef DIRECT_DISALLOW_SS_CPUBUG
240200:
241#endif
242	iretq
243
244	/*
245	 * From userland (normal trap path)
246	 */
247210:
248	TRAP(T_TRCTRAP)
249	/* NOT REACHED */
250
251IDTVEC(bpt)
252	TRAP(T_BPTFLT)
253IDTVEC(div)
254	TRAP(T_DIVIDE)
255IDTVEC(ofl)
256	TRAP(T_OFLOW)
257IDTVEC(bnd)
258	TRAP(T_BOUND)
259IDTVEC(ill)
260	TRAP(T_PRIVINFLT)
261IDTVEC(dna)
262	TRAP(T_DNA)
263IDTVEC(fpusegm)
264	TRAP(T_FPOPFLT)
265IDTVEC(mchk)
266	TRAP(T_MCHK)
267IDTVEC(fpu)
268	TRAP(T_ARITHTRAP)
269IDTVEC(xmm)
270	TRAP(T_XMMFLT)
271
272IDTVEC(tss)
273	TRAP_ERR(T_TSSFLT)
274IDTVEC(missing)
275	TRAP_ERR(T_SEGNPFLT)
276IDTVEC(stk)
277	TRAP_ERR(T_STKFLT)
278IDTVEC(align)
279	TRAP_ERR(T_ALIGNFLT)
280
281	/*
282	 * alltraps entry point.  Use swapgs if this is the first time in the
283	 * kernel from userland.  Reenable interrupts if they were enabled
284	 * before the trap.
285	 *
286	 * WARNING!  %gs not available until after our swapgs code
287	 */
288	SUPERALIGN_TEXT
289	.globl	alltraps
290	.type	alltraps,@function
291alltraps:
292
293#if 0
294alltraps_pushregs:
295	movq	%rdi,TF_RDI(%rsp)
296alltraps_pushregs_no_rdi:
297	movq	%rsi,TF_RSI(%rsp)
298	movq	%rdx,TF_RDX(%rsp)
299	movq	%rcx,TF_RCX(%rsp)
300	movq	%r8,TF_R8(%rsp)
301	movq	%r9,TF_R9(%rsp)
302	movq	%rax,TF_RAX(%rsp)
303	movq	%rbx,TF_RBX(%rsp)
304	movq	%rbp,TF_RBP(%rsp)
305	movq	%r10,TF_R10(%rsp)
306	movq	%r11,TF_R11(%rsp)
307	movq	%r12,TF_R12(%rsp)
308	movq	%r13,TF_R13(%rsp)
309	movq	%r14,TF_R14(%rsp)
310	movq	%r15,TF_R15(%rsp)
311#endif
312	sti
313	FAKE_MCOUNT(TF_RIP(%rsp))
314	.globl	calltrap
315	.type	calltrap,@function
316calltrap:
317	cld
318	movq	%rsp, %rdi
319	call	trap
320	MEXITCOUNT
321	jmp	doreti			/* Handle any pending ASTs */
322
323IDTVEC(dblfault)
324	PUSH_FRAME_TFERR
325	movq	$T_DOUBLEFLT,TF_TRAPNO(%rsp)
326	movq	$0,TF_ADDR(%rsp)
327	movq	$0,TF_XFLAGS(%rsp)
328
329	cld
330	movq	%rsp, %rdi
331	call	dblfault_handler
3322:	hlt
333	jmp	2b
334
335	/*
336	 * We need to save the contents of %cr2 before PUSH_FRAME* messes
337	 * with %cr3.
338	 */
339IDTVEC(page)
340	PUSH_FRAME_TFERR_SAVECR2
341	movq	$T_PAGEFLT,TF_TRAPNO(%rsp)
342	movq	$0,TF_XFLAGS(%rsp)
343	jmp	alltraps
344
345	/*
346	 * We have to special-case this one.  If we get a trap in doreti() at
347	 * the iretq stage, we'll reenter as a kernel exception with the
348	 * wrong gs and isolation state.  We have to act as through we came
349	 * in from userland.
350	 */
351IDTVEC(prot)
352	pushq	%r10
353	leaq	doreti_iret(%rip),%r10
354	cmpq	%r10,TF_RIP-TF_ERR+8(%rsp)		/* +8 due to pushq */
355	jne	prot_normal
356	testb	$SEL_RPL_MASK,TF_CS-TF_ERR+8(%rsp)      /* +8 due to pushq */
357	jnz	prot_normal
358
359	/*
360	 * Special fault during iretq
361	 */
362	popq	%r10
363	swapgs
364	KMMUENTER_TFERR
365	subq	$TF_ERR,%rsp
366	PUSH_FRAME_REGS
367	movq	$T_PROTFLT,TF_TRAPNO(%rsp)
368	movq	$0,TF_ADDR(%rsp)
369	movq	$0,TF_XFLAGS(%rsp)
370	jmp	alltraps
371
372prot_normal:
373	popq	%r10
374	PUSH_FRAME_TFERR
375	movq	$T_PROTFLT,TF_TRAPNO(%rsp)
376	movq	$0,TF_ADDR(%rsp)
377	movq	$0,TF_XFLAGS(%rsp)
378	jmp	alltraps
379
380/*
381 * Fast syscall entry point.  We enter here with just our new %cs/%ss set,
382 * and the new privilige level.  We are still running on the old user stack
383 * pointer.  We have to juggle a few things around to find our stack etc.
384 * swapgs gives us access to our PCPU space only.
385 *
386 * We use GD_TRAMPOLINE+TR_CR2 to save the user stack pointer temporarily.
387 */
388IDTVEC(fast_syscall)
389	swapgs					/* get kernel %gs */
390	movq	%rsp,PCPU(trampoline)+TR_CR2	/* save user %rsp */
391	movq	PCPU(common_tss)+TSS_RSP0,%rsp
392
393	/*
394	 * NOTE: KMMUENTER_SYSCALL does not actually use the stack but
395	 *	 adjust the stack pointer for correctness in case we
396	 *	 do in the future.
397	 */
398	subq	$TR_PCB_RSP,%rsp
399	KMMUENTER_SYSCALL
400	movq	PCPU(trampoline)+TR_PCB_RSP,%rsp
401
402	/* Now emulate a trapframe. Make the 8 byte alignment odd for call. */
403	subq	$TF_SIZE,%rsp
404	/* defer TF_RSP till we have a spare register */
405	movq	%r11,TF_RFLAGS(%rsp)
406	movq	%rcx,TF_RIP(%rsp)	/* %rcx original value is in %r10 */
407	movq	PCPU(trampoline)+TR_CR2,%r11	/* %r11 already saved */
408	movq	%r11,TF_RSP(%rsp)	/* user stack pointer */
409	orl	$RQF_QUICKRET,PCPU(reqflags)
410	movq	$KUDSEL,TF_SS(%rsp)
411	movq	$KUCSEL,TF_CS(%rsp)
412	movq	$2,TF_ERR(%rsp)
413	movq	$T_FAST_SYSCALL,TF_TRAPNO(%rsp)	/* for the vkernel */
414	movq	$0,TF_XFLAGS(%rsp)	/* note: used in signal frame */
415	movq	%rdi,TF_RDI(%rsp)	/* arg 1 */
416	movq	%rsi,TF_RSI(%rsp)	/* arg 2 */
417	movq	%rdx,TF_RDX(%rsp)	/* arg 3 */
418	movq	%r10,TF_RCX(%rsp)	/* arg 4 */
419	movq	%r8,TF_R8(%rsp)		/* arg 5 */
420	movq	%r9,TF_R9(%rsp)		/* arg 6 */
421	movq	%rax,TF_RAX(%rsp)	/* syscall number */
422	movq	%rbx,TF_RBX(%rsp)	/* C preserved */
423	movq	%rbp,TF_RBP(%rsp)	/* C preserved */
424	movq	%r12,TF_R12(%rsp)	/* C preserved */
425	movq	%r13,TF_R13(%rsp)	/* C preserved */
426	movq	%r14,TF_R14(%rsp)	/* C preserved */
427	movq	%r15,TF_R15(%rsp)	/* C preserved */
428
429	xorq	%rax,%rax		/* SECURITY CLEAR REGS */
430	movq	%rax,%rbx
431	movq	%rax,%rcx
432	movq	%rax,%rdx
433	movq	%rax,%rsi
434	movq	%rax,%rdi
435	movq	%rax,%rbp
436	movq	%rax,%r8
437	movq	%rax,%r9
438	movq	%rax,%r10
439	movq	%rax,%r11
440	movq	%rax,%r12
441	movq	%rax,%r13
442	movq	%rax,%r14
443	movq	%rax,%r15
444
445	sti
446	FAKE_MCOUNT(TF_RIP(%rsp))
447	movq	%rsp, %rdi
448	call	syscall2
449
450	/*
451	 * Fast return from system call
452	 */
453	cli
454	testl	$RQF_IPIQ|RQF_TIMER|RQF_INTPEND|RQF_AST_MASK,PCPU(reqflags)
455	jnz	1f
456	testl	$RQF_QUICKRET,PCPU(reqflags)
457	jz	1f
458	MEXITCOUNT
459
460	movq	TF_RBX(%rsp),%rbx	/* SECURITY RESTORE */
461	movq	TF_RCX(%rsp),%rcx
462	movq	TF_RBP(%rsp),%rbp
463	movq	TF_R8(%rsp),%r8
464	movq	TF_R9(%rsp),%r9
465	xorq	%r10,%r10		/* (security - clear scratch) */
466	movq	%r10,%r11
467	movq	TF_R12(%rsp),%r12
468	movq	TF_R13(%rsp),%r13
469	movq	TF_R14(%rsp),%r14
470	movq	TF_R15(%rsp),%r15
471
472	movq	TF_RDI(%rsp),%rdi	/* NORMAL RESTORE */
473	movq	TF_RSI(%rsp),%rsi
474	movq	TF_RDX(%rsp),%rdx
475	movq	TF_RAX(%rsp),%rax
476	movq	TF_RFLAGS(%rsp),%r11
477	movq	TF_RIP(%rsp),%rcx
478	movq	TF_RSP(%rsp),%rsp
479	KMMUEXIT_SYSCALL
480	swapgs
481	sysretq
482
483	/*
484	 * Normal slow / full iret
485	 */
4861:
487	MEXITCOUNT
488	jmp	doreti
489
490/*
491 * Here for CYA insurance, in case a "syscall" instruction gets
492 * issued from 32 bit compatibility mode. MSR_CSTAR has to point
493 * to *something* if EFER_SCE is enabled.
494 */
495IDTVEC(fast_syscall32)
496	sysret
497
498/*
499 * NMI handling is special.
500 *
501 * First, an NMI is taken on its own pcpu stack.  RFLAGS.IF, %gs, and %cr3
502 * will be inconsistent when interrupt supervisor mode.
503 *
504 * Second, the processor treats NMIs specially, blocking further NMIs
505 * until an 'iretq' instruction is executed.  We therefore need to
506 * execute the NMI handler with interrupts disabled to prevent a
507 * nested interrupt from executing an 'iretq' instruction and
508 * inadvertently taking the processor out of NMI mode.
509 */
510IDTVEC(nmi)
511	/*
512	 * We don't need to special-case entry from userland, %gs will
513	 * be consistent with expectations.
514	 */
515	testb   $SEL_RPL_MASK,TF_CS-TF_RIP(%rsp) ; /* from userland? */ \
516	jnz	200f
517
518	/*
519	 * From kernel - %gs and %cr3 may be inconsistent.  Save original
520	 * values and load consistent values, restore on return.
521	 *
522	 * The trap handler is NOT allowed to block for this case.
523	 */
524	subq	$TR_RIP, %rsp
525	movq	%rax, TR_RAX(%rsp)
526	movq	%rcx, TR_RCX(%rsp)
527	movq	%rdx, TR_RDX(%rsp)
528
529	cld
530	movq	%cr3,%rax			/* save CR3 */
531	movq	%rax, TR_PCB_CR3_SAVED(%rsp)
532	movl	$MSR_GSBASE,%ecx		/* save %gs */
533	rdmsr
534	shlq	$32,%rdx
535	orq	%rdx,%rax
536	movq	%rax, TR_PCB_GS_SAVED(%rsp)
537	movq	TR_PCB_GS_KERNEL(%rsp),%rdx	/* retrieve kernel %gs */
538	movl	%edx,%eax
539	shrq	$32,%rdx
540	wrmsr
541#if 0
542	movq	TR_PCB_CR3(%rsp),%rax		/* retrieve kernel %cr3 */
543#endif
544	movq	PCPU(trampoline)+TR_PCB_CR3,%rax
545	movq	%rax,%cr3
546
547	movq	TR_RDX(%rsp), %rdx
548	movq	TR_RCX(%rsp), %rcx
549	movq	TR_RAX(%rsp), %rax
550	addq	$TR_RIP, %rsp
551
552	/*
553	 * Ok, run the normal trap.  Because TF_CS is at a kernel RPL,
554	 * the normal code will skip the usual swapgs and KMMU (trampoline)
555	 * code.  We've handled the rest.
556	 *
557	 * NOTE: at this point the trampframe is above the normal stack
558	 *	 frame.  The trap code will be ignorant of the special
559	 *	 TR_* registers above the cpu hardware frame portion,
560	 *	 and the TR_* registers below it will be overwritten.
561	 */
562	PUSH_FRAME_TFRIP
563	movq	$0,TF_XFLAGS(%rsp)
564	movq	$T_NMI,TF_TRAPNO(%rsp)
565	movq	$0,TF_ADDR(%rsp)
566	movq	$0,TF_ERR(%rsp)
567
568	FAKE_MCOUNT(TF_RIP(%rsp))
569	cld
570	movq	%rsp, %rdi
571	call	trap
572	MEXITCOUNT
573
574	/*
575	 * Pop the frame (since we're coming from kernel mode, this will
576	 * not mess with %cr3 or %gs), then restore %cr3 and %gs for our
577	 * iretq.  Not optimal but more readable and this is not a
578	 * critical path.
579	 */
580	POP_FRAME(nop)
581
582	subq	$TR_RIP, %rsp
583	movq	%rax, TR_RAX(%rsp)
584	movq	%rcx, TR_RCX(%rsp)
585	movq	%rdx, TR_RDX(%rsp)
586
587	movl	$MSR_GSBASE,%ecx		/* restore %gs */
588	movq	TR_PCB_GS_SAVED(%rsp),%rdx
589	movl	%edx,%eax
590	shrq	$32,%rdx
591	wrmsr
592
593	movq	TR_PCB_CR3_SAVED(%rsp),%rax	/* restore %cr3 */
594	movq	%rax,%cr3
595
596	movq	TR_RAX(%rsp),%rax
597	movq	TR_RCX(%rsp),%rcx
598	movq	TR_RDX(%rsp),%rdx
599	addq	$TR_RIP, %rsp
600
601	/*
602	 * Direct iretq. No point jumping to doreti because the
603	 * exception code that deals with iretq faults can't handle
604	 * non-deterministic %gs/%cr3 state.
605	 */
606	iretq
607
608	/*
609	 * From userland (normal trap path)
610	 */
611200:
612	PUSH_FRAME_TFRIP
613	movq	$0,TF_XFLAGS(%rsp)
614	movq	$T_NMI,TF_TRAPNO(%rsp)
615	movq	$0,TF_ADDR(%rsp)
616	movq	$0,TF_ERR(%rsp)
617
618	FAKE_MCOUNT(TF_RIP(%rsp))
619	cld
620	movq	%rsp, %rdi
621	call	trap
622	MEXITCOUNT
623
624	POP_FRAME(jmp doreti_iret)
625
626/*
627 * Reserved (unconfigured) traps rsvd00 - rsvdff
628 */
629.macro reservetrap a b
630IDTVEC(rsvd\a\b)
631	TRAP(T_RESERVED + 0x\a\b)
632.endm
633
634.macro reservegrp a
635reservetrap \a 0
636reservetrap \a 1
637reservetrap \a 2
638reservetrap \a 3
639reservetrap \a 4
640reservetrap \a 5
641reservetrap \a 6
642reservetrap \a 7
643reservetrap \a 8
644reservetrap \a 9
645reservetrap \a a
646reservetrap \a b
647reservetrap \a c
648reservetrap \a d
649reservetrap \a e
650reservetrap \a f
651.endm
652
653reservegrp 0
654reservegrp 1
655reservegrp 2
656reservegrp 3
657reservegrp 4
658reservegrp 5
659reservegrp 6
660reservegrp 7
661reservegrp 8
662reservegrp 9
663reservegrp a
664reservegrp b
665reservegrp c
666reservegrp d
667reservegrp e
668reservegrp f
669
670/*
671 * This function is what cpu_heavy_restore jumps to after a new process
672 * is created.  The LWKT subsystem switches while holding a critical
673 * section and we maintain that abstraction here (e.g. because
674 * cpu_heavy_restore needs it due to PCB_*() manipulation), then get out of
675 * it before calling the initial function (typically fork_return()) and/or
676 * returning to user mode.
677 *
678 * The MP lock is not held at any point but the critcount is bumped
679 * on entry to prevent interruption of the trampoline at a bad point.
680 *
681 * This is effectively what td->td_switch() returns to.  It 'returns' the
682 * old thread in %rax and since this is not returning to a td->td_switch()
683 * call from lwkt_switch() we must handle the cleanup for the old thread
684 * by calling lwkt_switch_return().
685 *
686 * fork_trampoline(%rax:otd, %rbx:func, %r12:arg)
687 */
688ENTRY(fork_trampoline)
689	movq	%rax,%rdi
690	call	lwkt_switch_return
691	movq	PCPU(curthread),%rax
692	decl	TD_CRITCOUNT(%rax)
693
694	/*
695	 * cpu_set_fork_handler intercepts this function call to
696	 * have this call a non-return function to stay in kernel mode.
697	 *
698	 * initproc has its own fork handler, start_init(), which DOES
699	 * return.
700	 *
701	 * %rbx - chaining function (typically fork_return)
702	 * %r12 -> %rdi (argument)
703	 * frame-> %rsi (trap frame)
704	 *
705	 *   void (func:rbx)(arg:rdi, trapframe:rsi)
706	 */
707	movq	%rsp, %rsi		/* pass trapframe by reference */
708	movq	%r12, %rdi		/* arg1 */
709	call	*%rbx			/* function */
710
711	/* cut from syscall */
712
713	sti
714	call	splz
715
716	/*
717	 * Return via doreti to handle ASTs.
718	 *
719	 * trapframe is at the top of the stack.
720	 */
721	MEXITCOUNT
722	jmp	doreti
723
724/*
725 * To efficiently implement classification of trap and interrupt handlers
726 * for profiling, there must be only trap handlers between the labels btrap
727 * and bintr, and only interrupt handlers between the labels bintr and
728 * eintr.  This is implemented (partly) by including files that contain
729 * some of the handlers.  Before including the files, set up a normal asm
730 * environment so that the included files doen't need to know that they are
731 * included.
732 */
733
734	.data
735	.p2align 4
736
737	.text
738	SUPERALIGN_TEXT
739MCOUNT_LABEL(bintr)
740
741#if 0 /* JG */
742#include <x86_64/x86_64/apic_vector.S>
743#endif
744
745#ifdef DEV_ATPIC
746	.data
747	.p2align 4
748	.text
749	SUPERALIGN_TEXT
750
751#include <x86_64/isa/atpic_vector.S>
752#endif
753
754	.text
755MCOUNT_LABEL(eintr)
756