xref: /dragonfly/sys/platform/pc64/x86_64/exception.S (revision 3c7e5806)
1/*-
2 * Copyright (c) 1989, 1990 William F. Jolitz.
3 * Copyright (c) 1990 The Regents of the University of California.
4 * Copyright (c) 2007 The FreeBSD Foundation
5 * Copyright (c) 2008 The DragonFly Project.
6 * Copyright (c) 2008 Jordan Gordeev.
7 * All rights reserved.
8 *
9 * Portions of this software were developed by A. Joseph Koshy under
10 * sponsorship from the FreeBSD Foundation and Google, Inc.
11 *
12 * Redistribution and use in source and binary forms, with or without
13 * modification, are permitted provided that the following conditions
14 * are met:
15 * 1. Redistributions of source code must retain the above copyright
16 *    notice, this list of conditions and the following disclaimer.
17 * 2. Redistributions in binary form must reproduce the above copyright
18 *    notice, this list of conditions and the following disclaimer in the
19 *    documentation and/or other materials provided with the distribution.
20 * 3. Neither the name of the University nor the names of its contributors
21 *    may be used to endorse or promote products derived from this software
22 *    without specific prior written permission.
23 *
24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 * SUCH DAMAGE.
35 */
36
37#if 0 /* JG */
38#include "opt_atpic.h"
39#endif
40
41#include <machine/asmacros.h>
42#include <machine/psl.h>
43#include <machine/trap.h>
44#include <machine/segments.h>
45
46#include "assym.s"
47
48	.text
49
50	.globl	lwkt_switch_return
51
52/*****************************************************************************/
53/* Trap handling                                                             */
54/*****************************************************************************/
55/*
56 * Trap and fault vector routines.
57 *
58 * All traps are 'interrupt gates', SDT_SYSIGT.  An interrupt gate pushes
59 * state on the stack but also disables interrupts.  This is important for
60 * us for the use of the swapgs instruction.  We cannot be interrupted
61 * until the GS.base value is correct.  For most traps, we automatically
62 * then enable interrupts if the interrupted context had them enabled.
63 *
64 * The cpu will push a certain amount of state onto the kernel stack for
65 * the current process.  See x86_64/include/frame.h.
66 * This includes the current RFLAGS (status register, which includes
67 * the interrupt disable state prior to the trap), the code segment register,
68 * and the return instruction pointer are pushed by the cpu.  The cpu
69 * will also push an 'error' code for certain traps.  We push a dummy
70 * error code for those traps where the cpu doesn't in order to maintain
71 * a consistent frame.  We also push a contrived 'trap number'.
72 *
73 * The cpu does not push the general registers, we must do that, and we
74 * must restore them prior to calling 'iret'.  The cpu adjusts the %cs and
75 * %ss segment registers, but does not mess with %ds, %es, or %fs.  Thus we
76 * must load them with appropriate values for supervisor mode operation.
77 */
78
79MCOUNT_LABEL(user)
80MCOUNT_LABEL(btrap)
81
82/*
83 * Interrupts must be disabled for all traps, otherwise horrible %gs
84 * issues will occur.
85 */
86
87/* Regular traps; The cpu does not supply tf_err for these. */
88#define	TRAP(a)	 \
89	PUSH_FRAME_TFRIP ;			\
90	movq $0,TF_XFLAGS(%rsp) ;		\
91	movq $(a),TF_TRAPNO(%rsp) ;		\
92	movq $0,TF_ADDR(%rsp) ;			\
93	movq $0,TF_ERR(%rsp) ;			\
94	jmp alltraps
95
96/* This group of traps have tf_err already pushed by the cpu */
97#define	TRAP_ERR(a)				\
98	PUSH_FRAME_TFERR ;			\
99	movq $(a),TF_TRAPNO(%rsp) ;		\
100	movq $0,TF_ADDR(%rsp) ;			\
101	movq $0,TF_XFLAGS(%rsp) ;		\
102	jmp alltraps
103
104/*
105 * Due to a historical artifact, it is possible for a #DB exception
106 * to occur in certain bad places that would normlally be protected by
107 * the interrupt gate's interrupt disablement.
108 *
109 * Due to this possibly occuring in the system call entry code, we also
110 * run #DB on an ist2 stack to force the cpu to load a new %rsp, otherwise
111 * it might push the cpu exception frame onto the user stack.  To make things
112 * easier we just point ist2 at our trampoline area.
113 */
114IDTVEC(dbg)
115#ifdef DIRECT_DISALLOW_SS_CPUBUG
116	/*
117	 * Directly disallow #DB faults which can occur at critical points
118	 * in the code due to a historical artifact of how the cpu operates.
119	 * %gs state might not match RPL.  Test the %rip and iretq immediately
120	 * (valid %gs and %cr3 state not needed).  If we don't need kernel
121	 * reporting we can enable this and its a bit safer from unintended
122	 * consequences.
123	 *
124	 * If this is not enabled the kernel still catches the problem.  It
125	 * will report the problem and continue properly.
126	 */
127	cmpq	$Xbpt,0(%rsp)
128	je	200f
129	cmpq	$Xfast_syscall,0(%rsp)
130	je	200f
131#endif
132
133	/*
134	 * Ok, regardless of the RPL mask in the trap frame, we took
135	 * the trap on a separate stack via ist2.  This means we
136	 * must copy it appropriately.
137	 *
138	 * If coming from userland we can skip directly to the normal
139	 * TRAP code because it will handle the fact that we are on an
140	 * alternative stack (dbgstack set by ist2), even though it isn't
141	 * the trampoline stack).  The frame will be moved to the correct
142	 * kernel stack.
143	 */
144	testb   $SEL_RPL_MASK,TF_CS-TF_RIP(%rsp)
145	jnz	210f				/* jnz from userland */
146
147	/*
148	 * From kernel - %gs and %cr3 may be inconsistent.  Save original
149	 * values and load consistent values, restore after return.
150	 *
151	 * The trap handler is NOT allowed to block for this case.
152	 */
153	subq	$TR_RIP, %rsp
154	movq	%rax, TR_RAX(%rsp)
155	movq	%rcx, TR_RCX(%rsp)
156	movq	%rdx, TR_RDX(%rsp)
157
158	cld
159	movq	%cr3,%rax			/* save CR3 */
160	movq	%rax, TR_PCB_CR3_SAVED(%rsp)
161	movl	$MSR_GSBASE,%ecx		/* save %gs */
162	rdmsr
163	shlq	$32,%rdx
164	orq	%rdx,%rax
165	movq	%rax, TR_PCB_GS_SAVED(%rsp)
166	movq	TR_PCB_GS_KERNEL(%rsp),%rdx	/* retrieve kernel %gs */
167	movl	%edx,%eax
168	shrq	$32,%rdx
169	wrmsr
170	movq	PCPU(trampoline)+TR_PCB_CR3,%rax
171	movq	%rax,%cr3
172
173	movq	TR_RDX(%rsp), %rdx
174	movq	TR_RCX(%rsp), %rcx
175	movq	TR_RAX(%rsp), %rax
176	addq	$TR_RIP, %rsp
177
178	/*
179	 * We are coming from the kernel.
180	 *
181	 * We are on the IST2 stack and, in fact, we have to *STAY* on this
182	 * stack so no longer try to shift our frame to the kernel %rsp
183	 * in the trap frame, since this %rsp might actually be a user %rsp
184	 * in the mov mem,%ss + syscall DBG trap case.
185	 *
186	 * Run the normal trap.  Because TF_CS is at a kernel RPL, the
187	 * normal code will skip the usual swapgs and KMMU (trampoline)
188	 * code.  We've handled the rest.
189	 *
190	 * NOTE: at this point the trampframe is above the normal stack
191	 *	 frame.  The trap code will be ignorant of the special
192	 *	 TR_* registers above the cpu hardware frame portion,
193	 *	 and the TR_* registers below it will be overwritten.
194	 */
195	PUSH_FRAME_TFRIP
196	movq	$0,TF_XFLAGS(%rsp)
197	movq	$T_TRCTRAP,TF_TRAPNO(%rsp)
198	movq	$0,TF_ADDR(%rsp)
199	movq	$0,TF_ERR(%rsp)
200
201	FAKE_MCOUNT(TF_RIP(%rsp))
202	cld
203	movq	%rsp, %rdi
204	call	trap
205	MEXITCOUNT
206
207	/*
208	 * Pop the frame (since we're coming from kernel mode, this will
209	 * not mess with %cr3 or %gs), then restore %cr3 and %gs for our
210	 * iretq.  Not optimal but more readable and this is not a
211	 * critical path.
212	 */
213	POP_FRAME(nop)
214
215	subq	$TR_RIP, %rsp
216	movq	%rax, TR_RAX(%rsp)
217	movq	%rcx, TR_RCX(%rsp)
218	movq	%rdx, TR_RDX(%rsp)
219
220	movl	$MSR_GSBASE,%ecx		/* restore %gs */
221	movq	TR_PCB_GS_SAVED(%rsp),%rdx
222	movl	%edx,%eax
223	shrq	$32,%rdx
224	wrmsr
225
226	movq	TR_PCB_CR3_SAVED(%rsp),%rax	/* restore %cr3 */
227	movq	%rax,%cr3
228
229	movq	TR_RAX(%rsp),%rax
230	movq	TR_RCX(%rsp),%rcx
231	movq	TR_RDX(%rsp),%rdx
232	addq	$TR_RIP, %rsp
233
234	/*
235	 * Direct iretq. No point jumping to doreti because the
236	 * exception code that deals with iretq faults can't handle
237	 * non-deterministic %gs/%cr3 state.
238	 */
239#ifdef DIRECT_DISALLOW_SS_CPUBUG
240200:
241#endif
242	iretq
243
244	/*
245	 * From userland (normal trap path)
246	 */
247210:
248	TRAP(T_TRCTRAP)
249	/* NOT REACHED */
250
251IDTVEC(bpt)
252	TRAP(T_BPTFLT)
253IDTVEC(div)
254	TRAP(T_DIVIDE)
255IDTVEC(ofl)
256	TRAP(T_OFLOW)
257IDTVEC(bnd)
258	TRAP(T_BOUND)
259IDTVEC(ill)
260	TRAP(T_PRIVINFLT)
261IDTVEC(dna)
262	TRAP(T_DNA)
263IDTVEC(fpusegm)
264	TRAP(T_FPOPFLT)
265IDTVEC(mchk)
266	TRAP(T_MCHK)
267IDTVEC(rsvd)
268	TRAP(T_RESERVED)
269IDTVEC(fpu)
270	TRAP(T_ARITHTRAP)
271IDTVEC(xmm)
272	TRAP(T_XMMFLT)
273
274IDTVEC(tss)
275	TRAP_ERR(T_TSSFLT)
276IDTVEC(missing)
277	TRAP_ERR(T_SEGNPFLT)
278IDTVEC(stk)
279	TRAP_ERR(T_STKFLT)
280IDTVEC(align)
281	TRAP_ERR(T_ALIGNFLT)
282
283	/*
284	 * alltraps entry point.  Use swapgs if this is the first time in the
285	 * kernel from userland.  Reenable interrupts if they were enabled
286	 * before the trap.
287	 *
288	 * WARNING!  %gs not available until after our swapgs code
289	 */
290	SUPERALIGN_TEXT
291	.globl	alltraps
292	.type	alltraps,@function
293alltraps:
294
295#if 0
296alltraps_pushregs:
297	movq	%rdi,TF_RDI(%rsp)
298alltraps_pushregs_no_rdi:
299	movq	%rsi,TF_RSI(%rsp)
300	movq	%rdx,TF_RDX(%rsp)
301	movq	%rcx,TF_RCX(%rsp)
302	movq	%r8,TF_R8(%rsp)
303	movq	%r9,TF_R9(%rsp)
304	movq	%rax,TF_RAX(%rsp)
305	movq	%rbx,TF_RBX(%rsp)
306	movq	%rbp,TF_RBP(%rsp)
307	movq	%r10,TF_R10(%rsp)
308	movq	%r11,TF_R11(%rsp)
309	movq	%r12,TF_R12(%rsp)
310	movq	%r13,TF_R13(%rsp)
311	movq	%r14,TF_R14(%rsp)
312	movq	%r15,TF_R15(%rsp)
313#endif
314	sti
315	FAKE_MCOUNT(TF_RIP(%rsp))
316	.globl	calltrap
317	.type	calltrap,@function
318calltrap:
319	cld
320	movq	%rsp, %rdi
321	call	trap
322	MEXITCOUNT
323	jmp	doreti			/* Handle any pending ASTs */
324
325IDTVEC(dblfault)
326	PUSH_FRAME_TFERR
327	movq	$T_DOUBLEFLT,TF_TRAPNO(%rsp)
328	movq	$0,TF_ADDR(%rsp)
329	movq	$0,TF_XFLAGS(%rsp)
330
331	cld
332	movq	%rsp, %rdi
333	call	dblfault_handler
3342:	hlt
335	jmp	2b
336
337	/*
338	 * We need to save the contents of %cr2 before PUSH_FRAME* messes
339	 * with %cr3.
340	 */
341IDTVEC(page)
342	PUSH_FRAME_TFERR_SAVECR2
343	movq	$T_PAGEFLT,TF_TRAPNO(%rsp)
344	movq	$0,TF_XFLAGS(%rsp)
345	jmp	alltraps
346
347	/*
348	 * We have to special-case this one.  If we get a trap in doreti() at
349	 * the iretq stage, we'll reenter as a kernel exception with the
350	 * wrong gs and isolation state.  We have to act as through we came
351	 * in from userland.
352	 */
353IDTVEC(prot)
354	pushq	%r10
355	leaq	doreti_iret(%rip),%r10
356	cmpq	%r10,TF_RIP-TF_ERR+8(%rsp)		/* +8 due to pushq */
357	jne	prot_normal
358	testb	$SEL_RPL_MASK,TF_CS-TF_ERR+8(%rsp)      /* +8 due to pushq */
359	jnz	prot_normal
360
361	/*
362	 * Special fault during iretq
363	 */
364	popq	%r10
365	swapgs
366	KMMUENTER_TFERR
367	subq	$TF_ERR,%rsp
368	PUSH_FRAME_REGS
369	movq	$T_PROTFLT,TF_TRAPNO(%rsp)
370	movq	$0,TF_ADDR(%rsp)
371	movq	$0,TF_XFLAGS(%rsp)
372	jmp	alltraps
373
374prot_normal:
375	popq	%r10
376	PUSH_FRAME_TFERR
377	movq	$T_PROTFLT,TF_TRAPNO(%rsp)
378	movq	$0,TF_ADDR(%rsp)
379	movq	$0,TF_XFLAGS(%rsp)
380	jmp	alltraps
381
382/*
383 * Fast syscall entry point.  We enter here with just our new %cs/%ss set,
384 * and the new privilige level.  We are still running on the old user stack
385 * pointer.  We have to juggle a few things around to find our stack etc.
386 * swapgs gives us access to our PCPU space only.
387 *
388 * We use GD_TRAMPOLINE+TR_CR2 to save the user stack pointer temporarily.
389 */
390IDTVEC(fast_syscall)
391	swapgs					/* get kernel %gs */
392	movq	%rsp,PCPU(trampoline)+TR_CR2	/* save user %rsp */
393	movq	PCPU(common_tss)+TSS_RSP0,%rsp
394
395	/*
396	 * NOTE: KMMUENTER_SYSCALL does not actually use the stack but
397	 *	 adjust the stack pointer for correctness in case we
398	 *	 do in the future.
399	 */
400	subq	$TR_PCB_RSP,%rsp
401	KMMUENTER_SYSCALL
402	movq	PCPU(trampoline)+TR_PCB_RSP,%rsp
403
404	/* Now emulate a trapframe. Make the 8 byte alignment odd for call. */
405	subq	$TF_SIZE,%rsp
406	/* defer TF_RSP till we have a spare register */
407	movq	%r11,TF_RFLAGS(%rsp)
408	movq	%rcx,TF_RIP(%rsp)	/* %rcx original value is in %r10 */
409	movq	PCPU(trampoline)+TR_CR2,%r11	/* %r11 already saved */
410	movq	%r11,TF_RSP(%rsp)	/* user stack pointer */
411	orl	$RQF_QUICKRET,PCPU(reqflags)
412	movq	$KUDSEL,TF_SS(%rsp)
413	movq	$KUCSEL,TF_CS(%rsp)
414	movq	$2,TF_ERR(%rsp)
415	movq	$T_FAST_SYSCALL,TF_TRAPNO(%rsp)	/* for the vkernel */
416	movq	$0,TF_XFLAGS(%rsp)	/* note: used in signal frame */
417	movq	%rdi,TF_RDI(%rsp)	/* arg 1 */
418	movq	%rsi,TF_RSI(%rsp)	/* arg 2 */
419	movq	%rdx,TF_RDX(%rsp)	/* arg 3 */
420	movq	%r10,TF_RCX(%rsp)	/* arg 4 */
421	movq	%r8,TF_R8(%rsp)		/* arg 5 */
422	movq	%r9,TF_R9(%rsp)		/* arg 6 */
423	movq	%rax,TF_RAX(%rsp)	/* syscall number */
424	movq	%rbx,TF_RBX(%rsp)	/* C preserved */
425	movq	%rbp,TF_RBP(%rsp)	/* C preserved */
426	movq	%r12,TF_R12(%rsp)	/* C preserved */
427	movq	%r13,TF_R13(%rsp)	/* C preserved */
428	movq	%r14,TF_R14(%rsp)	/* C preserved */
429	movq	%r15,TF_R15(%rsp)	/* C preserved */
430
431	xorq	%rax,%rax		/* SECURITY CLEAR REGS */
432	movq	%rax,%rbx
433	movq	%rax,%rcx
434	movq	%rax,%rdx
435	movq	%rax,%rsi
436	movq	%rax,%rdi
437	movq	%rax,%rbp
438	movq	%rax,%r8
439	movq	%rax,%r9
440	movq	%rax,%r10
441	movq	%rax,%r11
442	movq	%rax,%r12
443	movq	%rax,%r13
444	movq	%rax,%r14
445	movq	%rax,%r15
446
447	sti
448	FAKE_MCOUNT(TF_RIP(%rsp))
449	movq	%rsp, %rdi
450	call	syscall2
451
452	/*
453	 * Fast return from system call
454	 */
455	cli
456	testl	$RQF_IPIQ|RQF_TIMER|RQF_INTPEND|RQF_AST_MASK,PCPU(reqflags)
457	jnz	1f
458	testl	$RQF_QUICKRET,PCPU(reqflags)
459	jz	1f
460	MEXITCOUNT
461
462	movq	TF_RBX(%rsp),%rbx	/* SECURITY RESTORE */
463	movq	TF_RCX(%rsp),%rcx
464	movq	TF_RBP(%rsp),%rbp
465	movq	TF_R8(%rsp),%r8
466	movq	TF_R9(%rsp),%r9
467	xorq	%r10,%r10		/* (security - clear scratch) */
468	movq	%r10,%r11
469	movq	TF_R12(%rsp),%r12
470	movq	TF_R13(%rsp),%r13
471	movq	TF_R14(%rsp),%r14
472	movq	TF_R15(%rsp),%r15
473
474	movq	TF_RDI(%rsp),%rdi	/* NORMAL RESTORE */
475	movq	TF_RSI(%rsp),%rsi
476	movq	TF_RDX(%rsp),%rdx
477	movq	TF_RAX(%rsp),%rax
478	movq	TF_RFLAGS(%rsp),%r11
479	movq	TF_RIP(%rsp),%rcx
480	movq	TF_RSP(%rsp),%rsp
481	KMMUEXIT_SYSCALL
482	swapgs
483	sysretq
484
485	/*
486	 * Normal slow / full iret
487	 */
4881:
489	MEXITCOUNT
490	jmp	doreti
491
492/*
493 * Here for CYA insurance, in case a "syscall" instruction gets
494 * issued from 32 bit compatibility mode. MSR_CSTAR has to point
495 * to *something* if EFER_SCE is enabled.
496 */
497IDTVEC(fast_syscall32)
498	sysret
499
500/*
501 * NMI handling is special.
502 *
503 * First, an NMI is taken on its own pcpu stack.  RFLAGS.IF, %gs, and %cr3
504 * will be inconsistent when interrupt supervisor mode.
505 *
506 * Second, the processor treats NMIs specially, blocking further NMIs
507 * until an 'iretq' instruction is executed.  We therefore need to
508 * execute the NMI handler with interrupts disabled to prevent a
509 * nested interrupt from executing an 'iretq' instruction and
510 * inadvertently taking the processor out of NMI mode.
511 */
512IDTVEC(nmi)
513	/*
514	 * We don't need to special-case entry from userland, %gs will
515	 * be consistent with expectations.
516	 */
517	testb   $SEL_RPL_MASK,TF_CS-TF_RIP(%rsp) ; /* from userland? */ \
518	jnz	200f
519
520	/*
521	 * From kernel - %gs and %cr3 may be inconsistent.  Save original
522	 * values and load consistent values, restore on return.
523	 *
524	 * The trap handler is NOT allowed to block for this case.
525	 */
526	subq	$TR_RIP, %rsp
527	movq	%rax, TR_RAX(%rsp)
528	movq	%rcx, TR_RCX(%rsp)
529	movq	%rdx, TR_RDX(%rsp)
530
531	cld
532	movq	%cr3,%rax			/* save CR3 */
533	movq	%rax, TR_PCB_CR3_SAVED(%rsp)
534	movl	$MSR_GSBASE,%ecx		/* save %gs */
535	rdmsr
536	shlq	$32,%rdx
537	orq	%rdx,%rax
538	movq	%rax, TR_PCB_GS_SAVED(%rsp)
539	movq	TR_PCB_GS_KERNEL(%rsp),%rdx	/* retrieve kernel %gs */
540	movl	%edx,%eax
541	shrq	$32,%rdx
542	wrmsr
543#if 0
544	movq	TR_PCB_CR3(%rsp),%rax		/* retrieve kernel %cr3 */
545#endif
546	movq	PCPU(trampoline)+TR_PCB_CR3,%rax
547	movq	%rax,%cr3
548
549	movq	TR_RDX(%rsp), %rdx
550	movq	TR_RCX(%rsp), %rcx
551	movq	TR_RAX(%rsp), %rax
552	addq	$TR_RIP, %rsp
553
554	/*
555	 * Ok, run the normal trap.  Because TF_CS is at a kernel RPL,
556	 * the normal code will skip the usual swapgs and KMMU (trampoline)
557	 * code.  We've handled the rest.
558	 *
559	 * NOTE: at this point the trampframe is above the normal stack
560	 *	 frame.  The trap code will be ignorant of the special
561	 *	 TR_* registers above the cpu hardware frame portion,
562	 *	 and the TR_* registers below it will be overwritten.
563	 */
564	PUSH_FRAME_TFRIP
565	movq	$0,TF_XFLAGS(%rsp)
566	movq	$T_NMI,TF_TRAPNO(%rsp)
567	movq	$0,TF_ADDR(%rsp)
568	movq	$0,TF_ERR(%rsp)
569
570	FAKE_MCOUNT(TF_RIP(%rsp))
571	cld
572	movq	%rsp, %rdi
573	call	trap
574	MEXITCOUNT
575
576	/*
577	 * Pop the frame (since we're coming from kernel mode, this will
578	 * not mess with %cr3 or %gs), then restore %cr3 and %gs for our
579	 * iretq.  Not optimal but more readable and this is not a
580	 * critical path.
581	 */
582	POP_FRAME(nop)
583
584	subq	$TR_RIP, %rsp
585	movq	%rax, TR_RAX(%rsp)
586	movq	%rcx, TR_RCX(%rsp)
587	movq	%rdx, TR_RDX(%rsp)
588
589	movl	$MSR_GSBASE,%ecx		/* restore %gs */
590	movq	TR_PCB_GS_SAVED(%rsp),%rdx
591	movl	%edx,%eax
592	shrq	$32,%rdx
593	wrmsr
594
595	movq	TR_PCB_CR3_SAVED(%rsp),%rax	/* restore %cr3 */
596	movq	%rax,%cr3
597
598	movq	TR_RAX(%rsp),%rax
599	movq	TR_RCX(%rsp),%rcx
600	movq	TR_RDX(%rsp),%rdx
601	addq	$TR_RIP, %rsp
602
603	/*
604	 * Direct iretq. No point jumping to doreti because the
605	 * exception code that deals with iretq faults can't handle
606	 * non-deterministic %gs/%cr3 state.
607	 */
608	iretq
609
610	/*
611	 * From userland (normal trap path)
612	 */
613200:
614	PUSH_FRAME_TFRIP
615	movq	$0,TF_XFLAGS(%rsp)
616	movq	$T_NMI,TF_TRAPNO(%rsp)
617	movq	$0,TF_ADDR(%rsp)
618	movq	$0,TF_ERR(%rsp)
619
620	FAKE_MCOUNT(TF_RIP(%rsp))
621	cld
622	movq	%rsp, %rdi
623	call	trap
624	MEXITCOUNT
625
626	POP_FRAME(jmp doreti_iret)
627
628/*
629 * This function is what cpu_heavy_restore jumps to after a new process
630 * is created.  The LWKT subsystem switches while holding a critical
631 * section and we maintain that abstraction here (e.g. because
632 * cpu_heavy_restore needs it due to PCB_*() manipulation), then get out of
633 * it before calling the initial function (typically fork_return()) and/or
634 * returning to user mode.
635 *
636 * The MP lock is not held at any point but the critcount is bumped
637 * on entry to prevent interruption of the trampoline at a bad point.
638 *
639 * This is effectively what td->td_switch() returns to.  It 'returns' the
640 * old thread in %rax and since this is not returning to a td->td_switch()
641 * call from lwkt_switch() we must handle the cleanup for the old thread
642 * by calling lwkt_switch_return().
643 *
644 * fork_trampoline(%rax:otd, %rbx:func, %r12:arg)
645 */
646ENTRY(fork_trampoline)
647	movq	%rax,%rdi
648	call	lwkt_switch_return
649	movq	PCPU(curthread),%rax
650	decl	TD_CRITCOUNT(%rax)
651
652	/*
653	 * cpu_set_fork_handler intercepts this function call to
654	 * have this call a non-return function to stay in kernel mode.
655	 *
656	 * initproc has its own fork handler, start_init(), which DOES
657	 * return.
658	 *
659	 * %rbx - chaining function (typically fork_return)
660	 * %r12 -> %rdi (argument)
661	 * frame-> %rsi (trap frame)
662	 *
663	 *   void (func:rbx)(arg:rdi, trapframe:rsi)
664	 */
665	movq	%rsp, %rsi		/* pass trapframe by reference */
666	movq	%r12, %rdi		/* arg1 */
667	call	*%rbx			/* function */
668
669	/* cut from syscall */
670
671	sti
672	call	splz
673
674	/*
675	 * Return via doreti to handle ASTs.
676	 *
677	 * trapframe is at the top of the stack.
678	 */
679	MEXITCOUNT
680	jmp	doreti
681
682/*
683 * To efficiently implement classification of trap and interrupt handlers
684 * for profiling, there must be only trap handlers between the labels btrap
685 * and bintr, and only interrupt handlers between the labels bintr and
686 * eintr.  This is implemented (partly) by including files that contain
687 * some of the handlers.  Before including the files, set up a normal asm
688 * environment so that the included files doen't need to know that they are
689 * included.
690 */
691
692	.data
693	.p2align 4
694
695	.text
696	SUPERALIGN_TEXT
697MCOUNT_LABEL(bintr)
698
699#if 0 /* JG */
700#include <x86_64/x86_64/apic_vector.S>
701#endif
702
703#ifdef DEV_ATPIC
704	.data
705	.p2align 4
706	.text
707	SUPERALIGN_TEXT
708
709#include <x86_64/isa/atpic_vector.S>
710#endif
711
712	.text
713MCOUNT_LABEL(eintr)
714