xref: /dragonfly/sys/platform/pc64/x86_64/exception.S (revision 8bf5b238)
1/*-
2 * Copyright (c) 1989, 1990 William F. Jolitz.
3 * Copyright (c) 1990 The Regents of the University of California.
4 * Copyright (c) 2007 The FreeBSD Foundation
5 * Copyright (c) 2008 The DragonFly Project.
6 * Copyright (c) 2008 Jordan Gordeev.
7 * All rights reserved.
8 *
9 * Portions of this software were developed by A. Joseph Koshy under
10 * sponsorship from the FreeBSD Foundation and Google, Inc.
11 *
12 * Redistribution and use in source and binary forms, with or without
13 * modification, are permitted provided that the following conditions
14 * are met:
15 * 1. Redistributions of source code must retain the above copyright
16 *    notice, this list of conditions and the following disclaimer.
17 * 2. Redistributions in binary form must reproduce the above copyright
18 *    notice, this list of conditions and the following disclaimer in the
19 *    documentation and/or other materials provided with the distribution.
20 * 3. Neither the name of the University nor the names of its contributors
21 *    may be used to endorse or promote products derived from this software
22 *    without specific prior written permission.
23 *
24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 * SUCH DAMAGE.
35 */
36
37#if 0 /* JG */
38#include "opt_atpic.h"
39#include "opt_compat.h"
40#endif
41
42#include <machine/asmacros.h>
43#include <machine/psl.h>
44#include <machine/trap.h>
45#include <machine/segments.h>
46
47#include "assym.s"
48
49	.text
50
51	.globl	lwkt_switch_return
52
53/*****************************************************************************/
54/* Trap handling                                                             */
55/*****************************************************************************/
56/*
57 * Trap and fault vector routines.
58 *
59 * All traps are 'interrupt gates', SDT_SYSIGT.  An interrupt gate pushes
60 * state on the stack but also disables interrupts.  This is important for
61 * us for the use of the swapgs instruction.  We cannot be interrupted
62 * until the GS.base value is correct.  For most traps, we automatically
63 * then enable interrupts if the interrupted context had them enabled.
64 *
65 * The cpu will push a certain amount of state onto the kernel stack for
66 * the current process.  See x86_64/include/frame.h.
67 * This includes the current RFLAGS (status register, which includes
68 * the interrupt disable state prior to the trap), the code segment register,
69 * and the return instruction pointer are pushed by the cpu.  The cpu
70 * will also push an 'error' code for certain traps.  We push a dummy
71 * error code for those traps where the cpu doesn't in order to maintain
72 * a consistent frame.  We also push a contrived 'trap number'.
73 *
74 * The cpu does not push the general registers, we must do that, and we
75 * must restore them prior to calling 'iret'.  The cpu adjusts the %cs and
76 * %ss segment registers, but does not mess with %ds, %es, or %fs.  Thus we
77 * must load them with appropriate values for supervisor mode operation.
78 */
79
80MCOUNT_LABEL(user)
81MCOUNT_LABEL(btrap)
82
83/*
84 * Interrupts must be disabled for all traps, otherwise horrible %gs
85 * issues will occur.
86 */
87
88/* Regular traps; The cpu does not supply tf_err for these. */
89#define	TRAP(a)	 \
90	PUSH_FRAME_TFRIP ;			\
91	movq $0,TF_XFLAGS(%rsp) ;		\
92	movq $(a),TF_TRAPNO(%rsp) ;		\
93	movq $0,TF_ADDR(%rsp) ;			\
94	movq $0,TF_ERR(%rsp) ;			\
95	jmp alltraps
96
97/* This group of traps have tf_err already pushed by the cpu */
98#define	TRAP_ERR(a)				\
99	PUSH_FRAME_TFERR ;			\
100	movq $(a),TF_TRAPNO(%rsp) ;		\
101	movq $0,TF_ADDR(%rsp) ;			\
102	movq $0,TF_XFLAGS(%rsp) ;		\
103	jmp alltraps
104
105/*
106 * Due to a historical artifact, it is possible for a #DB exception
107 * to occur in certain bad places that would normlally be protected by
108 * the interrupt gate's interrupt disablement.
109 *
110 * Due to this possibly occuring in the system call entry code, we also
111 * run #DB on an ist2 stack to force the cpu to load a new %rsp, otherwise
112 * it might push the cpu exception frame onto the user stack.  To make things
113 * easier we just point ist2 at our trampoline area.
114 */
115IDTVEC(dbg)
116#ifdef DIRECT_DISALLOW_SS_CPUBUG
117	/*
118	 * Directly disallow #DB faults which can occur at critical points
119	 * in the code due to a historical artifact of how the cpu operates.
120	 * %gs state might not match RPL.  Test the %rip and iretq immediately
121	 * (valid %gs and %cr3 state not needed).  If we don't need kernel
122	 * reporting we can enable this and its a bit safer from unintended
123	 * consequences.
124	 *
125	 * If this is not enabled the kernel still catches the problem.  It
126	 * will report the problem and continue properly.
127	 */
128	cmpq	$Xbpt,0(%rsp)
129	je	200f
130	cmpq	$Xfast_syscall,0(%rsp)
131	je	200f
132#endif
133
134	/*
135	 * Ok, regardless of the RPL mask in the trap frame, we took
136	 * the trap on a separate stack via ist2.  This means we
137	 * must copy it appropriately.
138	 *
139	 * If coming from userland we can skip directly to the normal
140	 * TRAP code because it will handle the fact that we are on an
141	 * alternative stack (dbgstack set by ist2), even though it isn't
142	 * the trampoline stack).  The frame will be moved to the correct
143	 * kernel stack.
144	 */
145	testb   $SEL_RPL_MASK,TF_CS-TF_RIP(%rsp)
146	jnz	210f				/* jnz from userland */
147
148	/*
149	 * From kernel - %gs and %cr3 may be inconsistent.  Save original
150	 * values and load consistent values, restore after return.
151	 *
152	 * The trap handler is NOT allowed to block for this case.
153	 */
154	subq	$TR_RIP, %rsp
155	movq	%rax, TR_RAX(%rsp)
156	movq	%rcx, TR_RCX(%rsp)
157	movq	%rdx, TR_RDX(%rsp)
158
159	cld
160	movq	%cr3,%rax			/* save CR3 */
161	movq	%rax, TR_PCB_CR3_SAVED(%rsp)
162	movl	$MSR_GSBASE,%ecx		/* save %gs */
163	rdmsr
164	shlq	$32,%rdx
165	orq	%rdx,%rax
166	movq	%rax, TR_PCB_GS_SAVED(%rsp)
167	movq	TR_PCB_GS_KERNEL(%rsp),%rdx	/* retrieve kernel %gs */
168	movl	%edx,%eax
169	shrq	$32,%rdx
170	wrmsr
171	movq	PCPU(trampoline)+TR_PCB_CR3,%rax
172	movq	%rax,%cr3
173
174	movq	TR_RDX(%rsp), %rdx
175	movq	TR_RCX(%rsp), %rcx
176	movq	TR_RAX(%rsp), %rax
177	addq	$TR_RIP, %rsp
178
179	/*
180	 * We are coming from the kernel.
181	 *
182	 * We are on the IST2 stack and, in fact, we have to *STAY* on this
183	 * stack so no longer try to shift our frame to the kernel %rsp
184	 * in the trap frame, since this %rsp might actually be a user %rsp
185	 * in the mov mem,%ss + syscall DBG trap case.
186	 *
187	 * Run the normal trap.  Because TF_CS is at a kernel RPL, the
188	 * normal code will skip the usual swapgs and KMMU (trampoline)
189	 * code.  We've handled the rest.
190	 *
191	 * NOTE: at this point the trampframe is above the normal stack
192	 *	 frame.  The trap code will be ignorant of the special
193	 *	 TR_* registers above the cpu hardware frame portion,
194	 *	 and the TR_* registers below it will be overwritten.
195	 */
196	PUSH_FRAME_TFRIP
197	movq	$0,TF_XFLAGS(%rsp)
198	movq	$T_TRCTRAP,TF_TRAPNO(%rsp)
199	movq	$0,TF_ADDR(%rsp)
200	movq	$0,TF_ERR(%rsp)
201
202	FAKE_MCOUNT(TF_RIP(%rsp))
203	cld
204	movq	%rsp, %rdi
205	call	trap
206	MEXITCOUNT
207
208	/*
209	 * Pop the frame (since we're coming from kernel mode, this will
210	 * not mess with %cr3 or %gs), then restore %cr3 and %gs for our
211	 * iretq.  Not optimal but more readable and this is not a
212	 * critical path.
213	 */
214	POP_FRAME(nop)
215
216	subq	$TR_RIP, %rsp
217	movq	%rax, TR_RAX(%rsp)
218	movq	%rcx, TR_RCX(%rsp)
219	movq	%rdx, TR_RDX(%rsp)
220
221	movl	$MSR_GSBASE,%ecx		/* restore %gs */
222	movq	TR_PCB_GS_SAVED(%rsp),%rdx
223	movl	%edx,%eax
224	shrq	$32,%rdx
225	wrmsr
226
227	movq	TR_PCB_CR3_SAVED(%rsp),%rax	/* restore %cr3 */
228	movq	%rax,%cr3
229
230	movq	TR_RAX(%rsp),%rax
231	movq	TR_RCX(%rsp),%rcx
232	movq	TR_RDX(%rsp),%rdx
233	addq	$TR_RIP, %rsp
234
235	/*
236	 * Direct iretq. No point jumping to doreti because the
237	 * exception code that deals with iretq faults can't handle
238	 * non-deterministic %gs/%cr3 state.
239	 */
240#ifdef DIRECT_DISALLOW_SS_CPUBUG
241200:
242#endif
243	iretq
244
245	/*
246	 * From userland (normal trap path)
247	 */
248210:
249	TRAP(T_TRCTRAP)
250	/* NOT REACHED */
251
252IDTVEC(bpt)
253	TRAP(T_BPTFLT)
254IDTVEC(div)
255	TRAP(T_DIVIDE)
256IDTVEC(ofl)
257	TRAP(T_OFLOW)
258IDTVEC(bnd)
259	TRAP(T_BOUND)
260IDTVEC(ill)
261	TRAP(T_PRIVINFLT)
262IDTVEC(dna)
263	TRAP(T_DNA)
264IDTVEC(fpusegm)
265	TRAP(T_FPOPFLT)
266IDTVEC(mchk)
267	TRAP(T_MCHK)
268IDTVEC(rsvd)
269	TRAP(T_RESERVED)
270IDTVEC(fpu)
271	TRAP(T_ARITHTRAP)
272IDTVEC(xmm)
273	TRAP(T_XMMFLT)
274
275IDTVEC(tss)
276	TRAP_ERR(T_TSSFLT)
277IDTVEC(missing)
278	TRAP_ERR(T_SEGNPFLT)
279IDTVEC(stk)
280	TRAP_ERR(T_STKFLT)
281IDTVEC(align)
282	TRAP_ERR(T_ALIGNFLT)
283
284	/*
285	 * alltraps entry point.  Use swapgs if this is the first time in the
286	 * kernel from userland.  Reenable interrupts if they were enabled
287	 * before the trap.
288	 *
289	 * WARNING!  %gs not available until after our swapgs code
290	 */
291	SUPERALIGN_TEXT
292	.globl	alltraps
293	.type	alltraps,@function
294alltraps:
295
296#if 0
297alltraps_pushregs:
298	movq	%rdi,TF_RDI(%rsp)
299alltraps_pushregs_no_rdi:
300	movq	%rsi,TF_RSI(%rsp)
301	movq	%rdx,TF_RDX(%rsp)
302	movq	%rcx,TF_RCX(%rsp)
303	movq	%r8,TF_R8(%rsp)
304	movq	%r9,TF_R9(%rsp)
305	movq	%rax,TF_RAX(%rsp)
306	movq	%rbx,TF_RBX(%rsp)
307	movq	%rbp,TF_RBP(%rsp)
308	movq	%r10,TF_R10(%rsp)
309	movq	%r11,TF_R11(%rsp)
310	movq	%r12,TF_R12(%rsp)
311	movq	%r13,TF_R13(%rsp)
312	movq	%r14,TF_R14(%rsp)
313	movq	%r15,TF_R15(%rsp)
314#endif
315	sti
316	FAKE_MCOUNT(TF_RIP(%rsp))
317	.globl	calltrap
318	.type	calltrap,@function
319calltrap:
320	cld
321	movq	%rsp, %rdi
322	call	trap
323	MEXITCOUNT
324	jmp	doreti			/* Handle any pending ASTs */
325
326IDTVEC(dblfault)
327	PUSH_FRAME_TFERR
328	movq	$T_DOUBLEFLT,TF_TRAPNO(%rsp)
329	movq	$0,TF_ADDR(%rsp)
330	movq	$0,TF_XFLAGS(%rsp)
331
332	cld
333	movq	%rsp, %rdi
334	call	dblfault_handler
3352:	hlt
336	jmp	2b
337
338	/*
339	 * We need to save the contents of %cr2 before PUSH_FRAME* messes
340	 * with %cr3.
341	 */
342IDTVEC(page)
343	PUSH_FRAME_TFERR_SAVECR2
344	movq	$T_PAGEFLT,TF_TRAPNO(%rsp)
345	movq	$0,TF_XFLAGS(%rsp)
346	jmp	alltraps
347
348	/*
349	 * We have to special-case this one.  If we get a trap in doreti() at
350	 * the iretq stage, we'll reenter as a kernel exception with the
351	 * wrong gs and isolation state.  We have to act as through we came
352	 * in from userland.
353	 */
354IDTVEC(prot)
355	pushq	%r10
356	leaq	doreti_iret(%rip),%r10
357	cmpq	%r10,TF_RIP-TF_ERR+8(%rsp)		/* +8 due to pushq */
358	jne	prot_normal
359	testb	$SEL_RPL_MASK,TF_CS-TF_ERR+8(%rsp)      /* +8 due to pushq */
360	jnz	prot_normal
361
362	/*
363	 * Special fault during iretq
364	 */
365	popq	%r10
366	swapgs
367	KMMUENTER_TFERR
368	subq	$TF_ERR,%rsp
369	PUSH_FRAME_REGS
370	movq	$T_PROTFLT,TF_TRAPNO(%rsp)
371	movq	$0,TF_ADDR(%rsp)
372	movq	$0,TF_XFLAGS(%rsp)
373	jmp	alltraps
374
375prot_normal:
376	popq	%r10
377	PUSH_FRAME_TFERR
378	movq	$T_PROTFLT,TF_TRAPNO(%rsp)
379	movq	$0,TF_ADDR(%rsp)
380	movq	$0,TF_XFLAGS(%rsp)
381	jmp	alltraps
382
383/*
384 * Fast syscall entry point.  We enter here with just our new %cs/%ss set,
385 * and the new privilige level.  We are still running on the old user stack
386 * pointer.  We have to juggle a few things around to find our stack etc.
387 * swapgs gives us access to our PCPU space only.
388 *
389 * We use GD_TRAMPOLINE+TR_CR2 to save the user stack pointer temporarily.
390 */
391IDTVEC(fast_syscall)
392	swapgs					/* get kernel %gs */
393	movq	%rsp,PCPU(trampoline)+TR_CR2	/* save user %rsp */
394	movq	PCPU(common_tss)+TSS_RSP0,%rsp
395
396	/*
397	 * NOTE: KMMUENTER_SYSCALL does not actually use the stack but
398	 *	 adjust the stack pointer for correctness in case we
399	 *	 do in the future.
400	 */
401	subq	$TR_PCB_RSP,%rsp
402	KMMUENTER_SYSCALL
403	movq	PCPU(trampoline)+TR_PCB_RSP,%rsp
404
405	/* Now emulate a trapframe. Make the 8 byte alignment odd for call. */
406	subq	$TF_SIZE,%rsp
407	/* defer TF_RSP till we have a spare register */
408	movq	%r11,TF_RFLAGS(%rsp)
409	movq	%rcx,TF_RIP(%rsp)	/* %rcx original value is in %r10 */
410	movq	PCPU(trampoline)+TR_CR2,%r11	/* %r11 already saved */
411	movq	%r11,TF_RSP(%rsp)	/* user stack pointer */
412	orl	$RQF_QUICKRET,PCPU(reqflags)
413	movq	$KUDSEL,TF_SS(%rsp)
414	movq	$KUCSEL,TF_CS(%rsp)
415	movq	$2,TF_ERR(%rsp)
416	movq	$T_FAST_SYSCALL,TF_TRAPNO(%rsp)	/* for the vkernel */
417	movq	$0,TF_XFLAGS(%rsp)	/* note: used in signal frame */
418	movq	%rdi,TF_RDI(%rsp)	/* arg 1 */
419	movq	%rsi,TF_RSI(%rsp)	/* arg 2 */
420	movq	%rdx,TF_RDX(%rsp)	/* arg 3 */
421	movq	%r10,TF_RCX(%rsp)	/* arg 4 */
422	movq	%r8,TF_R8(%rsp)		/* arg 5 */
423	movq	%r9,TF_R9(%rsp)		/* arg 6 */
424	movq	%rax,TF_RAX(%rsp)	/* syscall number */
425	movq	%rbx,TF_RBX(%rsp)	/* C preserved */
426	movq	%rbp,TF_RBP(%rsp)	/* C preserved */
427	movq	%r12,TF_R12(%rsp)	/* C preserved */
428	movq	%r13,TF_R13(%rsp)	/* C preserved */
429	movq	%r14,TF_R14(%rsp)	/* C preserved */
430	movq	%r15,TF_R15(%rsp)	/* C preserved */
431	sti
432	FAKE_MCOUNT(TF_RIP(%rsp))
433	movq	%rsp, %rdi
434	call	syscall2
435
436	/*
437	 * Fast return from system call
438	 */
439	cli
440	testl	$RQF_IPIQ|RQF_TIMER|RQF_INTPEND|RQF_AST_MASK,PCPU(reqflags)
441	jnz	1f
442	testl	$RQF_QUICKRET,PCPU(reqflags)
443	jz	1f
444	MEXITCOUNT
445	movq	TF_RDI(%rsp),%rdi
446	movq	TF_RSI(%rsp),%rsi
447	movq	TF_RDX(%rsp),%rdx
448	movq	TF_RAX(%rsp),%rax
449	movq	TF_RFLAGS(%rsp),%r11
450	movq	TF_RIP(%rsp),%rcx
451	movq	TF_RSP(%rsp),%rsp
452	KMMUEXIT_SYSCALL
453	swapgs
454	sysretq
455
456	/*
457	 * Normal slow / full iret
458	 */
4591:
460	MEXITCOUNT
461	jmp	doreti
462
463/*
464 * Here for CYA insurance, in case a "syscall" instruction gets
465 * issued from 32 bit compatibility mode. MSR_CSTAR has to point
466 * to *something* if EFER_SCE is enabled.
467 */
468IDTVEC(fast_syscall32)
469	sysret
470
471/*
472 * NMI handling is special.
473 *
474 * First, an NMI is taken on its own pcpu stack.  RFLAGS.IF, %gs, and %cr3
475 * will be inconsistent when interrupt supervisor mode.
476 *
477 * Second, the processor treats NMIs specially, blocking further NMIs
478 * until an 'iretq' instruction is executed.  We therefore need to
479 * execute the NMI handler with interrupts disabled to prevent a
480 * nested interrupt from executing an 'iretq' instruction and
481 * inadvertently taking the processor out of NMI mode.
482 */
483IDTVEC(nmi)
484	/*
485	 * We don't need to special-case entry from userland, %gs will
486	 * be consistent with expectations.
487	 */
488	testb   $SEL_RPL_MASK,TF_CS-TF_RIP(%rsp) ; /* from userland? */ \
489	jnz	200f
490
491	/*
492	 * From kernel - %gs and %cr3 may be inconsistent.  Save original
493	 * values and load consistent values, restore on return.
494	 *
495	 * The trap handler is NOT allowed to block for this case.
496	 */
497	subq	$TR_RIP, %rsp
498	movq	%rax, TR_RAX(%rsp)
499	movq	%rcx, TR_RCX(%rsp)
500	movq	%rdx, TR_RDX(%rsp)
501
502	cld
503	movq	%cr3,%rax			/* save CR3 */
504	movq	%rax, TR_PCB_CR3_SAVED(%rsp)
505	movl	$MSR_GSBASE,%ecx		/* save %gs */
506	rdmsr
507	shlq	$32,%rdx
508	orq	%rdx,%rax
509	movq	%rax, TR_PCB_GS_SAVED(%rsp)
510	movq	TR_PCB_GS_KERNEL(%rsp),%rdx	/* retrieve kernel %gs */
511	movl	%edx,%eax
512	shrq	$32,%rdx
513	wrmsr
514#if 0
515	movq	TR_PCB_CR3(%rsp),%rax		/* retrieve kernel %cr3 */
516#endif
517	movq	PCPU(trampoline)+TR_PCB_CR3,%rax
518	movq	%rax,%cr3
519
520	movq	TR_RDX(%rsp), %rdx
521	movq	TR_RCX(%rsp), %rcx
522	movq	TR_RAX(%rsp), %rax
523	addq	$TR_RIP, %rsp
524
525	/*
526	 * Ok, run the normal trap.  Because TF_CS is at a kernel RPL,
527	 * the normal code will skip the usual swapgs and KMMU (trampoline)
528	 * code.  We've handled the rest.
529	 *
530	 * NOTE: at this point the trampframe is above the normal stack
531	 *	 frame.  The trap code will be ignorant of the special
532	 *	 TR_* registers above the cpu hardware frame portion,
533	 *	 and the TR_* registers below it will be overwritten.
534	 */
535	PUSH_FRAME_TFRIP
536	movq	$0,TF_XFLAGS(%rsp)
537	movq	$T_NMI,TF_TRAPNO(%rsp)
538	movq	$0,TF_ADDR(%rsp)
539	movq	$0,TF_ERR(%rsp)
540
541	FAKE_MCOUNT(TF_RIP(%rsp))
542	cld
543	movq	%rsp, %rdi
544	call	trap
545	MEXITCOUNT
546
547	/*
548	 * Pop the frame (since we're coming from kernel mode, this will
549	 * not mess with %cr3 or %gs), then restore %cr3 and %gs for our
550	 * iretq.  Not optimal but more readable and this is not a
551	 * critical path.
552	 */
553	POP_FRAME(nop)
554
555	subq	$TR_RIP, %rsp
556	movq	%rax, TR_RAX(%rsp)
557	movq	%rcx, TR_RCX(%rsp)
558	movq	%rdx, TR_RDX(%rsp)
559
560	movl	$MSR_GSBASE,%ecx		/* restore %gs */
561	movq	TR_PCB_GS_SAVED(%rsp),%rdx
562	movl	%edx,%eax
563	shrq	$32,%rdx
564	wrmsr
565
566	movq	TR_PCB_CR3_SAVED(%rsp),%rax	/* restore %cr3 */
567	movq	%rax,%cr3
568
569	movq	TR_RAX(%rsp),%rax
570	movq	TR_RCX(%rsp),%rcx
571	movq	TR_RDX(%rsp),%rdx
572	addq	$TR_RIP, %rsp
573
574	/*
575	 * Direct iretq. No point jumping to doreti because the
576	 * exception code that deals with iretq faults can't handle
577	 * non-deterministic %gs/%cr3 state.
578	 */
579	iretq
580
581	/*
582	 * From userland (normal trap path)
583	 */
584200:
585	PUSH_FRAME_TFRIP
586	movq	$0,TF_XFLAGS(%rsp)
587	movq	$T_NMI,TF_TRAPNO(%rsp)
588	movq	$0,TF_ADDR(%rsp)
589	movq	$0,TF_ERR(%rsp)
590
591	FAKE_MCOUNT(TF_RIP(%rsp))
592	cld
593	movq	%rsp, %rdi
594	call	trap
595	MEXITCOUNT
596
597	POP_FRAME(jmp doreti_iret)
598
599/*
600 * This function is what cpu_heavy_restore jumps to after a new process
601 * is created.  The LWKT subsystem switches while holding a critical
602 * section and we maintain that abstraction here (e.g. because
603 * cpu_heavy_restore needs it due to PCB_*() manipulation), then get out of
604 * it before calling the initial function (typically fork_return()) and/or
605 * returning to user mode.
606 *
607 * The MP lock is not held at any point but the critcount is bumped
608 * on entry to prevent interruption of the trampoline at a bad point.
609 *
610 * This is effectively what td->td_switch() returns to.  It 'returns' the
611 * old thread in %rax and since this is not returning to a td->td_switch()
612 * call from lwkt_switch() we must handle the cleanup for the old thread
613 * by calling lwkt_switch_return().
614 *
615 * fork_trampoline(%rax:otd, %rbx:func, %r12:arg)
616 */
617ENTRY(fork_trampoline)
618	movq	%rax,%rdi
619	call	lwkt_switch_return
620	movq	PCPU(curthread),%rax
621	decl	TD_CRITCOUNT(%rax)
622
623	/*
624	 * cpu_set_fork_handler intercepts this function call to
625	 * have this call a non-return function to stay in kernel mode.
626	 *
627	 * initproc has its own fork handler, start_init(), which DOES
628	 * return.
629	 *
630	 * %rbx - chaining function (typically fork_return)
631	 * %r12 -> %rdi (argument)
632	 * frame-> %rsi (trap frame)
633	 *
634	 *   void (func:rbx)(arg:rdi, trapframe:rsi)
635	 */
636	movq	%rsp, %rsi		/* pass trapframe by reference */
637	movq	%r12, %rdi		/* arg1 */
638	call	*%rbx			/* function */
639
640	/* cut from syscall */
641
642	sti
643	call	splz
644
645	/*
646	 * Return via doreti to handle ASTs.
647	 *
648	 * trapframe is at the top of the stack.
649	 */
650	MEXITCOUNT
651	jmp	doreti
652
653/*
654 * To efficiently implement classification of trap and interrupt handlers
655 * for profiling, there must be only trap handlers between the labels btrap
656 * and bintr, and only interrupt handlers between the labels bintr and
657 * eintr.  This is implemented (partly) by including files that contain
658 * some of the handlers.  Before including the files, set up a normal asm
659 * environment so that the included files doen't need to know that they are
660 * included.
661 */
662
663#if 0 /* COMPAT_IA32 */
664	.data
665	.p2align 4
666	.text
667	SUPERALIGN_TEXT
668
669#include <x86_64/ia32/ia32_exception.S>
670#endif
671
672	.data
673	.p2align 4
674
675	.text
676	SUPERALIGN_TEXT
677MCOUNT_LABEL(bintr)
678
679#if 0 /* JG */
680#include <x86_64/x86_64/apic_vector.S>
681#endif
682
683#ifdef DEV_ATPIC
684	.data
685	.p2align 4
686	.text
687	SUPERALIGN_TEXT
688
689#include <x86_64/isa/atpic_vector.S>
690#endif
691
692	.text
693MCOUNT_LABEL(eintr)
694