1/*	$NetBSD: locore.S,v 1.108 2016/07/25 16:03:38 maxv Exp $	*/
2
3/*
4 * Copyright-o-rama!
5 */
6
7/*
8 * Copyright (c) 1998, 2000, 2007, 2008, 2016 The NetBSD Foundation, Inc.
9 * All rights reserved.
10 *
11 * This code is derived from software contributed to The NetBSD Foundation
12 * by Charles M. Hannum and by Maxime Villard.
13 *
14 * Redistribution and use in source and binary forms, with or without
15 * modification, are permitted provided that the following conditions
16 * are met:
17 * 1. Redistributions of source code must retain the above copyright
18 *    notice, this list of conditions and the following disclaimer.
19 * 2. Redistributions in binary form must reproduce the above copyright
20 *    notice, this list of conditions and the following disclaimer in the
21 *    documentation and/or other materials provided with the distribution.
22 *
23 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
24 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
25 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
26 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
27 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
28 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
29 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
30 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
31 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
32 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
33 * POSSIBILITY OF SUCH DAMAGE.
34 */
35
36/*
37 * Copyright (c) 2007 Manuel Bouyer.
38 *
39 * Redistribution and use in source and binary forms, with or without
40 * modification, are permitted provided that the following conditions
41 * are met:
42 * 1. Redistributions of source code must retain the above copyright
43 *    notice, this list of conditions and the following disclaimer.
44 * 2. Redistributions in binary form must reproduce the above copyright
45 *    notice, this list of conditions and the following disclaimer in the
46 *    documentation and/or other materials provided with the distribution.
47 *
48 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
49 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
50 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
51 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
52 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
53 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
54 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
55 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
56 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
57 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
58 *
59 */
60
61/*
62 * Copyright (c) 2006 Mathieu Ropert <mro@adviseo.fr>
63 *
64 * Permission to use, copy, modify, and distribute this software for any
65 * purpose with or without fee is hereby granted, provided that the above
66 * copyright notice and this permission notice appear in all copies.
67 *
68 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
69 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
70 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
71 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
72 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
73 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
74 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
75 */
76
77/*
78 * Copyright (c) 2001 Wasabi Systems, Inc.
79 * All rights reserved.
80 *
81 * Written by Frank van der Linden for Wasabi Systems, Inc.
82 *
83 * Redistribution and use in source and binary forms, with or without
84 * modification, are permitted provided that the following conditions
85 * are met:
86 * 1. Redistributions of source code must retain the above copyright
87 *    notice, this list of conditions and the following disclaimer.
88 * 2. Redistributions in binary form must reproduce the above copyright
89 *    notice, this list of conditions and the following disclaimer in the
90 *    documentation and/or other materials provided with the distribution.
91 * 3. All advertising materials mentioning features or use of this software
92 *    must display the following acknowledgement:
93 *      This product includes software developed for the NetBSD Project by
94 *      Wasabi Systems, Inc.
95 * 4. The name of Wasabi Systems, Inc. may not be used to endorse
96 *    or promote products derived from this software without specific prior
97 *    written permission.
98 *
99 * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND
100 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
101 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
102 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL WASABI SYSTEMS, INC
103 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
104 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
105 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
106 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
107 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
108 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
109 * POSSIBILITY OF SUCH DAMAGE.
110 */
111
112/*-
113 * Copyright (c) 1990 The Regents of the University of California.
114 * All rights reserved.
115 *
116 * This code is derived from software contributed to Berkeley by
117 * William Jolitz.
118 *
119 * Redistribution and use in source and binary forms, with or without
120 * modification, are permitted provided that the following conditions
121 * are met:
122 * 1. Redistributions of source code must retain the above copyright
123 *    notice, this list of conditions and the following disclaimer.
124 * 2. Redistributions in binary form must reproduce the above copyright
125 *    notice, this list of conditions and the following disclaimer in the
126 *    documentation and/or other materials provided with the distribution.
127 * 3. Neither the name of the University nor the names of its contributors
128 *    may be used to endorse or promote products derived from this software
129 *    without specific prior written permission.
130 *
131 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
132 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
133 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
134 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
135 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
136 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
137 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
138 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
139 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
140 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
141 * SUCH DAMAGE.
142 *
143 *	@(#)locore.s	7.3 (Berkeley) 5/13/91
144 */
145
146/* Override user-land alignment before including asm.h */
147#define	ALIGN_DATA	.align	8
148#define ALIGN_TEXT	.align 16,0x90
149#define _ALIGN_TEXT	ALIGN_TEXT
150
151#include <machine/asm.h>
152
153#include "opt_copy_symtab.h"
154#include "opt_ddb.h"
155#include "opt_ddbparam.h"
156#include "opt_modular.h"
157#include "opt_realmem.h"
158
159#include "opt_compat_netbsd.h"
160#include "opt_compat_netbsd32.h"
161#include "opt_compat_ibcs2.h"
162#include "opt_xen.h"
163
164#include "assym.h"
165#include "lapic.h"
166#include "ioapic.h"
167#include "ksyms.h"
168
169#include <sys/errno.h>
170#include <sys/syscall.h>
171
172#include <machine/pte.h>
173#include <machine/segments.h>
174#include <machine/specialreg.h>
175#include <machine/trap.h>
176#include <machine/bootinfo.h>
177#include <machine/frameasm.h>
178#include <machine/cputypes.h>
179
180#if NLAPIC > 0
181#include <machine/i82489reg.h>
182#endif
183
184/* Get definitions for IOM_BEGIN, IOM_END, and IOM_SIZE */
185#include <dev/isa/isareg.h>
186
187#define	_RELOC(x)	((x) - KERNBASE)
188#define	RELOC(x)	_RELOC(_C_LABEL(x))
189
190/* 32bit version of PG_NX */
191#define PG_NX32	0x80000000
192
193#if L2_SLOT_KERNBASE > 0
194#define TABLE_L2_ENTRIES (2 * (NKL2_KIMG_ENTRIES + 1))
195#else
196#define TABLE_L2_ENTRIES (NKL2_KIMG_ENTRIES + 1)
197#endif
198
199#if L3_SLOT_KERNBASE > 0
200#define TABLE_L3_ENTRIES (2 * NKL3_KIMG_ENTRIES)
201#else
202#define TABLE_L3_ENTRIES NKL3_KIMG_ENTRIES
203#endif
204
205#define PROC0_PML4_OFF	0
206#define PROC0_STK_OFF	(PROC0_PML4_OFF + 1 * PAGE_SIZE)
207#define PROC0_PTP3_OFF	(PROC0_STK_OFF + UPAGES * PAGE_SIZE)
208#define PROC0_PTP2_OFF	(PROC0_PTP3_OFF + NKL4_KIMG_ENTRIES * PAGE_SIZE)
209#define PROC0_PTP1_OFF	(PROC0_PTP2_OFF + TABLE_L3_ENTRIES * PAGE_SIZE)
210#define TABLESIZE \
211  ((NKL4_KIMG_ENTRIES + TABLE_L3_ENTRIES + TABLE_L2_ENTRIES + 1 + UPAGES) \
212    * PAGE_SIZE)
213
214/*
215 * fillkpt - Fill in a kernel page table
216 *	eax = pte (page frame | control | status)
217 *	ebx = page table address
218 *	ecx = number of pages to map
219 *
220 * Each entry is 8 (PDE_SIZE) bytes long: we must set the 4 upper bytes to 0.
221 */
222#define fillkpt	\
223	cmpl	$0,%ecx			;	/* zero-sized? */	\
224	je 	2f			; \
2251:	movl	$0,(PDE_SIZE-4)(%ebx)	;	/* upper 32 bits: 0 */	\
226	movl	%eax,(%ebx)		;	/* store phys addr */	\
227	addl	$PDE_SIZE,%ebx		;	/* next PTE/PDE */	\
228	addl	$PAGE_SIZE,%eax		;	/* next phys page */	\
229	loop	1b			; \
2302:					;
231
232/*
233 * fillkpt_nox - Same as fillkpt, but sets the NX/XD bit.
234 */
235#define fillkpt_nox \
236	cmpl	$0,%ecx			;	/* zero-sized? */	\
237	je 	2f			; \
238	pushl	%ebp			; \
239	movl	RELOC(nox_flag),%ebp	; \
2401:	movl	%ebp,(PDE_SIZE-4)(%ebx)	;	/* upper 32 bits: NX */ \
241	movl	%eax,(%ebx)		;	/* store phys addr */	\
242	addl	$PDE_SIZE,%ebx		;	/* next PTE/PDE */	\
243	addl	$PAGE_SIZE,%eax		;	/* next phys page */	\
244	loop	1b			; \
245	popl	%ebp			; \
2462:					;
247
248/*
249 * fillkpt_blank - Fill in a kernel page table with blank entries
250 *	ebx = page table address
251 *	ecx = number of pages to map
252 */
253#define fillkpt_blank	\
254	cmpl	$0,%ecx			;	/* zero-sized? */	\
255	je 	2f			; \
2561:	movl	$0,(PDE_SIZE-4)(%ebx)	;	/* upper 32 bits: 0 */	\
257	movl	$0,(%ebx)		;	/* lower 32 bits: 0 */	\
258	addl	$PDE_SIZE,%ebx		;	/* next PTE/PDE */	\
259	loop	1b			; \
2602:					;
261
262/*
263 * killkpt - Destroy a kernel page table (long mode)
264 *	rbx = page table address
265 *	rcx = number of pages to destroy
266 */
267#define killkpt \
2681:	movq	$0,(%rbx)	; \
269	addq	$PDE_SIZE,%rbx	; \
270	loop	1b		;
271
272
273#ifdef XEN
274#define __ASSEMBLY__
275#include <xen/xen-public/elfnote.h>
276#include <xen/xen-public/xen.h>
277#define ELFNOTE(name, type, desctype, descdata...) \
278.pushsection .note.name			;	\
279  .align 4				;	\
280  .long 2f - 1f		/* namesz */	;	\
281  .long 4f - 3f		/* descsz */	;	\
282  .long type				;	\
2831:.asciz #name				;	\
2842:.align 4				;	\
2853:desctype descdata			;	\
2864:.align 4				;	\
287.popsection
288
289/*
290 * Xen guest identifier and loader selection
291 */
292.section __xen_guest
293	ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS,       .asciz, "NetBSD")
294	ELFNOTE(Xen, XEN_ELFNOTE_GUEST_VERSION,  .asciz, "4.99")
295	ELFNOTE(Xen, XEN_ELFNOTE_XEN_VERSION,    .asciz, "xen-3.0")
296	ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE,      .quad,  KERNBASE)
297	ELFNOTE(Xen, XEN_ELFNOTE_PADDR_OFFSET,   .quad,  KERNBASE)
298	ELFNOTE(Xen, XEN_ELFNOTE_ENTRY,          .quad,  start)
299	ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, .quad,  hypercall_page)
300	ELFNOTE(Xen, XEN_ELFNOTE_HV_START_LOW,   .quad,  HYPERVISOR_VIRT_START)
301	ELFNOTE(Xen, XEN_ELFNOTE_FEATURES,       .asciz, "")
302	ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE,       .asciz, "yes")
303	ELFNOTE(Xen, XEN_ELFNOTE_L1_MFN_VALID,   .long,  PG_V, PG_V)\
304	ELFNOTE(Xen, XEN_ELFNOTE_LOADER,         .asciz, "generic")
305	ELFNOTE(Xen, XEN_ELFNOTE_SUSPEND_CANCEL, .long,  0)
306#if NKSYMS > 0 || defined(DDB) || defined(MODULAR)
307	ELFNOTE(Xen, XEN_ELFNOTE_BSD_SYMTAB,     .asciz, "yes")
308#endif
309#endif	/* XEN */
310
311/*
312 * Initialization
313 */
314	.data
315
316#if NLAPIC > 0
317	.align	PAGE_SIZE
318	.globl	_C_LABEL(local_apic)
319	.globl	_C_LABEL(lapic_tpr)
320
321	.type	_C_LABEL(local_apic), @object
322LABEL(local_apic)
323	.space	LAPIC_TPRI
324END(local_apic)
325	.type	_C_LABEL(lapic_tpr), @object
326LABEL(lapic_tpr)
327	.space	PAGE_SIZE-LAPIC_TPRI
328END(lapic_tpr)
329#endif /* NLAPIC > 0 */
330
331	.globl	_C_LABEL(tablesize)
332	.globl	_C_LABEL(nox_flag)
333	.globl	_C_LABEL(cputype)
334	.globl	_C_LABEL(cpuid_level)
335	.globl	_C_LABEL(esym)
336	.globl	_C_LABEL(eblob)
337	.globl	_C_LABEL(atdevbase)
338	.globl	_C_LABEL(PDPpaddr)
339	.globl	_C_LABEL(boothowto)
340	.globl	_C_LABEL(bootinfo)
341	.globl	_C_LABEL(biosbasemem)
342	.globl	_C_LABEL(biosextmem)
343	.globl	_C_LABEL(gdtstore)
344
345	.type	_C_LABEL(tablesize), @object
346_C_LABEL(tablesize):	.long	TABLESIZE
347END(tablesize)
348	.type	_C_LABEL(nox_flag), @object
349LABEL(nox_flag)		.long	0	/* 32bit NOX flag, set if supported */
350END(nox_flag)
351	.type	_C_LABEL(cputype), @object
352LABEL(cputype)		.long	0	/* are we 80486, Pentium, or.. */
353END(cputype)
354	.type	_C_LABEL(cpuid_level), @object
355LABEL(cpuid_level)	.long	-1	/* max. level accepted by cpuid instr */
356END(cpuid_level)
357	.type	_C_LABEL(esym), @object
358LABEL(esym)		.quad	0	/* ptr to end of syms */
359END(esym)
360	.type	_C_LABEL(eblob), @object
361LABEL(eblob)		.quad	0	/* ptr to end of modules */
362END(eblob)
363	.type	_C_LABEL(atdevbase), @object
364LABEL(atdevbase)	.quad	0	/* location of start of iomem in virt */
365END(atdevbase)
366	.type	_C_LABEL(PDPpaddr), @object
367LABEL(PDPpaddr)		.quad	0	/* paddr of PTD, for libkvm */
368END(PDPpaddr)
369	.type	_C_LABEL(biosbasemem), @object
370#ifndef REALBASEMEM
371LABEL(biosbasemem)	.long	0	/* base memory reported by BIOS */
372#else
373LABEL(biosbasemem)	.long	REALBASEMEM
374#endif
375END(biosbasemem)
376	.type	_C_LABEL(biosextmem), @object
377#ifndef REALEXTMEM
378LABEL(biosextmem)	.long	0	/* extended memory reported by BIOS */
379#else
380LABEL(biosextmem)	.long	REALEXTMEM
381#endif
382END(biosextmem)
383
384#ifndef XEN
385	.globl	gdt64_lo
386	.globl	gdt64_hi
387
388#define GDT64_LIMIT gdt64_end-gdt64_start-1
389/* Temporary gdt64, with base address in low memory */
390	.type	_C_LABEL(gdt64_lo), @object
391LABEL(gdt64_lo)
392	.word	GDT64_LIMIT
393	.quad	_RELOC(gdt64_start)
394END(gdt64_lo)
395.align 64
396
397/* Temporary gdt64, with base address in high memory */
398	.type	_C_LABEL(gdt64_hi), @object
399LABEL(gdt64_hi)
400	.word	GDT64_LIMIT
401	.quad	gdt64_start
402END(gdt64_hi)
403.align 64
404#undef GDT64_LIMIT
405
406	.type	_C_LABEL(gdt64_start), @object
407_C_LABEL(gdt64_start):
408	.quad 0x0000000000000000	/* always empty */
409	.quad 0x00af9a000000ffff	/* kernel CS */
410	.quad 0x00cf92000000ffff	/* kernel DS */
411END(gdt64_start)
412gdt64_end:
413
414	.type	_C_LABEL(farjmp64), @object
415_C_LABEL(farjmp64):
416	.long	_RELOC(longmode)
417	.word	GSEL(GCODE_SEL, SEL_KPL)
418END(farjmp64)
419
420#endif	/* !XEN */
421
422	/* Space for the temporary stack */
423	.size	tmpstk, tmpstk - .
424	.space	512
425tmpstk:
426
427	.globl _C_LABEL(cpu_private)
428	.comm _C_LABEL(cpu_private),PAGE_SIZE,PAGE_SIZE
429
430/*
431 * Some hackage to deal with 64bit symbols in 32 bit mode.
432 * This may not be needed if things are cleaned up a little.
433 */
434
435	.text
436	.globl	_C_LABEL(kernel_text)
437	.set	_C_LABEL(kernel_text),KERNTEXTOFF
438
439ENTRY(start)
440#ifndef XEN
441	.code32
442
443	/* Warm boot */
444	movw	$0x1234,0x472
445
446	/*
447	 * Load parameters from the stack (32 bits):
448	 *     boothowto, [bootdev], bootinfo, esym, biosextmem, biosbasemem
449	 * We are not interested in 'bootdev'.
450	 */
451
452	/* Load 'boothowto' */
453	movl	4(%esp),%eax
454	movl	%eax,RELOC(boothowto)
455
456	/* Load 'bootinfo' */
457	movl	12(%esp),%eax
458	testl	%eax,%eax		/* bootinfo = NULL? */
459	jz	bootinfo_finished
460
461	movl	(%eax),%ebx		/* number of entries */
462	movl	$RELOC(bootinfo),%ebp
463	movl	%ebp,%edx
464	addl	$BOOTINFO_MAXSIZE,%ebp
465	movl	%ebx,(%edx)
466	addl	$4,%edx
467
468bootinfo_entryloop:
469	testl	%ebx,%ebx		/* no remaining entries? */
470	jz	bootinfo_finished
471
472	addl	$4,%eax
473	movl	(%eax),%ecx		/* address of entry */
474	pushl	%edi
475	pushl	%esi
476	pushl	%eax
477
478	movl	(%ecx),%eax		/* btinfo_common::len (size of entry) */
479	movl	%edx,%edi
480	addl	(%ecx),%edx		/* update dest pointer */
481	cmpl	%ebp,%edx		/* beyond bootinfo+BOOTINFO_MAXSIZE? */
482	jg	bootinfo_overflow
483
484	movl	%ecx,%esi
485	movl	%eax,%ecx
486
487	/*
488	 * If any modules were loaded, record where they end.  We'll need to
489	 * skip over them.
490	 */
491	cmpl	$BTINFO_MODULELIST,4(%esi) /* btinfo_common::type */
492	jne	0f
493
494	pushl	12(%esi)		/* btinfo_modulelist::endpa */
495	popl	RELOC(eblob)
496	addl	$KERNBASE_LO,RELOC(eblob)
497	adcl	$KERNBASE_HI,RELOC(eblob)+4
498
4990:
500	rep
501	movsb				/* copy esi -> edi */
502	popl	%eax
503	popl	%esi
504	popl	%edi
505	subl	$1,%ebx			/* decrement the # of entries */
506	jmp	bootinfo_entryloop
507
508bootinfo_overflow:
509	/*
510	 * Cleanup for overflow case. Pop the registers, and correct the number
511	 * of entries.
512	 */
513	popl	%eax
514	popl	%esi
515	popl	%edi
516	movl	$RELOC(bootinfo),%ebp
517	movl	%ebp,%edx
518	subl	%ebx,(%edx)		/* correct the number of entries */
519
520bootinfo_finished:
521	/* Load 'esym' */
522	movl	16(%esp),%eax
523	testl	%eax,%eax		/* esym = NULL? */
524	jz	1f
525
526	addl	$KERNBASE_LO,%eax
527
5281:
529	movl	$RELOC(esym),%ebp
530	movl	%eax,(%ebp)
531	movl	$KERNBASE_HI,4(%ebp)
532
533	/* Load 'biosextmem' */
534	movl	$RELOC(biosextmem),%ebp
535	movl	(%ebp),%eax
536	testl	%eax,%eax		/* already set? */
537	jnz	biosextmem_finished
538
539	movl	20(%esp),%eax
540	movl	%eax,(%ebp)
541
542biosextmem_finished:
543	/* Load 'biosbasemem' */
544	movl	$RELOC(biosbasemem),%ebp
545	movl	(%ebp),%eax
546	testl	%eax,%eax		/* already set? */
547	jnz	biosbasemem_finished
548
549	movl	24(%esp),%eax
550	movl	%eax,(%ebp)
551
552biosbasemem_finished:
553	/*
554	 * Done with the parameters!
555	 */
556
557	/* First, reset the PSL. */
558	pushl	$PSL_MBO
559	popfl
560
561	xorl	%eax,%eax
562	cpuid
563	movl	%eax,RELOC(cpuid_level)
564
565	/*
566	 * Finished with old stack; load new %esp now instead of later so we
567	 * can trace this code without having to worry about the trace trap
568	 * clobbering the memory test or the zeroing of the bss+bootstrap page
569	 * tables.
570	 *
571	 * The boot program should check:
572	 *	text+data <= &stack_variable - more_space_for_stack
573	 *	text+data+bss+pad+space_for_page_tables <= end_of_memory
574	 *
575	 * XXX: the gdt is in the carcass of the boot program so clearing
576	 * the rest of memory is still not possible.
577	 */
578	movl	$RELOC(tmpstk),%esp
579
580	/*
581	 * Retrieve the NX/XD flag. We use the 32bit version of PG_NX.
582	 */
583	movl	$0x80000001,%eax
584	cpuid
585	andl	$CPUID_NOX,%edx
586	jz	no_NOX
587	movl	$PG_NX32,RELOC(nox_flag)
588no_NOX:
589
590/*
591 * There are four levels of pages in amd64: PML4 -> PDP -> PD -> PT. They will
592 * be referred to as: L4 -> L3 -> L2 -> L1.
593 *
594 * Virtual address space of the kernel:
595 * +------+--------+------+-----+--------+---------------------+----------
596 * | TEXT | RODATA | DATA | BSS | [SYMS] | [PRELOADED MODULES] | L4 ->
597 * +------+--------+------+-----+--------+---------------------+----------
598 *                             (1)      (2)                   (3)
599 *
600 * --------------+-----+-----+----+-------------+
601 * -> PROC0 STK -> L3 -> L2 -> L1 | ISA I/O MEM |
602 * --------------+-----+-----+----+-------------+
603 *                               (4)
604 *
605 * PROC0 STK is obviously not linked as a page level. It just happens to be
606 * caught between L4 and L3.
607 *
608 * (PROC0 STK + L4 + L3 + L2 + L1) is later referred to as BOOTSTRAP TABLES.
609 *
610 * Important note: the kernel segments are properly 4k-aligned
611 * (see kern.ldscript), so there's no need to enforce alignment.
612 */
613
614	/* Find end of kernel image; brings us on (1). */
615	movl	$RELOC(end),%edi
616
617#if (NKSYMS || defined(DDB) || defined(MODULAR)) && !defined(makeoptions_COPY_SYMTAB)
618	/* Save the symbols (if loaded); brinds us on (2). */
619	movl	RELOC(esym),%eax
620	testl	%eax,%eax
621	jz	1f
622	subl	$KERNBASE_LO,%eax	/* XXX */
623	movl	%eax,%edi
6241:
625#endif
626	/* Skip over any modules/blobs; brings us on (3). */
627	movl	RELOC(eblob),%eax
628	testl	%eax,%eax
629	jz	1f
630	subl	$KERNBASE_LO,%eax	/* XXX */
631	movl	%eax,%edi
6321:
633
634	/* We are on (3). Align up for BOOTSTRAP TABLES. */
635	movl	%edi,%esi
636	addl	$PGOFSET,%esi
637	andl	$~PGOFSET,%esi
638
639	/* We are on the BOOTSTRAP TABLES. Save L4's physical address. */
640	movl	$RELOC(PDPpaddr),%ebp
641	movl	%esi,(%ebp)
642	movl	$0,4(%ebp)
643
644	/* Now, zero out the BOOTSTRAP TABLES (before filling them in). */
645	movl	%esi,%edi
646	xorl	%eax,%eax
647	cld
648	movl	$TABLESIZE,%ecx
649	shrl	$2,%ecx
650	rep
651	stosl				/* copy eax -> edi */
652
653/*
654 * Build the page tables and levels. We go from L1 to L4, and link the levels
655 * together. Note: RELOC computes &addr - KERNBASE in 32 bits; the value can't
656 * be > 4G, or we can't deal with it anyway, since we are in 32bit mode.
657 */
658	/*
659	 * Build L1.
660	 */
661	leal	(PROC0_PTP1_OFF)(%esi),%ebx
662
663	/* Skip the first MB. */
664	movl	$(KERNTEXTOFF_LO - KERNBASE_LO),%ecx
665	shrl	$PGSHIFT,%ecx
666	fillkpt_blank
667
668	/* Map the kernel text RX. */
669	movl	$(KERNTEXTOFF_LO - KERNBASE_LO),%eax	/* start of TEXT */
670	movl	$RELOC(__rodata_start),%ecx
671	subl	%eax,%ecx
672	shrl	$PGSHIFT,%ecx
673	orl	$(PG_V|PG_KR),%eax
674	fillkpt
675
676	/* Map the kernel rodata R. */
677	movl	$RELOC(__rodata_start),%eax
678	movl	$RELOC(__data_start),%ecx
679	subl	%eax,%ecx
680	shrl	$PGSHIFT,%ecx
681	orl	$(PG_V|PG_KR),%eax
682	fillkpt_nox
683
684	/* Map the kernel data+bss RW. */
685	movl	$RELOC(__data_start),%eax
686	movl	$RELOC(__kernel_end),%ecx
687	subl	%eax,%ecx
688	shrl	$PGSHIFT,%ecx
689	orl	$(PG_V|PG_KW),%eax
690	fillkpt_nox
691
692	/* Map [SYMS]+[PRELOADED MODULES] RW. */
693	movl	$RELOC(__kernel_end),%eax
694	movl	%esi,%ecx		/* start of BOOTSTRAP TABLES */
695	subl	%eax,%ecx
696	shrl	$PGSHIFT,%ecx
697	orl	$(PG_V|PG_KW),%eax
698	fillkpt_nox
699
700	/* Map the BOOTSTRAP TABLES RW. */
701	movl	%esi,%eax		/* start of BOOTSTRAP TABLES */
702	movl	$TABLESIZE,%ecx		/* length of BOOTSTRAP TABLES */
703	shrl	$PGSHIFT,%ecx
704	orl	$(PG_V|PG_KW),%eax
705	fillkpt_nox
706
707	/* We are on (4). Map ISA I/O MEM RW. */
708	movl	$IOM_BEGIN,%eax
709	movl	$IOM_SIZE,%ecx	/* size of ISA I/O MEM */
710	shrl	$PGSHIFT,%ecx
711	orl	$(PG_V|PG_KW/*|PG_N*/),%eax
712	fillkpt_nox
713
714	/*
715	 * Build L2. Linked to L1.
716	 */
717	leal	(PROC0_PTP2_OFF)(%esi),%ebx
718	leal	(PROC0_PTP1_OFF)(%esi),%eax
719	orl	$(PG_V|PG_KW),%eax
720	movl	$(NKL2_KIMG_ENTRIES+1),%ecx
721	fillkpt
722
723#if L2_SLOT_KERNBASE > 0
724	/* If needed, set up level 2 entries for actual kernel mapping */
725	leal	(PROC0_PTP2_OFF + L2_SLOT_KERNBASE * PDE_SIZE)(%esi),%ebx
726	leal	(PROC0_PTP1_OFF)(%esi),%eax
727	orl	$(PG_V|PG_KW),%eax
728	movl	$(NKL2_KIMG_ENTRIES+1),%ecx
729	fillkpt
730#endif
731
732	/*
733	 * Build L3. Linked to L2.
734	 */
735	leal	(PROC0_PTP3_OFF)(%esi),%ebx
736	leal	(PROC0_PTP2_OFF)(%esi),%eax
737	orl	$(PG_V|PG_KW),%eax
738	movl	$NKL3_KIMG_ENTRIES,%ecx
739	fillkpt
740
741#if L3_SLOT_KERNBASE > 0
742	/* If needed, set up level 3 entries for actual kernel mapping */
743	leal	(PROC0_PTP3_OFF + L3_SLOT_KERNBASE * PDE_SIZE)(%esi),%ebx
744	leal	(PROC0_PTP2_OFF)(%esi),%eax
745	orl	$(PG_V|PG_KW),%eax
746	movl	$NKL3_KIMG_ENTRIES,%ecx
747	fillkpt
748#endif
749
750	/*
751	 * Build L4 for identity mapping. Linked to L3.
752	 */
753	leal	(PROC0_PML4_OFF)(%esi),%ebx
754	leal	(PROC0_PTP3_OFF)(%esi),%eax
755	orl	$(PG_V|PG_KW),%eax
756	movl	$NKL4_KIMG_ENTRIES,%ecx
757	fillkpt
758
759	/* Set up L4 entries for actual kernel mapping */
760	leal	(PROC0_PML4_OFF + L4_SLOT_KERNBASE * PDE_SIZE)(%esi),%ebx
761	leal	(PROC0_PTP3_OFF)(%esi),%eax
762	orl	$(PG_V|PG_KW),%eax
763	movl	$NKL4_KIMG_ENTRIES,%ecx
764	fillkpt
765
766	/* Install recursive top level PDE (one entry) */
767	leal	(PROC0_PML4_OFF + PDIR_SLOT_PTE * PDE_SIZE)(%esi),%ebx
768	leal	(PROC0_PML4_OFF)(%esi),%eax
769	orl	$(PG_V|PG_KW),%eax
770	movl	$1,%ecx
771	fillkpt_nox
772
773	/*
774	 * Startup checklist:
775	 * 1. Enable PAE (and SSE while here).
776	 */
777	movl	%cr4,%eax
778	orl	$(CR4_PAE|CR4_OSFXSR|CR4_OSXMMEXCPT),%eax
779	movl	%eax,%cr4
780
781	/*
782	 * 2. Set Long Mode Enable in EFER. Also enable the syscall extensions,
783	 *    and NOX if available.
784	 */
785	movl	$MSR_EFER,%ecx
786	rdmsr
787	xorl	%eax,%eax	/* XXX */
788	orl	$(EFER_LME|EFER_SCE),%eax
789	movl	RELOC(nox_flag),%ebx
790	cmpl	$0,%ebx
791	je 	skip_NOX
792	orl	$(EFER_NXE),%eax
793skip_NOX:
794	wrmsr
795
796	/*
797	 * 3. Load %cr3 with pointer to PML4.
798	 */
799	movl	%esi,%eax
800	movl	%eax,%cr3
801
802	/*
803	 * 4. Enable paging and the rest of it.
804	 */
805	movl	%cr0,%eax
806	orl	$(CR0_PE|CR0_PG|CR0_NE|CR0_TS|CR0_MP|CR0_WP|CR0_AM),%eax
807	movl	%eax,%cr0
808	jmp	compat
809compat:
810
811	/*
812	 * 5. Not quite done yet, we're now in a compatibility segment, in
813	 *    legacy mode. We must jump to a long mode segment. Need to set up
814	 *    a temporary GDT with a long mode segment in it to do that.
815	 */
816	movl	$RELOC(gdt64_lo),%eax
817	lgdt	(%eax)
818	movl	$RELOC(farjmp64),%eax
819	ljmp	*(%eax)
820
821	.code64
822longmode:
823	/*
824	 * 6. Finally, we're in long mode. However, we're still in the identity
825	 *    mapped area (could not jump out of that earlier because it would
826	 *    have been a > 32bit jump). We can do that now, so here we go.
827	 */
828	movabsq	$longmode_hi,%rax
829	jmp	*%rax
830
831longmode_hi:
832
833	/*
834	 * We left the identity mapped area. Base address of
835	 * the temporary gdt64 should now be in high memory.
836	 */
837	movq	$RELOC(gdt64_hi),%rax
838	lgdt	(%rax)
839
840	/*
841	 * We have arrived. There's no need anymore for the identity mapping in
842	 * low memory, remove it.
843	 */
844	movq	$KERNBASE,%r8
845
846#if L2_SLOT_KERNBASE > 0
847	movq	$(NKL2_KIMG_ENTRIES+1),%rcx
848	leaq	(PROC0_PTP2_OFF)(%rsi),%rbx	/* old, phys address */
849	addq	%r8,%rbx			/* new, virt address */
850	killkpt
851#endif
852
853#if L3_SLOT_KERNBASE > 0
854	movq	$NKL3_KIMG_ENTRIES,%rcx
855	leaq	(PROC0_PTP3_OFF)(%rsi),%rbx	/* old, phys address */
856	addq	%r8,%rbx			/* new, virt address */
857	killkpt
858#endif
859
860	movq	$NKL4_KIMG_ENTRIES,%rcx
861	leaq	(PROC0_PML4_OFF)(%rsi),%rbx	/* old, phys address of PML4 */
862	addq	%r8,%rbx			/* new, virt address of PML4 */
863	killkpt
864
865	/* Relocate atdevbase. */
866	movq	$(TABLESIZE+KERNBASE),%rdx
867	addq	%rsi,%rdx
868	movq	%rdx,_C_LABEL(atdevbase)(%rip)
869
870	/* Set up bootstrap stack. */
871	leaq	(PROC0_STK_OFF)(%rsi),%rax
872	addq	%r8,%rax
873	movq	%rax,(_C_LABEL(lwp0)+L_PCB)(%rip) /* XXX L_PCB != uarea */
874	leaq	(USPACE-FRAMESIZE)(%rax),%rsp
875	movq	%rsi,PCB_CR3(%rax)		/* pcb->pcb_cr3 */
876	xorq	%rbp,%rbp			/* mark end of frames */
877
878	xorw	%ax,%ax
879	movw	%ax,%gs
880	movw	%ax,%fs
881
882	/* XXX merge these */
883	leaq	(TABLESIZE+IOM_SIZE)(%rsi),%rdi
884
885#else	/* XEN */
886	/* First, reset the PSL. */
887	pushq	$2
888	popfq
889
890	cld
891
892	/*
893	 * Xen info:
894	 * - %rsi -> start_info struct
895	 * - %rsp -> stack, *theoretically* the last used page
896	 *	by Xen bootstrap
897	 */
898	movq	%rsi, %rbx
899
900	/* Clear BSS. */
901	xorq	%rax,%rax
902	movq	$_C_LABEL(__bss_start),%rdi
903	movq	$_C_LABEL(_end),%rcx
904	subq	%rdi,%rcx
905	rep
906	stosb
907
908	/* Copy start_info to a safe place */
909	movq	%rbx,%rsi
910	movq	$_C_LABEL(start_info_union),%rdi
911	movq	$64,%rcx
912	rep
913	movsq
914
915	/*
916	 * Memory layout at start of the day:
917	 * - Kernel image
918	 * - Page frames list
919	 * - start_info struct. we copied it, so it can be recycled.
920	 * - xenstore
921	 * - console
922	 * - Xen bootstrap page tables
923	 * - kernel stack. provided by Xen
924	 * - guaranteed 512kB padding
925	 *
926	 * As we want to rebuild our page tables and place our stack
927	 * in proc0 struct, all data starting from after console can be
928	 * discarded after we've done a little setup.
929	 */
930
931	/*
932	 * We want our own page tables, let's rebuild them
933	 * We will reclaim xen space afterward INCLUDING stack
934	 * so let's change it to a temporary one
935	 */
936
937	movq	$tmpstk, %rax
938	subq	$8, %rax
939	movq	%rax, %rsp
940
941	xorl	%eax,%eax
942	cpuid
943	movl	%eax,_C_LABEL(cpuid_level)
944
945	movq	$cpu_info_primary, %rdi
946	movq	%rdi, CPU_INFO_SELF(%rdi) /* ci->ci_self = ci */
947	movq	$1, %rsi
948	call	cpu_init_msrs	/* cpu_init_msrs(ci, true); */
949
950	call	xen_pmap_bootstrap
951
952	/*
953	 * First avail returned by xen_pmap_bootstrap in %rax
954	 */
955	movq	%rax, %rsi
956	movq	%rsi,(_C_LABEL(lwp0)+L_PCB)	/* XXX L_PCB != uarea */
957
958	/*
959	 * Set new stack and clear segments
960	 */
961	leaq	(USPACE-FRAMESIZE)(%rsi),%rsp
962	xorq	%rbp,%rbp
963
964	xorw	%ax,%ax
965	movw	%ax,%gs
966	movw	%ax,%fs
967
968	/*
969	 * Set first_avail after proc0
970	 */
971	movq	%rsi,%rdi
972	addq	$USPACE,%rdi
973	subq	$KERNBASE,%rdi	/* init_x86_64 wants a physical address */
974#endif	/* XEN */
975
976	call	_C_LABEL(init_x86_64)
977	call 	_C_LABEL(main)
978END(start)
979
980#if defined(XEN)
981/* space for the hypercall call page */
982#define HYPERCALL_PAGE_OFFSET 0x1000
983.org HYPERCALL_PAGE_OFFSET
984ENTRY(hypercall_page)
985.skip 0x1000
986END(hypercall_page)
987#endif /* XEN */
988
989/*
990 * int setjmp(label_t *)
991 *
992 * Used primarily by DDB.
993 */
994ENTRY(setjmp)
995	/*
996	 * Only save registers that must be preserved across function
997	 * calls according to the ABI (%rbx, %rsp, %rbp, %r12-%r15)
998	 * and %rip.
999	 */
1000	movq	%rdi,%rax
1001	movq	%rbx,(%rax)
1002	movq	%rsp,8(%rax)
1003	movq	%rbp,16(%rax)
1004	movq	%r12,24(%rax)
1005	movq	%r13,32(%rax)
1006	movq	%r14,40(%rax)
1007	movq	%r15,48(%rax)
1008	movq	(%rsp),%rdx
1009	movq	%rdx,56(%rax)
1010	xorl	%eax,%eax
1011	ret
1012END(setjmp)
1013
1014/*
1015 * int longjmp(label_t *)
1016 *
1017 * Used primarily by DDB.
1018 */
1019ENTRY(longjmp)
1020	movq	%rdi,%rax
1021	movq	(%rax),%rbx
1022	movq	8(%rax),%rsp
1023	movq	16(%rax),%rbp
1024	movq	24(%rax),%r12
1025	movq	32(%rax),%r13
1026	movq	40(%rax),%r14
1027	movq	48(%rax),%r15
1028	movq	56(%rax),%rdx
1029	movq	%rdx,(%rsp)
1030	movl	$1,%eax
1031	ret
1032END(longjmp)
1033
1034/*
1035 * void dumpsys(void)
1036 *
1037 * Mimic cpu_switchto() for postmortem debugging.
1038 */
1039ENTRY(dumpsys)
1040	/* Build a fake switch frame. */
1041	pushq	%rbx
1042	pushq	%r12
1043	pushq	%r13
1044	pushq	%r14
1045	pushq	%r15
1046
1047	/* Save a context. */
1048	movq	$dumppcb, %rax
1049	movq	%rsp, PCB_RSP(%rax)
1050	movq	%rbp, PCB_RBP(%rax)
1051
1052	call	_C_LABEL(dodumpsys)
1053
1054	addq	$(5*8), %rsp	/* sizeof(switchframe) - sizeof(%rip) */
1055	ret
1056END(dumpsys)
1057
1058/*
1059 * struct lwp *cpu_switchto(struct lwp *oldlwp, struct lwp *newlwp,
1060 *     bool returning)
1061 *
1062 *	1. if (oldlwp != NULL), save its context.
1063 *	2. then, restore context of newlwp.
1064 *
1065 * Note that the stack frame layout is known to "struct switchframe" in
1066 * <machine/frame.h> and to the code in cpu_lwp_fork() which initializes
1067 * it for a new lwp.
1068 */
1069ENTRY(cpu_switchto)
1070	pushq	%rbx
1071	pushq	%r12
1072	pushq	%r13
1073	pushq	%r14
1074	pushq	%r15
1075
1076	movq	%rdi,%r13	/* oldlwp */
1077	movq	%rsi,%r12	/* newlwp */
1078
1079	testq	%r13,%r13	/* oldlwp = NULL ? */
1080	jz	skip_save
1081
1082	/* Save old context. */
1083	movq	L_PCB(%r13),%rax
1084	movq	%rsp,PCB_RSP(%rax)
1085	movq	%rbp,PCB_RBP(%rax)
1086skip_save:
1087
1088	/* Switch to newlwp's stack. */
1089	movq	L_PCB(%r12),%r14
1090#ifdef XEN /* XXX debug code */
1091	cmpq	$0,PCB_RSP(%r14)
1092	jne 999f
1093	callq _C_LABEL(cpu_Debugger);
1094999:
1095#endif
1096	movq	PCB_RSP(%r14),%rsp
1097	movq	PCB_RBP(%r14),%rbp
1098
1099	/*
1100	 * Set curlwp.  This must be globally visible in order to permit
1101	 * non-interlocked mutex release.
1102	 */
1103	movq	%r12,%rcx
1104	xchgq	%rcx,CPUVAR(CURLWP)
1105
1106	/* Skip the rest if returning to a pinned LWP. */
1107	testb	%dl,%dl		/* returning = true ? */
1108	jnz	switch_return
1109
1110	/* Switch ring0 stack */
1111#ifndef XEN
1112	movq	PCB_RSP0(%r14),%rax
1113	movq	%rax,CPUVAR(RSP0)
1114#else
1115	movq	%r14,%rdi
1116	callq	_C_LABEL(x86_64_switch_context);
1117#endif
1118
1119	/* Don't bother with the rest if switching to a system process. */
1120	testl	$LW_SYSTEM,L_FLAG(%r12)
1121	jnz	switch_return
1122
1123	/* Is this process using RAS (restartable atomic sequences)? */
1124	movq	L_PROC(%r12),%rdi
1125	cmpq	$0,P_RASLIST(%rdi)
1126	je	no_RAS
1127
1128	/* Handle restartable atomic sequences (RAS). */
1129	movq	L_MD_REGS(%r12),%rbx
1130	movq	TF_RIP(%rbx),%rsi
1131	call	_C_LABEL(ras_lookup)
1132	cmpq	$-1,%rax
1133	je	no_RAS
1134	movq	%rax,TF_RIP(%rbx)
1135no_RAS:
1136
1137	/*
1138	 * Restore cr0 including FPU state (may have CR0_TS set).  Note that
1139	 * IPL_SCHED prevents from FPU interrupt altering the LWP's saved cr0.
1140	 */
1141#ifndef XEN
1142	movl	$IPL_HIGH,CPUVAR(ILEVEL)
1143	movl	PCB_CR0(%r14),%ecx	/* has CR0_TS clear */
1144	movq	%cr0,%rdx
1145
1146	/*
1147	 * If our floating point registers are on a different CPU,
1148	 * set CR0_TS so we'll trap rather than reuse bogus state.
1149	 */
1150	cmpq	CPUVAR(FPCURLWP),%r12
1151	je	skip_TS
1152	orq	$CR0_TS,%rcx
1153skip_TS:
1154
1155	/* Reloading CR0 is very expensive - avoid if possible. */
1156	cmpq	%rdx,%rcx
1157	je	skip_CR0
1158	movq	%rcx,%cr0
1159skip_CR0:
1160
1161	/* The 32bit LWPs are handled differently. */
1162	testl	$PCB_COMPAT32,PCB_FLAGS(%r14)
1163	jne	lwp_64bit
1164
1165lwp_32bit:
1166	/* Zero out %fs/%gs registers. */
1167	xorq	%rax,%rax
1168	movw	%ax,%fs
1169	CLI(cx)
1170	SWAPGS
1171	movw	%ax,%gs
1172	SWAPGS
1173	STI(cx)
1174
1175	/* Zero out GDT descriptors. */
1176	movq	CPUVAR(GDT),%rcx
1177	movq	%rax,(GUFS_SEL*8)(%rcx)
1178	movq	%rax,(GUGS_SEL*8)(%rcx)
1179
1180	/* Reload 64-bit %fs/%gs MSRs. */
1181	movl	$MSR_FSBASE,%ecx
1182	movl	PCB_FS(%r14),%eax
1183	movl	4+PCB_FS(%r14),%edx
1184	wrmsr
1185	movl	$MSR_KERNELGSBASE,%ecx
1186	movl	PCB_GS(%r14),%eax
1187	movl	4+PCB_GS(%r14),%edx
1188	wrmsr
1189
1190	jmp	switch_return
1191
1192lwp_64bit:
1193	/* Reload %fs/%gs GDT descriptors. */
1194	movq	CPUVAR(GDT),%rcx
1195	movq	PCB_FS(%r14),%rax
1196	movq	%rax,(GUFS_SEL*8)(%rcx)
1197	movq	PCB_GS(%r14),%rax
1198	movq	%rax,(GUGS_SEL*8)(%rcx)
1199
1200	/* Reload %fs and %gs */
1201	movq	L_MD_REGS(%r12),%rbx
1202	movw	TF_FS(%rbx),%fs
1203	CLI(ax)
1204	SWAPGS
1205	movw	TF_GS(%rbx),%gs
1206	SWAPGS
1207	STI(ax)
1208#else
1209	movq	%r12,%rdi
1210	callq	_C_LABEL(x86_64_tls_switch)
1211#endif
1212
1213switch_return:
1214	/* Return to the new LWP, returning 'oldlwp' in %rax. */
1215	movq	%r13,%rax
1216	popq	%r15
1217	popq	%r14
1218	popq	%r13
1219	popq	%r12
1220	popq	%rbx
1221	ret
1222END(cpu_switchto)
1223
1224/*
1225 * void savectx(struct pcb *pcb);
1226 *
1227 * Update pcb, saving current processor state.
1228 */
1229ENTRY(savectx)
1230	/* Save stack pointers. */
1231	movq	%rsp,PCB_RSP(%rdi)
1232	movq	%rbp,PCB_RBP(%rdi)
1233	ret
1234END(savectx)
1235
1236IDTVEC(syscall32)
1237	sysret		/* go away please */
1238IDTVEC_END(syscall32)
1239
1240/*
1241 * syscall()
1242 *
1243 * syscall insn entry.
1244 * This currently isn't much faster, but it can be made faster in the future.
1245 * (Actually we've already saved a few 100 clocks by not loading the trap gate)
1246 */
1247IDTVEC(syscall)
1248#ifndef XEN
1249	/*
1250	 * The user %rip is in %rcx and the user %flags in %r11. The kernel %cs
1251	 * and %ss are loaded, but nothing else is.
1252	 *
1253	 * The 'swapgs' instruction gives us access to cpu-specific memory where
1254	 * we can save a user register and then read the LWP's kernel stack
1255	 * pointer.
1256	 *
1257	 * This code doesn't seem to set %ds, this may not matter since it is
1258	 * ignored in 64bit mode, OTOH the syscall instruction sets %ss and that
1259	 * is ignored as well.
1260	 */
1261	swapgs
1262	movq	%r15,CPUVAR(SCRATCH)
1263	movq	CPUVAR(CURLWP),%r15
1264	movq	L_PCB(%r15),%r15
1265	movq	PCB_RSP0(%r15),%r15	/* LWP's kernel stack pointer */
1266
1267	/* Make stack look like an 'int nn' frame */
1268#define SP(x)	(x)-(TF_SS+8)(%r15)
1269	movq	$(LSEL(LUDATA_SEL, SEL_UPL)),SP(TF_SS) /* user %ss */
1270	movq	%rsp,SP(TF_RSP)		/* User space rsp */
1271
1272	movq	%r11,SP(TF_RFLAGS)	/* old rflags from syscall insn */
1273	movq	$(LSEL(LUCODE_SEL, SEL_UPL)),SP(TF_CS)
1274	movq	%rcx,SP(TF_RIP)		/* syscall saves rip in rcx */
1275
1276	leaq	SP(0),%rsp		/* %rsp now valid after frame */
1277	movq	CPUVAR(SCRATCH),%r15
1278#undef SP
1279
1280	movq	$2,TF_ERR(%rsp)		/* syscall instruction size */
1281	movq	$T_ASTFLT,TF_TRAPNO(%rsp)
1282
1283	movw	%es,TF_ES(%rsp)
1284	sti
1285	INTR_SAVE_GPRS
1286	movw	%fs,TF_FS(%rsp)
1287	movw	%gs,TF_GS(%rsp)
1288	movw	$(LSEL(LUDATA_SEL, SEL_UPL)),TF_DS(%rsp)
1289#else
1290	/* Xen already switched to kernel stack */
1291	pushq	%rsi
1292	STI(si)
1293	popq	%rsi
1294	addq	$0x10,%rsp	/* gap to match cs:rip */
1295	pushq	$2		/* error code */
1296	pushq	$T_ASTFLT
1297	subq	$TF_REGSIZE,%rsp
1298	INTR_SAVE_GPRS
1299	movw	%fs,TF_FS(%rsp)
1300	movw	%gs,TF_GS(%rsp)
1301	movw	%es,TF_ES(%rsp)
1302	movw	$(LSEL(LUDATA_SEL, SEL_UPL)),TF_DS(%rsp)
1303#endif
1304
1305do_syscall:
1306	movq	CPUVAR(CURLWP),%r14
1307	incq	CPUVAR(NSYSCALL)	/* count it atomically */
1308	movq	%rsp,L_MD_REGS(%r14)	/* save pointer to frame */
1309	movq	L_PROC(%r14),%r15
1310	andl	$~MDL_IRET,L_MD_FLAGS(%r14)   /* Allow sysret return */
1311	movq	%rsp,%rdi		/* Pass frame as arg0 */
1312	call	*P_MD_SYSCALL(%r15)
1313.Lsyscall_checkast:
1314	/*
1315	 * Disable interrupts to avoid new ASTs (etc) being added and
1316	 * to ensure we don't take an interrupt with some of the user
1317	 * registers loaded.
1318	 */
1319	CLI(si)
1320	/* Check for ASTs on exit to user mode. */
1321	movl	L_MD_ASTPENDING(%r14),%eax
1322	orl	CPUVAR(WANT_PMAPLOAD),%eax
1323	jnz	9f
1324
1325#ifdef DIAGNOSTIC
1326	cmpl	$IPL_NONE,CPUVAR(ILEVEL)
1327	jne	spl_error
1328#endif
1329
1330	testl	$(MDL_IRET|MDL_COMPAT32),L_MD_FLAGS(%r14)
1331	INTR_RESTORE_GPRS
1332	movw	TF_ES(%rsp),%es
1333	SWAPGS
1334	jnz	2f
1335#ifndef XEN
1336	movq	TF_RIP(%rsp),%rcx	/* %rip for sysret */
1337	movq	TF_RFLAGS(%rsp),%r11	/* %flags for sysret */
1338	movw	TF_DS(%rsp),%ds
1339	movq	TF_RSP(%rsp),%rsp
1340	sysretq
1341#else
1342	movw	TF_DS(%rsp),%ds
1343	addq	$TF_RIP,%rsp
1344	pushq	$256	/* VGCF_IN_SYSCALL */
1345	jmp	HYPERVISOR_iret
1346#endif
1347
1348/*
1349 * If the syscall might have modified some registers, or we are a 32bit
1350 * process we must return to user with an 'iret' instruction.
1351 * If the iret faults in kernel (assumed due to illegal register values)
1352 * then a SIGSEGV will be signalled.
1353 */
13542:
1355	movw	TF_DS(%rsp),%ds
1356	addq	$TF_RIP,%rsp
1357	iretq
1358
1359#ifdef DIAGNOSTIC
1360	/* Report SPL error */
1361spl_error:
1362	movabsq	$4f,%rdi
1363	movl	TF_RAX(%rsp),%esi
1364	movl	TF_RDI(%rsp),%edx
1365	movl	%ebx,%ecx
1366	movl	CPUVAR(ILEVEL),%r8d
1367	xorq	%rax,%rax
1368	call	_C_LABEL(printf)
1369	movl	$IPL_NONE,%edi
1370	call	_C_LABEL(spllower)
1371	jmp	.Lsyscall_checkast
13724:	.asciz	"WARNING: SPL NOT LOWERED ON SYSCALL %d %d EXIT %x %x\n"
1373#endif
1374
1375/* AST pending or pmap load needed */
13769:
1377	cmpl	$0,CPUVAR(WANT_PMAPLOAD)
1378	jz	10f
1379	STI(si)
1380	call	_C_LABEL(do_pmap_load)
1381	jmp	.Lsyscall_checkast	/* re-check ASTs */
138210:
1383	CLEAR_ASTPENDING(%r14)
1384	STI(si)
1385	/* Pushed T_ASTFLT into tf_trapno on entry. */
1386	movq	%rsp,%rdi
1387	call	_C_LABEL(trap)
1388	jmp	.Lsyscall_checkast	/* re-check ASTs */
1389IDTVEC_END(syscall)
1390
1391/*
1392 * void lwp_trampoline(void);
1393 *
1394 * This is a trampoline function pushed run by newly created LWPs
1395 * in order to do additional setup in their context.
1396 */
1397NENTRY(lwp_trampoline)
1398	movq	%rbp,%rsi
1399	movq	%rbp,%r14	/* for .Lsyscall_checkast */
1400	movq	%rax,%rdi
1401	xorq	%rbp,%rbp
1402	call	_C_LABEL(lwp_startup)
1403	movq	%r13,%rdi
1404	call	*%r12
1405	jmp	.Lsyscall_checkast
1406END(lwp_trampoline)
1407
1408/*
1409 * oosyscall()
1410 *
1411 * Old call gate entry for syscall. only needed if we're
1412 * going to support running old i386 NetBSD 1.0 or ibcs2 binaries, etc,
1413 * on NetBSD/amd64.
1414 * The 64bit call gate can't request that arguments be copied from the
1415 * user stack (which the i386 code uses to get a gap for the flags).
1416 * push/pop are <read>:<modify_sp>:<write> cycles.
1417 */
1418IDTVEC(oosyscall)
1419	/* Set rflags in trap frame. */
1420	pushq	(%rsp)		/* move user's %eip */
1421	pushq	16(%rsp)	/* and %cs */
1422	popq	8(%rsp)
1423	pushfq
1424	popq	16(%rsp)
1425	pushq	$7		/* size of instruction for restart */
1426	jmp	osyscall1
1427IDTVEC_END(oosyscall)
1428
1429/*
1430 * osyscall()
1431 *
1432 * Trap gate entry for int $80 syscall, also used by sigreturn.
1433 */
1434IDTVEC(osyscall)
1435#ifdef XEN
1436	movq (%rsp),%rcx
1437	movq 8(%rsp),%r11
1438	addq $0x10,%rsp
1439#endif
1440	pushq	$2		/* size of instruction for restart */
1441osyscall1:
1442	pushq	$T_ASTFLT	/* trap # for doing ASTs */
1443	INTRENTRY
1444	STI(si)
1445	jmp	do_syscall
1446IDTVEC_END(osyscall)
1447
1448/*
1449 * bool sse2_idlezero_page(void *pg)
1450 *
1451 * Zero a page without polluting the cache.  Preemption must be
1452 * disabled by the caller. Abort if a preemption is pending.
1453 * Returns true if the page is zeroed, false if not.
1454 */
1455ENTRY(sse2_idlezero_page)
1456	pushq	%rbp
1457	movq	%rsp,%rbp
1458	movl	$(PAGE_SIZE/64), %ecx
1459	xorq	%rax, %rax
1460	.align	16
14611:
1462	testl	$RESCHED_KPREEMPT, CPUVAR(RESCHED)
1463	jnz	2f
1464	movnti	%rax, 0(%rdi)
1465	movnti	%rax, 8(%rdi)
1466	movnti	%rax, 16(%rdi)
1467	movnti	%rax, 24(%rdi)
1468	movnti	%rax, 32(%rdi)
1469	movnti	%rax, 40(%rdi)
1470	movnti	%rax, 48(%rdi)
1471	movnti	%rax, 56(%rdi)
1472	addq	$64, %rdi
1473	decl	%ecx
1474	jnz	1b
1475	sfence
1476	incl	%eax
1477	popq	%rbp
1478	ret
14792:
1480	sfence
1481	popq	%rbp
1482	ret
1483END(sse2_idlezero_page)
1484
1485/*
1486 * void pagezero(vaddr_t va)
1487 *
1488 * Zero a page without polluting the cache.
1489 */
1490
1491ENTRY(pagezero)
1492	movq	$-PAGE_SIZE,%rdx
1493	subq	%rdx,%rdi
1494	xorq	%rax,%rax
14951:
1496	movnti	%rax,(%rdi,%rdx)
1497	movnti	%rax,8(%rdi,%rdx)
1498	movnti	%rax,16(%rdi,%rdx)
1499	movnti	%rax,24(%rdi,%rdx)
1500	movnti	%rax,32(%rdi,%rdx)
1501	movnti	%rax,40(%rdi,%rdx)
1502	movnti	%rax,48(%rdi,%rdx)
1503	movnti	%rax,56(%rdi,%rdx)
1504	addq	$64,%rdx
1505	jne	1b
1506	sfence
1507	ret
1508END(pagezero)
1509