1/* $OpenBSD: locore.S,v 1.148 2024/08/02 22:24:51 guenther Exp $ */ 2/* $NetBSD: locore.S,v 1.13 2004/03/25 18:33:17 drochner Exp $ */ 3 4/* 5 * Copyright-o-rama! 6 */ 7 8/* 9 * Copyright (c) 2001 Wasabi Systems, Inc. 10 * All rights reserved. 11 * 12 * Written by Frank van der Linden for Wasabi Systems, Inc. 13 * 14 * Redistribution and use in source and binary forms, with or without 15 * modification, are permitted provided that the following conditions 16 * are met: 17 * 1. Redistributions of source code must retain the above copyright 18 * notice, this list of conditions and the following disclaimer. 19 * 2. Redistributions in binary form must reproduce the above copyright 20 * notice, this list of conditions and the following disclaimer in the 21 * documentation and/or other materials provided with the distribution. 22 * 3. All advertising materials mentioning features or use of this software 23 * must display the following acknowledgement: 24 * This product includes software developed for the NetBSD Project by 25 * Wasabi Systems, Inc. 26 * 4. The name of Wasabi Systems, Inc. may not be used to endorse 27 * or promote products derived from this software without specific prior 28 * written permission. 29 * 30 * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND 31 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 32 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 33 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL WASABI SYSTEMS, INC 34 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 35 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 36 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 37 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 38 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 39 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 40 * POSSIBILITY OF SUCH DAMAGE. 41 */ 42 43 44/*- 45 * Copyright (c) 1998, 2000 The NetBSD Foundation, Inc. 46 * All rights reserved. 47 * 48 * This code is derived from software contributed to The NetBSD Foundation 49 * by Charles M. Hannum. 50 * 51 * Redistribution and use in source and binary forms, with or without 52 * modification, are permitted provided that the following conditions 53 * are met: 54 * 1. Redistributions of source code must retain the above copyright 55 * notice, this list of conditions and the following disclaimer. 56 * 2. Redistributions in binary form must reproduce the above copyright 57 * notice, this list of conditions and the following disclaimer in the 58 * documentation and/or other materials provided with the distribution. 59 * 60 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 61 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 62 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 63 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 64 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 65 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 66 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 67 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 68 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 69 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 70 * POSSIBILITY OF SUCH DAMAGE. 71 */ 72 73/*- 74 * Copyright (c) 1990 The Regents of the University of California. 75 * All rights reserved. 76 * 77 * This code is derived from software contributed to Berkeley by 78 * William Jolitz. 79 * 80 * Redistribution and use in source and binary forms, with or without 81 * modification, are permitted provided that the following conditions 82 * are met: 83 * 1. Redistributions of source code must retain the above copyright 84 * notice, this list of conditions and the following disclaimer. 85 * 2. Redistributions in binary form must reproduce the above copyright 86 * notice, this list of conditions and the following disclaimer in the 87 * documentation and/or other materials provided with the distribution. 88 * 3. Neither the name of the University nor the names of its contributors 89 * may be used to endorse or promote products derived from this software 90 * without specific prior written permission. 91 * 92 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 93 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 94 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 95 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 96 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 97 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 98 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 99 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 100 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 101 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 102 * SUCH DAMAGE. 103 * 104 * @(#)locore.s 7.3 (Berkeley) 5/13/91 105 */ 106 107#include "assym.h" 108#include "efi.h" 109#include "lapic.h" 110#include "ksyms.h" 111#include "xen.h" 112#include "hyperv.h" 113 114#include <sys/syscall.h> 115 116#include <machine/param.h> 117#include <machine/codepatch.h> 118#include <machine/psl.h> 119#include <machine/segments.h> 120#include <machine/specialreg.h> 121#include <machine/trap.h> /* T_PROTFLT */ 122#include <machine/frameasm.h> 123 124#if NLAPIC > 0 125#include <machine/i82489reg.h> 126#endif 127 128/* 129 * override user-land alignment before including asm.h 130 */ 131#define ALIGN_DATA .align 8,0xcc 132 133#include <machine/asm.h> 134 135#define SET_CURPROC(proc,cpu) \ 136 movq CPUVAR(SELF),cpu ; \ 137 movq proc,CPUVAR(CURPROC) ; \ 138 movq cpu,P_CPU(proc) 139 140#define GET_CURPCB(reg) movq CPUVAR(CURPCB),reg 141#define SET_CURPCB(reg) movq reg,CPUVAR(CURPCB) 142 143 144/* 145 * Initialization 146 */ 147 .data 148 149#if NLAPIC > 0 150 .align NBPG, 0xcc 151 .globl local_apic, lapic_id, lapic_tpr 152local_apic: 153 .space LAPIC_ID 154lapic_id: 155 .long 0x00000000 156 .space LAPIC_TPRI-(LAPIC_ID+4) 157lapic_tpr: 158 .space LAPIC_PPRI-LAPIC_TPRI 159lapic_ppr: 160 .space LAPIC_ISR-LAPIC_PPRI 161lapic_isr: 162 .space NBPG-LAPIC_ISR 163#endif 164 165/*****************************************************************************/ 166 167/* 168 * Signal trampoline; copied to a page mapped into userspace. 169 * gdb's backtrace logic matches against the instructions in this. 170 */ 171 .section .rodata 172 .globl sigcode 173sigcode: 174 endbr64 175 call 1f 176 movq %rsp,%rdi 177 pushq %rdi /* fake return address */ 178 movq $SYS_sigreturn,%rax 179 .globl sigcodecall 180sigcodecall: 181 syscall 182 .globl sigcoderet 183sigcoderet: 184 int3 1851: CODEPATCH_START 186 JMP_RETPOLINE(rax) 187 CODEPATCH_END(CPTAG_RETPOLINE_RAX) 188 .globl esigcode 189esigcode: 190 .globl sigfill 191sigfill: 192 int3 193esigfill: 194 .globl sigfillsiz 195sigfillsiz: 196 .long esigfill - sigfill 197 198 .text 199/* 200 * void lgdt(struct region_descriptor *rdp); 201 * Change the global descriptor table. 202 */ 203NENTRY(lgdt) 204 RETGUARD_SETUP(lgdt, r11) 205 /* Reload the descriptor table. */ 206 movq %rdi,%rax 207 lgdt (%rax) 208 /* Flush the prefetch q. */ 209 jmp 1f 210 nop 2111: /* Reload "stale" selectors. */ 212 movl $GSEL(GDATA_SEL, SEL_KPL),%eax 213 movl %eax,%ds 214 movl %eax,%es 215 movl %eax,%ss 216 /* Reload code selector by doing intersegment return. */ 217 popq %rax 218 pushq $GSEL(GCODE_SEL, SEL_KPL) 219 pushq %rax 220 RETGUARD_CHECK(lgdt, r11) 221 lretq 222END(lgdt) 223 224#if defined(DDB) || NEFI > 0 225ENTRY(setjmp) 226 RETGUARD_SETUP(setjmp, r11) 227 /* 228 * Only save registers that must be preserved across function 229 * calls according to the ABI (%rbx, %rsp, %rbp, %r12-%r15) 230 * and %rip. 231 */ 232 movq %rdi,%rax 233 movq %rbx,(%rax) 234 movq %rsp,8(%rax) 235 movq %rbp,16(%rax) 236 movq %r12,24(%rax) 237 movq %r13,32(%rax) 238 movq %r14,40(%rax) 239 movq %r15,48(%rax) 240 movq (%rsp),%rdx 241 movq %rdx,56(%rax) 242 xorl %eax,%eax 243 RETGUARD_CHECK(setjmp, r11) 244 ret 245 lfence 246END(setjmp) 247 248ENTRY(longjmp) 249 movq %rdi,%rax 250 movq 8(%rax),%rsp 251 movq 56(%rax),%rdx 252 movq %rdx,(%rsp) 253 RETGUARD_SETUP(longjmp, r11) 254 movq (%rax),%rbx 255 movq 16(%rax),%rbp 256 movq 24(%rax),%r12 257 movq 32(%rax),%r13 258 movq 40(%rax),%r14 259 movq 48(%rax),%r15 260 xorl %eax,%eax 261 incl %eax 262 RETGUARD_CHECK(longjmp, r11) 263 ret 264 lfence 265END(longjmp) 266#endif /* DDB || NEFI > 0 */ 267 268/*****************************************************************************/ 269 270/* 271 * int cpu_switchto(struct proc *old, struct proc *new) 272 * Switch from "old" proc to "new". 273 */ 274ENTRY(cpu_switchto) 275 pushq %rbx 276 pushq %rbp 277 pushq %r12 278 pushq %r13 279 pushq %r14 280 pushq %r15 281 282 movq %rdi, %r13 283 movq %rsi, %r12 284 285 /* Record new proc. */ 286 movb $SONPROC,P_STAT(%r12) # p->p_stat = SONPROC 287 SET_CURPROC(%r12,%rcx) 288 289 movl CPUVAR(CPUID),%r9d 290 291 /* for the FPU/"extended CPU state" handling below */ 292 movq xsave_mask(%rip),%rdx 293 movl %edx,%eax 294 shrq $32,%rdx 295 296 /* If old proc exited, don't bother. */ 297 xorl %ecx,%ecx 298 testq %r13,%r13 299 jz switch_exited 300 301 /* 302 * Save old context. 303 * 304 * Registers: 305 * %rax - scratch 306 * %r13 - old proc, then old pcb 307 * %rcx - old pmap if not P_SYSTEM 308 * %r12 - new proc 309 * %r9d - cpuid 310 */ 311 312 /* remember the pmap if not P_SYSTEM */ 313 testl $P_SYSTEM,P_FLAG(%r13) 314 movq P_ADDR(%r13),%r13 315 jnz 0f 316 movq PCB_PMAP(%r13),%rcx 3170: 318 319 /* Save stack pointers. */ 320 movq %rsp,PCB_RSP(%r13) 321 movq %rbp,PCB_RBP(%r13) 322 323 /* 324 * If the old proc ran in userspace then save the 325 * floating-point/"extended state" registers 326 */ 327 testl $CPUPF_USERXSTATE,CPUVAR(PFLAGS) 328 jz .Lxstate_reset 329 330 movq %r13, %rdi 331#if PCB_SAVEFPU != 0 332 addq $PCB_SAVEFPU,%rdi 333#endif 334 CODEPATCH_START 335 fxsave64 (%rdi) 336 CODEPATCH_END(CPTAG_XSAVE) 337 338switch_exited: 339 /* now clear the xstate */ 340 movq proc0paddr(%rip),%rdi 341#if PCB_SAVEFPU != 0 342 addq $PCB_SAVEFPU,%rdi 343#endif 344 CODEPATCH_START 345 fxrstor64 (%rdi) 346 CODEPATCH_END(CPTAG_XRSTORS) 347 andl $~CPUPF_USERXSTATE,CPUVAR(PFLAGS) 348 349.Lxstate_reset: 350 /* 351 * If the segment registers haven't been reset since the old proc 352 * ran in userspace then reset them now 353 */ 354 testl $CPUPF_USERSEGS,CPUVAR(PFLAGS) 355 jz restore_saved 356 andl $~CPUPF_USERSEGS,CPUVAR(PFLAGS) 357 358 /* set %ds, %es, %fs, and %gs to expected value to prevent info leak */ 359 movw $(GSEL(GUDATA_SEL, SEL_UPL)),%ax 360 movw %ax,%ds 361 movw %ax,%es 362 movw %ax,%fs 363 cli /* block interrupts when on user GS.base */ 364 swapgs /* switch from kernel to user GS.base */ 365 movw %ax,%gs /* set %gs to UDATA and GS.base to 0 */ 366 swapgs /* back to kernel GS.base */ 367 368restore_saved: 369 /* 370 * Restore saved context. 371 * 372 * Registers: 373 * %rax, %rdx - scratch 374 * %rcx - old pmap if not P_SYSTEM 375 * %r12 - new process 376 * %r13 - new pcb 377 * %rbx - new pmap if not P_SYSTEM 378 */ 379 380 movq P_ADDR(%r12),%r13 381 382 /* remember the pmap if not P_SYSTEM */ 383 xorl %ebx,%ebx 384 testl $P_SYSTEM,P_FLAG(%r12) 385 jnz 1f 386 movq PCB_PMAP(%r13),%rbx 3871: 388 389 /* No interrupts while loading new state. */ 390 cli 391 392 /* Restore stack pointers. */ 393 movq PCB_RSP(%r13),%rsp 394 movq PCB_RBP(%r13),%rbp 395 396 /* Stack pivot done, setup RETGUARD */ 397 RETGUARD_SETUP_OFF(cpu_switchto, r11, 6*8) 398 399 /* don't switch cr3 to the same thing it already was */ 400 movq PCB_CR3(%r13),%rax 401 movq %cr3,%rdi 402 xorq %rax,%rdi 403 btrq $63,%rdi /* ignore CR3_REUSE_PCID */ 404 testq %rdi,%rdi 405 jz .Lsame_cr3 406 407#ifdef DIAGNOSTIC 408 /* verify ci_proc_pmap had been updated properly */ 409 cmpq %rcx,CPUVAR(PROC_PMAP) 410 jnz .Lbogus_proc_pmap 411#endif 412 /* record which pmap this CPU should get IPIs for */ 413 movq %rbx,CPUVAR(PROC_PMAP) 414 415.Lset_cr3: 416 movq %rax,%cr3 /* %rax used below too */ 417 418.Lsame_cr3: 419 /* 420 * If we switched from a userland thread with a shallow call stack 421 * (e.g interrupt->ast->mi_ast->prempt->mi_switch->cpu_switchto) 422 * then the RSB may have attacker controlled entries when we switch 423 * to a deeper call stack in the new thread. Refill the RSB with 424 * entries safe to speculate into/through. 425 */ 426 RET_STACK_REFILL_WITH_RCX 427 428 /* Don't bother with the rest if switching to a system process. */ 429 testq %rbx,%rbx 430 jz switch_restored 431 432 /* record the bits needed for future U-->K transition */ 433 movq PCB_KSTACK(%r13),%rdx 434 subq $FRAMESIZE,%rdx 435 movq %rdx,CPUVAR(KERN_RSP) 436 437 CODEPATCH_START 438 /* 439 * Meltdown: iff we're doing separate U+K and U-K page tables, 440 * then record them in cpu_info for easy access in syscall and 441 * interrupt trampolines. 442 */ 443 movq PM_PDIRPA_INTEL(%rbx),%rdx 444 orq cr3_reuse_pcid,%rax 445 orq cr3_pcid_proc_intel,%rdx 446 movq %rax,CPUVAR(KERN_CR3) 447 movq %rdx,CPUVAR(USER_CR3) 448 CODEPATCH_END(CPTAG_MELTDOWN_NOP) 449 450switch_restored: 451 SET_CURPCB(%r13) 452 453 /* Interrupts are okay again. */ 454 sti 455 popq %r15 456 popq %r14 457 popq %r13 458 popq %r12 459 popq %rbp 460 popq %rbx 461 RETGUARD_CHECK(cpu_switchto, r11) 462 ret 463 lfence 464 465#ifdef DIAGNOSTIC 466.Lbogus_proc_pmap: 467 leaq bogus_proc_pmap,%rdi 468 call panic 469 int3 /* NOTREACHED */ 470 .pushsection .rodata 471bogus_proc_pmap: 472 .asciz "curcpu->ci_proc_pmap didn't point to previous pmap" 473 .popsection 474#endif /* DIAGNOSTIC */ 475END(cpu_switchto) 476 477NENTRY(retpoline_rax) 478 CODEPATCH_START 479 JMP_RETPOLINE(rax) 480 CODEPATCH_END(CPTAG_RETPOLINE_RAX) 481END(retpoline_rax) 482 483NENTRY(__x86_indirect_thunk_r11) 484 CODEPATCH_START 485 JMP_RETPOLINE(r11) 486 CODEPATCH_END(CPTAG_RETPOLINE_R11) 487END(__x86_indirect_thunk_r11) 488 489ENTRY(cpu_idle_cycle_hlt) 490 RETGUARD_SETUP(cpu_idle_cycle_hlt, r11) 491 sti 492 hlt 493 RETGUARD_CHECK(cpu_idle_cycle_hlt, r11) 494 ret 495 lfence 496END(cpu_idle_cycle_hlt) 497 498/* 499 * savectx(struct pcb *pcb); 500 * Update pcb, saving current processor state. 501 */ 502ENTRY(savectx) 503 RETGUARD_SETUP(savectx, r11) 504 /* Save stack pointers. */ 505 movq %rsp,PCB_RSP(%rdi) 506 movq %rbp,PCB_RBP(%rdi) 507 RETGUARD_CHECK(savectx, r11) 508 ret 509 lfence 510END(savectx) 511 512/* 513 * syscall insn entry. 514 * Enter here with interrupts blocked; %rcx contains the caller's 515 * %rip and the original rflags has been copied to %r11. %cs and 516 * %ss have been updated to the kernel segments, but %rsp is still 517 * the user-space value. 518 * First order of business is to swap to the kernel GS.base so that 519 * we can access our struct cpu_info. After possibly mucking with 520 * pagetables, we switch to our kernel stack. Once that's in place 521 * we can save the rest of the syscall frame and unblock interrupts. 522 */ 523KUTEXT_PAGE_START 524 .align NBPG, 0xcc 525XUsyscall_meltdown: 526 /* 527 * This is the real Xsyscall_meltdown page, which is mapped into 528 * the U-K page tables at the same location as Xsyscall_meltdown 529 * below. For this, the Meltdown case, we use the scratch space 530 * in cpu_info so we can switch to the kernel page tables 531 * (thank you, Intel), at which point we'll continue at the 532 * "SYSCALL_ENTRY" after Xsyscall below. 533 * In case the CPU speculates past the mov to cr3, we put a 534 * retpoline-style pause-lfence-jmp-to-pause loop. 535 */ 536 endbr64 537 swapgs 538 movq %rax,CPUVAR(SCRATCH) 539 movq CPUVAR(KERN_CR3),%rax 540 movq %rax,%cr3 5410: pause 542 lfence 543 jmp 0b 544KUTEXT_PAGE_END 545 546KTEXT_PAGE_START 547 .align NBPG, 0xcc 548GENTRY(Xsyscall_meltdown) 549 /* pad to match real Xsyscall_meltdown positioning above */ 550 movq CPUVAR(KERN_CR3),%rax 551 movq %rax,%cr3 552GENTRY(Xsyscall) 553 endbr64 554 swapgs 555 movq %rax,CPUVAR(SCRATCH) 556 SYSCALL_ENTRY /* create trapframe */ 557 sti 558 559 movq CPUVAR(CURPROC),%r14 560 movq %rsp,P_MD_REGS(%r14) # save pointer to frame 561 andl $~MDP_IRET,P_MD_FLAGS(%r14) 562 movq %rsp,%rdi 563 call syscall 564 565.Lsyscall_check_asts: 566 /* Check for ASTs on exit to user mode. */ 567 cli 568 CHECK_ASTPENDING(%r11) 569 je 2f 570 CLEAR_ASTPENDING(%r11) 571 sti 572 movq %rsp,%rdi 573 call ast 574 jmp .Lsyscall_check_asts 575 5762: 577#ifdef DIAGNOSTIC 578 cmpl $IPL_NONE,CPUVAR(ILEVEL) 579 jne .Lsyscall_spl_not_lowered 580#endif /* DIAGNOSTIC */ 581 582 /* Could registers have been changed that require an iretq? */ 583 testl $MDP_IRET, P_MD_FLAGS(%r14) 584 jne intr_user_exit_post_ast 585 586 /* Restore FPU/"extended CPU state" if it's not already in the CPU */ 587 testl $CPUPF_USERXSTATE,CPUVAR(PFLAGS) 588 jz .Lsyscall_restore_xstate 589 590 /* Restore FS.base if it's not already in the CPU */ 591 testl $CPUPF_USERSEGS,CPUVAR(PFLAGS) 592 jz .Lsyscall_restore_fsbase 593 594.Lsyscall_restore_registers: 595 /* 596 * If the pmap we're now on isn't the same as the one we 597 * were on last time we were in userspace, then use IBPB 598 * to prevent cross-process branch-target injection. 599 */ 600 CODEPATCH_START 601 movq CPUVAR(PROC_PMAP),%rbx 602 cmpq CPUVAR(USER_PMAP),%rbx 603 je 1f 604 xorl %edx,%edx 605 movl $PRED_CMD_IBPB,%eax 606 movl $MSR_PRED_CMD,%ecx 607 wrmsr 608 movq %rbx,CPUVAR(USER_PMAP) 6091: 610 CODEPATCH_END(CPTAG_IBPB_NOP) 611 call pku_xonly 612 RET_STACK_REFILL_WITH_RCX 613 614 movq TF_R8(%rsp),%r8 615 movq TF_R9(%rsp),%r9 616 movq TF_R10(%rsp),%r10 617 movq TF_R12(%rsp),%r12 618 movq TF_R13(%rsp),%r13 619 movq TF_R14(%rsp),%r14 620 movq TF_R15(%rsp),%r15 621 movq TF_RBX(%rsp),%rbx 622 movq TF_RDX(%rsp),%rdx 623 624 CODEPATCH_START 625 xorl %edi,%edi 626 xorl %esi,%esi 627 xorl %r11d,%r11d 628 xorl %eax,%eax 629 xorl %ecx,%ecx 630 movw %ds,TF_R8(%rsp) 631 verw TF_R8(%rsp) 632 CODEPATCH_END(CPTAG_MDS) 633 634 movq TF_RDI(%rsp),%rdi 635 movq TF_RSI(%rsp),%rsi 636 movq TF_RBP(%rsp),%rbp 637 638 /* 639 * We need to finish reading from the trapframe, then switch 640 * to the user page tables, swapgs, and return. We need 641 * to get the final value for the register that was used 642 * for the mov to %cr3 from somewhere accessible on the 643 * user page tables, so save it in CPUVAR(SCRATCH) across 644 * the switch. 645 */ 646 movq TF_RAX(%rsp),%rax 647 movq TF_RIP(%rsp),%rcx 648 movq TF_RFLAGS(%rsp),%r11 649 movq TF_RSP(%rsp),%rsp 650 CODEPATCH_START 651 movq %rax,CPUVAR(SCRATCH) 652 movq CPUVAR(USER_CR3),%rax 653 PCID_SET_REUSE_NOP 654 movq %rax,%cr3 655Xsyscall_trampback: 6560: pause 657 lfence 658 jmp 0b 659 CODEPATCH_END(CPTAG_MELTDOWN_NOP) 660 swapgs 661 sysretq 662END(Xsyscall) 663END(Xsyscall_meltdown) 664KTEXT_PAGE_END 665 666KUTEXT_PAGE_START 667 .space (Xsyscall_trampback - Xsyscall_meltdown) - \ 668 (. - XUsyscall_meltdown), 0xcc 669 movq %rax,%cr3 670 movq CPUVAR(SCRATCH),%rax 671 swapgs 672 sysretq 673KUTEXT_PAGE_END 674 675 .text 676 _ALIGN_TRAPS 677 /* in this case, need FS.base but not xstate, rarely happens */ 678.Lsyscall_restore_fsbase: /* CPU doesn't have curproc's FS.base */ 679 orl $CPUPF_USERSEGS,CPUVAR(PFLAGS) 680 movq CPUVAR(CURPCB),%rdi 681 jmp .Lsyscall_restore_fsbase_real 682 683 _ALIGN_TRAPS 684.Lsyscall_restore_xstate: /* CPU doesn't have curproc's xstate */ 685 orl $(CPUPF_USERXSTATE|CPUPF_USERSEGS),CPUVAR(PFLAGS) 686 movq CPUVAR(CURPCB),%rdi 687 movq xsave_mask(%rip),%rdx 688 movl %edx,%eax 689 shrq $32,%rdx 690#if PCB_SAVEFPU != 0 691 addq $PCB_SAVEFPU,%rdi 692#endif 693 /* untouched state so can't fault */ 694 CODEPATCH_START 695 fxrstor64 (%rdi) 696 CODEPATCH_END(CPTAG_XRSTORS) 697#if PCB_SAVEFPU != 0 698 subq $PCB_SAVEFPU,%rdi 699#endif 700.Lsyscall_restore_fsbase_real: 701 movq PCB_FSBASE(%rdi),%rdx 702 movl %edx,%eax 703 shrq $32,%rdx 704 movl $MSR_FSBASE,%ecx 705 wrmsr 706 jmp .Lsyscall_restore_registers 707 708#ifdef DIAGNOSTIC 709.Lsyscall_spl_not_lowered: 710 leaq spl_lowered(%rip), %rdi 711 movl TF_ERR(%rsp),%esi /* syscall # stashed above */ 712 movl TF_RDI(%rsp),%edx 713 movl %ebx,%ecx 714 movl CPUVAR(ILEVEL),%r8d 715 xorq %rax,%rax 716 call printf 717#ifdef DDB 718 int $3 719#endif /* DDB */ 720 movl $IPL_NONE,CPUVAR(ILEVEL) 721 jmp .Lsyscall_check_asts 722 723 .section .rodata 724spl_lowered: 725 .asciz "WARNING: SPL NOT LOWERED ON SYSCALL %d %d EXIT %x %x\n" 726 .text 727#endif 728 729NENTRY(proc_trampoline) 730 call proc_trampoline_mi 731 movq %r13,%rdi 732 movq %r12,%rax 733 call retpoline_rax 734 movq CPUVAR(CURPROC),%r14 735 jmp .Lsyscall_check_asts 736END(proc_trampoline) 737 738 739/* 740 * Returning to userspace via iretq. We do things in this order: 741 * - check for ASTs 742 * - restore FPU/"extended CPU state" if it's not already in the CPU 743 * - DIAGNOSTIC: no more C calls after this, so check the SPL 744 * - restore FS.base if it's not already in the CPU 745 * - restore most registers 746 * - update the iret frame from the trapframe 747 * - finish reading from the trapframe 748 * - switch to the trampoline stack \ 749 * - jump to the .kutext segment |-- Meltdown workaround 750 * - switch to the user page tables / 751 * - swapgs 752 * - iretq 753 */ 754KTEXT_PAGE_START 755 _ALIGN_TRAPS 756GENTRY(intr_user_exit) 757#ifdef DIAGNOSTIC 758 pushfq 759 popq %rdx 760 testq $PSL_I,%rdx 761 jnz .Lintr_user_exit_not_blocked 762#endif /* DIAGNOSTIC */ 763 764 /* Check for ASTs */ 765 CHECK_ASTPENDING(%r11) 766 je intr_user_exit_post_ast 767 CLEAR_ASTPENDING(%r11) 768 sti 769 movq %rsp,%rdi 770 call ast 771 cli 772 jmp intr_user_exit 773 774intr_user_exit_post_ast: 775 /* Restore FPU/"extended CPU state" if it's not already in the CPU */ 776 testl $CPUPF_USERXSTATE,CPUVAR(PFLAGS) 777 jz .Lintr_restore_xstate 778 779 /* Restore FS.base if it's not already in the CPU */ 780 testl $CPUPF_USERSEGS,CPUVAR(PFLAGS) 781 jz .Lintr_restore_fsbase 782 783.Lintr_restore_registers: 784#ifdef DIAGNOSTIC 785 /* no more C calls after this, so check the SPL */ 786 cmpl $0,CPUVAR(ILEVEL) 787 jne .Luser_spl_not_lowered 788#endif /* DIAGNOSTIC */ 789 790 /* 791 * If the pmap we're now on isn't the same as the one we 792 * were on last time we were in userspace, then use IBPB 793 * to prevent cross-process branch-target injection. 794 */ 795 CODEPATCH_START 796 movq CPUVAR(PROC_PMAP),%rbx 797 cmpq CPUVAR(USER_PMAP),%rbx 798 je 1f 799 xorl %edx,%edx 800 movl $PRED_CMD_IBPB,%eax 801 movl $MSR_PRED_CMD,%ecx 802 wrmsr 803 movq %rbx,CPUVAR(USER_PMAP) 8041: 805 CODEPATCH_END(CPTAG_IBPB_NOP) 806 call pku_xonly 807 RET_STACK_REFILL_WITH_RCX 808 809 movq TF_R8(%rsp),%r8 810 movq TF_R9(%rsp),%r9 811 movq TF_R10(%rsp),%r10 812 movq TF_R12(%rsp),%r12 813 movq TF_R13(%rsp),%r13 814 movq TF_R14(%rsp),%r14 815 movq TF_R15(%rsp),%r15 816 movq TF_RBX(%rsp),%rbx 817 818 CODEPATCH_START 819 xorl %edi,%edi 820 xorl %esi,%esi 821 xorl %r11d,%r11d 822 xorl %eax,%eax 823 xorl %edx,%edx 824 xorl %ecx,%ecx 825 movw %ds,TF_R8(%rsp) 826 verw TF_R8(%rsp) 827 CODEPATCH_END(CPTAG_MDS) 828 829 movq TF_RDI(%rsp),%rdi 830 movq TF_RSI(%rsp),%rsi 831 movq TF_RBP(%rsp),%rbp 832 833 /* 834 * To get the final value for the register that was used 835 * for the mov to %cr3, we need access to somewhere accessible 836 * on the user page tables, so we save it in CPUVAR(SCRATCH) 837 * across the switch. 838 */ 839 /* update iret frame */ 840 movq CPUVAR(INTR_RSP),%rdx 841 movq $(GSEL(GUCODE_SEL,SEL_UPL)),IRETQ_CS(%rdx) 842 movq TF_RIP(%rsp),%rax 843 movq %rax,IRETQ_RIP(%rdx) 844 movq TF_RFLAGS(%rsp),%rax 845 movq %rax,IRETQ_RFLAGS(%rdx) 846 movq TF_RSP(%rsp),%rax 847 movq %rax,IRETQ_RSP(%rdx) 848 movq $(GSEL(GUDATA_SEL,SEL_UPL)),IRETQ_SS(%rdx) 849 /* finish with the trap frame */ 850 movq TF_RAX(%rsp),%rax 851 movq TF_RCX(%rsp),%rcx 852 movq TF_R11(%rsp),%r11 853 /* switch to the trampoline stack */ 854 xchgq %rdx,%rsp 855 movq TF_RDX(%rdx),%rdx 856 CODEPATCH_START 857 movq %rax,CPUVAR(SCRATCH) 858 movq CPUVAR(USER_CR3),%rax 859 PCID_SET_REUSE_NOP 860 movq %rax,%cr3 861Xiretq_trampback: 862KTEXT_PAGE_END 863/* the movq %cr3 switches to this "KUTEXT" page */ 864KUTEXT_PAGE_START 865 .space (Xiretq_trampback - Xsyscall_meltdown) - \ 866 (. - XUsyscall_meltdown), 0xcc 867 movq CPUVAR(SCRATCH),%rax 868.Liretq_swapgs: 869 swapgs 870doreti_iret_meltdown: 871 iretq 872KUTEXT_PAGE_END 873/* 874 * Back to the "KTEXT" page to fill in the speculation trap and the 875 * swapgs+iretq used for non-Meltdown kernels. This switching back 876 * and forth between segments is so that we can do the .space 877 * calculation below to guarantee the iretq's above and below line 878 * up, so the 'doreti_iret' label lines up with the iretq whether 879 * the CPU is affected by Meltdown or not. 880 */ 881KTEXT_PAGE_START 8820: pause 883 lfence 884 jmp 0b 885 .space (.Liretq_swapgs - XUsyscall_meltdown) - \ 886 (. - Xsyscall_meltdown), 0xcc 887 CODEPATCH_END(CPTAG_MELTDOWN_NOP) 888 swapgs 889 890 .globl doreti_iret 891doreti_iret: 892 iretq 893KTEXT_PAGE_END 894 895 .text 896 _ALIGN_TRAPS 897.Lintr_restore_xstate: /* CPU doesn't have curproc's xstate */ 898 orl $CPUPF_USERXSTATE,CPUVAR(PFLAGS) 899 movq CPUVAR(CURPCB),%rdi 900#if PCB_SAVEFPU != 0 901 addq $PCB_SAVEFPU,%rdi 902#endif 903 movq xsave_mask(%rip),%rdx 904 movl %edx,%eax 905 shrq $32, %rdx 906 CODEPATCH_START 907 fxrstor64 (%rdi) 908 CODEPATCH_END(CPTAG_XRSTORS) 909 //testl %eax,%eax 910 //jnz .Lintr_xrstor_faulted 911.Lintr_restore_fsbase: /* CPU doesn't have curproc's FS.base */ 912 orl $CPUPF_USERSEGS,CPUVAR(PFLAGS) 913 movq CPUVAR(CURPCB),%rdx 914 movq PCB_FSBASE(%rdx),%rdx 915 movl %edx,%eax 916 shrq $32,%rdx 917 movl $MSR_FSBASE,%ecx 918 wrmsr 919 jmp .Lintr_restore_registers 920 921.Lintr_xrstor_faulted: 922 /* 923 * xrstor faulted; we need to reset the FPU state and call trap() 924 * to post a signal, which requires interrupts be enabled. 925 */ 926 sti 927 movq proc0paddr(%rip),%rdi 928#if PCB_SAVEFPU != 0 929 addq $PCB_SAVEFPU,%rdi 930#endif 931 CODEPATCH_START 932 fxrstor64 (%rdi) 933 CODEPATCH_END(CPTAG_XRSTORS) 934 movq $T_PROTFLT,TF_TRAPNO(%rsp) 935 jmp recall_trap 936 937#ifdef DIAGNOSTIC 938.Lintr_user_exit_not_blocked: 939 movl warn_once(%rip),%edi 940 testl %edi,%edi 941 jnz 1f 942 incl %edi 943 movl %edi,warn_once(%rip) 944 leaq .Lnot_blocked(%rip),%rdi 945 call printf 946#ifdef DDB 947 int $3 948#endif /* DDB */ 9491: cli 950 jmp intr_user_exit 951 952.Luser_spl_not_lowered: 953 sti 954 leaq intr_spl_lowered(%rip),%rdi 955 movl CPUVAR(ILEVEL),%esi 956 xorl %edx,%edx /* always SPL zero for userspace */ 957 xorl %eax,%eax 958 call printf 959#ifdef DDB 960 int $3 961#endif /* DDB */ 962 movl $0,CPUVAR(ILEVEL) 963 cli 964 jmp intr_user_exit 965 966 .section .rodata 967intr_spl_lowered: 968 .asciz "WARNING: SPL NOT LOWERED ON TRAP EXIT %x %x\n" 969 .text 970#endif /* DIAGNOSTIC */ 971END(Xintr_user_exit) 972 973 974/* 975 * Return to supervisor mode from trap or interrupt 976 */ 977NENTRY(intr_fast_exit) 978#ifdef DIAGNOSTIC 979 pushfq 980 popq %rdx 981 testq $PSL_I,%rdx 982 jnz .Lintr_exit_not_blocked 983#endif /* DIAGNOSTIC */ 984 movq TF_RDI(%rsp),%rdi 985 movq TF_RSI(%rsp),%rsi 986 movq TF_R8(%rsp),%r8 987 movq TF_R9(%rsp),%r9 988 movq TF_R10(%rsp),%r10 989 movq TF_R12(%rsp),%r12 990 movq TF_R13(%rsp),%r13 991 movq TF_R14(%rsp),%r14 992 movq TF_R15(%rsp),%r15 993 movq TF_RBP(%rsp),%rbp 994 movq TF_RBX(%rsp),%rbx 995 movq TF_RDX(%rsp),%rdx 996 movq TF_RCX(%rsp),%rcx 997 movq TF_R11(%rsp),%r11 998 movq TF_RAX(%rsp),%rax 999 addq $TF_RIP,%rsp 1000 iretq 1001 1002#ifdef DIAGNOSTIC 1003.Lintr_exit_not_blocked: 1004 movl warn_once(%rip),%edi 1005 testl %edi,%edi 1006 jnz 1f 1007 incl %edi 1008 movl %edi,warn_once(%rip) 1009 leaq .Lnot_blocked(%rip),%rdi 1010 call printf 1011#ifdef DDB 1012 int $3 1013#endif /* DDB */ 10141: cli 1015 jmp intr_fast_exit 1016 1017 .data 1018.global warn_once 1019warn_once: 1020 .long 0 1021 .section .rodata 1022.Lnot_blocked: 1023 .asciz "WARNING: INTERRUPTS NOT BLOCKED ON INTERRUPT RETURN: 0x%x 0x%x\n" 1024 .text 1025#endif 1026END(intr_fast_exit) 1027 1028/* 1029 * FPU/"extended CPU state" handling 1030 * void xrstor_kern(sfp, mask) 1031 * using first of xrstors/xrstor/fxrstor, load given state 1032 * which is assumed to be trusted: i.e., unaltered from 1033 * xsaves/xsaveopt/xsave/fxsave by kernel 1034 * int xrstor_user(sfp, mask) 1035 * using first of xrstor/fxrstor, load given state which might 1036 * not be trustable: #GP faults will be caught; returns 0/1 if 1037 * okay/it trapped. 1038 * void fpusave(sfp) 1039 * save current state, but retain it in the FPU 1040 * void fpusavereset(sfp) 1041 * save current state and reset FPU to initial/kernel state 1042 * int xsetbv_user(reg, mask) 1043 * load specified %xcr# register, returns 0/1 if okay/it trapped 1044 */ 1045 1046ENTRY(xrstor_kern) 1047 RETGUARD_SETUP(xrstor_kern, r11) 1048 movq %rsi, %rdx 1049 movl %esi, %eax 1050 shrq $32, %rdx 1051 CODEPATCH_START 1052 fxrstor64 (%rdi) 1053 CODEPATCH_END(CPTAG_XRSTORS) 1054 RETGUARD_CHECK(xrstor_kern, r11) 1055 ret 1056 lfence 1057END(xrstor_kern) 1058 1059ENTRY(xrstor_user) 1060 RETGUARD_SETUP(xrstor_user, r11) 1061 movq %rsi, %rdx 1062 movl %esi, %eax 1063 shrq $32, %rdx 1064 .globl xrstor_fault 1065xrstor_fault: 1066 CODEPATCH_START 1067 fxrstor64 (%rdi) 1068 CODEPATCH_END(CPTAG_XRSTOR) 1069 xorl %eax, %eax 1070 RETGUARD_CHECK(xrstor_user, r11) 1071 ret 1072 lfence 1073NENTRY(xrstor_resume) 1074 movl $1, %eax 1075 RETGUARD_CHECK(xrstor_user, r11) 1076 ret 1077 lfence 1078END(xrstor_user) 1079 1080ENTRY(fpusave) 1081 RETGUARD_SETUP(fpusave, r11) 1082 movq xsave_mask(%rip),%rdx 1083 movl %edx,%eax 1084 shrq $32,%rdx 1085 CODEPATCH_START 1086 fxsave64 (%rdi) 1087 CODEPATCH_END(CPTAG_XSAVE) 1088 RETGUARD_CHECK(fpusave, r11) 1089 ret 1090 lfence 1091END(fpusave) 1092 1093ENTRY(fpusavereset) 1094 RETGUARD_SETUP(fpusavereset, r11) 1095 movq xsave_mask(%rip),%rdx 1096 movl %edx,%eax 1097 shrq $32,%rdx 1098 CODEPATCH_START 1099 fxsave64 (%rdi) 1100 CODEPATCH_END(CPTAG_XSAVE) 1101 movq proc0paddr(%rip),%rdi 1102#if PCB_SAVEFPU != 0 1103 addq $PCB_SAVEFPU,%rdi 1104#endif 1105 CODEPATCH_START 1106 fxrstor64 (%rdi) 1107 CODEPATCH_END(CPTAG_XRSTORS) 1108 RETGUARD_CHECK(fpusavereset, r11) 1109 ret 1110 lfence 1111END(fpusavereset) 1112 1113ENTRY(xsetbv_user) 1114 RETGUARD_SETUP(xsetbv_user, r11) 1115 movl %edi, %ecx 1116 movq %rsi, %rdx 1117 movl %esi, %eax 1118 shrq $32, %rdx 1119 .globl xsetbv_fault 1120xsetbv_fault: 1121 xsetbv 1122 xorl %eax, %eax 1123 RETGUARD_CHECK(xsetbv_user, r11) 1124 ret 1125 lfence 1126NENTRY(xsetbv_resume) 1127 movl $1, %eax 1128 RETGUARD_CHECK(xsetbv_user, r11) 1129 ret 1130 lfence 1131END(xsetbv_user) 1132 1133CODEPATCH_CODE(_xrstor, xrstor64 (%rdi)) 1134CODEPATCH_CODE(_xrstors, xrstors64 (%rdi)) 1135CODEPATCH_CODE(_xsave, xsave64 (%rdi)) 1136CODEPATCH_CODE(_xsaves, xsaves64 (%rdi)) 1137CODEPATCH_CODE(_xsaveopt, xsaveopt64 (%rdi)) 1138CODEPATCH_CODE(_pcid_set_reuse, 1139 orl $(CR3_REUSE_PCID >> 32),CPUVAR(USER_CR3 + 4)) 1140CODEPATCH_CODE_LEN(_jmprax, jmp *%rax; int3) 1141CODEPATCH_CODE_LEN(_jmpr11, jmp *%r11; int3) 1142CODEPATCH_CODE_LEN(_jmpr13, jmp *%r13; int3) 1143 1144ENTRY(pagezero) 1145 RETGUARD_SETUP(pagezero, r11) 1146 movq $-PAGE_SIZE,%rdx 1147 subq %rdx,%rdi 1148 xorq %rax,%rax 11491: 1150 movnti %rax,(%rdi,%rdx) 1151 movnti %rax,8(%rdi,%rdx) 1152 movnti %rax,16(%rdi,%rdx) 1153 movnti %rax,24(%rdi,%rdx) 1154 addq $32,%rdx 1155 jne 1b 1156 sfence 1157 RETGUARD_CHECK(pagezero, r11) 1158 ret 1159 lfence 1160END(pagezero) 1161 1162/* void pku_xonly(void) */ 1163ENTRY(pku_xonly) 1164 movq pg_xo,%rax /* have PKU support? */ 1165 cmpq $0,%rax 1166 je 1f 1167 movl $0,%ecx /* force PKRU for xonly restriction */ 1168 movl $0,%edx 1169 movl $PGK_VALUE,%eax /* key0 normal, key1 is exec without read */ 1170 wrpkru 11711: ret 1172 lfence 1173END(pku_xonly) 1174 1175/* int rdmsr_safe(u_int msr, uint64_t *data) */ 1176ENTRY(rdmsr_safe) 1177 RETGUARD_SETUP(rdmsr_safe, r10) 1178 1179 movl %edi, %ecx /* u_int msr */ 1180 .globl rdmsr_safe_fault 1181rdmsr_safe_fault: 1182 rdmsr 1183 salq $32, %rdx 1184 movl %eax, %eax 1185 orq %rdx, %rax 1186 movq %rax, (%rsi) /* *data */ 1187 xorq %rax, %rax 1188 1189 RETGUARD_CHECK(rdmsr_safe, r10) 1190 ret 1191 lfence 1192 1193NENTRY(rdmsr_resume) 1194 movl $0x1, %eax 1195 RETGUARD_CHECK(rdmsr_safe, r10) 1196 ret 1197 lfence 1198END(rdmsr_safe) 1199 1200#if NHYPERV > 0 1201/* uint64_t hv_hypercall_trampoline(uint64_t control, paddr_t input, paddr_t output) */ 1202NENTRY(hv_hypercall_trampoline) 1203 endbr64 1204 mov %rdx, %r8 1205 mov %rsi, %rdx 1206 mov %rdi, %rcx 1207 jmp hv_hypercall_page 1208END(hv_hypercall_trampoline) 1209 /* Hypercall page needs to be page aligned */ 1210 .text 1211 .align NBPG, 0xcc 1212 .globl hv_hypercall_page 1213hv_hypercall_page: 1214 .skip 0x1000, 0xcc 1215#endif /* NHYPERV > 0 */ 1216 1217#if NXEN > 0 1218 /* Hypercall page needs to be page aligned */ 1219 .text 1220 .align NBPG, 0xcc 1221 .globl xen_hypercall_page 1222xen_hypercall_page: 1223 .skip 0x1000, 0xcc 1224#endif /* NXEN > 0 */ 1225