1// (C) Copyright Giovanni P. Deretta 2005. 2// Distributed under the Boost Software License, Version 1.0. 3// (See accompanying file LICENSE_1_0.txt or copy at 4// http://www.boost.org/LICENSE_1_0.txt) 5 6/* 7 EAX is &from.sp 8 EDX is to.sp 9 This is the simplest version of swapcontext 10 It saves registers on the old stack, 11 saves the old stack pointer, 12 load the new stack pointer, 13 pop registers from the new stack 14 and returns to new caller. 15 EAX is simpy passed to the function it returns to. 16 The first time EAX is the first parameter of the trampoline. 17 Otherwise it is simply discarded. 18 NOTE: This function should work on any IA32 CPU. 19 NOTE: The biggest penality is the last jump that 20 will be always mispredicted (~50 cycles on P4). 21 We try to make its address available as soon as possible 22 to try to reduce the penality. Doing a ret instead of a 23 'add $4, %esp' 24 'jmp *%ecx' 25 really kills performance. 26 NOTE: popl is slightly better than mov+add to pop registers 27 so is pushl rather than mov+sub. 28 */ 29 30 .text 31 .align 16 32 .globl swapcontext_stack, 33 .type swapcontext_stack, @function 34 35swapcontext_stack: 36 movq 16(%rdx), %rcx 37 pushq %rbp 38 pushq %rbx 39 pushq %rsi 40 pushq %rdi 41 movq %rsp, (%rax) 42 movq %rdx, %rsp 43 popq %rdi 44 popq %rsi 45 popq %rbx 46 popq %rbp 47 add $4, %rsp 48// popq %rcx 49 jmp *%rcx 50 ud2 51 52/* 53 This is exactly the same than swapcontext_stack, 54 but while the swapcontext_stack should be used 55 for invocations, this should be used for yielding, 56 thus there are two 'jmp' sites that, in the common 57 invoke+yield case, each jump always to the same target 58 and can be predicted (this is very important on P4). 59 This optimization gives a 50% performance bonus on a plain 60 'invoke and yield' test. 61 NOTE: both subroutines work even if they are used in the 62 wrong place. 63 */ 64 65 .align 16 66 .globl swapcontext_stack2, 67 .type swapcontext_stack2, @function 68swapcontext_stack2: 69 movq 16(%rdx), %rcx 70 pushq %rbp 71 pushq %rbx 72 pushq %rsi 73 pushq %rdi 74 movq %rsp, (%rax) 75 movq %rdx, %rsp 76 popq %rdi 77 popq %rsi 78 popq %rbx 79 popq %rbp 80 add $4, %rsp 81// popq %rcx 82 jmp *%rcx 83 ud2 84 85 .align 16 86 .globl swapcontext_stack3, 87 .type swapcontext_stack3, @function 88swapcontext_stack3: 89 movq 16(%rdx), %rcx 90 pushq %rbp 91 pushq %rbx 92 pushq %rsi 93 pushq %rdi 94 movq %rsp, (%rax) 95 movq %rdx, %rsp 96 popq %rdi 97 popq %rsi 98 popq %rbx 99 popq %rbp 100 add $4, %rsp 101// popq %rcx 102 jmp *%rcx 103 ud2 104 105 .align 16 106 .globl swapcontext_stack_orig, 107 .type swapcontext_stack_orig, @function 108swapcontext_stack_orig: 109 pushq %rbp 110 pushq %rbx 111 pushq %rsi 112 pushq %rdi 113 movq %rsp, (%rax) 114 movq %rdx, %rsp 115 popq %rdi 116 popq %rsi 117 popq %rbx 118 popq %rbp 119 ret 120