1//  (C) Copyright Giovanni P. Deretta 2005.
2//  Distributed under the Boost Software License, Version 1.0.
3//  (See accompanying file LICENSE_1_0.txt or copy at
4//  http://www.boost.org/LICENSE_1_0.txt)
5
6/*
7    EAX is &from.sp
8    EDX is to.sp
9    This is the simplest version of swapcontext
10    It saves registers on the old stack,
11    saves the old stack pointer,
12    load the new stack pointer,
13    pop registers from the new stack
14     and returns to new caller.
15    EAX is simpy passed to the function it returns to.
16    The first time EAX is the first parameter of the trampoline.
17    Otherwise it is simply discarded.
18    NOTE: This function should work on any IA32 CPU.
19    NOTE: The biggest penality is the last jump that
20    will be always mispredicted (~50 cycles on P4).
21    We try to make its address available as soon as possible
22    to try to reduce the penality. Doing a ret instead of a
23    'add $4, %esp'
24    'jmp *%ecx'
25    really kills performance.
26    NOTE: popl is slightly better than mov+add to pop registers
27    so is pushl rather than mov+sub.
28    */
29
30    .text
31    .align 16
32    .globl swapcontext_stack,
33    .type swapcontext_stack, @function
34
35swapcontext_stack:
36    movq  16(%rdx), %rcx
37    pushq   %rbp
38    pushq %rbx
39    pushq %rsi
40    pushq %rdi
41    movq  %rsp, (%rax)
42    movq  %rdx, %rsp
43    popq  %rdi
44    popq  %rsi
45    popq  %rbx
46    popq  %rbp
47    add   $4, %rsp
48//  popq %rcx
49    jmp   *%rcx
50    ud2
51
52/*
53   This is exactly the same than swapcontext_stack,
54   but while the swapcontext_stack should be used
55   for invocations, this should be used for yielding,
56   thus there are two 'jmp' sites that, in the common
57   invoke+yield case, each jump always to the same target
58   and can be predicted (this is very important on P4).
59   This optimization gives a 50% performance bonus on a plain
60   'invoke and yield' test.
61   NOTE: both subroutines work even if they are used in the
62    wrong place.
63  */
64
65    .align 16
66    .globl swapcontext_stack2,
67    .type swapcontext_stack2, @function
68swapcontext_stack2:
69    movq  16(%rdx), %rcx
70    pushq %rbp
71    pushq %rbx
72    pushq %rsi
73    pushq %rdi
74    movq  %rsp, (%rax)
75    movq  %rdx, %rsp
76    popq  %rdi
77    popq  %rsi
78    popq  %rbx
79    popq  %rbp
80    add   $4, %rsp
81//  popq %rcx
82    jmp   *%rcx
83    ud2
84
85    .align 16
86    .globl swapcontext_stack3,
87    .type swapcontext_stack3, @function
88swapcontext_stack3:
89    movq  16(%rdx), %rcx
90    pushq %rbp
91    pushq %rbx
92    pushq %rsi
93    pushq %rdi
94    movq  %rsp, (%rax)
95    movq  %rdx, %rsp
96    popq  %rdi
97    popq  %rsi
98    popq  %rbx
99    popq  %rbp
100    add   $4, %rsp
101//  popq  %rcx
102    jmp   *%rcx
103    ud2
104
105    .align 16
106    .globl swapcontext_stack_orig,
107    .type swapcontext_stack_orig, @function
108swapcontext_stack_orig:
109    pushq %rbp
110    pushq %rbx
111    pushq %rsi
112    pushq %rdi
113    movq  %rsp, (%rax)
114    movq  %rdx, %rsp
115    popq  %rdi
116    popq  %rsi
117    popq  %rbx
118    popq  %rbp
119    ret
120