xref: /reactos/sdk/lib/crt/math/libm_sse2/expf.asm (revision 4afb647c)
1;
2; MIT License
3; -----------
4;
5; Copyright (c) 2002-2019 Advanced Micro Devices, Inc.
6;
7; Permission is hereby granted, free of charge, to any person obtaining a copy
8; of this Software and associated documentaon files (the "Software"), to deal
9; in the Software without restriction, including without limitation the rights
10; to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11; copies of the Software, and to permit persons to whom the Software is
12; furnished to do so, subject to the following conditions:
13;
14; The above copyright notice and this permission notice shall be included in
15; all copies or substantial portions of the Software.
16;
17; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18; IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19; FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20; AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21; LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
23; THE SOFTWARE.
24;
25; expf.asm
26;
27; An implementation of the expf libm function.
28;
29; Prototype:
30;
31;     float expf(float x);
32;
33
34;
35;   Algorithm:
36;       Similar to one presnted in exp.asm
37;
38; If FMA3 hardware is available, an FMA3 implementation of expf will be used.
39
40
41.const
42ALIGN 16
43
44__real_inf                      DD 7f800000h
45                                DD 0
46                                DQ 0
47
48__real_ninf                     DD 0ff800000h
49                                DD 0
50                                DQ 0
51
52__real_qnanbit                  DD 00400000h
53                                DD 0
54                                DQ 0
55
56__real_zero                     DD 00000000h
57                                DD 0
58                                DQ 0
59
60__real_p8192                    DQ 40c0000000000000h
61                                DQ 0
62__real_m9600                    DQ 0c0c2c00000000000h
63                                DQ 0
64
65__real_64_by_log2               DQ 40571547652b82feh ; 64/ln(2)
66                                DQ 0
67__real_log2_by_64               DQ 3f862e42fefa39efh ; log2_by_64
68                                DQ 0
69
70__real_1_by_6                   DQ 3fc5555555555555h ; 1/6
71                                DQ 0
72__real_1_by_2                   DQ 3fe0000000000000h ; 1/2
73                                DQ 0
74
75; these codes and the ones in the corresponding .c file have to match
76__flag_x_nan            DD 00000001
77__flag_y_zero           DD 00000002
78__flag_y_inf            DD 00000003
79
80EXTRN __two_to_jby64_table:QWORD
81EXTRN __use_fma3_lib:DWORD
82
83fname           TEXTEQU <expf>
84fname_special   TEXTEQU <_expf_special>
85
86; define local variable storage offsets
87
88; make room for fname_special to save things
89dummy_space     EQU    020h
90stack_size      EQU    038h
91
92include fm.inc
93
94; external function
95EXTERN fname_special:PROC
96
97.code
98
99ALIGN 16
100PUBLIC fname
101fname PROC FRAME
102    StackAllocate stack_size
103    .ENDPROLOG
104
105    ; Do this to avoid possible exceptions from a NaN argument.
106    movd        edx, xmm0
107    btr         edx,31
108    cmp         edx, DWORD PTR __real_inf
109    jge         Lexpf_x_is_inf_or_nan
110
111    cmp          DWORD PTR __use_fma3_lib, 0
112    jne          Lexpf_fma3
113
114Lexpf_sse2:
115
116    cvtss2sd    xmm0, xmm0
117
118    ; x * (64/ln(2))
119    movsd       xmm3, QWORD PTR __real_64_by_log2
120    mulsd       xmm3, xmm0
121
122    ; x <= 128*ln(2), ( x * (64/ln(2)) ) <= 64*128
123    ; x > -150*ln(2), ( x * (64/ln(2)) ) > 64*(-150)
124    comisd      xmm3, QWORD PTR __real_p8192
125    jae         Lexpf_y_is_inf
126
127    comisd      xmm3, QWORD PTR __real_m9600
128    jb          Lexpf_y_is_zero
129
130    ; n = int( x * (64/ln(2)) )
131    cvtpd2dq    xmm4, xmm3
132    lea         r10, __two_to_jby64_table
133    cvtdq2pd    xmm1, xmm4
134
135    ; r = x - n * ln(2)/64
136    movsd       xmm2, QWORD PTR __real_log2_by_64
137    mulsd       xmm2, xmm1
138    movd        ecx, xmm4
139    mov         rax, 3fh
140    and         eax, ecx
141    subsd       xmm0, xmm2
142    movsd       xmm1, xmm0
143
144    ; m = (n - j) / 64
145    sub         ecx, eax
146    sar         ecx, 6
147
148    ; q
149    movsd       xmm3, QWORD PTR __real_1_by_6
150    mulsd       xmm3, xmm0
151    mulsd       xmm0, xmm0
152    addsd       xmm3, QWORD PTR __real_1_by_2
153    mulsd       xmm0, xmm3
154    addsd       xmm0, xmm1
155
156    add         rcx, 1023
157    shl         rcx, 52
158
159    ; (f)*(1+q)
160    movsd       xmm2, QWORD PTR [r10+rax*8]
161    mulsd       xmm0, xmm2
162    addsd       xmm0, xmm2
163
164    movd        xmm1, rcx
165    mulsd       xmm0, xmm1
166    cvtsd2ss    xmm0, xmm0
167
168Lexpf_final_check:
169    StackDeallocate stack_size
170    ret
171
172ALIGN 16
173Lexpf_y_is_zero:
174
175    movss       xmm1, DWORD PTR __real_zero
176    movd        xmm0, edx
177    mov         r8d, DWORD PTR __flag_y_zero
178
179    call        fname_special
180    jmp         Lexpf_finish
181
182ALIGN 16
183Lexpf_y_is_inf:
184
185    movss       xmm1, DWORD PTR __real_inf
186    movd        xmm0, edx
187    mov         r8d, DWORD PTR __flag_y_inf
188
189    call        fname_special
190    jmp         Lexpf_finish
191
192ALIGN 16
193Lexpf_x_is_inf_or_nan:
194
195    cmp         edx, DWORD PTR __real_inf
196    je          Lexpf_finish
197
198    cmp         edx, DWORD PTR __real_ninf
199    je          Lexpf_process_zero
200
201    or          edx, DWORD PTR __real_qnanbit
202    movd        xmm1, edx
203    mov         r8d, DWORD PTR __flag_x_nan
204    call        fname_special
205    jmp         Lexpf_finish
206
207ALIGN 16
208Lexpf_process_zero:
209    movss       xmm0, DWORD PTR __real_zero
210    jmp         Lexpf_final_check
211
212ALIGN 16
213Lexpf_finish:
214    StackDeallocate stack_size
215    ret
216
217
218ALIGN 16
219Lexpf_fma3:
220
221    vcvtss2sd    xmm0, xmm0, xmm0
222
223    ; x * (64/ln(2))
224    vmulsd      xmm3, xmm0, QWORD PTR __real_64_by_log2
225
226    ; x <= 128*ln(2), ( x * (64/ln(2)) ) <= 64*128
227    ; x > -150*ln(2), ( x * (64/ln(2)) ) > 64*(-150)
228    vcomisd     xmm3, QWORD PTR __real_p8192
229    jae         Lexpf_fma3_y_is_inf
230
231    vucomisd    xmm3, QWORD PTR __real_m9600
232    jb          Lexpf_fma3_y_is_zero
233
234    ; n = int( x * (64/ln(2)) )
235    vcvtpd2dq   xmm4, xmm3
236    lea         r10, __two_to_jby64_table
237    vcvtdq2pd   xmm1, xmm4
238
239    ; r = x - n * ln(2)/64
240    vfnmadd231sd xmm0, xmm1, QWORD PTR __real_log2_by_64
241    vmovd        ecx, xmm4
242    mov          rax, 3fh
243    and          eax, ecx
244    vmovapd      xmm1, xmm0               ; xmm1 <-- copy of r
245
246    ; m = (n - j) / 64
247    sub          ecx, eax
248    sar          ecx, 6
249
250    ; q
251    vmovsd       xmm3, QWORD PTR __real_1_by_6
252    vmulsd       xmm0, xmm0, xmm0         ; xmm0 <-- r^2
253    vfmadd213sd  xmm3, xmm1, QWORD PTR __real_1_by_2 ; xmm3 <-- r/6 + 1/2
254    vfmadd213sd  xmm0, xmm3, xmm1         ; xmm0 <-- q = r^2*(r/6 + 1/2) + r
255
256    add         rcx, 1023
257    shl         rcx, 52
258
259    ; (f)*(1+q)
260    vmovsd       xmm2, QWORD PTR [r10+rax*8]
261    vfmadd213sd  xmm0, xmm2, xmm2
262
263    vmovq        xmm2,rcx
264    vmulsd       xmm0, xmm0, xmm2
265    vcvtsd2ss    xmm0, xmm0, xmm0
266
267Lexpf_fma3_final_check:
268    StackDeallocate stack_size
269    ret
270
271ALIGN 16
272Lexpf_fma3_y_is_zero:
273
274    vmovss       xmm1, DWORD PTR __real_zero
275    vmovd        xmm0, edx
276    mov          r8d, DWORD PTR __flag_y_zero
277
278    call         fname_special
279    jmp          Lexpf_fma3_finish
280
281ALIGN 16
282Lexpf_fma3_y_is_inf:
283
284    vmovss       xmm1, DWORD PTR __real_inf
285    vmovd        xmm0, edx
286    mov          r8d, DWORD PTR __flag_y_inf
287
288    call         fname_special
289    jmp          Lexpf_fma3_finish
290
291ALIGN 16
292Lexpf_fma3_process_zero:
293    vmovss       xmm0, DWORD PTR __real_zero
294    jmp          Lexpf_fma3_final_check
295
296ALIGN 16
297Lexpf_fma3_finish:
298    StackDeallocate stack_size
299    ret
300
301fname endp
302
303END
304