1; 2; MIT License 3; ----------- 4; 5; Copyright (c) 2002-2019 Advanced Micro Devices, Inc. 6; 7; Permission is hereby granted, free of charge, to any person obtaining a copy 8; of this Software and associated documentaon files (the "Software"), to deal 9; in the Software without restriction, including without limitation the rights 10; to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11; copies of the Software, and to permit persons to whom the Software is 12; furnished to do so, subject to the following conditions: 13; 14; The above copyright notice and this permission notice shall be included in 15; all copies or substantial portions of the Software. 16; 17; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18; IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19; FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20; AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21; LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 23; THE SOFTWARE. 24; 25; expf.asm 26; 27; An implementation of the expf libm function. 28; 29; Prototype: 30; 31; float expf(float x); 32; 33 34; 35; Algorithm: 36; Similar to one presnted in exp.asm 37; 38; If FMA3 hardware is available, an FMA3 implementation of expf will be used. 39 40 41.const 42ALIGN 16 43 44__real_inf DD 7f800000h 45 DD 0 46 DQ 0 47 48__real_ninf DD 0ff800000h 49 DD 0 50 DQ 0 51 52__real_qnanbit DD 00400000h 53 DD 0 54 DQ 0 55 56__real_zero DD 00000000h 57 DD 0 58 DQ 0 59 60__real_p8192 DQ 40c0000000000000h 61 DQ 0 62__real_m9600 DQ 0c0c2c00000000000h 63 DQ 0 64 65__real_64_by_log2 DQ 40571547652b82feh ; 64/ln(2) 66 DQ 0 67__real_log2_by_64 DQ 3f862e42fefa39efh ; log2_by_64 68 DQ 0 69 70__real_1_by_6 DQ 3fc5555555555555h ; 1/6 71 DQ 0 72__real_1_by_2 DQ 3fe0000000000000h ; 1/2 73 DQ 0 74 75; these codes and the ones in the corresponding .c file have to match 76__flag_x_nan DD 00000001 77__flag_y_zero DD 00000002 78__flag_y_inf DD 00000003 79 80EXTRN __two_to_jby64_table:QWORD 81EXTRN __use_fma3_lib:DWORD 82 83fname TEXTEQU <expf> 84fname_special TEXTEQU <_expf_special> 85 86; define local variable storage offsets 87 88; make room for fname_special to save things 89dummy_space EQU 020h 90stack_size EQU 038h 91 92include fm.inc 93 94; external function 95EXTERN fname_special:PROC 96 97.code 98 99ALIGN 16 100PUBLIC fname 101fname PROC FRAME 102 StackAllocate stack_size 103 .ENDPROLOG 104 105 ; Do this to avoid possible exceptions from a NaN argument. 106 movd edx, xmm0 107 btr edx,31 108 cmp edx, DWORD PTR __real_inf 109 jge Lexpf_x_is_inf_or_nan 110 111 cmp DWORD PTR __use_fma3_lib, 0 112 jne Lexpf_fma3 113 114Lexpf_sse2: 115 116 cvtss2sd xmm0, xmm0 117 118 ; x * (64/ln(2)) 119 movsd xmm3, QWORD PTR __real_64_by_log2 120 mulsd xmm3, xmm0 121 122 ; x <= 128*ln(2), ( x * (64/ln(2)) ) <= 64*128 123 ; x > -150*ln(2), ( x * (64/ln(2)) ) > 64*(-150) 124 comisd xmm3, QWORD PTR __real_p8192 125 jae Lexpf_y_is_inf 126 127 comisd xmm3, QWORD PTR __real_m9600 128 jb Lexpf_y_is_zero 129 130 ; n = int( x * (64/ln(2)) ) 131 cvtpd2dq xmm4, xmm3 132 lea r10, __two_to_jby64_table 133 cvtdq2pd xmm1, xmm4 134 135 ; r = x - n * ln(2)/64 136 movsd xmm2, QWORD PTR __real_log2_by_64 137 mulsd xmm2, xmm1 138 movd ecx, xmm4 139 mov rax, 3fh 140 and eax, ecx 141 subsd xmm0, xmm2 142 movsd xmm1, xmm0 143 144 ; m = (n - j) / 64 145 sub ecx, eax 146 sar ecx, 6 147 148 ; q 149 movsd xmm3, QWORD PTR __real_1_by_6 150 mulsd xmm3, xmm0 151 mulsd xmm0, xmm0 152 addsd xmm3, QWORD PTR __real_1_by_2 153 mulsd xmm0, xmm3 154 addsd xmm0, xmm1 155 156 add rcx, 1023 157 shl rcx, 52 158 159 ; (f)*(1+q) 160 movsd xmm2, QWORD PTR [r10+rax*8] 161 mulsd xmm0, xmm2 162 addsd xmm0, xmm2 163 164 movd xmm1, rcx 165 mulsd xmm0, xmm1 166 cvtsd2ss xmm0, xmm0 167 168Lexpf_final_check: 169 StackDeallocate stack_size 170 ret 171 172ALIGN 16 173Lexpf_y_is_zero: 174 175 movss xmm1, DWORD PTR __real_zero 176 movd xmm0, edx 177 mov r8d, DWORD PTR __flag_y_zero 178 179 call fname_special 180 jmp Lexpf_finish 181 182ALIGN 16 183Lexpf_y_is_inf: 184 185 movss xmm1, DWORD PTR __real_inf 186 movd xmm0, edx 187 mov r8d, DWORD PTR __flag_y_inf 188 189 call fname_special 190 jmp Lexpf_finish 191 192ALIGN 16 193Lexpf_x_is_inf_or_nan: 194 195 cmp edx, DWORD PTR __real_inf 196 je Lexpf_finish 197 198 cmp edx, DWORD PTR __real_ninf 199 je Lexpf_process_zero 200 201 or edx, DWORD PTR __real_qnanbit 202 movd xmm1, edx 203 mov r8d, DWORD PTR __flag_x_nan 204 call fname_special 205 jmp Lexpf_finish 206 207ALIGN 16 208Lexpf_process_zero: 209 movss xmm0, DWORD PTR __real_zero 210 jmp Lexpf_final_check 211 212ALIGN 16 213Lexpf_finish: 214 StackDeallocate stack_size 215 ret 216 217 218ALIGN 16 219Lexpf_fma3: 220 221 vcvtss2sd xmm0, xmm0, xmm0 222 223 ; x * (64/ln(2)) 224 vmulsd xmm3, xmm0, QWORD PTR __real_64_by_log2 225 226 ; x <= 128*ln(2), ( x * (64/ln(2)) ) <= 64*128 227 ; x > -150*ln(2), ( x * (64/ln(2)) ) > 64*(-150) 228 vcomisd xmm3, QWORD PTR __real_p8192 229 jae Lexpf_fma3_y_is_inf 230 231 vucomisd xmm3, QWORD PTR __real_m9600 232 jb Lexpf_fma3_y_is_zero 233 234 ; n = int( x * (64/ln(2)) ) 235 vcvtpd2dq xmm4, xmm3 236 lea r10, __two_to_jby64_table 237 vcvtdq2pd xmm1, xmm4 238 239 ; r = x - n * ln(2)/64 240 vfnmadd231sd xmm0, xmm1, QWORD PTR __real_log2_by_64 241 vmovd ecx, xmm4 242 mov rax, 3fh 243 and eax, ecx 244 vmovapd xmm1, xmm0 ; xmm1 <-- copy of r 245 246 ; m = (n - j) / 64 247 sub ecx, eax 248 sar ecx, 6 249 250 ; q 251 vmovsd xmm3, QWORD PTR __real_1_by_6 252 vmulsd xmm0, xmm0, xmm0 ; xmm0 <-- r^2 253 vfmadd213sd xmm3, xmm1, QWORD PTR __real_1_by_2 ; xmm3 <-- r/6 + 1/2 254 vfmadd213sd xmm0, xmm3, xmm1 ; xmm0 <-- q = r^2*(r/6 + 1/2) + r 255 256 add rcx, 1023 257 shl rcx, 52 258 259 ; (f)*(1+q) 260 vmovsd xmm2, QWORD PTR [r10+rax*8] 261 vfmadd213sd xmm0, xmm2, xmm2 262 263 vmovq xmm2,rcx 264 vmulsd xmm0, xmm0, xmm2 265 vcvtsd2ss xmm0, xmm0, xmm0 266 267Lexpf_fma3_final_check: 268 StackDeallocate stack_size 269 ret 270 271ALIGN 16 272Lexpf_fma3_y_is_zero: 273 274 vmovss xmm1, DWORD PTR __real_zero 275 vmovd xmm0, edx 276 mov r8d, DWORD PTR __flag_y_zero 277 278 call fname_special 279 jmp Lexpf_fma3_finish 280 281ALIGN 16 282Lexpf_fma3_y_is_inf: 283 284 vmovss xmm1, DWORD PTR __real_inf 285 vmovd xmm0, edx 286 mov r8d, DWORD PTR __flag_y_inf 287 288 call fname_special 289 jmp Lexpf_fma3_finish 290 291ALIGN 16 292Lexpf_fma3_process_zero: 293 vmovss xmm0, DWORD PTR __real_zero 294 jmp Lexpf_fma3_final_check 295 296ALIGN 16 297Lexpf_fma3_finish: 298 StackDeallocate stack_size 299 ret 300 301fname endp 302 303END 304