1; 2; MIT License 3; ----------- 4; 5; Copyright (c) 2002-2019 Advanced Micro Devices, Inc. 6; 7; Permission is hereby granted, free of charge, to any person obtaining a copy 8; of this Software and associated documentaon files (the "Software"), to deal 9; in the Software without restriction, including without limitation the rights 10; to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11; copies of the Software, and to permit persons to whom the Software is 12; furnished to do so, subject to the following conditions: 13; 14; The above copyright notice and this permission notice shall be included in 15; all copies or substantial portions of the Software. 16; 17; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18; IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19; FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20; AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21; LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 23; THE SOFTWARE. 24; 25; exp.asm 26; 27; An implementation of the exp libm function. 28; 29; Prototype: 30; 31; double exp(double x); 32; 33 34; 35; Algorithm: 36; 37; e^x = 2^(x/ln(2)) = 2^(x*(64/ln(2))/64) 38; 39; x*(64/ln(2)) = n + f, |f| <= 0.5, n is integer 40; n = 64*m + j, 0 <= j < 64 41; 42; e^x = 2^((64*m + j + f)/64) 43; = (2^m) * (2^(j/64)) * 2^(f/64) 44; = (2^m) * (2^(j/64)) * e^(f*(ln(2)/64)) 45; 46; f = x*(64/ln(2)) - n 47; r = f*(ln(2)/64) = x - n*(ln(2)/64) 48; 49; e^x = (2^m) * (2^(j/64)) * e^r 50; 51; (2^(j/64)) is precomputed 52; 53; e^r = 1 + r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5! + (r^5)/5! 54; e^r = 1 + q 55; 56; q = r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5! + (r^5)/5! 57; 58 59.const 60ALIGN 16 61; these codes and the ones in the corresponding .c file have to match 62__flag_x_nan DD 00000001 63__flag_y_zero DD 00000002 64__flag_y_inf DD 00000003 65 66ALIGN 16 67 68L__real_1_by_720 DQ 03f56c16c16c16c17h 69 DQ 03f56c16c16c16c17h ; 1/720 70L__real_1_by_120 DQ 03f81111111111111h 71 DQ 03f81111111111111h ; 1/120 72L__real_1_by_6 DQ 03fc5555555555555h 73 DQ 03fc5555555555555h ; 1/6 74L__real_1_by_2 DQ 03fe0000000000000h 75 DQ 03fe0000000000000h ; 1/2 76L__real_1_by_24 DQ 03fa5555555555555h 77 DQ 03fa5555555555555h ; 1/24 78 79ALIGN 16 80L__log2_by_64_mtail_mhead DQ 0bf862e42fefa0000h, 0bd1cf79abc9e3b39h 81L__ln_of_smallest_normal DQ 0C086232BDD7ABCD2h 82L__zero DQ 00000000000000000h 83L__max_exp_arg DQ 040862e42fefa39efh ; 709.78271289338397 84L__denormal_tiny_threshold DQ 0c0874046dfefd9d0h ; -744.03460681327306 85L__min_exp_arg DQ 0c0874910d52d3051h ; -745.13321910194111 86L__real_64_by_log2 DQ 040571547652b82feh ; 64/ln(2) 87L__positive_infinity DQ 07ff0000000000000h 88L__negative_infinity DQ 0fff0000000000000h 89L__real_qnanbit DQ 0008000000000000h ; qnan set bit 90L__real_x_near0_threshold DQ 3c00000000000000h 91L__log2_by_64_mhead DQ 0bf862e42fefa0000h 92L__log2_by_64_mtail DQ 0bd1cf79abc9e3b39h 93L__real_smallest_denormal DQ 00000000000000001h 94L__real_one DQ 03ff0000000000000h 95L__2_to_neg_26 DQ 03E50000000000000h ; 2^-26 96L__min_normal DQ 00010000000000000h ; smallest normal 97 98 99EXTRN __two_to_jby64_table:QWORD 100EXTRN __two_to_jby64_head_table:QWORD 101EXTRN __two_to_jby64_tail_table:QWORD 102EXTRN __use_fma3_lib:DWORD 103 104; make room for fname_special to save things 105dummy_space EQU 020h 106stack_size EQU 038h 107 108include fm.inc 109 110fname TEXTEQU <exp> 111fname_special TEXTEQU <_exp_special> 112 113;Define name and any external functions being called 114EXTERN fname_special : PROC 115 116.code 117PUBLIC fname 118fname PROC FRAME 119 StackAllocate stack_size 120 .ENDPROLOG 121 122 ; We need to avoid unwanted exceptions from a NaN argument. 123 ; It could be argued that a signaling NaN should raise an exception, 124 ; but the existing library doesn't. At any rate, the comparison operations 125 ; don't seem to like quiet NaN either, so... 126 movd rdx, xmm0 127 btr rdx, 63 128 cmp rdx, L__positive_infinity 129 jge Lexp_x_is_nan_or_inf 130 131 cmp DWORD PTR __use_fma3_lib, 0 132 jne Lexp_fma3 133 134 movapd xmm2, xmm0 135 movapd xmm3, xmm0 136 137 ; Some hardware has problems with too many branches in a single 138 ; 16- or 32-byte window, so let's peel off the common case into 139 ; a single branch. 140 cmplesd xmm2, L__max_exp_arg ; xmm2 <-- 0xFFFFFFFF is x is not too big positive 141 cmpnltsd xmm3, L__denormal_tiny_threshold ; xmm3 <-- 0xFFFFFFFF if x is not too big negative 142 andps xmm2, xmm3 ; xmm2 <-- 0xFFFFFFFF if x is in range, 0 otherwise 143 ucomisd xmm2, xmm2 ; note that FFF... is NaN, so this comparison should set PF for in-range x 144 jp Lexp_y_is_finite 145 146 ucomisd xmm0, L__max_exp_arg 147 ja Lexp_y_is_inf 148 ; Since we peeled off the cases with normal result, 149 ; there is only one possibility remaining: 150 jmp Lexp_y_is_denormal_or_zero 151 152ALIGN 16 153Lexp_y_is_finite: 154 ; x * (64/ln(2)) 155 movapd xmm1, xmm0 156 btr rdx, 63 ; rdx <-- |x| 157 cmp rdx, L__2_to_neg_26 158 jbe Lexp_return_1_plus_x 159 mulsd xmm1, L__real_64_by_log2 160 161 ; n = int( x * (64/ln(2)) ) 162 cvttpd2dq xmm2, xmm1 ; xmm2 = (int)n 163 cvtdq2pd xmm1, xmm2 ; xmm1 = (double)n 164 movd ecx, xmm2 165 movapd xmm2, xmm1 166 167 ; r1 = x - n * ln(2)/64 head 168 mulsd xmm1, L__log2_by_64_mhead 169 170 ; j = n & 0x3f 171 mov rax, 03fh 172 and eax, ecx ; eax = j 173 ; m = (n - j) / 64 174 sar ecx, 6 ; ecx = m 175 176 177 ; r2 = - n * ln(2)/64 tail 178 mulsd xmm2, L__log2_by_64_mtail 179 addsd xmm0, xmm1 ; xmm0 = r1 180 181 ; r1+r2 182 addsd xmm2, xmm0 ; xmm2 = r 183 184 ; q = r + r^2*1/2 + r^3*1/6 + r^4 *1/24 + r^5*1/120 + r^6*1/720 185 ; q = r + r*r*(1/2 + r*(1/6+ r*(1/24 + r*(1/120 + r*(1/720))))) 186 movapd xmm3, L__real_1_by_720 ; xmm3 = 1/720 187 mulsd xmm3, xmm2 ; xmm3 = r*1/720 188 movapd xmm0, L__real_1_by_6 ; xmm0 = 1/6 189 movapd xmm1, xmm2 ; xmm1 = r 190 mulsd xmm0, xmm2 ; xmm0 = r*1/6 191 addsd xmm3, L__real_1_by_120 ; xmm3 = 1/120 + (r*1/720) 192 mulsd xmm1, xmm2 ; xmm1 = r*r 193 addsd xmm0, L__real_1_by_2 ; xmm0 = 1/2 + (r*1/6) 194 movapd xmm4, xmm1 ; xmm4 = r*r 195 mulsd xmm4, xmm1 ; xmm4 = (r*r) * (r*r) 196 mulsd xmm3, xmm2 ; xmm3 = r * (1/120 + (r*1/720)) 197 mulsd xmm0, xmm1 ; xmm0 = (r*r)*(1/2 + (r*1/6)) 198 addsd xmm3, L__real_1_by_24 ; xmm3 = 1/24 + (r * (1/120 + (r*1/720))) 199 addsd xmm0, xmm2 ; xmm0 = r + ((r*r)*(1/2 + (r*1/6))) 200 mulsd xmm3, xmm4 ; xmm3 = ((r*r) * (r*r)) * (1/24 + (r * (1/120 + (r*1/720)))) 201 addsd xmm0, xmm3 ; xmm0 = r + ((r*r)*(1/2 + (r*1/6))) + ((r*r) * (r*r)) * (1/24 + (r * (1/120 + (r*1/720)))) 202 203 ;(f)*(q) + f2 + f1 204 cmp ecx, 0fffffc02h ; -1022 205 lea rdx, __two_to_jby64_table 206 lea r11, __two_to_jby64_tail_table 207 lea r10, __two_to_jby64_head_table 208 mulsd xmm0, QWORD PTR [rdx+rax * 8 ] 209 addsd xmm0, QWORD PTR [r11+rax * 8 ] 210 addsd xmm0, QWORD PTR [r10+rax * 8 ] 211 212 jle Lexp_process_denormal 213Lexp_process_normal: 214 shl rcx, 52 215 movd xmm2, rcx 216 paddq xmm0, xmm2 217 StackDeallocate stack_size 218 ret 219 220ALIGN 16 221Lexp_process_denormal: 222 jl Lexp_process_true_denormal 223 ucomisd xmm0, L__real_one 224 jae Lexp_process_normal 225Lexp_process_true_denormal: 226 ; here ( e^r < 1 and m = -1022 ) or m <= -1023 227 add ecx, 1074 228 mov rax, 1 229 shl rax, cl 230 movd xmm2, rax 231 mulsd xmm0, xmm2 232 jmp Lexp_finish 233 234Lexp_y_is_one: 235 movsd xmm0, L__real_one 236 jmp Lexp_finish 237 238ALIGN 16 239Lexp_x_is_nan_or_inf: 240 movd rax, xmm0 241 cmp rax, L__positive_infinity 242 je Lexp_finish 243 cmp rax, L__negative_infinity 244 je Lexp_return_zero_without_exception 245 or rax, L__real_qnanbit 246 movd xmm1, rax 247 mov r8d, __flag_x_nan 248 call fname_special 249 jmp Lexp_finish 250 251ALIGN 16 252Lexp_y_is_inf: 253 mov rax, 07ff0000000000000h 254 movd xmm1, rax 255 mov r8d, __flag_y_inf 256 call fname_special 257 jmp Lexp_finish 258 259ALIGN 16 260Lexp_y_is_denormal_or_zero: 261 ucomisd xmm0, L__min_exp_arg 262 jbe Lexp_y_is_zero 263 movapd xmm0, L__real_smallest_denormal 264 jmp Lexp_finish 265 266ALIGN 16 267Lexp_y_is_zero: 268 pxor xmm1, xmm1 269 mov r8d, __flag_y_zero 270 call fname_special 271 jmp Lexp_finish 272 273ALIGN 16 274Lexp_return_1_plus_x: 275 cmp rdx, L__min_normal 276 jbe Lexp_return_1_plus_eps 277 addsd xmm0, L__real_one 278 StackDeallocate stack_size 279 ret 0 280 281; Some hardware really does not like subnormals. Try to avoid them. 282ALIGN 16 283Lexp_return_1_plus_eps: 284 movsd xmm0, L__real_one 285 addsd xmm0, L__min_normal ; make sure inexact is set 286 StackDeallocate stack_size 287 ret 0 288 289ALIGN 16 290Lexp_return_zero_without_exception: 291 pxor xmm0,xmm0 292 StackDeallocate stack_size 293 ret 0 294 295 296ALIGN 16 297Lexp_finish: 298 StackDeallocate stack_size 299 ret 0 300 301ALIGN 16 302Lexp_fma3: 303 ; Some hardware has problems with too many branches in a single 304 ; 16- or 32-byte window, so let's peel off the common case into 305 ; a single branch. 306 vcmplesd xmm2, xmm0, L__max_exp_arg ; xmm2 <-- 0xFFFFFFFF is x is not too big positive 307 vcmpnltsd xmm3, xmm0, L__denormal_tiny_threshold ; xmm3 <-- 0xFFFFFFFF if x is not too big negative 308 vandps xmm2, xmm3, xmm2 ; xmm2 <-- 0xFFFFFFFF if x is in range, 0 otherwise 309 vucomisd xmm2, xmm2 ; note that FFF... is NaN, so this comparison should set PF for in-range x 310 jp Lexp_fma3_y_is_finite 311 312 vucomisd xmm0,L__max_exp_arg 313 ja Lexp_fma3_y_is_inf 314 ; Since we peeled off the cases with normal result, 315 ; there is only one possibility remaining: 316 jmp Lexp_fma3_y_is_zero 317 318; vpsllq xmm1, xmm0, 1 319; vpsrlq xmm1, xmm1, 1 320; vucomisd xmm1, L__real_x_near0_threshold ; 2^-63 321; jb Lexp_fma3_y_is_one 322 323ALIGN 16 324Lexp_fma3_y_is_finite: 325 vmovq rdx, xmm0 326 btr rdx, 63 ; rdx <-- |x| 327 cmp rdx, L__2_to_neg_26 328 jbe Lexp_fma3_return_1_plus_x 329 330 ; x * (64/ln(2)) 331 vmulsd xmm1,xmm0,L__real_64_by_log2 332 333 ; n = int( x * (64/ln(2)) ) 334 vcvttpd2dq xmm2,xmm1 ;xmm2 = (int)n 335 vcvtdq2pd xmm1,xmm2 ;xmm1 = (double)n ;can use round 336 vmovd ecx,xmm2 337 338 ; r1 = x - n * ln(2)/64 head 339 ; r2 = - n * ln(2)/64 tail 340 ; r = r1+r2 341 vmovlhps xmm1,xmm1,xmm1 ;xmm1 = (double (double)n,)n 342 vmovq xmm0,xmm0 ;xmm0 = 0,x ;zero out the upper part 343 vfmadd132pd xmm1,xmm0,L__log2_by_64_mtail_mhead 344 vhaddpd xmm2,xmm1,xmm1 ;xmm2 = r,r 345 346 ;j = n & 03fh 347 mov rax,03fh 348 and eax,ecx ;eax = j 349 ; m = (n - j) / 64 350 sar ecx,6 ;ecx = m 351 352 ; q = r + r^2*1/2 + r^3*1/6 + r^4 *1/24 + r^5*1/120 + r^6*1/720 353 ; q = r + r*r*(1/2 + r*(1/6+ r*(1/24 + r*(1/120 + r*(1/720))))) 354 vmovapd xmm3,L__real_1_by_720 355 vfmadd213sd xmm3,xmm2,L__real_1_by_120 356 vfmadd213sd xmm3,xmm2,L__real_1_by_24 357 vfmadd213sd xmm3,xmm2,L__real_1_by_6 358 vfmadd213sd xmm3,xmm2,L__real_1_by_2 359 vmulsd xmm0,xmm2,xmm2 360 vfmadd213sd xmm0,xmm3,xmm2 361 362 ; (f)*(q) + f2 + f1 363 cmp ecx,0fffffc02h ; -1022 364 lea rdx,__two_to_jby64_table 365 lea r11,__two_to_jby64_tail_table 366 lea r10,__two_to_jby64_head_table 367 vmulsd xmm2,xmm0,QWORD PTR[rdx + rax * 8] 368 vaddsd xmm1,xmm2,QWORD PTR[r11 + rax * 8] 369 vaddsd xmm0,xmm1,QWORD PTR[r10 + rax * 8] 370 371 jle Lexp_fma3_process_denormal 372Lexp_fma3_process_normal: 373 shl rcx,52 374 vmovq xmm2,rcx 375 vpaddq xmm0,xmm0,xmm2 376 StackDeallocate stack_size 377 ret 378 379ALIGN 16 380Lexp_fma3_process_denormal: 381 jl Lexp_fma3_process_true_denormal 382 vucomisd xmm0,L__real_one 383 jae Lexp_fma3_process_normal 384Lexp_fma3_process_true_denormal: 385 ; here ( e^r < 1 and m = -1022 ) or m <= -1023 386 add ecx,1074 387 mov rax,1 388 shl rax,cl 389 vmovq xmm2,rax 390 vmulsd xmm0,xmm0,xmm2 391 jmp Lexp_fma3_finish 392 393Lexp_fma3_y_is_one: 394 vmovsd xmm0, L__real_one 395 jmp Lexp_fma3_finish 396 397 398ALIGN 16 399Lexp_fma3_y_is_inf: 400 mov rax,07ff0000000000000h 401 vmovq xmm1,rax 402 mov r8d,__flag_y_inf 403 call fname_special 404 jmp Lexp_fma3_finish 405 406ALIGN 16 407Lexp_fma3_return_1_plus_x: 408 cmp rdx, L__min_normal 409 jbe Lexp_fma3_return_1_plus_eps 410 vaddsd xmm0, xmm0, L__real_one 411 StackDeallocate stack_size 412 ret 0 413 414; Some hardware really does not like subnormals. Try to avoid them. 415ALIGN 16 416Lexp_fma3_return_1_plus_eps: 417 vmovsd xmm0, L__real_one 418 vaddsd xmm0, xmm0, L__min_normal ; make sure inexact is set 419 StackDeallocate stack_size 420 ret 0 421 422ALIGN 16 423Lexp_fma3_y_is_zero: 424 vpxor xmm1,xmm1,xmm1 425 mov r8d,__flag_y_zero 426 call fname_special 427 jmp Lexp_fma3_finish 428 429ALIGN 16 430Lexp_fma3_return_zero_without_exception: 431 vpxor xmm0,xmm0,xmm0 432 433ALIGN 16 434Lexp_fma3_finish: 435 StackDeallocate stack_size 436 ret 437 438fname endp 439END 440