1; 2; 3; MIT License 4; ----------- 5; 6; Copyright (c) 2002-2019 Advanced Micro Devices, Inc. 7; 8; Permission is hereby granted, free of charge, to any person obtaining a copy 9; of this Software and associated documentaon files (the "Software"), to deal 10; in the Software without restriction, including without limitation the rights 11; to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 12; copies of the Software, and to permit persons to whom the Software is 13; furnished to do so, subject to the following conditions: 14; 15; The above copyright notice and this permission notice shall be included in 16; all copies or substantial portions of the Software. 17; 18; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 19; IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 20; FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 21; AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 22; LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 23; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 24; THE SOFTWARE. 25; 26; An implementation of the tanf function using the fma3 instruction. 27; 28; Prototype: 29; 30; float tanf(float x); 31; 32; Computes tanf(x). 33; It will provide proper C99 return values, 34; but may not raise floating point status bits properly. 35; Based on the NAG C implementation. 36; 37.const 38ALIGN 16 39L_sign_mask DQ 07FFFFFFFFFFFFFFFh 40 DQ 07FFFFFFFFFFFFFFFh 41L_twobypi DQ 03FE45F306DC9C883h 42 DQ 03FE45F306DC9C883h 43L_int_three DQ 00000000000000003h 44 DQ 00000000000000003h 45L_int_one DQ 00000000000000001h 46 DQ 00000000000000001h 47L_signbit DQ 08000000000000000h 48 DQ 08000000000000000h 49 50L_tanf DQ 03FD8A8B0DA56CB17h ; c0 51 DQ 0BF919DBA6EFD6AADh ; c1 52 DQ 03FF27E84A3E73A2Eh ; d0 53 DQ 0BFE07266D7B3511Bh ; d1 54 DQ 03F92E29003C692D9h ; d2 55 56L_large_x_sse2 DQ 04160000000000000h ; 8388608. 57L_large_x_fma3 DQ 041E921FB40000000h ; 3.373259264e9 58L_point_333 DQ 03FD5555555555555h 59L_mask_3e4 DQ 03e40000000000000h 60L_mask_3f2 DQ 03f20000000000000h 61L_point_five DQ 03FE0000000000000h 62L_piby2_1 DQ 03FF921FB54400000h 63L_piby2_1tail DQ 03DD0B4611A626331h 64L_piby2_lead DQ 03ff921fb54442d18h 65L_n_one DQ 0BFF0000000000000h 66L_piby4 DQ 03fe921fb54442d18h 67L_min_norm DQ 00010000000000000h 68 69 70L_inf_mask_32 DD 07F800000h 71 DD 07F800000h 72 73EXTRN __use_fma3_lib:DWORD 74EXTRN __L_2_by_pi_bits:BYTE 75 76fname TEXTEQU <tanf> 77fname_special TEXTEQU <_tanf_special> 78 79; define local variable storage offsets 80; actually there aren't any, but we need to leave room for _tanf_special. 81dummy_space EQU 20h 82stack_size EQU 38h 83 84include fm.inc 85 86;Define name and any external functions being called 87EXTERN fname_special : PROC 88 89.code 90PUBLIC fname 91fname PROC FRAME 92 StackAllocate stack_size 93 .ENDPROLOG 94 cmp DWORD PTR __use_fma3_lib, 0 95 jne Ltanf_fma3 96 97Ltanf_sse2: 98 movd eax,xmm0 99 mov r8d,L_inf_mask_32 100 and eax,r8d 101 cmp eax, r8d 102 jz Ltanf_sse2_naninf 103 104 cvtss2sd xmm5,xmm0 105 movd r9,xmm5 106 btr r9,63 ; r9 <-- |x| 107 108 cmp r9,L_piby4 109 jg Ltanf_sse2_range_reduce 110 cmp r9,L_mask_3f2 ; compare to 2^-13 = 0.0001220703125 111 jge Ltanf_sse2_compute_tanf_piby_4 112 cmp r9,L_mask_3e4 ; compare to 2^-27 = 7.4505805969238281e-009 113 jge Ltanf_sse2_compute_x_xxx_0_333 114 ; At this point tan(x) ~= x; if it's not exact, set the inexact flag. 115 116 test r9, r9 117 je Ltanf_sse2_exact_return 118 movsd xmm1, L_n_one 119 addsd xmm1, L_min_norm ; set inexact 120 121Ltanf_sse2_exact_return: 122 StackDeallocate stack_size 123 ret 124 125ALIGN 16 126Ltanf_sse2_compute_x_xxx_0_333: 127 movapd xmm2,xmm5 128 mulsd xmm2,xmm2 ; xmm2 <-- x^2 129 movapd xmm0,xmm2 130 mulsd xmm0,xmm5 ; xmm0 <-- x^3 131 mulsd xmm0,L_point_333 132 addsd xmm0,xmm5 ; x + x*x*x*0.3333333333333333; 133 jmp Ltanf_sse2_return_s 134 135ALIGN 16 136Ltanf_sse2_compute_tanf_piby_4: 137 movapd xmm0,xmm5 ; xmm0 <-- x (as double) 138 139 movapd xmm1,xmm0 140 mulsd xmm1,xmm0 ; xmm1 <-- x*x 141 142 movsd xmm3,L_tanf+008h ; xmm3 <-- c1 143 mulsd xmm3,xmm1 ; xmm3 <-- c1*x^2 144 addsd xmm3,L_tanf ; xmm3 <-- c = c1*x^2 + c0 145 146 movsd xmm2,L_tanf+020h ; xmm2 <-- d2 147 mulsd xmm2,xmm1 ; xmm2 <-- d2*x^2 148 addsd xmm2,L_tanf+018h ; xmm2 <-- d2*x^2 + d1 149 mulsd xmm2,xmm1 ; xmm2 <-- (d2*x^2 + d1)*x^2 150 addsd xmm2,L_tanf+010h ; xmm2 <-- d = (d2*x^2 + d1)*x^2 + d0 151 divsd xmm3,xmm2 ; xmm3 <-- c/d 152 mulsd xmm1,xmm0 ; xmm1 <-- x^3 153 mulsd xmm1,xmm3 ; xmm1 <-- x^3 * c/d 154 addsd xmm0,xmm1 ; xmm0 <-- x + x^3 * c/d 155 jmp Ltanf_sse2_return_s 156 157Ltanf_sse2_range_reduce: 158 movd xmm0,r9 159 cmp r9,L_large_x_sse2 160 jge Ltanf_sse2_tanf_reduce_large 161 162Ltanf_sse2_tanf_reduce_moderate: 163 movapd xmm1,xmm0 164 andpd xmm1,L_sign_mask 165 movapd xmm2,L_twobypi 166 mulsd xmm2,xmm1 167 addsd xmm2,L_point_five 168 cvttpd2dq xmm4,xmm2 169 cvtdq2pd xmm1,xmm4 170 andpd xmm4,L_int_three ; xmm4 <-- region 171 movapd xmm2,xmm0 172 173 movapd xmm3,xmm1 174 mulsd xmm1,L_piby2_1 175 subsd xmm2,xmm1 176 mulsd xmm3,L_piby2_1tail ; xmm3 rtail 177 movapd xmm0,xmm2 178 subsd xmm0,xmm3 179 subsd xmm2,xmm0 180 movapd xmm1,xmm2 181 subsd xmm1,xmm3 182 jmp Ltanf_sse2_exit_s 183 184Ltanf_sse2_tanf_reduce_large: 185 lea r9,__L_2_by_pi_bits 186 ;xexp = (x >> 52) 1023 187 movd r11,xmm0 188 mov rcx,r11 189 shr r11,52 190 sub r11,1023 ; r11 <-- xexp = exponent of input x 191 ;calculate the last byte from which to start multiplication 192 ;last = 134 (xexp >> 3) 193 mov r10,r11 194 shr r10,3 195 sub r10,134 ; r10 <-- -last 196 neg r10 ; r10 <-- last 197 ;load 64 bits of 2_by_pi 198 mov rax,[r9+r10] 199 ;mantissa of x = ((x << 12) >> 12) | implied bit 200 shl rcx,12 201 shr rcx,12 ; rcx <-- mantissa part of input x 202 bts rcx,52 ; add the implied bit as well 203 ;load next 128 bits of 2_by_pi 204 add r10,8 ; increment to next 8 bytes of 2_by_pi 205 movdqu xmm0,[r9+r10] 206 ;do three 64bit multiplications with mant of x 207 mul rcx 208 mov r8,rax ; r8 = last 64 bits of mul = res1[2] 209 mov r10,rdx ; r10 = carry 210 vmovq rax,xmm0 211 mul rcx 212 ;resexp = xexp & 7 213 and r11,7 ; r11 <-- resexp = last 3 bits of xexp 214 psrldq xmm0,8 215 add rax,r10 ; add the previous carry 216 adc rdx,0 217 mov r9,rax ; r9 <-- next 64 bits of mul = res1[1] 218 mov r10,rdx ; r10 <-- carry 219 movd rax,xmm0 220 mul rcx 221 add r10,rax ;r10 = most sig 64 bits = res1[0] 222 ;find the region 223 ;last three bits ltb = most sig bits >> (54 resexp)) 224 ; decimal point in last 18 bits == 8 lsb's in first 64 bits 225 ; and 8 msb's in next 64 bits 226 ;point_five = ltb & 01h; 227 ;region = ((ltb >> 1) + point_five) & 3; 228 mov rcx,54 229 mov rax,r10 230 sub rcx,r11 231 xor rdx,rdx ;rdx = sign of x 232 shr rax,cl 233 jnc Ltanf_sse2_no_point_five_f 234 ;;if there is carry.. then negate the result of multiplication 235 not r10 236 not r9 237 not r8 238 mov rdx,08000000000000000h 239ALIGN 16 240Ltanf_sse2_no_point_five_f: 241 adc rax,0 242 and rax,3 243 movd xmm4,eax ; xmm4 <-- region 244 ;calculate the number of integer bits and zero them out 245 mov rcx,r11 246 add rcx,10 ; rcx = no. of integer bits 247 shl r10,cl 248 shr r10,cl ; r10 contains only mant bits 249 sub rcx,64 ; form the exponent 250 mov r11,rcx 251 ;find the highest set bit 252 bsr rcx,r10 253 jnz Ltanf_sse2_form_mantissa_f 254 mov r10,r9 255 mov r9,r8 256 mov r8,0 257 bsr rcx,r10 ;rcx = hsb 258 sub r11,64 259ALIGN 16 260Ltanf_sse2_form_mantissa_f: 261 add r11,rcx ; for exp of x 262 sub rcx,52 ; rcx = no. of bits to shift in r10 263 cmp rcx,0 264 jl Ltanf_sse2_hsb_below_52_f 265 je Ltanf_sse2_form_numbers_f 266 ;hsb above 52 267 mov r8,r10 268 shr r10,cl ; r10 = mantissa of x with hsb at 52 269 shr r9,cl ; make space for bits from r10 270 sub rcx,64 271 neg rcx ; rcx = no of bits to shift r10 272 shl r8,cl 273 or r9,r8 ; r9 = mantissa bits of xx 274 jmp Ltanf_sse2_form_numbers_f 275 276ALIGN 16 277Ltanf_sse2_hsb_below_52_f: 278 neg rcx 279 mov rax,r9 280 shl r10,cl 281 shl r9,cl 282 sub rcx,64 283 neg rcx 284 shr rax,cl 285 or r10,rax 286 shr r8,cl 287 or r9,r8 288ALIGN 16 289Ltanf_sse2_form_numbers_f: 290 add r11,1023 291 btr r10,52 ; remove the implied bit 292 mov rcx,r11 293 or r10,rdx ; put the sign 294 shl rcx,52 295 or r10,rcx ; x is in r10 296 movd xmm0,r10 ; xmm0 <-- x 297 mulsd xmm0,L_piby2_lead 298 299Ltanf_sse2_exit_s: 300 movd eax,xmm4 301 and eax,1 ; eax <-- region & 1 302 movapd xmm1,xmm0 303 mulsd xmm1,xmm0 ; xmm1 <-- x*x 304 305 movsd xmm3,L_tanf+008h ; xmm3 <-- c1 306 mulsd xmm3,xmm1 ; xmm3 <-- c1*x^2 307 addsd xmm3,L_tanf ; xmm3 <-- c = c1*x^2 + c0 308 309 movsd xmm2,L_tanf+020h ; xmm2 <-- d2 310 mulsd xmm2,xmm1 ; xmm2 <-- d2*x^2 311 addsd xmm2,L_tanf+018h ; xmm2 <-- d2*x^2 + d1 312 mulsd xmm2,xmm1 ; xmm2 <-- (d2*x^2 + d1)*x^2 313 addsd xmm2,L_tanf+010h ; xmm2 <-- d = (d2*x^2 + d1)*x^2 + d0 314 divsd xmm3,xmm2 ; xmm3 <-- c/d 315 mulsd xmm1,xmm0 ; xmm1 <-- x^3 316 mulsd xmm1,xmm3 ; xmm1 <-- x^3 * c/d 317 addsd xmm0,xmm1 ; xmm0 <-- x + x^3 * c/d 318 cmp eax,01h 319 jne Ltanf_sse2_exit_tanpiby4 320Ltanf_sse2_recip : 321 movd xmm3,L_n_one 322 divsd xmm3,xmm0 323 movsd xmm0,xmm3 324Ltanf_sse2_exit_tanpiby4 : 325 andpd xmm5,L_signbit 326 xorpd xmm0,xmm5 327 328Ltanf_sse2_return_s: 329 cvtsd2ss xmm0,xmm0 330Ltanf_sse2_return_c: 331 StackDeallocate stack_size 332 ret 333 334Ltanf_sse2_naninf: 335 call fname_special 336 StackDeallocate stack_size 337 ret 338 339ALIGN 16 340Ltanf_fma3: 341 vmovd eax,xmm0 342 mov r8d,L_inf_mask_32 343 and eax,r8d 344 cmp eax, r8d 345 jz Ltanf_fma3_naninf 346 347 vcvtss2sd xmm5,xmm0,xmm0 348 vmovq r9,xmm5 349 btr r9,63 ; r9 <-- |x| 350 351 cmp r9,L_piby4 352 jg Ltanf_fma3_range_reduce 353 cmp r9,L_mask_3f2 354 jge Ltanf_fma3_compute_tanf_piby_4 355 cmp r9,L_mask_3e4 356 jge Ltanf_fma3_compute_x_xxx_0_333 357 jmp Ltanf_fma3_return_c 358 359Ltanf_fma3_compute_x_xxx_0_333: 360 vmulsd xmm2,xmm5,xmm5 361 vmulsd xmm0,xmm2,xmm5 362 vfmadd132sd xmm0,xmm5,L_point_333 ; x + x*x*x*0.3333333333333333; 363 jmp Ltanf_fma3_return_s 364 365Ltanf_fma3_compute_tanf_piby_4: 366 vmovsd xmm0,xmm5,xmm5 367 vmulsd xmm1,xmm0,xmm0 368 vmovsd xmm3,L_tanf+008h 369 vfmadd213sd xmm3,xmm1,L_tanf 370 vmovsd xmm2,L_tanf+020h 371 vfmadd213sd xmm2,xmm1,L_tanf+018h 372 vfmadd213sd xmm2,xmm1,L_tanf+010h 373 vdivsd xmm3,xmm3,xmm2 374 vmulsd xmm1,xmm1,xmm0 375 vfmadd231sd xmm0,xmm1,xmm3 376 jmp Ltanf_fma3_return_s 377 378Ltanf_fma3_range_reduce: 379 vmovq xmm0,r9 380 cmp r9,L_large_x_fma3 381 jge Ltanf_fma3_tanf_reduce_large 382 383Ltanf_fma3_tanf_reduce_moderate: 384 vandpd xmm1,xmm0,L_sign_mask 385 vmovapd xmm2,L_twobypi 386 vfmadd213sd xmm2,xmm1,L_point_five 387 vcvttpd2dq xmm2,xmm2 388 vpmovsxdq xmm1,xmm2 389 vandpd xmm4,xmm1,L_int_three ; xmm4 <-- region 390 vshufps xmm1 ,xmm1,xmm1,8 391 vcvtdq2pd xmm1,xmm1 392 vmovdqa xmm2,xmm0 393 vfnmadd231sd xmm2,xmm1,L_piby2_1 ; xmm2 rhead 394 vmulsd xmm3,xmm1,L_piby2_1tail ; xmm3 rtail 395 vsubsd xmm0,xmm2,xmm3 396 vsubsd xmm2,xmm2,xmm0 397 vsubsd xmm1,xmm2,xmm3 398 jmp Ltanf_fma3_exit_s 399 400Ltanf_fma3_tanf_reduce_large: 401 lea r9,__L_2_by_pi_bits 402 ;xexp = (x >> 52) 1023 403 vmovq r11,xmm0 404 mov rcx,r11 405 shr r11,52 406 sub r11,1023 ; r11 <-- xexp = exponent of input x 407 ;calculate the last byte from which to start multiplication 408 ;last = 134 (xexp >> 3) 409 mov r10,r11 410 shr r10,3 411 sub r10,134 ; r10 <-- -last 412 neg r10 ; r10 <-- last 413 ;load 64 bits of 2_by_pi 414 mov rax,[r9+r10] 415 ;mantissa of x = ((x << 12) >> 12) | implied bit 416 shl rcx,12 417 shr rcx,12 ; rcx <-- mantissa part of input x 418 bts rcx,52 ; add the implied bit as well 419 ;load next 128 bits of 2_by_pi 420 add r10,8 ; increment to next 8 bytes of 2_by_pi 421 vmovdqu xmm0,XMMWORD PTR[r9+r10] 422 ;do three 64bit multiplications with mant of x 423 mul rcx 424 mov r8,rax ; r8 = last 64 bits of mul = res1[2] 425 mov r10,rdx ; r10 = carry 426 vmovq rax,xmm0 427 mul rcx 428 ;resexp = xexp & 7 429 and r11,7 ; r11 <-- resexp = last 3 bits of xexp 430 vpsrldq xmm0,xmm0,8 431 add rax,r10 ; add the previous carry 432 adc rdx,0 433 mov r9,rax ; r9 <-- next 64 bits of mul = res1[1] 434 mov r10,rdx ; r10 <-- carry 435 vmovq rax,xmm0 436 mul rcx 437 add r10,rax ;r10 = most sig 64 bits = res1[0] 438 ;find the region 439 ;last three bits ltb = most sig bits >> (54 resexp)) 440 ; decimal point in last 18 bits == 8 lsb's in first 64 bits 441 ; and 8 msb's in next 64 bits 442 ;point_five = ltb & 01h; 443 ;region = ((ltb >> 1) + point_five) & 3; 444 mov rcx,54 445 mov rax,r10 446 sub rcx,r11 447 xor rdx,rdx ;rdx = sign of x 448 shr rax,cl 449 jnc Ltanf_fma3_no_point_five_f 450 ;;if there is carry.. then negate the result of multiplication 451 not r10 452 not r9 453 not r8 454 mov rdx,08000000000000000h 455ALIGN 16 456Ltanf_fma3_no_point_five_f: 457 adc rax,0 458 and rax,3 459 vmovd xmm4,eax ; xmm4 <-- region 460 ;calculate the number of integer bits and zero them out 461 mov rcx,r11 462 add rcx,10 ; rcx = no. of integer bits 463 shl r10,cl 464 shr r10,cl ; r10 contains only mant bits 465 sub rcx,64 ; form the exponent 466 mov r11,rcx 467 ;find the highest set bit 468 bsr rcx,r10 469 jnz Ltanf_fma3_form_mantissa_f 470 mov r10,r9 471 mov r9,r8 472 mov r8,0 473 bsr rcx,r10 ;rcx = hsb 474 sub r11,64 475ALIGN 16 476Ltanf_fma3_form_mantissa_f: 477 add r11,rcx ; for exp of x 478 sub rcx,52 ; rcx = no. of bits to shift in r10 479 cmp rcx,0 480 jl Ltanf_fma3_hsb_below_52_f 481 je Ltanf_fma3_form_numbers_f 482 ;hsb above 52 483 mov r8,r10 484 shr r10,cl ; r10 = mantissa of x with hsb at 52 485 shr r9,cl ; make space for bits from r10 486 sub rcx,64 487 neg rcx ; rcx = no of bits to shift r10 488 shl r8,cl 489 or r9,r8 ; r9 = mantissa bits of xx 490 jmp Ltanf_fma3_form_numbers_f 491 492ALIGN 16 493Ltanf_fma3_hsb_below_52_f: 494 neg rcx 495 mov rax,r9 496 shl r10,cl 497 shl r9,cl 498 sub rcx,64 499 neg rcx 500 shr rax,cl 501 or r10,rax 502 shr r8,cl 503 or r9,r8 504ALIGN 16 505Ltanf_fma3_form_numbers_f: 506 add r11,1023 507 btr r10,52 ; remove the implied bit 508 mov rcx,r11 509 or r10,rdx ; put the sign 510 shl rcx,52 511 or r10,rcx ; x is in r10 512 vmovq xmm0,r10 ; xmm0 <-- x 513 vmulsd xmm0,xmm0,L_piby2_lead 514 515Ltanf_fma3_exit_s: 516 vandpd xmm2,xmm4,XMMWORD PTR L_int_one 517 vmovd eax,xmm2 518 vmulsd xmm1,xmm0,xmm0 519 vmovsd xmm3,L_tanf+008h 520 vfmadd213sd xmm3,xmm1,L_tanf 521 vmovsd xmm2,L_tanf+020h 522 vfmadd213sd xmm2,xmm1,L_tanf+018h 523 vfmadd213sd xmm2,xmm1,L_tanf+010h 524 vdivsd xmm3,xmm3,xmm2 525 vmulsd xmm1,xmm1,xmm0 526 vfmadd231sd xmm0,xmm1,xmm3 527 cmp eax,01h 528 je Ltanf_fma3_recip 529 jmp Ltanf_fma3_exit_tanpiby4 530 531Ltanf_fma3_recip : 532 vmovq xmm3,L_n_one 533 vdivsd xmm0,xmm3,xmm0 534 535Ltanf_fma3_exit_tanpiby4 : 536 vandpd xmm5,xmm5,L_signbit 537 vxorpd xmm0,xmm0,xmm5 538 539Ltanf_fma3_return_s: 540 vcvtsd2ss xmm0,xmm0,xmm0 541Ltanf_fma3_return_c: 542 StackDeallocate stack_size 543 ret 544 545Ltanf_fma3_naninf: 546 call fname_special 547 StackDeallocate stack_size 548 ret 549 550fname endp 551END 552