1; 2; 3; MIT License 4; ----------- 5; 6; Copyright (c) 2002-2019 Advanced Micro Devices, Inc. 7; 8; Permission is hereby granted, free of charge, to any person obtaining a copy 9; of this Software and associated documentaon files (the "Software"), to deal 10; in the Software without restriction, including without limitation the rights 11; to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 12; copies of the Software, and to permit persons to whom the Software is 13; furnished to do so, subject to the following conditions: 14; 15; The above copyright notice and this permission notice shall be included in 16; all copies or substantial portions of the Software. 17; 18; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 19; IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 20; FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 21; AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 22; LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 23; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 24; THE SOFTWARE. 25; 26; An implementation of the tan function. 27; 28; Prototype: 29; 30; double tan(double x); 31; 32; Computes tan(x). 33; It will provide proper C99 return values, 34; but may not raise floating point status bits properly. 35; Based on the NAG C implementation. 36; 37; If FMA3 hardware is present, it will be used for the calculation. 38; 39 40.const 41ALIGN 16 42L_signbit DQ 08000000000000000h 43 DQ 08000000000000000h ; duplicate for pd 44 45L_sign_mask DQ 07FFFFFFFFFFFFFFFh 46 DQ 07FFFFFFFFFFFFFFFh ; duplicate for pd 47 48L_int_one DQ 00000000000000001h 49 DQ 00000000000000001h ; duplicate for pd 50 51L_twobypi DQ 03FE45F306DC9C883h 52 DQ 03FE45F306DC9C883h ; duplicate for pd 53 54L_point_333 DQ 03FD5555555555555h; 1/3 55 DQ 03FD5555555555555h ; duplicate for pd 56 57L_tan_p0 DQ 03FD7D50F6638564Ah ; 0.372379159759792203640806338901e0 58 DQ 03FD7D50F6638564Ah ; duplicate for pd 59 60L_tan_p2 DQ 0BF977C24C7569ABBh ; -0.229345080057565662883358588111e-1 61 DQ 0BF977C24C7569ABBh ; duplicate for pd 62 63L_tan_p4 DQ 03F2D5DAF289C385Ah ; 0.224044448537022097264602535574e-3 64 DQ 03F2D5DAF289C385Ah ; duplicate for pd 65 66L_tan_q0 DQ 03FF1DFCB8CAA40B8h ; 0.111713747927937668539901657944e1 67 DQ 03FF1DFCB8CAA40B8h ; duplicate for pd 68 69L_tan_q2 DQ 0BFE08046499EB90Fh ; -0.515658515729031149329237816945e0 70 DQ 0BFE08046499EB90Fh ; duplicate for pd 71 72L_tan_q4 DQ 03F9AB0F4F80A0ACFh ; 0.260656620398645407524064091208e-1 73 DQ 03F9AB0F4F80A0ACFh ; duplicate for pd 74 75L_tan_q6 DQ 0BF2E7517EF6D98F8h ; -0.232371494088563558304549252913e-3 76 DQ 0BF2E7517EF6D98F8h ; duplicate for pd 77 78L_half_mask DQ 0ffffffff00000000h 79 DQ 0ffffffff00000000h ; duplicate for pd 80 81L_piby4_lead DQ 03FE921FB54442D18h ; pi/4, high part 82 DQ 03FE921FB54442D18h ; duplicate for pd 83 84L_piby4_tail DQ 03C81A62633145C06h ; pi/4, low parft 85 DQ 03C81A62633145C06h ; duplicate for pd 86 87; Different parts of argument reduction need different versions of pi/2 88 89L_piby2_1 DQ 03FF921FB54400000h ; pi/2, high 33 bits 90L_piby2_1tail DQ 03DD0B4611A626331h ; pi/2, second 53 bits, overlaps... 91L_piby2_2 DQ 03DD0B4611A600000h ; pi/2, second 33 bits 92L_piby2_2tail DQ 03BA3198A2E037073h ; pi/2, third 53 bits, overlaps... 93L_piby2_3 DQ 03BA3198A2E000000h ; pi/2, third 33 bits 94L_piby2_3tail DQ 0397B839A252049C1h ; pi/2, fourth 53 bits 95 96; end of pi/2 versions 97 98L_two_to_neg_27 DQ 03e40000000000000h ; 2^-27 99L_two_to_neg_13 DQ 03f20000000000000h ; 2^-13 100 101L_inf_mask_64 DQ 07FF0000000000000h 102L_point_five DQ 03FE0000000000000h 103L_point_68 DQ 03FE5C28F5C28F5C3h ; .68 104L_n_point_68 DQ 0BFE5C28F5C28F5C3h ; -.68 105 106L_zero DQ -0000000000000000h ; 0.0 107L_one DQ 03FF0000000000000h ; 1.0 108L_n_one DQ 0BFF0000000000000h ; -1.0 109L_two DQ 04000000000000000h ; 2.0 110 111L_moderate_arg_cw DQ 0411E848000000000h ; 5.e5 112L_moderate_arg_bdl DQ 0417312D000000000h ; 2e7, works for BDL 113 114fname TEXTEQU <tan> 115fname_special TEXTEQU <_tan_special> 116 117; local storage offsets 118save_xmm6 EQU 020h 119save_xmm7 EQU 030h 120store_input EQU 040h 121save_r10 EQU 050h 122dummy_space EQU 060h 123stack_size EQU 088h 124 125include fm.inc 126 127EXTERN __use_fma3_lib:DWORD 128EXTERN fname_special : PROC 129EXTERN __remainder_piby2_fma3 : PROC 130EXTERN __remainder_piby2_fma3_bdl : PROC 131EXTERN __remainder_piby2_forAsm : PROC 132EXTERN _set_statfp : PROC 133 134.code 135ALIGN 16 136PUBLIC fname 137fname PROC FRAME 138 StackAllocate stack_size 139 SaveXmm xmm6, save_xmm6 140 SaveXmm xmm7, save_xmm7 141 .ENDPROLOG 142 cmp DWORD PTR __use_fma3_lib, 0 143 jne Ltan_fma3 144 145Ltan_sse2: 146 movd rdx, xmm0 ; really movq 147 movaps xmm6, xmm0 148 mov rcx, rdx 149 btr rcx, 63 ; rcx <-- |x| 150 151 cmp rcx, L_piby4_lead 152 ja Ltan_abs_x_nle_pio4 ; branch if > pi/4 or NaN 153 154 155 cmp rcx, L_two_to_neg_13 156 jae Ltan_abs_x_ge_two_to_neg_13 157 158 cmp rcx, L_two_to_neg_27 159 jae Labs_x_ge_two_to_neg_27 160 161 ; At this point tan(x) ~= x; if it's not exact, set the inexact flag 162 163 test rcx, rcx 164 je Ltan_return 165 166 mov ecx, 20h ; ecx <-- AMD_F_INEXACT 167 call _set_statfp 168 movaps xmm0, xmm6 ; may be redundant, but xmm0 <-- x 169 170 RestoreXmm xmm7, save_xmm7 171 RestoreXmm xmm6, save_xmm6 172 StackDeallocate stack_size 173 ret 0 174 175Labs_x_ge_two_to_neg_27: 176 177 mulsd xmm0, xmm0 178 mulsd xmm0, xmm6 179 mulsd xmm0, QWORD PTR L_point_333 180 181 addsd xmm0, xmm6 182 183 RestoreXmm xmm7, save_xmm7 184 RestoreXmm xmm6, save_xmm6 185 StackDeallocate stack_size 186 ret 0 187 188Ltan_abs_x_ge_two_to_neg_13: 189 xorps xmm1, xmm1 ; xmm1 <-- xx = 0 190 xor r8d, r8d ; r8 <-- recip flag = 0 191 call _tan_piby4 192 193Ltan_return: 194 RestoreXmm xmm7, save_xmm7 195 RestoreXmm xmm6, save_xmm6 196 StackDeallocate stack_size 197 ret 0 198 199Ltan_abs_x_nle_pio4: 200 201 cmp rcx, L_inf_mask_64 ; |x| uint >= +inf as uint ? 202 jnae Ltan_x_is_finite 203 204 call fname_special 205 RestoreXmm xmm7, save_xmm7 206 RestoreXmm xmm6, save_xmm6 207 StackDeallocate stack_size 208 ret 209 210ALIGN 16 211Ltan_x_is_finite: 212 xor r8d, r8d 213 xor r10, r10 214 cmp rcx, rdx 215 setne r10b ; r10 <-- x was negative flag 216 andpd xmm6, L_sign_mask 217 218 movsd xmm0, QWORD PTR L_moderate_arg_cw ; currently 5e5 219 comisd xmm0, xmm6 220 jbe Ltan_x_is_very_large 221 222Ltan_x_is_moderate: ; unused label 223 224 ; For these arguments we do a Cody-Waite reduction, subtracting the 225 ; appropriate multiple of pi/2, using extra precision where x is close 226 ; to an exact multiple of pi/2 227 ; We special-case region setting for |x| <= 9pi/4 228 ; It seems strange that this speeds things up, but it does 229 230 mov rdx, rcx 231 232 mov rax, 4616025215990052958 ; 400f6a7a2955385eH (5pi/4) 233 shr rdx, 52 ; rdx <-- xexp 234 cmp rcx, rax 235 ja Labs_x_gt_5pio4 236 237 mov rax, 4612488097114038738 ; 4002d97c7f3321d2H (3pi/4) 238 cmp rcx, rax 239 seta r8b 240 inc r8d ; r8d <-- region (1 or 2) 241 jmp Lhave_region 242 243Labs_x_gt_5pio4: 244 mov rax, 4619644535898419899 ; 401c463abeccb2bbH (9pi/4) 245 cmp rcx, rax 246 ja Lneed_region_computation 247 mov rax, 4617875976460412789 ; 4015fdbbe9bba775H (7pi/4) 248 cmp rcx, rax 249 seta r8b 250 add r8d, 3 ; r8d <-- region (3 or 4) 251 jmp Lhave_region 252 253ALIGN 16 254Lneed_region_computation: 255 movaps xmm0, xmm6 256 mulsd xmm0, QWORD PTR L_twobypi 257 addsd xmm0, QWORD PTR L_point_five 258 cvttsd2si r8d, xmm0 ; r8d <-- region 259 260Lhave_region: 261 movd xmm3, r8d 262 cvtdq2pd xmm3, xmm3 263 264 movaps xmm2, xmm3 265 movaps xmm0, xmm3 266 mulsd xmm0, QWORD PTR L_piby2_1 267 mulsd xmm2, QWORD PTR L_piby2_1tail ; xmm2 < rtail = npi2 * piby2_1tail 268 subsd xmm6, xmm0 ; xmm6 <-- rhead = x - npi2*piby2_1 269 270 ; If x is not too close to multiple of pi/2, 271 ; we're essentially done with reduction 272 ; If the exponent of rhead is not close to that of x, 273 ; then most of x has been subtracted away in computing rhead; 274 ; i.e., x is close to a multiple of pi/2. 275 276 movd rax, xmm6 277 shr rax, 52 278 and eax, 2047 279 sub rdx, rax ; rdx <-- exp diff of x vs rhead 280 281 cmp rdx, 15 282 jbe Ltan_have_rhead_rtail 283 284 ; Oops, x is almost a multiple of pi/2. Compute more bits of reduced x 285 286 ; t = rhead; 287 ; rtail = npi2 * piby2_2; 288 ; rhead = t - rtail; 289 ; rtail = npi2 * piby2_2tail - ((t - rhead) - rtail); 290 291 movaps xmm1, xmm6 292 movaps xmm0, xmm3 293 294 movaps xmm2, xmm3 295 mulsd xmm0, QWORD PTR L_piby2_2 296 mulsd xmm2, QWORD PTR L_piby2_2tail 297 subsd xmm6, xmm0 298 subsd xmm1, xmm6 299 subsd xmm1, xmm0 300 subsd xmm2, xmm1 301 302 cmp rdx, 48 303 jbe Ltan_have_rhead_rtail ; We've done enough 304 305 ; Wow, x is REALLY close to a multiple of pi/2. Compute more bits. 306 307 ; t = rhead; 308 ; rtail = npi2 * piby2_3; 309 ; rhead = t - rtail; 310 ; rtail = npi2 * piby2_3tail - ((t - rhead) - rtail); 311 312 movaps xmm1, xmm6 313 movaps xmm0, xmm3 314 movaps xmm2, xmm3 315 mulsd xmm0, QWORD PTR L_piby2_3 316 mulsd xmm2, QWORD PTR L_piby2_3tail 317 subsd xmm6, xmm0 ; xmm6 <-- rhead = t - rtail 318 subsd xmm1, xmm6 ; xmm1 <-- t - rhead 319 subsd xmm1, xmm0 ; xmm1 <-- ((t - rhead) - rtail) 320 subsd xmm2, xmm1 ; xmm2 <-- final rtail 321 322Ltan_have_rhead_rtail: 323 324 ; At this point xmm6 has a suitable rhead, xmm2 a suitable rtail 325 movaps xmm0, xmm6 ; xmm0 <-- copy of rhead 326 327 ; r = rhead - rtail 328 ; rr = (rhead - r) - rtail; 329 ; region = npi2 & 3; 330 331 and r8d, 3 ; r8d <-- region 332 subsd xmm0, xmm2 ; xmm0 <-- r = rhead - rtail 333 subsd xmm6, xmm0 ; xmm6 <-- rhead - r 334 subsd xmm6, xmm2 ; xmm6 <-- rr = (rhead - r) - rtail 335 336Ltan_do_tan_computation: 337 and r8d, 1 ; r8d <-- region & 1 338 movaps xmm1, xmm6 339 call _tan_piby4 340 test r10d, r10d 341 je Ltan_pos_return 342 xorpd xmm0, QWORD PTR L_signbit 343Ltan_pos_return: 344 RestoreXmm xmm7, save_xmm7 345 RestoreXmm xmm6, save_xmm6 346 StackDeallocate stack_size 347 ret 0 348 349ALIGN 16 350Ltan_x_is_very_large: 351 ; Reduce x into range [-pi/4,pi/4] (general case) 352 movaps xmm0, xmm6 353 mov QWORD PTR [rsp+save_r10], r10 354 call __remainder_piby2_forAsm ; this call clobbers r10 355 mov r10, QWORD PTR [rsp+save_r10] 356 movapd xmm6,xmm1 ; xmm6 <-- rr 357 mov r8d,eax ; r8d <-- region 358 jmp Ltan_do_tan_computation 359 360;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 361; 362; From here on, it is assumed that the hardware supports FMA3 (and AVX). 363 364ALIGN 16 365Ltan_fma3: 366 vmovq r9,xmm0 367 mov rdx,r9 ; rdx <-- x 368 btr r9,63 ; r9 <-- |x| 369 cmp r9,L_piby4_lead 370 jae Ltan_fma3_absx_gt_pio4 ; Note that NaN will branch 371 372Ltan_fma3_absx_le_pio4: 373 ; no argument reduction is needed, so recip is 0, xx is 0. 374 ; Note that this routine is not special-casing very small |x| 375 vmovsd xmm5,L_piby4_lead 376 vmovsd xmm6,L_piby4_tail 377 vxorpd xmm1,xmm1,xmm1 ; xx <-- 0. 378 vxorpd xmm7,xmm7,xmm7 ; transform <-- 0 379 comisd xmm0,L_point_68 380 jbe Ltan_fma3_small_x_le_point_68 381Ltan_fma3_x_small_gt_point_68: 382 vmovsd xmm7,L_one ; xmm7 <-- transform = 1.0 383 vsubsd xmm0,xmm5,xmm0 ; x = piby4_lead - x 384 vaddsd xmm0,xmm0,xmm6 ; xmm0 <-- x = x + xl = x + piby4_tail 385 jmp Ltan_fma3_compute_Remez_for_small_x 386ALIGN 16 387Ltan_fma3_small_x_le_point_68: 388 comisd xmm0,L_n_point_68 389 jae Ltan_fma3_compute_Remez_for_small_x 390Ltan_fma3_small_x_lt_neg_point_68: 391 vmovsd xmm7,L_n_one ; xmm7 <-- transform = -1.0 392 vaddsd xmm0,xmm5,xmm0 ; x = piby4_lead + x 393 vaddsd xmm0,xmm0,xmm6 ; xmm0 <-- x = x + xl = x + piby4_tail 394Ltan_fma3_compute_Remez_for_small_x: 395 ; At this point xmm0 holds x, possibly transformed 396 397 ; now do core Remez rational approximation for x in [0,0.68] 398 vmovsd xmm4,L_tan_q6 399 vmovsd xmm3,L_tan_p4 400 vmulsd xmm2,xmm0,xmm0 ; xx is 0, so xmm2 <-- r = x*x 401 vfmadd213sd xmm4,xmm2,L_tan_q4 402 vfmadd213sd xmm3,xmm2,L_tan_p2 403 vfmadd213sd xmm4,xmm2,L_tan_q2 404 vfmadd213sd xmm3,xmm2,L_tan_p0 ; xmm3 <-- p2 (polynomial) 405 vfmadd213sd xmm4,xmm2,L_tan_q0 ; xmm4 <-- q3 (polynomial) 406 vdivsd xmm3,xmm3,xmm4 ; xmm3 <-- r3 = p2/q3 407 vmulsd xmm3,xmm3,xmm2 ; xmm3 <-- r * r3 408 vfmadd132sd xmm0,xmm0,xmm3 ; xx = 0, so xmm0 <-- t = x + x*(r*r3) 409 comisd xmm7,L_zero ; did we transform x? 410 ; if x was transformed, we need to transform t to get answer; 411 ; if not, the answer is just t. 412 je Ltan_fma3_ext_piby4_zero 413 414 ; x was transformed, so answer is +- (1. - 2.*t/(1.+t)) 415 ; (remember recip is 0 here) 416 vmovsd xmm3,L_one 417 vaddsd xmm4,xmm0,L_one ; xmm4 <-- 1. + t 418 vdivsd xmm6,xmm0,xmm4 ; xmm6 <-- t / (1.+t) 419 vfnmadd231sd xmm3,xmm6,L_two ; xmm3 <-- 1. - 2.*t/(1.+t) 420 vmulsd xmm0,xmm3,xmm7 ; multiply by +- 1. 421 422Ltan_fma3_ext_piby4_zero: 423 ; restore volatile registers 424 AVXRestoreXmm xmm7, save_xmm7 425 AVXRestoreXmm xmm6, save_xmm6 426 StackDeallocate stack_size 427 ret 0 428 429ALIGN 16 430Ltan_fma3_absx_gt_pio4: ;;; come here if |x| > pi/4 431 cmp r9, L_inf_mask_64 432 jae Ltan_fma3_naninf 433 434;Ltan_fma3_range_reduce: 435 vmovapd [store_input + rsp],xmm0 ; save copy of x 436 vmovq xmm0,r9 ; xmm0l <-- |x| 437 cmp r9,L_moderate_arg_bdl 438 jge Ltan_fma3_remainder_piby2 ; go elsewhere if |x| > 500000. 439 440 ; Note that __remainder_piby2_fma3 and __remainder_piby2_fma3_bdl 441 ; have calling conventions that differ from the C routine 442 ; on input 443 ; |x| is in xmm0 444 ; on output 445 ; z is in xmm0 446 ; zz is in xmm1 447 ; where z + zz = arg reduced |x| and zz is small compared to z 448 ; region of |x| is in rax 449 450 Ltan_fma3_remainder_piby2_small: 451 ; Boldo-Daumas-Li reduction for reasonably small |x| 452 call __remainder_piby2_fma3_bdl 453 454 455Ltan_fma3_full_computation: 456 ; we have done argument reduction; recip and xx may be nonzero 457 ; x is in xmm0, xx is in xmm1 458 ; recip is region & 1, and region is in rax. 459 460 vmovsd xmm5,L_piby4_lead 461 vmovsd xmm6,L_piby4_tail 462 463 vxorpd xmm7,xmm7,xmm7 ; transform <-- 0 464 vcomisd xmm0,L_point_68 465 jbe Ltan_fma3_full_x_le_point_68 466Ltan_fma3_full_x_gt_point_68: 467 vmovsd xmm7,L_one ; xmm7 <-- transform = 1.0 468 vsubsd xmm0,xmm5,xmm0 ; xmm0 <-- x = piby4_lead - x 469 vsubsd xmm2,xmm6,xmm1 ; xmm2 <-- xl = pibi4_tail - xx 470 vaddsd xmm0,xmm0,xmm2 ; xmm0 <-- x = x + xl 471 vxorps xmm1,xmm1,xmm1 ; xmm1 <-- xx = 0 472 jmp Ltan_fma3_compute_Remez 473ALIGN 16 474Ltan_fma3_full_x_le_point_68: 475 vcomisd xmm0,L_n_point_68 476 jae Ltan_fma3_compute_Remez 477Ltan_fma3_full_x_lt_neg_point_68: 478 vmovsd xmm7,L_n_one ; xmm7 <-- transform = -1.0 479 vaddsd xmm0,xmm5,xmm0 ; x = piby4_lead + x 480 vaddsd xmm2,xmm6,xmm1 ; xmm2 <-- xl = piby4_tail + xx 481 vaddsd xmm0,xmm0,xmm2 ; xmm0 <-- x = x + xl 482 vxorps xmm1,xmm1,xmm1 ; xmm1 <-- xx = 0 483 484Ltan_fma3_compute_Remez: 485 vmulsd xmm2,xmm0,xmm0 ; xmm2 <-- x*x 486 vmulsd xmm5,xmm1,xmm0 ; xmm5 <-- x*xx 487 vfmadd132sd xmm5,xmm2,L_two ; xmm5 <-- r = x*x + 2.*x*xx 488 vmovsd xmm2,L_tan_p4 489 vfmadd213sd xmm2,xmm5,L_tan_p2 ; xmm2 <-- p4*r+p2 490 vfmadd213sd xmm2,xmm5,L_tan_p0 ; xmm2 <-- p = (p4*r+p2)*r+p0 491 vmovsd xmm4,L_tan_q6 492 vfmadd213sd xmm4,xmm5,L_tan_q4 ; xmm4 <-- q6*r+q4 493 vfmadd213sd xmm4,xmm5,L_tan_q2 ; xmm4 <-- (q6*r+q4)*r+q2 494 vfmadd213sd xmm4,xmm5,L_tan_q0 ; xmm4 <-- q = ((q6*r+q4)*r+q2)*r+q0 495 vdivsd xmm2,xmm2,xmm4 ; xmm2 <-- p/q 496 vmulsd xmm2,xmm2,xmm5 ; xmm2 <-- r*p/q 497 vfmadd213sd xmm2,xmm0,xmm1 ; xmm2 <-- t2 = xx + x*r*(p/q) 498 vaddsd xmm1,xmm0,xmm2 ; xmm1 <-- t = (t1=x) + t2 499 500 ; If |x| > .68 we transformed, and t is an approximation of 501 ; tan(pi/4 +- (x+xx)) 502 ; otherwise, t is just tan(x+xx) 503 vxorpd xmm6,xmm6,xmm6 504 vcomisd xmm7,xmm6 ; did we transform? (|x| > .68) ? 505 jz Ltan_fma3_if_recip_set ; if not, go check recip 506 507Ltan_fma3_if_transfor_set: 508 ; Because we transformed x+xx, we have to transform t before returning 509 ; let transform be 1 for x > .68, -1 for x < -.68, then we return 510 ; transform * (recip ? (2.*t/(t-1.) - 1.) : (1. - 2.*t/(1.+t))) 511 vaddsd xmm6,xmm1,xmm1 ; xmm6 <-- 2.*t 512 vmovsd xmm4,L_one 513 vaddsd xmm2,xmm1,xmm4 ; xmm2 <-- t+1 514 vsubsd xmm5,xmm1,xmm4 ; xmm5 <-- t-1 515 bt rax,0 516 jc Ltan_fma3_transform_and_recip_set 517 ; here recip is not set 518 vaddsd xmm2,xmm1,xmm4 ; xmm2 <-- t+1 519 vdivsd xmm2,xmm1,xmm2 ; xmm2 <-- t/(t+1) 520 vfnmadd132sd xmm2,xmm4,L_two ; xmm2 <-- 1 - 2*t/(t+1) 521 vmulsd xmm1,xmm2,xmm7 ; xmm1 <-- transform*(1 - 2*t/(t+1)) 522 jmp Ltan_fma3_exit_piby4 523ALIGN 16 524Ltan_fma3_transform_and_recip_set: 525 ; here recip is set 526 vsubsd xmm2,xmm1,xmm4 ; xmm2 <-- t-1 527 vdivsd xmm2,xmm1,xmm2 ; xmm2 <-- t/(t-1) 528 vfmsub132sd xmm2,xmm4,L_two ; xmm2 <-- 2*t/(t-1) - 1 529 vmulsd xmm1,xmm2,xmm7 ; xmm1 <-- transform*(2*t/(t-1) - 1) 530 jmp Ltan_fma3_exit_piby4 531 532ALIGN 16 533Ltan_fma3_if_recip_set: 534 ; Here we did not transform x and xx, but if we are in an odd quadrant 535 ; we will need to return -1./(t1+t2), computed accurately 536 ; (t=t1 is in xmm1, t2 is in xmm2) 537 bt rax,0 538 jnc Ltan_fma3_exit_piby4 539 540 vandpd xmm7,xmm1,L_half_mask ; xmm7 <-- z1 = high bits of t 541 vsubsd xmm4,xmm7,xmm0 ; xmm4 <-- z1 - t1 542 vsubsd xmm4,xmm2,xmm4 ; xmm4 <-- z2 = t2 - (z1-t1) 543 vmovsd xmm2,L_n_one 544 vdivsd xmm2,xmm2,xmm1 ; xmm2 <-- trec = -1./t 545 vandpd xmm5,xmm2,L_half_mask ; xmm5 <-- trec_top=high bits of trec 546 vfmadd213sd xmm7,xmm5,L_one ; xmm7 <-- trec_top*z1 + 1. 547 vfmadd231sd xmm7 ,xmm4,xmm5 ; xmm7 <-- z2*trec_top + (trec_top*z1 + 1.) 548 vfmadd213sd xmm7,xmm2,xmm5 ; xmm7 <-- u = trec_top + trec*(z2*trec_top + (trec_top*z1+1.)) 549 vmovapd xmm1,xmm7 ; xmm1 <-- u 550 551Ltan_fma3_exit_piby4: 552 vmovapd xmm0,xmm1 ; xmm0 <-- t, u, or v, as needed 553 554 vmovapd xmm1,[store_input + rsp] 555 vandpd xmm1,xmm1,L_signbit 556 vxorpd xmm0,xmm0,xmm1 ; tan(-x) = -tan(x) 557 558 ; restore volatile registers 559 AVXRestoreXmm xmm7, save_xmm7 560 AVXRestoreXmm xmm6, save_xmm6 561 StackDeallocate stack_size 562 ret 563 564ALIGN 16 565Ltan_fma3_remainder_piby2: 566 ; argument reduction for general x 567 568 call __remainder_piby2_fma3 569 jmp Ltan_fma3_full_computation 570 571 572Ltan_fma3_naninf: ; here argument is +-Inf or NaN. Special case. 573 call fname_special 574 AVXRestoreXmm xmm7, save_xmm7 575 AVXRestoreXmm xmm6, save_xmm6 576 StackDeallocate stack_size 577 ret 578 579fname endp 580 581;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 582 583.const 584tan_piby4_save_xmm6 EQU 030h 585tan_piby4_stack_size EQU 048h 586.code 587ALIGN 16 588_tan_piby4 PROC PRIVATE FRAME 589 StackAllocate tan_piby4_stack_size 590 SaveXmm xmm6, tan_piby4_save_xmm6 591 .ENDPROLOG 592 593 ; Compute tangent for x+xx in [-pi/4,pi/4]. 594 ; xmm0 has x 595 ; xmm1 has xx 596 ; r8d has recip. If recip is true, return -1/tan(x+xx) else tan(x+xx) 597 598 xor eax, eax 599 600 comisd xmm0, QWORD PTR L_point_68 601 movaps xmm3, xmm1 602 movaps xmm6, xmm0 603 jbe Ltan_piby4_x_le_point_68 604 605 ; Here x > .68, so we transform x using the identity 606 ; tan(pi/4-x) = (1-tan(x))/(1+tan(x)) 607 608 movsd xmm2, QWORD PTR L_piby4_lead 609 mov eax, 1 ; eax <-- transform = 1 610 subsd xmm2, xmm0 ; xmm2 <-- x = piby4_lead - x 611 movsd xmm0, QWORD PTR L_piby4_tail 612 subsd xmm0, xmm1 ; xmm0 <-- xl = piby4_tail - xx 613 movaps xmm6, xmm2 614 addsd xmm6, xmm0 ; xmm6 <-- x = x + xl 615 xorps xmm3,xmm3 ; xmm3 <-- xx = 0. 616 jmp Ltan_piby4_do_remez 617 618Ltan_piby4_x_le_point_68: 619; 43 : else if (x < -0.68) 620 621 movsd xmm0, QWORD PTR L_n_point_68 622 comisd xmm0, xmm6 623 jbe Ltan_piby4_do_remez ; jump if x >= -.68 624 625 ; Here x < -.68, so we transform x using the identity 626 ; tan(x-pi/4) = (tan(x)-1)/(tan(x)+1) 627 628 addsd xmm6, QWORD PTR L_piby4_lead ; xmm6 <-- x = piby4_lead + x 629 addsd xmm3, QWORD PTR L_piby4_tail ; xmm3 <-- xl = piby4_tail + xx 630 or eax, -1 ; eax <-- transform = -1 631 addsd xmm6, xmm3 ; xmm6 <-- x = x + xl 632 xorps xmm3, xmm3 ; xmm3 <-- xx = 0 633 634Ltan_piby4_do_remez: 635 636 ; Core Remez [2,3] approximation to tan(x+xx) on the interval [0,0.68]. 637 movaps xmm0, xmm6 638 movaps xmm2, xmm6; 639; An implementation of the tan function. 640; 641; Prototype: 642; 643; double tan(double x); 644; 645; Computes tan(x). 646; It will provide proper C99 return values, 647; but may not raise floating point status bits properly. 648; Based on the NAG C implementation. 649; 650; 651 652 mulsd xmm0, xmm6 ; xmm0 <-- x*x 653 addsd xmm2, xmm2 ; xmm2 <-- 2*x 654 mulsd xmm2, xmm3 ; xmm2 <-- 2*x*xx 655 addsd xmm2, xmm0 ; xmm2 <-- r = x*x + 2*x*xx 656 657 ; Magic Remez approximation 658 movaps xmm0, xmm2 659 movaps xmm5, xmm2 660 movaps xmm1, xmm2 661 mulsd xmm5, QWORD PTR L_tan_p4 662 mulsd xmm1, QWORD PTR L_tan_q6 663 mulsd xmm0, xmm6 664 addsd xmm5, QWORD PTR L_tan_p2 665 mulsd xmm5, xmm2 666 addsd xmm5, QWORD PTR L_tan_p0 667 mulsd xmm5, xmm0 668 movsd xmm0, QWORD PTR L_tan_q4 669 addsd xmm0, xmm1 670 mulsd xmm0, xmm2 671 addsd xmm0, QWORD PTR L_tan_q2 672 mulsd xmm0, xmm2 673 addsd xmm0, QWORD PTR L_tan_q0 674 divsd xmm5, xmm0 675 addsd xmm5, xmm3 ; xmm5 <-- t2 676 677 test eax, eax 678 je Ltan_piby4_transform_false 679 680 addsd xmm5, xmm6 ; xmm5 <-- t = t1 + t2 = x + t2 681 682 test r8d, r8d 683 je Ltan_piby4_transform_true_recip_false 684 685 ; Here transform and recip are both true. 686 ; return transform*(2*t/(t-1) - 1.0); 687 688 movaps xmm0, xmm5 689 subsd xmm5, QWORD PTR L_one 690 movd xmm1, eax 691 addsd xmm0, xmm0 692 divsd xmm0, xmm5 693 cvtdq2pd xmm1, xmm1 694 subsd xmm0, QWORD PTR L_one 695 mulsd xmm0, xmm1 696 RestoreXmm xmm6, tan_piby4_save_xmm6 697 StackDeallocate tan_piby4_stack_size 698 ret 0 699 700Ltan_piby4_transform_true_recip_false: 701 ; Here return transform*(1.0 - 2*t/(1+t)); 702 movsd xmm0, QWORD PTR L_one 703 movaps xmm1, xmm5 704 addsd xmm5, xmm0 705 addsd xmm1, xmm1 706 divsd xmm1, xmm5 707 subsd xmm0, xmm1 708 movd xmm1, eax 709 cvtdq2pd xmm1, xmm1 710 mulsd xmm0, xmm1 711 RestoreXmm xmm6, tan_piby4_save_xmm6 712 StackDeallocate tan_piby4_stack_size 713 ret 0 714 715Ltan_piby4_transform_false: 716 test r8d, r8d 717 je Ltan_piby4_atransform_false_recip_false 718 719 ; Here transform is false but recip is true 720 ; We return an accurate computation of -1.0/(t1 + t2). 721 722 movsd xmm4, QWORD PTR L_n_one 723 movaps xmm0, xmm5 724 mov rcx, -4294967296 ; ffffffff00000000H 725 addsd xmm0, xmm6 726 movd rax, xmm0 ; really movq 727 divsd xmm4, xmm0 728 and rax, rcx 729 movd xmm3, rax ; really movq 730 movaps xmm1, xmm3 731 subsd xmm1, xmm6 732 733 movd rax, xmm4 ; really movq 734 subsd xmm5, xmm1 735 736 and rax, rcx 737 movd xmm2, rax ; really movq 738 739 ; return trec_top + trec * ((1.0 + trec_top * z1) + trec_top * z2); 740 741 movaps xmm0, xmm2 742 mulsd xmm5, xmm2 743 mulsd xmm0, xmm3 744 addsd xmm0, QWORD PTR L_one 745 addsd xmm0, xmm5 746 mulsd xmm0, xmm4 747 addsd xmm0, xmm2 748 749 RestoreXmm xmm6, tan_piby4_save_xmm6 750 StackDeallocate tan_piby4_stack_size 751 ret 0 752 753Ltan_piby4_atransform_false_recip_false: 754 ; Here both transform and recip are false; we just return t1 + t2 755 addsd xmm5, xmm6 756 movaps xmm0, xmm5 757 RestoreXmm xmm6, tan_piby4_save_xmm6 758 StackDeallocate tan_piby4_stack_size 759 ret 0 760 761_tan_piby4 endp 762END 763