1; 2; MIT License 3; ----------- 4; 5; Copyright (c) 2002-2019 Advanced Micro Devices, Inc. 6; 7; Permission is hereby granted, free of charge, to any person obtaining a copy 8; of this Software and associated documentaon files (the "Software"), to deal 9; in the Software without restriction, including without limitation the rights 10; to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11; copies of the Software, and to permit persons to whom the Software is 12; furnished to do so, subject to the following conditions: 13; 14; The above copyright notice and this permission notice shall be included in 15; all copies or substantial portions of the Software. 16; 17; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18; IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19; FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20; AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21; LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 23; THE SOFTWARE. 24; 25; 26; An implementation of the sinf function. 27; 28; Prototype 29; 30; float sinf(float x); 31; 32; Computes sinf(x). 33; It will provide proper C99 return values, 34; but may not raise floating point status bits properly. 35; Based on the NAG C implementation. 36; 37 38.const 39ALIGN 16 40L_signbit DQ 08000000000000000h 41 DQ 08000000000000000h 42L_sign_mask DQ 07FFFFFFFFFFFFFFFh 43 DQ 07FFFFFFFFFFFFFFFh 44L_one DQ 03FF0000000000000h 45 DQ 03FF0000000000000h 46L_int_three DQ 00000000000000003h 47 DQ 00000000000000003h 48L_one_half DQ 03FE0000000000000h 49 DQ 03FE0000000000000h 50L_twobypi DQ 03FE45F306DC9C883h 51 DQ 03FE45F306DC9C883h 52L_piby2_1 DQ 03FF921FB54400000h 53 DQ 03FF921FB54400000h 54L_one_sixth DQ 03FC5555555555555h 55 DQ 03FC5555555555555h 56L_piby2_1tail DQ 03DD0B4611A626331h 57 DQ 03DD0B4611A626331h 58L_piby2_2 DQ 03dd0b4611a600000h 59 DQ 03dd0b4611a600000h 60L_piby2_2tail DQ 03ba3198a2e037073h 61 DQ 03ba3198a2e037073h 62L_inf_mask_32 DD 07F800000h 63 DD 07F800000h 64 DQ 07F8000007F800000h 65L_int_two DQ 00000000000000002h 66 DQ 00000000000000002h 67L_piby2_lead DQ 03ff921fb54442d18h 68 DQ 03ff921fb54442d18h 69L_piby4 DQ 03fe921fb54442d18h 70 DQ 03fe921fb54442d18h 71L_mask_3f2 DQ 03f20000000000000h 72 DQ 03f20000000000000h 73L_mask_3f8 DQ 03f80000000000000h 74 DQ 03f80000000000000h 75 76; Do these really need to be different? 77L_large_x_fma3 DQ 04170008AC0000000h ; 16779436 78L_large_x_sse2 DQ 0416E848000000000h ; 16000000 79 80EXTRN __Lcosfarray:QWORD 81EXTRN __Lsinfarray:QWORD 82EXTRN __use_fma3_lib:DWORD 83EXTRN __L_2_by_pi_bits:BYTE 84 85; define local variable storage offsets 86p_temp EQU 010h ; temporary for get/put bits operation 87p_temp1 EQU 018h ; temporary for get/put bits operation 88region EQU 020h ; pointer to region for remainder_piby2 89r EQU 028h ; pointer to r for remainder_piby2 90dummy_space EQU 040h 91 92stack_size EQU 058h 93 94include fm.inc 95 96fname TEXTEQU <sinf> 97fname_special TEXTEQU <_sinf_special> 98 99;Define name and any external functions being called 100EXTRN __remainder_piby2d2f_forC : PROC ; NEAR 101EXTERN fname_special : PROC 102 103.code 104ALIGN 16 105PUBLIC fname 106fname PROC FRAME 107 StackAllocate stack_size 108 .ENDPROLOG 109 cmp DWORD PTR __use_fma3_lib, 0 110 jne Lsinf_fma3 111 112Lsinf_sse2: 113 114 xorpd xmm2, xmm2 ; zeroed out for later use 115 116;; if NaN or inf 117 movd edx, xmm0 118 mov eax, 07f800000h 119 mov r10d, eax 120 and r10d, edx 121 cmp r10d, eax 122 jz Lsinf_sse2_naninf 123 124; GET_BITS_DP64(x, ux); 125; get the input value to an integer register. 126 cvtss2sd xmm0, xmm0 ; convert input to double. 127 movd rdx, xmm0 ; rdx is ux 128 129; ax = (ux & ~SIGNBIT_DP64); 130 mov r10, rdx 131 btr r10, 63 ; r10 is ax 132 mov r8d, 1 ; for determining region later on 133 134;; if (ax <= 0x3fe921fb54442d18) abs(x) <= pi/4 135 mov rax, 03fe921fb54442d18h 136 cmp r10, rax 137 jg Lsinf_absx_gt_piby4 138 139;; if (ax < 0x3f80000000000000) abs(x) < 2.0^(-7) 140 mov rax, 3f80000000000000h 141 cmp r10, rax 142 jge Lsinf_sse2_small 143 144;; if (ax < 0x3f20000000000000) abs(x) < 2.0^(-13) 145 mov rax, 3f20000000000000h 146 cmp r10, rax 147 jge Lsinf_sse2_smaller 148 149; sinf = x; 150 jmp Lsinf_sse2_cleanup 151 152ALIGN 16 153Lsinf_sse2_smaller: 154; sinf = x - x^3 * 0.1666666666666666666; 155 movsd xmm2, xmm0 156 movsd xmm4, QWORD PTR L_one_sixth ; 0.1666666666666666666 157 mulsd xmm2, xmm2 ; x^2 158 mulsd xmm2, xmm0 ; x^3 159 mulsd xmm2, xmm4 ; x^3 * 0.1666666666666666666 160 subsd xmm0, xmm2 ; x - x^3 * 0.1666666666666666666 161 jmp Lsinf_sse2_cleanup 162 163ALIGN 16 164Lsinf_sse2_small: 165 movsd xmm2, xmm0 ; x2 = r * r; 166 mulsd xmm2, xmm0 ; x2 167 168;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 169; region 0 or 2 - do a sinf calculation 170; zs = x + x3((s1 + x2 * s2) + x4(s3 + x2 * s4)); 171 movsd xmm1, QWORD PTR __Lsinfarray+18h ; s4 172 mulsd xmm1, xmm2 ; s4x2 173 movsd xmm4, xmm2 ; move for x4 174 movsd xmm5, QWORD PTR __Lsinfarray+8h ; s2 175 mulsd xmm4, xmm2 ; x4 176 movsd xmm3, xmm0 ; move for x3 177 mulsd xmm5, xmm2 ; s2x2 178 mulsd xmm3, xmm2 ; x3 179 addsd xmm1, QWORD PTR __Lsinfarray+10h ; s3+s4x2 180 mulsd xmm1, xmm4 ; s3x4+s4x6 181 addsd xmm5, QWORD PTR __Lsinfarray ; s1+s2x2 182 addsd xmm1, xmm5 ; s1+s2x2+s3x4+s4x6 183 mulsd xmm1, xmm3 ; x3(s1+s2x2+s3x4+s4x6) 184 addsd xmm0, xmm1 ; x + x3(s1+s2x2+s3x4+s4x6) 185 jmp Lsinf_sse2_cleanup 186 187;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 188ALIGN 16 189Lsinf_absx_gt_piby4: 190; xneg = (ax != ux); 191 cmp rdx, r10 192 mov r11d, 0 193;; if (xneg) x = -x; 194 jz Lsinf_sse2_reduce_moderate 195 196 mov r11d, 1 197 subsd xmm2, xmm0 198 movsd xmm0, xmm2 199 200Lsinf_sse2_reduce_moderate: 201;; if (x < 5.0e6) 202 cmp r10, QWORD PTR L_large_x_sse2 203 jae Lsinf_sse2_reduce_large 204 205; reduce the argument to be in a range from -pi/4 to +pi/4 206; by subtracting multiples of pi/2 207 movsd xmm2, xmm0 208 movsd xmm3, QWORD PTR L_twobypi 209 movsd xmm4, xmm0 210 movsd xmm5, QWORD PTR L_one_half ; .5 211 mulsd xmm2, xmm3 212 213;/* How many pi/2 is x a multiple of? */ 214; xexp = ax >> EXPSHIFTBITS_DP64; 215 mov r9, r10 216 shr r9, 52 ; >>EXPSHIFTBITS_DP64 217 218; npi2 = (int)(x * twobypi + 0.5); 219 addsd xmm2, xmm5 ; npi2 220 221 movsd xmm3, QWORD PTR L_piby2_1 222 cvttpd2dq xmm0, xmm2 ; convert to integer 223 movsd xmm1, QWORD PTR L_piby2_1tail 224 cvtdq2pd xmm2, xmm0 ; and back to double. 225 226; /* Subtract the multiple from x to get an extra-precision remainder */ 227; rhead = x - npi2 * piby2_1; 228 mulsd xmm3, xmm2 229 subsd xmm4, xmm3 ; rhead 230 231; rtail = npi2 * piby2_1tail; 232 mulsd xmm1, xmm2 233 movd eax, xmm0 234 235; GET_BITS_DP64(rhead-rtail, uy); 236; originally only rhead 237 movsd xmm0, xmm4 238 subsd xmm0, xmm1 239 240 movsd xmm3, QWORD PTR L_piby2_2 241 movd rcx, xmm0 242 movsd xmm5, QWORD PTR L_piby2_2tail 243 244; xmm0=r, xmm4=rhead, xmm1=rtail, xmm2=npi2, xmm3=temp for calc, xmm5= temp for calc 245; expdiff = xexp - ((uy & EXPBITS_DP64) >> EXPSHIFTBITS_DP64); 246 shl rcx, 1 ; strip any sign bit 247 shr rcx, 53 ; >> EXPSHIFTBITS_DP64 +1 248 sub r9, rcx ; expdiff 249 250;; if (expdiff > 15) 251 cmp r9, 15 252 jle Lsinf_sse2_expdiff_le_15 253 254; The remainder is pretty small compared with x, which 255; implies that x is a near multiple of pi/2 256; (x matches the multiple to at least 15 bits) 257; t = rhead; 258 movsd xmm1, xmm4 259 260; rtail = npi2 * piby2_2; 261 mulsd xmm3, xmm2 262 263; rhead = t - rtail; 264 mulsd xmm5, xmm2 ; npi2 * piby2_2tail 265 subsd xmm4, xmm3 ; rhead 266 267; rtail = npi2 * piby2_2tail - ((t - rhead) - rtail); 268 subsd xmm1, xmm4 ; t - rhead 269 subsd xmm1, xmm3 ; -rtail 270 subsd xmm5, xmm1 ; rtail 271 272; r = rhead - rtail; 273 movsd xmm0, xmm4 274 275;HARSHA 276;xmm1=rtail 277 movsd xmm1, xmm5 278 subsd xmm0, xmm5 279 280; xmm0=r, xmm4=rhead, xmm1=rtail 281 282;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 283Lsinf_sse2_expdiff_le_15: 284 cmp rcx, 03f2h ; is r < 2^-13 ? 285 jge Lsinf_sse2_calc_sincosf_piby4 ; use taylor series if not 286 cmp rcx, 03deh ; if r really small. 287 jle Lsinf_sse2_r_very_small ; then sinf(r) ~ r or 1 288 289 movsd xmm2, xmm0 290 mulsd xmm2, xmm0 ; xmm2 <-- r^2 291 292;; if region is 0 or 2 do a sinf calc. 293 and r8d, eax 294 jnz Lsinf_sse2_small_calc_sin 295 296; region 0 or 2 do a sinf calculation 297; use simply polynomial 298; x - x*x*x*0.166666666666666666; 299 movsd xmm3, QWORD PTR L_one_sixth 300 mulsd xmm3, xmm0 ; * x 301 mulsd xmm3, xmm2 ; * x^2 302 subsd xmm0, xmm3 ; xs 303 jmp Lsinf_sse2_adjust_region 304 305ALIGN 16 306Lsinf_sse2_small_calc_sin: 307; region 1 or 3 do a cosf calculation 308; use simply polynomial 309; 1.0 - x*x*0.5; 310 movsd xmm0, QWORD PTR L_one ; 1.0 311 mulsd xmm2, QWORD PTR L_one_half ; 0.5 *x^2 312 subsd xmm0, xmm2 ; xc 313 jmp Lsinf_sse2_adjust_region 314 315ALIGN 16 316Lsinf_sse2_r_very_small: 317;; if region is 0 or 2 do a sinf calc. (sinf ~ x) 318 and r8d, eax 319 jz Lsinf_sse2_adjust_region 320 321 movsd xmm0, QWORD PTR L_one ; cosf(r) is a 1 322 jmp Lsinf_sse2_adjust_region 323 324;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 325ALIGN 16 326Lsinf_sse2_reduce_large: 327; Reduce x into range [-pi/4, pi/4] 328; __remainder_piby2d2f_forC(x, &r, ®ion); 329 330 mov QWORD PTR p_temp[rsp], r11 331 lea rdx, QWORD PTR r[rsp] 332 lea r8, QWORD PTR region[rsp] 333 movd rcx, xmm0 334 call __remainder_piby2d2f_forC 335 mov r11, QWORD PTR p_temp[rsp] 336 mov r8d, 1 ; for determining region later on 337 movsd xmm1, QWORD PTR r[rsp] ; x 338 mov eax, DWORD PTR region[rsp] ; region 339 340; xmm0 = x, xmm4 = xx, r8d = 1, eax= region 341;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 342 343; perform taylor series to calc sinfx, cosfx 344Lsinf_sse2_calc_sincosf_piby4: 345; x2 = r * r; 346 movsd xmm2, xmm0 347 mulsd xmm2, xmm0 ; x2 348 349;; if region is 1 or 3, do a cosf calc. 350 and r8d, eax 351 jnz Lsinf_sse2_do_cosf_calc 352 353; region is 0 or 2: do a sinf calc. 354; zs = x + x3((s1 + x2 * s2) + x4(s3 + x2 * s4)); 355Lsinf_sse2_do_sinf_calc: 356 movsd xmm1, QWORD PTR __Lsinfarray+18h ; s4 357 mulsd xmm1, xmm2 ; s4x2 358 movsd xmm4, xmm2 ; move for x4 359 mulsd xmm4, xmm2 ; x4 360 movsd xmm5, QWORD PTR __Lsinfarray+8h ; s2 361 mulsd xmm5, xmm2 ; s2x2 362 movsd xmm3, xmm0 ; move for x3 363 mulsd xmm3, xmm2 ; x3 364 addsd xmm1, QWORD PTR __Lsinfarray+10h ; s3+s4x2 365 mulsd xmm1, xmm4 ; s3x4+s4x6 366 addsd xmm5, QWORD PTR __Lsinfarray ; s1+s2x2 367 addsd xmm1, xmm5 ; s1+s2x2+s3x4+s4x6 368 mulsd xmm1, xmm3 ; x3(s1+s2x2+s3x4+s4x6) 369 addsd xmm0, xmm1 ; x + x3(s1+s2x2+s3x4+s4x6) 370 jmp Lsinf_sse2_adjust_region 371 372ALIGN 16 373Lsinf_sse2_do_cosf_calc: 374 375; region 1 or 3 - do a cosf calculation 376; zc = 1-0.5*x2+ c1*x4 +c2*x6 +c3*x8; 377; zc = 1-0.5*x2+ c1*x4 +c2*x6 +c3*x8 + c4*x10 for a higher precision 378 movsd xmm1, QWORD PTR __Lcosfarray+20h ; c4 379 movsd xmm4, xmm2 ; move for x4 380 mulsd xmm1, xmm2 ; c4x2 381 movsd xmm3, QWORD PTR __Lcosfarray+10h ; c2 382 mulsd xmm4, xmm2 ; x4 383 movsd xmm0, QWORD PTR __Lcosfarray ; c0 384 mulsd xmm3, xmm2 ; c2x2 385 mulsd xmm0, xmm2 ; c0x2 (=-0.5x2) 386 addsd xmm1, QWORD PTR __Lcosfarray+18h ; c3+c4x2 387 mulsd xmm1, xmm4 ; c3x4 + c4x6 388 addsd xmm3, QWORD PTR __Lcosfarray+8h ; c1+c2x2 389 addsd xmm1, xmm3 ; c1 + c2x2 + c3x4 + c4x6 390 mulsd xmm1, xmm4 ; c1x4 + c2x6 + c3x8 + c4x10 391 addsd xmm0, QWORD PTR L_one ; 1 - 0.5x2 392 addsd xmm0, xmm1 ; 1 - 0.5x2 + c1x4 + c2x6 + c3x8 + c4x10 393 394;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 395 396 397Lsinf_sse2_adjust_region: 398; positive or negative 399; switch (region) 400 shr eax, 1 401 mov ecx, eax 402 and eax, r11d 403 404 not ecx 405 not r11d 406 and ecx, r11d 407 408 or eax, ecx 409 and eax, 1 410 jnz Lsinf_sse2_cleanup 411 412;; if the original region 0, 1 and arg is negative, then we negate the result. 413;; if the original region 2, 3 and arg is positive, then we negate the result. 414 movsd xmm2, xmm0 415 xorpd xmm0, xmm0 416 subsd xmm0, xmm2 417 418 419Lsinf_sse2_cleanup: 420 cvtsd2ss xmm0, xmm0 421 StackDeallocate stack_size 422 ret 423 424ALIGN 16 425Lsinf_sse2_naninf: 426 call fname_special 427 StackDeallocate stack_size 428 ret 429 430ALIGN 16 431Lsinf_fma3: 432 vmovd eax,xmm0 433 mov r8d,L_inf_mask_32 434 and eax,r8d 435 cmp eax, r8d 436 jz Lsinf_fma3_naninf 437 438 vcvtss2sd xmm5,xmm0,xmm0 439 vmovq r9,xmm5 440 btr r9,63 ; r9 <-- |x| 441 cmp r9,L_piby4 442 jg Lsinf_fma3_range_reduce 443 444 cmp r9,L_mask_3f8 445 jge Lsinf_fma3_compute_sinf_piby_4 446 447 cmp r9,L_mask_3f2 448 jge Lsinf_fma3_compute_x_xxx_0_1666 449 450 ; Here |x| < 2^-13; just return sin x ~ x 451 StackDeallocate stack_size 452 ret 453 454ALIGN 16 455Lsinf_fma3_compute_x_xxx_0_1666: 456 ; Here |x| < 2^-7; return sin x ~ x + 1/6 x^3 457 vmulsd xmm1,xmm5,xmm5 458 vmulsd xmm0,xmm1,xmm5 ; xmm1 <-- x^3 459 vfnmadd132sd xmm0,xmm5,L_one_sixth ; x - x*x*x*0.166666666666666666 460 jmp Lsinf_fma3_return_sinf_s 461 462ALIGN 16 463Lsinf_fma3_compute_sinf_piby_4: 464 vmovapd xmm0,xmm5 465 vmovsd xmm1,__Lsinfarray+010h 466 vmulsd xmm3,xmm0,xmm0 ; xmm3 <-- x^2 467 vfmadd231sd xmm1,xmm3,__Lsinfarray+018h 468 vfmadd213sd xmm1,xmm3,__Lsinfarray+08h 469 vfmadd213sd xmm1,xmm3,__Lsinfarray 470 vmulsd xmm3,xmm0,xmm3 ; xmm3 <-- x^3 471 vfmadd231sd xmm0,xmm1,xmm3 472 jmp Lsinf_fma3_return_sinf_s 473 474ALIGN 16 475Lsinf_fma3_range_reduce: 476 vmovq xmm0,r9 ; xmm0 <-- |x| 477 cmp r9,L_large_x_fma3 478 jge Lsinf_fma3_reduce_large 479 480Lsinf_fma3_sinf_reduce_moderate: 481 vandpd xmm1,xmm0,L_sign_mask ; xmm1 <-- |x| mov should suffice WAT 482 vmovapd xmm2,L_twobypi 483 vfmadd213sd xmm2,xmm1,L_one_half 484 vcvttpd2dq xmm2,xmm2 485 vpmovsxdq xmm1,xmm2 486 vandpd xmm4,xmm1,L_int_three ; xmm4 <-- region 487 vshufps xmm1 ,xmm1,xmm1,8 488 vcvtdq2pd xmm1,xmm1 489 vmovdqa xmm2,xmm0 490 vfnmadd231sd xmm2,xmm1,L_piby2_1 ; xmm2 <-- rhead 491 vmulsd xmm3,xmm1,L_piby2_1tail ; xmm3 <-- rtail 492 vsubsd xmm0,xmm2,xmm3 ; xmm0 <-- r_1 493 vsubsd xmm2,xmm2,xmm0 494 vsubsd xmm1,xmm2,xmm3 ; xmm4 <-- rr_1 495 jmp Lsinf_fma3_exit_s 496 497ALIGN 16 498Lsinf_fma3_reduce_large: 499 lea r9,__L_2_by_pi_bits 500 ;xexp = (x >> 52) 1023 501 vmovq r11,xmm0 502 mov rcx,r11 503 shr r11,52 504 sub r11,1023 ; r11 <-- xexp = exponent of input x 505 ;calculate the last byte from which to start multiplication 506 ;last = 134 (xexp >> 3) 507 mov r10,r11 508 shr r10,3 509 sub r10,134 ;r10 = last 510 neg r10 ;r10 = last 511 ;load 64 bits of 2_by_pi 512 mov rax,[r9+r10] 513 ;mantissa of x = ((x << 12) >> 12) | implied bit 514 shl rcx,12 515 shr rcx,12 ;rcx = mantissa part of input x 516 bts rcx,52 ;add the implied bit as well 517 ;load next 128 bits of 2_by_pi 518 add r10,8 ;increment to next 8 bytes of 2_by_pi 519 vmovdqu xmm0,XMMWORD PTR[r9+r10] 520 ;do three 64bit multiplications with mant of x 521 mul rcx 522 mov r8,rax ; r8 <-- last 64 bits of mul = res1[2] 523 mov r10,rdx ; r10 <-- carry 524 vmovq rax,xmm0 525 mul rcx 526 ;resexp = xexp & 7 527 and r11,7 ; r11 <-- resexp = last 3 bits 528 psrldq xmm0,8 529 add rax,r10 ; add the previous carry 530 adc rdx,0 531 mov r9,rax ; r9 <-- next 64 bits of mul = res1[1] 532 mov r10,rdx ; r10 <-- carry 533 vmovq rax,xmm0 534 mul rcx 535 add r10,rax ; r10 = most sig 64 bits = res1[0] 536 ;find the region 537 ;last three bits ltb = most sig bits >> (54 resexp)) 538 ; decimal point in last 18 bits == 8 lsb's in first 64 bits 539 ; and 8 msb's in next 64 bits 540 ;point_five = ltb & 01h; 541 ;region = ((ltb >> 1) + point_five) & 3; 542 mov rcx,54 543 mov rax,r10 544 sub rcx,r11 545 xor rdx,rdx ;rdx = sign of x(i.e first part of x * 2bypi) 546 shr rax,cl 547 jnc Lsinf_fma3_no_point_five_f 548 ;;if there is carry.. then negate the result of multiplication 549 not r10 550 not r9 551 not r8 552 mov rdx,08000000000000000h 553 554Lsinf_fma3_no_point_five_f: 555 adc rax,0 556 and rax,3 557 vmovd xmm4,eax ;store region to xmm4 558 ;calculate the number of integer bits and zero them out 559 mov rcx,r11 560 add rcx,10 ; rcx <-- no. of integer bits 561 shl r10,cl 562 shr r10,cl ; r10 contains only mant bits 563 sub rcx,64 ; form the exponent 564 mov r11,rcx 565 ;find the highest set bit 566 bsr rcx,r10 567 jnz Lsinf_fma3_form_mantissa_f 568 mov r10,r9 569 mov r9,r8 570 mov r8,0 571 bsr rcx,r10 ; rcx <-- hsb 572 sub r11,64 573 574Lsinf_fma3_form_mantissa_f: 575 add r11,rcx ;for exp of x 576 sub rcx,52 ;rcx = no. of bits to shift in r10 577 cmp rcx,0 578 jl Lsinf_fma3_hsb_below_52_f 579 je Lsinf_fma3_form_numbers_f 580 ;hsb above 52 581 mov r8,r10 ; previous contents of r8 not required 582 shr r10,cl ; r10 = mantissa of x with hsb at 52 583 shr r9,cl ; make space for bits from r10 584 sub rcx,64 585 neg rcx 586 shl r8,cl 587 or r9,r8 ; r9 = mantissa bits of xx 588 jmp Lsinf_fma3_form_numbers_f 589 590ALIGN 16 591Lsinf_fma3_hsb_below_52_f: 592 neg rcx 593 mov rax,r9 594 shl r10,cl 595 shl r9,cl 596 sub rcx,64 597 neg rcx 598 shr rax,cl 599 or r10,rax 600 shr r8,cl 601 or r9,r8 602 603ALIGN 16 604Lsinf_fma3_form_numbers_f: 605 add r11,1023 606 btr r10,52 ; remove the implied bit 607 mov rcx,r11 608 or r10,rdx ; put the sign 609 shl rcx,52 610 or r10,rcx ; r10 <-- x 611 vmovq xmm0,r10 ; xmm0 <-- x 612 vmulsd xmm0,xmm0,L_piby2_lead 613Lsinf_fma3_exit_s: 614 vmovq rax,xmm4 615 and rax,01h 616 cmp rax,01h 617 jz Lsinf_fma3_cos_piby4_compute 618 619Lsinf_fma3_sin_piby4_compute: 620;; vmovapd xmm1,__Lsinfarray+010h 621 vmovsd xmm1,__Lsinfarray+010h 622 vmulsd xmm3,xmm0,xmm0 623 vfmadd231sd xmm1,xmm3,__Lsinfarray+018h 624 vfmadd213sd xmm1,xmm3,__Lsinfarray+008h 625 vfmadd213sd xmm1,xmm3,__Lsinfarray 626 vmulsd xmm3,xmm0,xmm3 ; xmm3 <-- x^3 627 vfmadd231sd xmm0,xmm1,xmm3 628 jmp Lsinf_fma3_exit_s_1 629 630ALIGN 16 631Lsinf_fma3_cos_piby4_compute: 632 vmovapd xmm2,L_one 633 vmulsd xmm3,xmm0,xmm0 634 vfmadd231sd xmm2,xmm3,__Lcosfarray ; xmm2 <-- 1 + c0 x^2 635 ; would simple Horner's be slower? 636 vmovsd xmm1,__Lcosfarray+018h ; xmm1 <-- c3 637 vfmadd231sd xmm1,xmm3,__Lcosfarray+020h ; xmm1 <-- c4 x^2+ c3 638 vfmadd213sd xmm1,xmm3,__Lcosfarray+010h ; xmm1 <-- (c4 x^2+ c3)x^2 + c2 639 vfmadd213sd xmm1,xmm3,__Lcosfarray+008h ; xmm1 <-- ((c4 x^2+ c3)x^2 + c2)x^2 + c1 640 vmulsd xmm3,xmm3,xmm3 ; xmm3 <-- x^4 641 vmovdqa xmm0,xmm2 642 vfmadd231sd xmm0,xmm1,xmm3 643Lsinf_fma3_exit_s_1: 644 ; assuming FMA3 ==> AVX ==> SSE4.1 645 vpcmpeqq xmm2,xmm4,XMMWORD PTR L_int_two 646 vpcmpeqq xmm3,xmm4,XMMWORD PTR L_int_three 647 vorpd xmm3,xmm2,xmm3 648 vandnpd xmm3,xmm3,L_signbit 649 vxorpd xmm0,xmm0,xmm3 650 651 vandnpd xmm1,xmm5,L_signbit 652 vxorpd xmm0,xmm1,xmm0 653Lsinf_fma3_return_sinf_s: 654 vcvtsd2ss xmm0,xmm0,xmm0 655 StackDeallocate stack_size 656 ret 657 658Lsinf_fma3_naninf: 659 call fname_special 660 StackDeallocate stack_size 661 ret 662 663fname endp 664END 665