1*4afb647cSTimo Kreuzer; 2*4afb647cSTimo Kreuzer; MIT License 3*4afb647cSTimo Kreuzer; ----------- 4*4afb647cSTimo Kreuzer; 5*4afb647cSTimo Kreuzer; Copyright (c) 2002-2019 Advanced Micro Devices, Inc. 6*4afb647cSTimo Kreuzer; 7*4afb647cSTimo Kreuzer; Permission is hereby granted, free of charge, to any person obtaining a copy 8*4afb647cSTimo Kreuzer; of this Software and associated documentaon files (the "Software"), to deal 9*4afb647cSTimo Kreuzer; in the Software without restriction, including without limitation the rights 10*4afb647cSTimo Kreuzer; to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11*4afb647cSTimo Kreuzer; copies of the Software, and to permit persons to whom the Software is 12*4afb647cSTimo Kreuzer; furnished to do so, subject to the following conditions: 13*4afb647cSTimo Kreuzer; 14*4afb647cSTimo Kreuzer; The above copyright notice and this permission notice shall be included in 15*4afb647cSTimo Kreuzer; all copies or substantial portions of the Software. 16*4afb647cSTimo Kreuzer; 17*4afb647cSTimo Kreuzer; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18*4afb647cSTimo Kreuzer; IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19*4afb647cSTimo Kreuzer; FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20*4afb647cSTimo Kreuzer; AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21*4afb647cSTimo Kreuzer; LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22*4afb647cSTimo Kreuzer; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 23*4afb647cSTimo Kreuzer; THE SOFTWARE. 24*4afb647cSTimo Kreuzer; 25*4afb647cSTimo Kreuzer; 26*4afb647cSTimo Kreuzer; An implementation of the sinf function. 27*4afb647cSTimo Kreuzer; 28*4afb647cSTimo Kreuzer; Prototype 29*4afb647cSTimo Kreuzer; 30*4afb647cSTimo Kreuzer; float sinf(float x); 31*4afb647cSTimo Kreuzer; 32*4afb647cSTimo Kreuzer; Computes sinf(x). 33*4afb647cSTimo Kreuzer; It will provide proper C99 return values, 34*4afb647cSTimo Kreuzer; but may not raise floating point status bits properly. 35*4afb647cSTimo Kreuzer; Based on the NAG C implementation. 36*4afb647cSTimo Kreuzer; 37*4afb647cSTimo Kreuzer 38*4afb647cSTimo Kreuzer.const 39*4afb647cSTimo KreuzerALIGN 16 40*4afb647cSTimo KreuzerL_signbit DQ 08000000000000000h 41*4afb647cSTimo Kreuzer DQ 08000000000000000h 42*4afb647cSTimo KreuzerL_sign_mask DQ 07FFFFFFFFFFFFFFFh 43*4afb647cSTimo Kreuzer DQ 07FFFFFFFFFFFFFFFh 44*4afb647cSTimo KreuzerL_one DQ 03FF0000000000000h 45*4afb647cSTimo Kreuzer DQ 03FF0000000000000h 46*4afb647cSTimo KreuzerL_int_three DQ 00000000000000003h 47*4afb647cSTimo Kreuzer DQ 00000000000000003h 48*4afb647cSTimo KreuzerL_one_half DQ 03FE0000000000000h 49*4afb647cSTimo Kreuzer DQ 03FE0000000000000h 50*4afb647cSTimo KreuzerL_twobypi DQ 03FE45F306DC9C883h 51*4afb647cSTimo Kreuzer DQ 03FE45F306DC9C883h 52*4afb647cSTimo KreuzerL_piby2_1 DQ 03FF921FB54400000h 53*4afb647cSTimo Kreuzer DQ 03FF921FB54400000h 54*4afb647cSTimo KreuzerL_one_sixth DQ 03FC5555555555555h 55*4afb647cSTimo Kreuzer DQ 03FC5555555555555h 56*4afb647cSTimo KreuzerL_piby2_1tail DQ 03DD0B4611A626331h 57*4afb647cSTimo Kreuzer DQ 03DD0B4611A626331h 58*4afb647cSTimo KreuzerL_piby2_2 DQ 03dd0b4611a600000h 59*4afb647cSTimo Kreuzer DQ 03dd0b4611a600000h 60*4afb647cSTimo KreuzerL_piby2_2tail DQ 03ba3198a2e037073h 61*4afb647cSTimo Kreuzer DQ 03ba3198a2e037073h 62*4afb647cSTimo KreuzerL_inf_mask_32 DD 07F800000h 63*4afb647cSTimo Kreuzer DD 07F800000h 64*4afb647cSTimo Kreuzer DQ 07F8000007F800000h 65*4afb647cSTimo KreuzerL_int_two DQ 00000000000000002h 66*4afb647cSTimo Kreuzer DQ 00000000000000002h 67*4afb647cSTimo KreuzerL_piby2_lead DQ 03ff921fb54442d18h 68*4afb647cSTimo Kreuzer DQ 03ff921fb54442d18h 69*4afb647cSTimo KreuzerL_piby4 DQ 03fe921fb54442d18h 70*4afb647cSTimo Kreuzer DQ 03fe921fb54442d18h 71*4afb647cSTimo KreuzerL_mask_3f2 DQ 03f20000000000000h 72*4afb647cSTimo Kreuzer DQ 03f20000000000000h 73*4afb647cSTimo KreuzerL_mask_3f8 DQ 03f80000000000000h 74*4afb647cSTimo Kreuzer DQ 03f80000000000000h 75*4afb647cSTimo Kreuzer 76*4afb647cSTimo Kreuzer; Do these really need to be different? 77*4afb647cSTimo KreuzerL_large_x_fma3 DQ 04170008AC0000000h ; 16779436 78*4afb647cSTimo KreuzerL_large_x_sse2 DQ 0416E848000000000h ; 16000000 79*4afb647cSTimo Kreuzer 80*4afb647cSTimo KreuzerEXTRN __Lcosfarray:QWORD 81*4afb647cSTimo KreuzerEXTRN __Lsinfarray:QWORD 82*4afb647cSTimo KreuzerEXTRN __use_fma3_lib:DWORD 83*4afb647cSTimo KreuzerEXTRN __L_2_by_pi_bits:BYTE 84*4afb647cSTimo Kreuzer 85*4afb647cSTimo Kreuzer; define local variable storage offsets 86*4afb647cSTimo Kreuzerp_temp EQU 010h ; temporary for get/put bits operation 87*4afb647cSTimo Kreuzerp_temp1 EQU 018h ; temporary for get/put bits operation 88*4afb647cSTimo Kreuzerregion EQU 020h ; pointer to region for remainder_piby2 89*4afb647cSTimo Kreuzerr EQU 028h ; pointer to r for remainder_piby2 90*4afb647cSTimo Kreuzerdummy_space EQU 040h 91*4afb647cSTimo Kreuzer 92*4afb647cSTimo Kreuzerstack_size EQU 058h 93*4afb647cSTimo Kreuzer 94*4afb647cSTimo Kreuzerinclude fm.inc 95*4afb647cSTimo Kreuzer 96*4afb647cSTimo Kreuzerfname TEXTEQU <sinf> 97*4afb647cSTimo Kreuzerfname_special TEXTEQU <_sinf_special> 98*4afb647cSTimo Kreuzer 99*4afb647cSTimo Kreuzer;Define name and any external functions being called 100*4afb647cSTimo KreuzerEXTRN __remainder_piby2d2f_forC : PROC ; NEAR 101*4afb647cSTimo KreuzerEXTERN fname_special : PROC 102*4afb647cSTimo Kreuzer 103*4afb647cSTimo Kreuzer.code 104*4afb647cSTimo KreuzerALIGN 16 105*4afb647cSTimo KreuzerPUBLIC fname 106*4afb647cSTimo Kreuzerfname PROC FRAME 107*4afb647cSTimo Kreuzer StackAllocate stack_size 108*4afb647cSTimo Kreuzer .ENDPROLOG 109*4afb647cSTimo Kreuzer cmp DWORD PTR __use_fma3_lib, 0 110*4afb647cSTimo Kreuzer jne Lsinf_fma3 111*4afb647cSTimo Kreuzer 112*4afb647cSTimo KreuzerLsinf_sse2: 113*4afb647cSTimo Kreuzer 114*4afb647cSTimo Kreuzer xorpd xmm2, xmm2 ; zeroed out for later use 115*4afb647cSTimo Kreuzer 116*4afb647cSTimo Kreuzer;; if NaN or inf 117*4afb647cSTimo Kreuzer movd edx, xmm0 118*4afb647cSTimo Kreuzer mov eax, 07f800000h 119*4afb647cSTimo Kreuzer mov r10d, eax 120*4afb647cSTimo Kreuzer and r10d, edx 121*4afb647cSTimo Kreuzer cmp r10d, eax 122*4afb647cSTimo Kreuzer jz Lsinf_sse2_naninf 123*4afb647cSTimo Kreuzer 124*4afb647cSTimo Kreuzer; GET_BITS_DP64(x, ux); 125*4afb647cSTimo Kreuzer; get the input value to an integer register. 126*4afb647cSTimo Kreuzer cvtss2sd xmm0, xmm0 ; convert input to double. 127*4afb647cSTimo Kreuzer movd rdx, xmm0 ; rdx is ux 128*4afb647cSTimo Kreuzer 129*4afb647cSTimo Kreuzer; ax = (ux & ~SIGNBIT_DP64); 130*4afb647cSTimo Kreuzer mov r10, rdx 131*4afb647cSTimo Kreuzer btr r10, 63 ; r10 is ax 132*4afb647cSTimo Kreuzer mov r8d, 1 ; for determining region later on 133*4afb647cSTimo Kreuzer 134*4afb647cSTimo Kreuzer;; if (ax <= 0x3fe921fb54442d18) abs(x) <= pi/4 135*4afb647cSTimo Kreuzer mov rax, 03fe921fb54442d18h 136*4afb647cSTimo Kreuzer cmp r10, rax 137*4afb647cSTimo Kreuzer jg Lsinf_absx_gt_piby4 138*4afb647cSTimo Kreuzer 139*4afb647cSTimo Kreuzer;; if (ax < 0x3f80000000000000) abs(x) < 2.0^(-7) 140*4afb647cSTimo Kreuzer mov rax, 3f80000000000000h 141*4afb647cSTimo Kreuzer cmp r10, rax 142*4afb647cSTimo Kreuzer jge Lsinf_sse2_small 143*4afb647cSTimo Kreuzer 144*4afb647cSTimo Kreuzer;; if (ax < 0x3f20000000000000) abs(x) < 2.0^(-13) 145*4afb647cSTimo Kreuzer mov rax, 3f20000000000000h 146*4afb647cSTimo Kreuzer cmp r10, rax 147*4afb647cSTimo Kreuzer jge Lsinf_sse2_smaller 148*4afb647cSTimo Kreuzer 149*4afb647cSTimo Kreuzer; sinf = x; 150*4afb647cSTimo Kreuzer jmp Lsinf_sse2_cleanup 151*4afb647cSTimo Kreuzer 152*4afb647cSTimo KreuzerALIGN 16 153*4afb647cSTimo KreuzerLsinf_sse2_smaller: 154*4afb647cSTimo Kreuzer; sinf = x - x^3 * 0.1666666666666666666; 155*4afb647cSTimo Kreuzer movsd xmm2, xmm0 156*4afb647cSTimo Kreuzer movsd xmm4, QWORD PTR L_one_sixth ; 0.1666666666666666666 157*4afb647cSTimo Kreuzer mulsd xmm2, xmm2 ; x^2 158*4afb647cSTimo Kreuzer mulsd xmm2, xmm0 ; x^3 159*4afb647cSTimo Kreuzer mulsd xmm2, xmm4 ; x^3 * 0.1666666666666666666 160*4afb647cSTimo Kreuzer subsd xmm0, xmm2 ; x - x^3 * 0.1666666666666666666 161*4afb647cSTimo Kreuzer jmp Lsinf_sse2_cleanup 162*4afb647cSTimo Kreuzer 163*4afb647cSTimo KreuzerALIGN 16 164*4afb647cSTimo KreuzerLsinf_sse2_small: 165*4afb647cSTimo Kreuzer movsd xmm2, xmm0 ; x2 = r * r; 166*4afb647cSTimo Kreuzer mulsd xmm2, xmm0 ; x2 167*4afb647cSTimo Kreuzer 168*4afb647cSTimo Kreuzer;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 169*4afb647cSTimo Kreuzer; region 0 or 2 - do a sinf calculation 170*4afb647cSTimo Kreuzer; zs = x + x3((s1 + x2 * s2) + x4(s3 + x2 * s4)); 171*4afb647cSTimo Kreuzer movsd xmm1, QWORD PTR __Lsinfarray+18h ; s4 172*4afb647cSTimo Kreuzer mulsd xmm1, xmm2 ; s4x2 173*4afb647cSTimo Kreuzer movsd xmm4, xmm2 ; move for x4 174*4afb647cSTimo Kreuzer movsd xmm5, QWORD PTR __Lsinfarray+8h ; s2 175*4afb647cSTimo Kreuzer mulsd xmm4, xmm2 ; x4 176*4afb647cSTimo Kreuzer movsd xmm3, xmm0 ; move for x3 177*4afb647cSTimo Kreuzer mulsd xmm5, xmm2 ; s2x2 178*4afb647cSTimo Kreuzer mulsd xmm3, xmm2 ; x3 179*4afb647cSTimo Kreuzer addsd xmm1, QWORD PTR __Lsinfarray+10h ; s3+s4x2 180*4afb647cSTimo Kreuzer mulsd xmm1, xmm4 ; s3x4+s4x6 181*4afb647cSTimo Kreuzer addsd xmm5, QWORD PTR __Lsinfarray ; s1+s2x2 182*4afb647cSTimo Kreuzer addsd xmm1, xmm5 ; s1+s2x2+s3x4+s4x6 183*4afb647cSTimo Kreuzer mulsd xmm1, xmm3 ; x3(s1+s2x2+s3x4+s4x6) 184*4afb647cSTimo Kreuzer addsd xmm0, xmm1 ; x + x3(s1+s2x2+s3x4+s4x6) 185*4afb647cSTimo Kreuzer jmp Lsinf_sse2_cleanup 186*4afb647cSTimo Kreuzer 187*4afb647cSTimo Kreuzer;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 188*4afb647cSTimo KreuzerALIGN 16 189*4afb647cSTimo KreuzerLsinf_absx_gt_piby4: 190*4afb647cSTimo Kreuzer; xneg = (ax != ux); 191*4afb647cSTimo Kreuzer cmp rdx, r10 192*4afb647cSTimo Kreuzer mov r11d, 0 193*4afb647cSTimo Kreuzer;; if (xneg) x = -x; 194*4afb647cSTimo Kreuzer jz Lsinf_sse2_reduce_moderate 195*4afb647cSTimo Kreuzer 196*4afb647cSTimo Kreuzer mov r11d, 1 197*4afb647cSTimo Kreuzer subsd xmm2, xmm0 198*4afb647cSTimo Kreuzer movsd xmm0, xmm2 199*4afb647cSTimo Kreuzer 200*4afb647cSTimo KreuzerLsinf_sse2_reduce_moderate: 201*4afb647cSTimo Kreuzer;; if (x < 5.0e6) 202*4afb647cSTimo Kreuzer cmp r10, QWORD PTR L_large_x_sse2 203*4afb647cSTimo Kreuzer jae Lsinf_sse2_reduce_large 204*4afb647cSTimo Kreuzer 205*4afb647cSTimo Kreuzer; reduce the argument to be in a range from -pi/4 to +pi/4 206*4afb647cSTimo Kreuzer; by subtracting multiples of pi/2 207*4afb647cSTimo Kreuzer movsd xmm2, xmm0 208*4afb647cSTimo Kreuzer movsd xmm3, QWORD PTR L_twobypi 209*4afb647cSTimo Kreuzer movsd xmm4, xmm0 210*4afb647cSTimo Kreuzer movsd xmm5, QWORD PTR L_one_half ; .5 211*4afb647cSTimo Kreuzer mulsd xmm2, xmm3 212*4afb647cSTimo Kreuzer 213*4afb647cSTimo Kreuzer;/* How many pi/2 is x a multiple of? */ 214*4afb647cSTimo Kreuzer; xexp = ax >> EXPSHIFTBITS_DP64; 215*4afb647cSTimo Kreuzer mov r9, r10 216*4afb647cSTimo Kreuzer shr r9, 52 ; >>EXPSHIFTBITS_DP64 217*4afb647cSTimo Kreuzer 218*4afb647cSTimo Kreuzer; npi2 = (int)(x * twobypi + 0.5); 219*4afb647cSTimo Kreuzer addsd xmm2, xmm5 ; npi2 220*4afb647cSTimo Kreuzer 221*4afb647cSTimo Kreuzer movsd xmm3, QWORD PTR L_piby2_1 222*4afb647cSTimo Kreuzer cvttpd2dq xmm0, xmm2 ; convert to integer 223*4afb647cSTimo Kreuzer movsd xmm1, QWORD PTR L_piby2_1tail 224*4afb647cSTimo Kreuzer cvtdq2pd xmm2, xmm0 ; and back to double. 225*4afb647cSTimo Kreuzer 226*4afb647cSTimo Kreuzer; /* Subtract the multiple from x to get an extra-precision remainder */ 227*4afb647cSTimo Kreuzer; rhead = x - npi2 * piby2_1; 228*4afb647cSTimo Kreuzer mulsd xmm3, xmm2 229*4afb647cSTimo Kreuzer subsd xmm4, xmm3 ; rhead 230*4afb647cSTimo Kreuzer 231*4afb647cSTimo Kreuzer; rtail = npi2 * piby2_1tail; 232*4afb647cSTimo Kreuzer mulsd xmm1, xmm2 233*4afb647cSTimo Kreuzer movd eax, xmm0 234*4afb647cSTimo Kreuzer 235*4afb647cSTimo Kreuzer; GET_BITS_DP64(rhead-rtail, uy); 236*4afb647cSTimo Kreuzer; originally only rhead 237*4afb647cSTimo Kreuzer movsd xmm0, xmm4 238*4afb647cSTimo Kreuzer subsd xmm0, xmm1 239*4afb647cSTimo Kreuzer 240*4afb647cSTimo Kreuzer movsd xmm3, QWORD PTR L_piby2_2 241*4afb647cSTimo Kreuzer movd rcx, xmm0 242*4afb647cSTimo Kreuzer movsd xmm5, QWORD PTR L_piby2_2tail 243*4afb647cSTimo Kreuzer 244*4afb647cSTimo Kreuzer; xmm0=r, xmm4=rhead, xmm1=rtail, xmm2=npi2, xmm3=temp for calc, xmm5= temp for calc 245*4afb647cSTimo Kreuzer; expdiff = xexp - ((uy & EXPBITS_DP64) >> EXPSHIFTBITS_DP64); 246*4afb647cSTimo Kreuzer shl rcx, 1 ; strip any sign bit 247*4afb647cSTimo Kreuzer shr rcx, 53 ; >> EXPSHIFTBITS_DP64 +1 248*4afb647cSTimo Kreuzer sub r9, rcx ; expdiff 249*4afb647cSTimo Kreuzer 250*4afb647cSTimo Kreuzer;; if (expdiff > 15) 251*4afb647cSTimo Kreuzer cmp r9, 15 252*4afb647cSTimo Kreuzer jle Lsinf_sse2_expdiff_le_15 253*4afb647cSTimo Kreuzer 254*4afb647cSTimo Kreuzer; The remainder is pretty small compared with x, which 255*4afb647cSTimo Kreuzer; implies that x is a near multiple of pi/2 256*4afb647cSTimo Kreuzer; (x matches the multiple to at least 15 bits) 257*4afb647cSTimo Kreuzer; t = rhead; 258*4afb647cSTimo Kreuzer movsd xmm1, xmm4 259*4afb647cSTimo Kreuzer 260*4afb647cSTimo Kreuzer; rtail = npi2 * piby2_2; 261*4afb647cSTimo Kreuzer mulsd xmm3, xmm2 262*4afb647cSTimo Kreuzer 263*4afb647cSTimo Kreuzer; rhead = t - rtail; 264*4afb647cSTimo Kreuzer mulsd xmm5, xmm2 ; npi2 * piby2_2tail 265*4afb647cSTimo Kreuzer subsd xmm4, xmm3 ; rhead 266*4afb647cSTimo Kreuzer 267*4afb647cSTimo Kreuzer; rtail = npi2 * piby2_2tail - ((t - rhead) - rtail); 268*4afb647cSTimo Kreuzer subsd xmm1, xmm4 ; t - rhead 269*4afb647cSTimo Kreuzer subsd xmm1, xmm3 ; -rtail 270*4afb647cSTimo Kreuzer subsd xmm5, xmm1 ; rtail 271*4afb647cSTimo Kreuzer 272*4afb647cSTimo Kreuzer; r = rhead - rtail; 273*4afb647cSTimo Kreuzer movsd xmm0, xmm4 274*4afb647cSTimo Kreuzer 275*4afb647cSTimo Kreuzer;HARSHA 276*4afb647cSTimo Kreuzer;xmm1=rtail 277*4afb647cSTimo Kreuzer movsd xmm1, xmm5 278*4afb647cSTimo Kreuzer subsd xmm0, xmm5 279*4afb647cSTimo Kreuzer 280*4afb647cSTimo Kreuzer; xmm0=r, xmm4=rhead, xmm1=rtail 281*4afb647cSTimo Kreuzer 282*4afb647cSTimo Kreuzer;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 283*4afb647cSTimo KreuzerLsinf_sse2_expdiff_le_15: 284*4afb647cSTimo Kreuzer cmp rcx, 03f2h ; is r < 2^-13 ? 285*4afb647cSTimo Kreuzer jge Lsinf_sse2_calc_sincosf_piby4 ; use taylor series if not 286*4afb647cSTimo Kreuzer cmp rcx, 03deh ; if r really small. 287*4afb647cSTimo Kreuzer jle Lsinf_sse2_r_very_small ; then sinf(r) ~ r or 1 288*4afb647cSTimo Kreuzer 289*4afb647cSTimo Kreuzer movsd xmm2, xmm0 290*4afb647cSTimo Kreuzer mulsd xmm2, xmm0 ; xmm2 <-- r^2 291*4afb647cSTimo Kreuzer 292*4afb647cSTimo Kreuzer;; if region is 0 or 2 do a sinf calc. 293*4afb647cSTimo Kreuzer and r8d, eax 294*4afb647cSTimo Kreuzer jnz Lsinf_sse2_small_calc_sin 295*4afb647cSTimo Kreuzer 296*4afb647cSTimo Kreuzer; region 0 or 2 do a sinf calculation 297*4afb647cSTimo Kreuzer; use simply polynomial 298*4afb647cSTimo Kreuzer; x - x*x*x*0.166666666666666666; 299*4afb647cSTimo Kreuzer movsd xmm3, QWORD PTR L_one_sixth 300*4afb647cSTimo Kreuzer mulsd xmm3, xmm0 ; * x 301*4afb647cSTimo Kreuzer mulsd xmm3, xmm2 ; * x^2 302*4afb647cSTimo Kreuzer subsd xmm0, xmm3 ; xs 303*4afb647cSTimo Kreuzer jmp Lsinf_sse2_adjust_region 304*4afb647cSTimo Kreuzer 305*4afb647cSTimo KreuzerALIGN 16 306*4afb647cSTimo KreuzerLsinf_sse2_small_calc_sin: 307*4afb647cSTimo Kreuzer; region 1 or 3 do a cosf calculation 308*4afb647cSTimo Kreuzer; use simply polynomial 309*4afb647cSTimo Kreuzer; 1.0 - x*x*0.5; 310*4afb647cSTimo Kreuzer movsd xmm0, QWORD PTR L_one ; 1.0 311*4afb647cSTimo Kreuzer mulsd xmm2, QWORD PTR L_one_half ; 0.5 *x^2 312*4afb647cSTimo Kreuzer subsd xmm0, xmm2 ; xc 313*4afb647cSTimo Kreuzer jmp Lsinf_sse2_adjust_region 314*4afb647cSTimo Kreuzer 315*4afb647cSTimo KreuzerALIGN 16 316*4afb647cSTimo KreuzerLsinf_sse2_r_very_small: 317*4afb647cSTimo Kreuzer;; if region is 0 or 2 do a sinf calc. (sinf ~ x) 318*4afb647cSTimo Kreuzer and r8d, eax 319*4afb647cSTimo Kreuzer jz Lsinf_sse2_adjust_region 320*4afb647cSTimo Kreuzer 321*4afb647cSTimo Kreuzer movsd xmm0, QWORD PTR L_one ; cosf(r) is a 1 322*4afb647cSTimo Kreuzer jmp Lsinf_sse2_adjust_region 323*4afb647cSTimo Kreuzer 324*4afb647cSTimo Kreuzer;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 325*4afb647cSTimo KreuzerALIGN 16 326*4afb647cSTimo KreuzerLsinf_sse2_reduce_large: 327*4afb647cSTimo Kreuzer; Reduce x into range [-pi/4, pi/4] 328*4afb647cSTimo Kreuzer; __remainder_piby2d2f_forC(x, &r, ®ion); 329*4afb647cSTimo Kreuzer 330*4afb647cSTimo Kreuzer mov QWORD PTR p_temp[rsp], r11 331*4afb647cSTimo Kreuzer lea rdx, QWORD PTR r[rsp] 332*4afb647cSTimo Kreuzer lea r8, QWORD PTR region[rsp] 333*4afb647cSTimo Kreuzer movd rcx, xmm0 334*4afb647cSTimo Kreuzer call __remainder_piby2d2f_forC 335*4afb647cSTimo Kreuzer mov r11, QWORD PTR p_temp[rsp] 336*4afb647cSTimo Kreuzer mov r8d, 1 ; for determining region later on 337*4afb647cSTimo Kreuzer movsd xmm1, QWORD PTR r[rsp] ; x 338*4afb647cSTimo Kreuzer mov eax, DWORD PTR region[rsp] ; region 339*4afb647cSTimo Kreuzer 340*4afb647cSTimo Kreuzer; xmm0 = x, xmm4 = xx, r8d = 1, eax= region 341*4afb647cSTimo Kreuzer;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 342*4afb647cSTimo Kreuzer 343*4afb647cSTimo Kreuzer; perform taylor series to calc sinfx, cosfx 344*4afb647cSTimo KreuzerLsinf_sse2_calc_sincosf_piby4: 345*4afb647cSTimo Kreuzer; x2 = r * r; 346*4afb647cSTimo Kreuzer movsd xmm2, xmm0 347*4afb647cSTimo Kreuzer mulsd xmm2, xmm0 ; x2 348*4afb647cSTimo Kreuzer 349*4afb647cSTimo Kreuzer;; if region is 1 or 3, do a cosf calc. 350*4afb647cSTimo Kreuzer and r8d, eax 351*4afb647cSTimo Kreuzer jnz Lsinf_sse2_do_cosf_calc 352*4afb647cSTimo Kreuzer 353*4afb647cSTimo Kreuzer; region is 0 or 2: do a sinf calc. 354*4afb647cSTimo Kreuzer; zs = x + x3((s1 + x2 * s2) + x4(s3 + x2 * s4)); 355*4afb647cSTimo KreuzerLsinf_sse2_do_sinf_calc: 356*4afb647cSTimo Kreuzer movsd xmm1, QWORD PTR __Lsinfarray+18h ; s4 357*4afb647cSTimo Kreuzer mulsd xmm1, xmm2 ; s4x2 358*4afb647cSTimo Kreuzer movsd xmm4, xmm2 ; move for x4 359*4afb647cSTimo Kreuzer mulsd xmm4, xmm2 ; x4 360*4afb647cSTimo Kreuzer movsd xmm5, QWORD PTR __Lsinfarray+8h ; s2 361*4afb647cSTimo Kreuzer mulsd xmm5, xmm2 ; s2x2 362*4afb647cSTimo Kreuzer movsd xmm3, xmm0 ; move for x3 363*4afb647cSTimo Kreuzer mulsd xmm3, xmm2 ; x3 364*4afb647cSTimo Kreuzer addsd xmm1, QWORD PTR __Lsinfarray+10h ; s3+s4x2 365*4afb647cSTimo Kreuzer mulsd xmm1, xmm4 ; s3x4+s4x6 366*4afb647cSTimo Kreuzer addsd xmm5, QWORD PTR __Lsinfarray ; s1+s2x2 367*4afb647cSTimo Kreuzer addsd xmm1, xmm5 ; s1+s2x2+s3x4+s4x6 368*4afb647cSTimo Kreuzer mulsd xmm1, xmm3 ; x3(s1+s2x2+s3x4+s4x6) 369*4afb647cSTimo Kreuzer addsd xmm0, xmm1 ; x + x3(s1+s2x2+s3x4+s4x6) 370*4afb647cSTimo Kreuzer jmp Lsinf_sse2_adjust_region 371*4afb647cSTimo Kreuzer 372*4afb647cSTimo KreuzerALIGN 16 373*4afb647cSTimo KreuzerLsinf_sse2_do_cosf_calc: 374*4afb647cSTimo Kreuzer 375*4afb647cSTimo Kreuzer; region 1 or 3 - do a cosf calculation 376*4afb647cSTimo Kreuzer; zc = 1-0.5*x2+ c1*x4 +c2*x6 +c3*x8; 377*4afb647cSTimo Kreuzer; zc = 1-0.5*x2+ c1*x4 +c2*x6 +c3*x8 + c4*x10 for a higher precision 378*4afb647cSTimo Kreuzer movsd xmm1, QWORD PTR __Lcosfarray+20h ; c4 379*4afb647cSTimo Kreuzer movsd xmm4, xmm2 ; move for x4 380*4afb647cSTimo Kreuzer mulsd xmm1, xmm2 ; c4x2 381*4afb647cSTimo Kreuzer movsd xmm3, QWORD PTR __Lcosfarray+10h ; c2 382*4afb647cSTimo Kreuzer mulsd xmm4, xmm2 ; x4 383*4afb647cSTimo Kreuzer movsd xmm0, QWORD PTR __Lcosfarray ; c0 384*4afb647cSTimo Kreuzer mulsd xmm3, xmm2 ; c2x2 385*4afb647cSTimo Kreuzer mulsd xmm0, xmm2 ; c0x2 (=-0.5x2) 386*4afb647cSTimo Kreuzer addsd xmm1, QWORD PTR __Lcosfarray+18h ; c3+c4x2 387*4afb647cSTimo Kreuzer mulsd xmm1, xmm4 ; c3x4 + c4x6 388*4afb647cSTimo Kreuzer addsd xmm3, QWORD PTR __Lcosfarray+8h ; c1+c2x2 389*4afb647cSTimo Kreuzer addsd xmm1, xmm3 ; c1 + c2x2 + c3x4 + c4x6 390*4afb647cSTimo Kreuzer mulsd xmm1, xmm4 ; c1x4 + c2x6 + c3x8 + c4x10 391*4afb647cSTimo Kreuzer addsd xmm0, QWORD PTR L_one ; 1 - 0.5x2 392*4afb647cSTimo Kreuzer addsd xmm0, xmm1 ; 1 - 0.5x2 + c1x4 + c2x6 + c3x8 + c4x10 393*4afb647cSTimo Kreuzer 394*4afb647cSTimo Kreuzer;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 395*4afb647cSTimo Kreuzer 396*4afb647cSTimo Kreuzer 397*4afb647cSTimo KreuzerLsinf_sse2_adjust_region: 398*4afb647cSTimo Kreuzer; positive or negative 399*4afb647cSTimo Kreuzer; switch (region) 400*4afb647cSTimo Kreuzer shr eax, 1 401*4afb647cSTimo Kreuzer mov ecx, eax 402*4afb647cSTimo Kreuzer and eax, r11d 403*4afb647cSTimo Kreuzer 404*4afb647cSTimo Kreuzer not ecx 405*4afb647cSTimo Kreuzer not r11d 406*4afb647cSTimo Kreuzer and ecx, r11d 407*4afb647cSTimo Kreuzer 408*4afb647cSTimo Kreuzer or eax, ecx 409*4afb647cSTimo Kreuzer and eax, 1 410*4afb647cSTimo Kreuzer jnz Lsinf_sse2_cleanup 411*4afb647cSTimo Kreuzer 412*4afb647cSTimo Kreuzer;; if the original region 0, 1 and arg is negative, then we negate the result. 413*4afb647cSTimo Kreuzer;; if the original region 2, 3 and arg is positive, then we negate the result. 414*4afb647cSTimo Kreuzer movsd xmm2, xmm0 415*4afb647cSTimo Kreuzer xorpd xmm0, xmm0 416*4afb647cSTimo Kreuzer subsd xmm0, xmm2 417*4afb647cSTimo Kreuzer 418*4afb647cSTimo Kreuzer 419*4afb647cSTimo KreuzerLsinf_sse2_cleanup: 420*4afb647cSTimo Kreuzer cvtsd2ss xmm0, xmm0 421*4afb647cSTimo Kreuzer StackDeallocate stack_size 422*4afb647cSTimo Kreuzer ret 423*4afb647cSTimo Kreuzer 424*4afb647cSTimo KreuzerALIGN 16 425*4afb647cSTimo KreuzerLsinf_sse2_naninf: 426*4afb647cSTimo Kreuzer call fname_special 427*4afb647cSTimo Kreuzer StackDeallocate stack_size 428*4afb647cSTimo Kreuzer ret 429*4afb647cSTimo Kreuzer 430*4afb647cSTimo KreuzerALIGN 16 431*4afb647cSTimo KreuzerLsinf_fma3: 432*4afb647cSTimo Kreuzer vmovd eax,xmm0 433*4afb647cSTimo Kreuzer mov r8d,L_inf_mask_32 434*4afb647cSTimo Kreuzer and eax,r8d 435*4afb647cSTimo Kreuzer cmp eax, r8d 436*4afb647cSTimo Kreuzer jz Lsinf_fma3_naninf 437*4afb647cSTimo Kreuzer 438*4afb647cSTimo Kreuzer vcvtss2sd xmm5,xmm0,xmm0 439*4afb647cSTimo Kreuzer vmovq r9,xmm5 440*4afb647cSTimo Kreuzer btr r9,63 ; r9 <-- |x| 441*4afb647cSTimo Kreuzer cmp r9,L_piby4 442*4afb647cSTimo Kreuzer jg Lsinf_fma3_range_reduce 443*4afb647cSTimo Kreuzer 444*4afb647cSTimo Kreuzer cmp r9,L_mask_3f8 445*4afb647cSTimo Kreuzer jge Lsinf_fma3_compute_sinf_piby_4 446*4afb647cSTimo Kreuzer 447*4afb647cSTimo Kreuzer cmp r9,L_mask_3f2 448*4afb647cSTimo Kreuzer jge Lsinf_fma3_compute_x_xxx_0_1666 449*4afb647cSTimo Kreuzer 450*4afb647cSTimo Kreuzer ; Here |x| < 2^-13; just return sin x ~ x 451*4afb647cSTimo Kreuzer StackDeallocate stack_size 452*4afb647cSTimo Kreuzer ret 453*4afb647cSTimo Kreuzer 454*4afb647cSTimo KreuzerALIGN 16 455*4afb647cSTimo KreuzerLsinf_fma3_compute_x_xxx_0_1666: 456*4afb647cSTimo Kreuzer ; Here |x| < 2^-7; return sin x ~ x + 1/6 x^3 457*4afb647cSTimo Kreuzer vmulsd xmm1,xmm5,xmm5 458*4afb647cSTimo Kreuzer vmulsd xmm0,xmm1,xmm5 ; xmm1 <-- x^3 459*4afb647cSTimo Kreuzer vfnmadd132sd xmm0,xmm5,L_one_sixth ; x - x*x*x*0.166666666666666666 460*4afb647cSTimo Kreuzer jmp Lsinf_fma3_return_sinf_s 461*4afb647cSTimo Kreuzer 462*4afb647cSTimo KreuzerALIGN 16 463*4afb647cSTimo KreuzerLsinf_fma3_compute_sinf_piby_4: 464*4afb647cSTimo Kreuzer vmovapd xmm0,xmm5 465*4afb647cSTimo Kreuzer vmovsd xmm1,__Lsinfarray+010h 466*4afb647cSTimo Kreuzer vmulsd xmm3,xmm0,xmm0 ; xmm3 <-- x^2 467*4afb647cSTimo Kreuzer vfmadd231sd xmm1,xmm3,__Lsinfarray+018h 468*4afb647cSTimo Kreuzer vfmadd213sd xmm1,xmm3,__Lsinfarray+08h 469*4afb647cSTimo Kreuzer vfmadd213sd xmm1,xmm3,__Lsinfarray 470*4afb647cSTimo Kreuzer vmulsd xmm3,xmm0,xmm3 ; xmm3 <-- x^3 471*4afb647cSTimo Kreuzer vfmadd231sd xmm0,xmm1,xmm3 472*4afb647cSTimo Kreuzer jmp Lsinf_fma3_return_sinf_s 473*4afb647cSTimo Kreuzer 474*4afb647cSTimo KreuzerALIGN 16 475*4afb647cSTimo KreuzerLsinf_fma3_range_reduce: 476*4afb647cSTimo Kreuzer vmovq xmm0,r9 ; xmm0 <-- |x| 477*4afb647cSTimo Kreuzer cmp r9,L_large_x_fma3 478*4afb647cSTimo Kreuzer jge Lsinf_fma3_reduce_large 479*4afb647cSTimo Kreuzer 480*4afb647cSTimo KreuzerLsinf_fma3_sinf_reduce_moderate: 481*4afb647cSTimo Kreuzer vandpd xmm1,xmm0,L_sign_mask ; xmm1 <-- |x| mov should suffice WAT 482*4afb647cSTimo Kreuzer vmovapd xmm2,L_twobypi 483*4afb647cSTimo Kreuzer vfmadd213sd xmm2,xmm1,L_one_half 484*4afb647cSTimo Kreuzer vcvttpd2dq xmm2,xmm2 485*4afb647cSTimo Kreuzer vpmovsxdq xmm1,xmm2 486*4afb647cSTimo Kreuzer vandpd xmm4,xmm1,L_int_three ; xmm4 <-- region 487*4afb647cSTimo Kreuzer vshufps xmm1 ,xmm1,xmm1,8 488*4afb647cSTimo Kreuzer vcvtdq2pd xmm1,xmm1 489*4afb647cSTimo Kreuzer vmovdqa xmm2,xmm0 490*4afb647cSTimo Kreuzer vfnmadd231sd xmm2,xmm1,L_piby2_1 ; xmm2 <-- rhead 491*4afb647cSTimo Kreuzer vmulsd xmm3,xmm1,L_piby2_1tail ; xmm3 <-- rtail 492*4afb647cSTimo Kreuzer vsubsd xmm0,xmm2,xmm3 ; xmm0 <-- r_1 493*4afb647cSTimo Kreuzer vsubsd xmm2,xmm2,xmm0 494*4afb647cSTimo Kreuzer vsubsd xmm1,xmm2,xmm3 ; xmm4 <-- rr_1 495*4afb647cSTimo Kreuzer jmp Lsinf_fma3_exit_s 496*4afb647cSTimo Kreuzer 497*4afb647cSTimo KreuzerALIGN 16 498*4afb647cSTimo KreuzerLsinf_fma3_reduce_large: 499*4afb647cSTimo Kreuzer lea r9,__L_2_by_pi_bits 500*4afb647cSTimo Kreuzer ;xexp = (x >> 52) 1023 501*4afb647cSTimo Kreuzer vmovq r11,xmm0 502*4afb647cSTimo Kreuzer mov rcx,r11 503*4afb647cSTimo Kreuzer shr r11,52 504*4afb647cSTimo Kreuzer sub r11,1023 ; r11 <-- xexp = exponent of input x 505*4afb647cSTimo Kreuzer ;calculate the last byte from which to start multiplication 506*4afb647cSTimo Kreuzer ;last = 134 (xexp >> 3) 507*4afb647cSTimo Kreuzer mov r10,r11 508*4afb647cSTimo Kreuzer shr r10,3 509*4afb647cSTimo Kreuzer sub r10,134 ;r10 = last 510*4afb647cSTimo Kreuzer neg r10 ;r10 = last 511*4afb647cSTimo Kreuzer ;load 64 bits of 2_by_pi 512*4afb647cSTimo Kreuzer mov rax,[r9+r10] 513*4afb647cSTimo Kreuzer ;mantissa of x = ((x << 12) >> 12) | implied bit 514*4afb647cSTimo Kreuzer shl rcx,12 515*4afb647cSTimo Kreuzer shr rcx,12 ;rcx = mantissa part of input x 516*4afb647cSTimo Kreuzer bts rcx,52 ;add the implied bit as well 517*4afb647cSTimo Kreuzer ;load next 128 bits of 2_by_pi 518*4afb647cSTimo Kreuzer add r10,8 ;increment to next 8 bytes of 2_by_pi 519*4afb647cSTimo Kreuzer vmovdqu xmm0,XMMWORD PTR[r9+r10] 520*4afb647cSTimo Kreuzer ;do three 64bit multiplications with mant of x 521*4afb647cSTimo Kreuzer mul rcx 522*4afb647cSTimo Kreuzer mov r8,rax ; r8 <-- last 64 bits of mul = res1[2] 523*4afb647cSTimo Kreuzer mov r10,rdx ; r10 <-- carry 524*4afb647cSTimo Kreuzer vmovq rax,xmm0 525*4afb647cSTimo Kreuzer mul rcx 526*4afb647cSTimo Kreuzer ;resexp = xexp & 7 527*4afb647cSTimo Kreuzer and r11,7 ; r11 <-- resexp = last 3 bits 528*4afb647cSTimo Kreuzer psrldq xmm0,8 529*4afb647cSTimo Kreuzer add rax,r10 ; add the previous carry 530*4afb647cSTimo Kreuzer adc rdx,0 531*4afb647cSTimo Kreuzer mov r9,rax ; r9 <-- next 64 bits of mul = res1[1] 532*4afb647cSTimo Kreuzer mov r10,rdx ; r10 <-- carry 533*4afb647cSTimo Kreuzer vmovq rax,xmm0 534*4afb647cSTimo Kreuzer mul rcx 535*4afb647cSTimo Kreuzer add r10,rax ; r10 = most sig 64 bits = res1[0] 536*4afb647cSTimo Kreuzer ;find the region 537*4afb647cSTimo Kreuzer ;last three bits ltb = most sig bits >> (54 resexp)) 538*4afb647cSTimo Kreuzer ; decimal point in last 18 bits == 8 lsb's in first 64 bits 539*4afb647cSTimo Kreuzer ; and 8 msb's in next 64 bits 540*4afb647cSTimo Kreuzer ;point_five = ltb & 01h; 541*4afb647cSTimo Kreuzer ;region = ((ltb >> 1) + point_five) & 3; 542*4afb647cSTimo Kreuzer mov rcx,54 543*4afb647cSTimo Kreuzer mov rax,r10 544*4afb647cSTimo Kreuzer sub rcx,r11 545*4afb647cSTimo Kreuzer xor rdx,rdx ;rdx = sign of x(i.e first part of x * 2bypi) 546*4afb647cSTimo Kreuzer shr rax,cl 547*4afb647cSTimo Kreuzer jnc Lsinf_fma3_no_point_five_f 548*4afb647cSTimo Kreuzer ;;if there is carry.. then negate the result of multiplication 549*4afb647cSTimo Kreuzer not r10 550*4afb647cSTimo Kreuzer not r9 551*4afb647cSTimo Kreuzer not r8 552*4afb647cSTimo Kreuzer mov rdx,08000000000000000h 553*4afb647cSTimo Kreuzer 554*4afb647cSTimo KreuzerLsinf_fma3_no_point_five_f: 555*4afb647cSTimo Kreuzer adc rax,0 556*4afb647cSTimo Kreuzer and rax,3 557*4afb647cSTimo Kreuzer vmovd xmm4,eax ;store region to xmm4 558*4afb647cSTimo Kreuzer ;calculate the number of integer bits and zero them out 559*4afb647cSTimo Kreuzer mov rcx,r11 560*4afb647cSTimo Kreuzer add rcx,10 ; rcx <-- no. of integer bits 561*4afb647cSTimo Kreuzer shl r10,cl 562*4afb647cSTimo Kreuzer shr r10,cl ; r10 contains only mant bits 563*4afb647cSTimo Kreuzer sub rcx,64 ; form the exponent 564*4afb647cSTimo Kreuzer mov r11,rcx 565*4afb647cSTimo Kreuzer ;find the highest set bit 566*4afb647cSTimo Kreuzer bsr rcx,r10 567*4afb647cSTimo Kreuzer jnz Lsinf_fma3_form_mantissa_f 568*4afb647cSTimo Kreuzer mov r10,r9 569*4afb647cSTimo Kreuzer mov r9,r8 570*4afb647cSTimo Kreuzer mov r8,0 571*4afb647cSTimo Kreuzer bsr rcx,r10 ; rcx <-- hsb 572*4afb647cSTimo Kreuzer sub r11,64 573*4afb647cSTimo Kreuzer 574*4afb647cSTimo KreuzerLsinf_fma3_form_mantissa_f: 575*4afb647cSTimo Kreuzer add r11,rcx ;for exp of x 576*4afb647cSTimo Kreuzer sub rcx,52 ;rcx = no. of bits to shift in r10 577*4afb647cSTimo Kreuzer cmp rcx,0 578*4afb647cSTimo Kreuzer jl Lsinf_fma3_hsb_below_52_f 579*4afb647cSTimo Kreuzer je Lsinf_fma3_form_numbers_f 580*4afb647cSTimo Kreuzer ;hsb above 52 581*4afb647cSTimo Kreuzer mov r8,r10 ; previous contents of r8 not required 582*4afb647cSTimo Kreuzer shr r10,cl ; r10 = mantissa of x with hsb at 52 583*4afb647cSTimo Kreuzer shr r9,cl ; make space for bits from r10 584*4afb647cSTimo Kreuzer sub rcx,64 585*4afb647cSTimo Kreuzer neg rcx 586*4afb647cSTimo Kreuzer shl r8,cl 587*4afb647cSTimo Kreuzer or r9,r8 ; r9 = mantissa bits of xx 588*4afb647cSTimo Kreuzer jmp Lsinf_fma3_form_numbers_f 589*4afb647cSTimo Kreuzer 590*4afb647cSTimo KreuzerALIGN 16 591*4afb647cSTimo KreuzerLsinf_fma3_hsb_below_52_f: 592*4afb647cSTimo Kreuzer neg rcx 593*4afb647cSTimo Kreuzer mov rax,r9 594*4afb647cSTimo Kreuzer shl r10,cl 595*4afb647cSTimo Kreuzer shl r9,cl 596*4afb647cSTimo Kreuzer sub rcx,64 597*4afb647cSTimo Kreuzer neg rcx 598*4afb647cSTimo Kreuzer shr rax,cl 599*4afb647cSTimo Kreuzer or r10,rax 600*4afb647cSTimo Kreuzer shr r8,cl 601*4afb647cSTimo Kreuzer or r9,r8 602*4afb647cSTimo Kreuzer 603*4afb647cSTimo KreuzerALIGN 16 604*4afb647cSTimo KreuzerLsinf_fma3_form_numbers_f: 605*4afb647cSTimo Kreuzer add r11,1023 606*4afb647cSTimo Kreuzer btr r10,52 ; remove the implied bit 607*4afb647cSTimo Kreuzer mov rcx,r11 608*4afb647cSTimo Kreuzer or r10,rdx ; put the sign 609*4afb647cSTimo Kreuzer shl rcx,52 610*4afb647cSTimo Kreuzer or r10,rcx ; r10 <-- x 611*4afb647cSTimo Kreuzer vmovq xmm0,r10 ; xmm0 <-- x 612*4afb647cSTimo Kreuzer vmulsd xmm0,xmm0,L_piby2_lead 613*4afb647cSTimo KreuzerLsinf_fma3_exit_s: 614*4afb647cSTimo Kreuzer vmovq rax,xmm4 615*4afb647cSTimo Kreuzer and rax,01h 616*4afb647cSTimo Kreuzer cmp rax,01h 617*4afb647cSTimo Kreuzer jz Lsinf_fma3_cos_piby4_compute 618*4afb647cSTimo Kreuzer 619*4afb647cSTimo KreuzerLsinf_fma3_sin_piby4_compute: 620*4afb647cSTimo Kreuzer;; vmovapd xmm1,__Lsinfarray+010h 621*4afb647cSTimo Kreuzer vmovsd xmm1,__Lsinfarray+010h 622*4afb647cSTimo Kreuzer vmulsd xmm3,xmm0,xmm0 623*4afb647cSTimo Kreuzer vfmadd231sd xmm1,xmm3,__Lsinfarray+018h 624*4afb647cSTimo Kreuzer vfmadd213sd xmm1,xmm3,__Lsinfarray+008h 625*4afb647cSTimo Kreuzer vfmadd213sd xmm1,xmm3,__Lsinfarray 626*4afb647cSTimo Kreuzer vmulsd xmm3,xmm0,xmm3 ; xmm3 <-- x^3 627*4afb647cSTimo Kreuzer vfmadd231sd xmm0,xmm1,xmm3 628*4afb647cSTimo Kreuzer jmp Lsinf_fma3_exit_s_1 629*4afb647cSTimo Kreuzer 630*4afb647cSTimo KreuzerALIGN 16 631*4afb647cSTimo KreuzerLsinf_fma3_cos_piby4_compute: 632*4afb647cSTimo Kreuzer vmovapd xmm2,L_one 633*4afb647cSTimo Kreuzer vmulsd xmm3,xmm0,xmm0 634*4afb647cSTimo Kreuzer vfmadd231sd xmm2,xmm3,__Lcosfarray ; xmm2 <-- 1 + c0 x^2 635*4afb647cSTimo Kreuzer ; would simple Horner's be slower? 636*4afb647cSTimo Kreuzer vmovsd xmm1,__Lcosfarray+018h ; xmm1 <-- c3 637*4afb647cSTimo Kreuzer vfmadd231sd xmm1,xmm3,__Lcosfarray+020h ; xmm1 <-- c4 x^2+ c3 638*4afb647cSTimo Kreuzer vfmadd213sd xmm1,xmm3,__Lcosfarray+010h ; xmm1 <-- (c4 x^2+ c3)x^2 + c2 639*4afb647cSTimo Kreuzer vfmadd213sd xmm1,xmm3,__Lcosfarray+008h ; xmm1 <-- ((c4 x^2+ c3)x^2 + c2)x^2 + c1 640*4afb647cSTimo Kreuzer vmulsd xmm3,xmm3,xmm3 ; xmm3 <-- x^4 641*4afb647cSTimo Kreuzer vmovdqa xmm0,xmm2 642*4afb647cSTimo Kreuzer vfmadd231sd xmm0,xmm1,xmm3 643*4afb647cSTimo KreuzerLsinf_fma3_exit_s_1: 644*4afb647cSTimo Kreuzer ; assuming FMA3 ==> AVX ==> SSE4.1 645*4afb647cSTimo Kreuzer vpcmpeqq xmm2,xmm4,XMMWORD PTR L_int_two 646*4afb647cSTimo Kreuzer vpcmpeqq xmm3,xmm4,XMMWORD PTR L_int_three 647*4afb647cSTimo Kreuzer vorpd xmm3,xmm2,xmm3 648*4afb647cSTimo Kreuzer vandnpd xmm3,xmm3,L_signbit 649*4afb647cSTimo Kreuzer vxorpd xmm0,xmm0,xmm3 650*4afb647cSTimo Kreuzer 651*4afb647cSTimo Kreuzer vandnpd xmm1,xmm5,L_signbit 652*4afb647cSTimo Kreuzer vxorpd xmm0,xmm1,xmm0 653*4afb647cSTimo KreuzerLsinf_fma3_return_sinf_s: 654*4afb647cSTimo Kreuzer vcvtsd2ss xmm0,xmm0,xmm0 655*4afb647cSTimo Kreuzer StackDeallocate stack_size 656*4afb647cSTimo Kreuzer ret 657*4afb647cSTimo Kreuzer 658*4afb647cSTimo KreuzerLsinf_fma3_naninf: 659*4afb647cSTimo Kreuzer call fname_special 660*4afb647cSTimo Kreuzer StackDeallocate stack_size 661*4afb647cSTimo Kreuzer ret 662*4afb647cSTimo Kreuzer 663*4afb647cSTimo Kreuzerfname endp 664*4afb647cSTimo KreuzerEND 665