xref: /reactos/sdk/lib/crt/math/libm_sse2/sinf.asm (revision 4afb647c)
1*4afb647cSTimo Kreuzer;
2*4afb647cSTimo Kreuzer; MIT License
3*4afb647cSTimo Kreuzer; -----------
4*4afb647cSTimo Kreuzer;
5*4afb647cSTimo Kreuzer; Copyright (c) 2002-2019 Advanced Micro Devices, Inc.
6*4afb647cSTimo Kreuzer;
7*4afb647cSTimo Kreuzer; Permission is hereby granted, free of charge, to any person obtaining a copy
8*4afb647cSTimo Kreuzer; of this Software and associated documentaon files (the "Software"), to deal
9*4afb647cSTimo Kreuzer; in the Software without restriction, including without limitation the rights
10*4afb647cSTimo Kreuzer; to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11*4afb647cSTimo Kreuzer; copies of the Software, and to permit persons to whom the Software is
12*4afb647cSTimo Kreuzer; furnished to do so, subject to the following conditions:
13*4afb647cSTimo Kreuzer;
14*4afb647cSTimo Kreuzer; The above copyright notice and this permission notice shall be included in
15*4afb647cSTimo Kreuzer; all copies or substantial portions of the Software.
16*4afb647cSTimo Kreuzer;
17*4afb647cSTimo Kreuzer; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18*4afb647cSTimo Kreuzer; IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19*4afb647cSTimo Kreuzer; FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20*4afb647cSTimo Kreuzer; AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21*4afb647cSTimo Kreuzer; LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22*4afb647cSTimo Kreuzer; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
23*4afb647cSTimo Kreuzer; THE SOFTWARE.
24*4afb647cSTimo Kreuzer;
25*4afb647cSTimo Kreuzer;
26*4afb647cSTimo Kreuzer; An implementation of the sinf function.
27*4afb647cSTimo Kreuzer;
28*4afb647cSTimo Kreuzer; Prototype
29*4afb647cSTimo Kreuzer;
30*4afb647cSTimo Kreuzer;     float sinf(float x);
31*4afb647cSTimo Kreuzer;
32*4afb647cSTimo Kreuzer;   Computes sinf(x).
33*4afb647cSTimo Kreuzer;   It will provide proper C99 return values,
34*4afb647cSTimo Kreuzer;   but may not raise floating point status bits properly.
35*4afb647cSTimo Kreuzer;   Based on the NAG C implementation.
36*4afb647cSTimo Kreuzer;
37*4afb647cSTimo Kreuzer
38*4afb647cSTimo Kreuzer.const
39*4afb647cSTimo KreuzerALIGN 16
40*4afb647cSTimo KreuzerL_signbit        DQ 08000000000000000h
41*4afb647cSTimo Kreuzer                 DQ 08000000000000000h
42*4afb647cSTimo KreuzerL_sign_mask      DQ 07FFFFFFFFFFFFFFFh
43*4afb647cSTimo Kreuzer                 DQ 07FFFFFFFFFFFFFFFh
44*4afb647cSTimo KreuzerL_one            DQ 03FF0000000000000h
45*4afb647cSTimo Kreuzer                 DQ 03FF0000000000000h
46*4afb647cSTimo KreuzerL_int_three      DQ 00000000000000003h
47*4afb647cSTimo Kreuzer                 DQ 00000000000000003h
48*4afb647cSTimo KreuzerL_one_half       DQ 03FE0000000000000h
49*4afb647cSTimo Kreuzer                 DQ 03FE0000000000000h
50*4afb647cSTimo KreuzerL_twobypi        DQ 03FE45F306DC9C883h
51*4afb647cSTimo Kreuzer                 DQ 03FE45F306DC9C883h
52*4afb647cSTimo KreuzerL_piby2_1        DQ 03FF921FB54400000h
53*4afb647cSTimo Kreuzer                 DQ 03FF921FB54400000h
54*4afb647cSTimo KreuzerL_one_sixth      DQ 03FC5555555555555h
55*4afb647cSTimo Kreuzer                 DQ 03FC5555555555555h
56*4afb647cSTimo KreuzerL_piby2_1tail    DQ 03DD0B4611A626331h
57*4afb647cSTimo Kreuzer                 DQ 03DD0B4611A626331h
58*4afb647cSTimo KreuzerL_piby2_2        DQ 03dd0b4611a600000h
59*4afb647cSTimo Kreuzer                 DQ 03dd0b4611a600000h
60*4afb647cSTimo KreuzerL_piby2_2tail    DQ 03ba3198a2e037073h
61*4afb647cSTimo Kreuzer                 DQ 03ba3198a2e037073h
62*4afb647cSTimo KreuzerL_inf_mask_32    DD 07F800000h
63*4afb647cSTimo Kreuzer                 DD 07F800000h
64*4afb647cSTimo Kreuzer                 DQ 07F8000007F800000h
65*4afb647cSTimo KreuzerL_int_two        DQ 00000000000000002h
66*4afb647cSTimo Kreuzer                 DQ 00000000000000002h
67*4afb647cSTimo KreuzerL_piby2_lead     DQ 03ff921fb54442d18h
68*4afb647cSTimo Kreuzer                 DQ 03ff921fb54442d18h
69*4afb647cSTimo KreuzerL_piby4          DQ 03fe921fb54442d18h
70*4afb647cSTimo Kreuzer                 DQ 03fe921fb54442d18h
71*4afb647cSTimo KreuzerL_mask_3f2       DQ 03f20000000000000h
72*4afb647cSTimo Kreuzer                 DQ 03f20000000000000h
73*4afb647cSTimo KreuzerL_mask_3f8       DQ 03f80000000000000h
74*4afb647cSTimo Kreuzer                 DQ 03f80000000000000h
75*4afb647cSTimo Kreuzer
76*4afb647cSTimo Kreuzer; Do these really need to be different?
77*4afb647cSTimo KreuzerL_large_x_fma3   DQ 04170008AC0000000h     ; 16779436
78*4afb647cSTimo KreuzerL_large_x_sse2   DQ 0416E848000000000h     ; 16000000
79*4afb647cSTimo Kreuzer
80*4afb647cSTimo KreuzerEXTRN __Lcosfarray:QWORD
81*4afb647cSTimo KreuzerEXTRN __Lsinfarray:QWORD
82*4afb647cSTimo KreuzerEXTRN __use_fma3_lib:DWORD
83*4afb647cSTimo KreuzerEXTRN __L_2_by_pi_bits:BYTE
84*4afb647cSTimo Kreuzer
85*4afb647cSTimo Kreuzer; define local variable storage offsets
86*4afb647cSTimo Kreuzerp_temp              EQU 010h     ; temporary for get/put bits operation
87*4afb647cSTimo Kreuzerp_temp1             EQU 018h     ; temporary for get/put bits operation
88*4afb647cSTimo Kreuzerregion              EQU 020h     ; pointer to region for remainder_piby2
89*4afb647cSTimo Kreuzerr                   EQU 028h     ; pointer to r for remainder_piby2
90*4afb647cSTimo Kreuzerdummy_space         EQU 040h
91*4afb647cSTimo Kreuzer
92*4afb647cSTimo Kreuzerstack_size          EQU 058h
93*4afb647cSTimo Kreuzer
94*4afb647cSTimo Kreuzerinclude fm.inc
95*4afb647cSTimo Kreuzer
96*4afb647cSTimo Kreuzerfname           TEXTEQU <sinf>
97*4afb647cSTimo Kreuzerfname_special   TEXTEQU <_sinf_special>
98*4afb647cSTimo Kreuzer
99*4afb647cSTimo Kreuzer;Define name and any external functions being called
100*4afb647cSTimo KreuzerEXTRN           __remainder_piby2d2f_forC : PROC    ; NEAR
101*4afb647cSTimo KreuzerEXTERN          fname_special        : PROC
102*4afb647cSTimo Kreuzer
103*4afb647cSTimo Kreuzer.code
104*4afb647cSTimo KreuzerALIGN 16
105*4afb647cSTimo KreuzerPUBLIC fname
106*4afb647cSTimo Kreuzerfname PROC FRAME
107*4afb647cSTimo Kreuzer    StackAllocate stack_size
108*4afb647cSTimo Kreuzer    .ENDPROLOG
109*4afb647cSTimo Kreuzer    cmp          DWORD PTR __use_fma3_lib, 0
110*4afb647cSTimo Kreuzer    jne          Lsinf_fma3
111*4afb647cSTimo Kreuzer
112*4afb647cSTimo KreuzerLsinf_sse2:
113*4afb647cSTimo Kreuzer
114*4afb647cSTimo Kreuzer    xorpd   xmm2, xmm2                                ; zeroed out for later use
115*4afb647cSTimo Kreuzer
116*4afb647cSTimo Kreuzer;;  if NaN or inf
117*4afb647cSTimo Kreuzer    movd    edx, xmm0
118*4afb647cSTimo Kreuzer    mov     eax, 07f800000h
119*4afb647cSTimo Kreuzer    mov     r10d, eax
120*4afb647cSTimo Kreuzer    and     r10d, edx
121*4afb647cSTimo Kreuzer    cmp     r10d, eax
122*4afb647cSTimo Kreuzer    jz      Lsinf_sse2_naninf
123*4afb647cSTimo Kreuzer
124*4afb647cSTimo Kreuzer; GET_BITS_DP64(x, ux);
125*4afb647cSTimo Kreuzer; get the input value to an integer register.
126*4afb647cSTimo Kreuzer    cvtss2sd     xmm0, xmm0               ; convert input to double.
127*4afb647cSTimo Kreuzer    movd         rdx, xmm0                ; rdx is ux
128*4afb647cSTimo Kreuzer
129*4afb647cSTimo Kreuzer;  ax = (ux & ~SIGNBIT_DP64);
130*4afb647cSTimo Kreuzer    mov     r10, rdx
131*4afb647cSTimo Kreuzer    btr     r10, 63                                   ; r10 is ax
132*4afb647cSTimo Kreuzer    mov     r8d, 1                                    ; for determining region later on
133*4afb647cSTimo Kreuzer
134*4afb647cSTimo Kreuzer;;  if (ax <= 0x3fe921fb54442d18)  abs(x) <= pi/4
135*4afb647cSTimo Kreuzer    mov     rax, 03fe921fb54442d18h
136*4afb647cSTimo Kreuzer    cmp     r10, rax
137*4afb647cSTimo Kreuzer    jg      Lsinf_absx_gt_piby4
138*4afb647cSTimo Kreuzer
139*4afb647cSTimo Kreuzer;;      if (ax < 0x3f80000000000000)  abs(x) < 2.0^(-7)
140*4afb647cSTimo Kreuzer    mov     rax, 3f80000000000000h
141*4afb647cSTimo Kreuzer    cmp     r10, rax
142*4afb647cSTimo Kreuzer    jge     Lsinf_sse2_small
143*4afb647cSTimo Kreuzer
144*4afb647cSTimo Kreuzer;;          if (ax < 0x3f20000000000000) abs(x) < 2.0^(-13)
145*4afb647cSTimo Kreuzer    mov     rax, 3f20000000000000h
146*4afb647cSTimo Kreuzer    cmp     r10, rax
147*4afb647cSTimo Kreuzer    jge     Lsinf_sse2_smaller
148*4afb647cSTimo Kreuzer
149*4afb647cSTimo Kreuzer;                  sinf = x;
150*4afb647cSTimo Kreuzer    jmp     Lsinf_sse2_cleanup
151*4afb647cSTimo Kreuzer
152*4afb647cSTimo KreuzerALIGN 16
153*4afb647cSTimo KreuzerLsinf_sse2_smaller:
154*4afb647cSTimo Kreuzer;   sinf = x - x^3 * 0.1666666666666666666;
155*4afb647cSTimo Kreuzer    movsd   xmm2, xmm0
156*4afb647cSTimo Kreuzer    movsd   xmm4, QWORD PTR L_one_sixth   ; 0.1666666666666666666
157*4afb647cSTimo Kreuzer    mulsd   xmm2, xmm2                    ; x^2
158*4afb647cSTimo Kreuzer    mulsd   xmm2, xmm0                    ; x^3
159*4afb647cSTimo Kreuzer    mulsd   xmm2, xmm4                    ; x^3 * 0.1666666666666666666
160*4afb647cSTimo Kreuzer    subsd   xmm0, xmm2                    ; x - x^3 * 0.1666666666666666666
161*4afb647cSTimo Kreuzer    jmp     Lsinf_sse2_cleanup
162*4afb647cSTimo Kreuzer
163*4afb647cSTimo KreuzerALIGN 16
164*4afb647cSTimo KreuzerLsinf_sse2_small:
165*4afb647cSTimo Kreuzer    movsd   xmm2, xmm0                    ; x2 = r * r;
166*4afb647cSTimo Kreuzer    mulsd   xmm2, xmm0                    ; x2
167*4afb647cSTimo Kreuzer
168*4afb647cSTimo Kreuzer;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
169*4afb647cSTimo Kreuzer; region 0 or 2     - do a sinf calculation
170*4afb647cSTimo Kreuzer;  zs = x + x3((s1 + x2 * s2) + x4(s3 + x2 * s4));
171*4afb647cSTimo Kreuzer    movsd   xmm1, QWORD PTR __Lsinfarray+18h          ; s4
172*4afb647cSTimo Kreuzer    mulsd   xmm1, xmm2                                ; s4x2
173*4afb647cSTimo Kreuzer    movsd   xmm4, xmm2                                ; move for x4
174*4afb647cSTimo Kreuzer    movsd   xmm5, QWORD PTR __Lsinfarray+8h           ; s2
175*4afb647cSTimo Kreuzer    mulsd   xmm4, xmm2                                ; x4
176*4afb647cSTimo Kreuzer    movsd   xmm3, xmm0                                ; move for x3
177*4afb647cSTimo Kreuzer    mulsd   xmm5, xmm2                                ; s2x2
178*4afb647cSTimo Kreuzer    mulsd   xmm3, xmm2                                ; x3
179*4afb647cSTimo Kreuzer    addsd   xmm1, QWORD PTR __Lsinfarray+10h          ; s3+s4x2
180*4afb647cSTimo Kreuzer    mulsd   xmm1, xmm4                                ; s3x4+s4x6
181*4afb647cSTimo Kreuzer    addsd   xmm5, QWORD PTR __Lsinfarray              ; s1+s2x2
182*4afb647cSTimo Kreuzer    addsd   xmm1, xmm5                                ; s1+s2x2+s3x4+s4x6
183*4afb647cSTimo Kreuzer    mulsd   xmm1, xmm3                                ; x3(s1+s2x2+s3x4+s4x6)
184*4afb647cSTimo Kreuzer    addsd   xmm0, xmm1                                ; x + x3(s1+s2x2+s3x4+s4x6)
185*4afb647cSTimo Kreuzer    jmp     Lsinf_sse2_cleanup
186*4afb647cSTimo Kreuzer
187*4afb647cSTimo Kreuzer;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
188*4afb647cSTimo KreuzerALIGN 16
189*4afb647cSTimo KreuzerLsinf_absx_gt_piby4:
190*4afb647cSTimo Kreuzer;  xneg = (ax != ux);
191*4afb647cSTimo Kreuzer    cmp     rdx, r10
192*4afb647cSTimo Kreuzer    mov     r11d, 0
193*4afb647cSTimo Kreuzer;;  if (xneg) x = -x;
194*4afb647cSTimo Kreuzer    jz      Lsinf_sse2_reduce_moderate
195*4afb647cSTimo Kreuzer
196*4afb647cSTimo Kreuzer    mov     r11d, 1
197*4afb647cSTimo Kreuzer    subsd   xmm2, xmm0
198*4afb647cSTimo Kreuzer    movsd   xmm0, xmm2
199*4afb647cSTimo Kreuzer
200*4afb647cSTimo KreuzerLsinf_sse2_reduce_moderate:
201*4afb647cSTimo Kreuzer;;  if (x < 5.0e6)
202*4afb647cSTimo Kreuzer    cmp     r10, QWORD PTR L_large_x_sse2
203*4afb647cSTimo Kreuzer    jae     Lsinf_sse2_reduce_large
204*4afb647cSTimo Kreuzer
205*4afb647cSTimo Kreuzer; reduce  the argument to be in a range from -pi/4 to +pi/4
206*4afb647cSTimo Kreuzer; by subtracting multiples of pi/2
207*4afb647cSTimo Kreuzer    movsd   xmm2, xmm0
208*4afb647cSTimo Kreuzer    movsd   xmm3, QWORD PTR L_twobypi
209*4afb647cSTimo Kreuzer    movsd   xmm4, xmm0
210*4afb647cSTimo Kreuzer    movsd   xmm5, QWORD PTR L_one_half              ; .5
211*4afb647cSTimo Kreuzer    mulsd   xmm2, xmm3
212*4afb647cSTimo Kreuzer
213*4afb647cSTimo Kreuzer;/* How many pi/2 is x a multiple of? */
214*4afb647cSTimo Kreuzer;      xexp  = ax >> EXPSHIFTBITS_DP64;
215*4afb647cSTimo Kreuzer    mov     r9, r10
216*4afb647cSTimo Kreuzer    shr     r9, 52                                    ; >>EXPSHIFTBITS_DP64
217*4afb647cSTimo Kreuzer
218*4afb647cSTimo Kreuzer;        npi2  = (int)(x * twobypi + 0.5);
219*4afb647cSTimo Kreuzer    addsd   xmm2, xmm5                        ; npi2
220*4afb647cSTimo Kreuzer
221*4afb647cSTimo Kreuzer    movsd   xmm3, QWORD PTR L_piby2_1
222*4afb647cSTimo Kreuzer    cvttpd2dq  xmm0, xmm2                             ; convert to integer
223*4afb647cSTimo Kreuzer    movsd   xmm1, QWORD PTR L_piby2_1tail
224*4afb647cSTimo Kreuzer    cvtdq2pd   xmm2, xmm0                             ; and back to double.
225*4afb647cSTimo Kreuzer
226*4afb647cSTimo Kreuzer;      /* Subtract the multiple from x to get an extra-precision remainder */
227*4afb647cSTimo Kreuzer;      rhead  = x - npi2 * piby2_1;
228*4afb647cSTimo Kreuzer    mulsd   xmm3, xmm2
229*4afb647cSTimo Kreuzer    subsd   xmm4, xmm3                                ; rhead
230*4afb647cSTimo Kreuzer
231*4afb647cSTimo Kreuzer;      rtail  = npi2 * piby2_1tail;
232*4afb647cSTimo Kreuzer    mulsd   xmm1, xmm2
233*4afb647cSTimo Kreuzer    movd    eax, xmm0
234*4afb647cSTimo Kreuzer
235*4afb647cSTimo Kreuzer;      GET_BITS_DP64(rhead-rtail, uy);
236*4afb647cSTimo Kreuzer; originally only rhead
237*4afb647cSTimo Kreuzer    movsd   xmm0, xmm4
238*4afb647cSTimo Kreuzer    subsd   xmm0, xmm1
239*4afb647cSTimo Kreuzer
240*4afb647cSTimo Kreuzer    movsd   xmm3, QWORD PTR L_piby2_2
241*4afb647cSTimo Kreuzer    movd    rcx, xmm0
242*4afb647cSTimo Kreuzer    movsd   xmm5, QWORD PTR L_piby2_2tail
243*4afb647cSTimo Kreuzer
244*4afb647cSTimo Kreuzer;    xmm0=r, xmm4=rhead, xmm1=rtail, xmm2=npi2, xmm3=temp for calc, xmm5= temp for calc
245*4afb647cSTimo Kreuzer;      expdiff = xexp - ((uy & EXPBITS_DP64) >> EXPSHIFTBITS_DP64);
246*4afb647cSTimo Kreuzer    shl     rcx, 1                                    ; strip any sign bit
247*4afb647cSTimo Kreuzer    shr     rcx, 53                                   ; >> EXPSHIFTBITS_DP64 +1
248*4afb647cSTimo Kreuzer    sub     r9, rcx                                   ; expdiff
249*4afb647cSTimo Kreuzer
250*4afb647cSTimo Kreuzer;;      if (expdiff > 15)
251*4afb647cSTimo Kreuzer    cmp     r9, 15
252*4afb647cSTimo Kreuzer    jle     Lsinf_sse2_expdiff_le_15
253*4afb647cSTimo Kreuzer
254*4afb647cSTimo Kreuzer;       The remainder is pretty small compared with x, which
255*4afb647cSTimo Kreuzer;       implies that x is a near multiple of pi/2
256*4afb647cSTimo Kreuzer;       (x matches the multiple to at least 15 bits)
257*4afb647cSTimo Kreuzer;          t  = rhead;
258*4afb647cSTimo Kreuzer    movsd   xmm1, xmm4
259*4afb647cSTimo Kreuzer
260*4afb647cSTimo Kreuzer;          rtail  = npi2 * piby2_2;
261*4afb647cSTimo Kreuzer    mulsd   xmm3, xmm2
262*4afb647cSTimo Kreuzer
263*4afb647cSTimo Kreuzer;          rhead  = t - rtail;
264*4afb647cSTimo Kreuzer    mulsd   xmm5, xmm2                                ; npi2 * piby2_2tail
265*4afb647cSTimo Kreuzer    subsd   xmm4, xmm3                                ; rhead
266*4afb647cSTimo Kreuzer
267*4afb647cSTimo Kreuzer;          rtail  = npi2 * piby2_2tail - ((t - rhead) - rtail);
268*4afb647cSTimo Kreuzer    subsd   xmm1, xmm4                                ; t - rhead
269*4afb647cSTimo Kreuzer    subsd   xmm1, xmm3                                ; -rtail
270*4afb647cSTimo Kreuzer    subsd   xmm5, xmm1                                ; rtail
271*4afb647cSTimo Kreuzer
272*4afb647cSTimo Kreuzer;      r = rhead - rtail;
273*4afb647cSTimo Kreuzer    movsd   xmm0, xmm4
274*4afb647cSTimo Kreuzer
275*4afb647cSTimo Kreuzer;HARSHA
276*4afb647cSTimo Kreuzer;xmm1=rtail
277*4afb647cSTimo Kreuzer    movsd   xmm1, xmm5
278*4afb647cSTimo Kreuzer    subsd   xmm0, xmm5
279*4afb647cSTimo Kreuzer
280*4afb647cSTimo Kreuzer;    xmm0=r, xmm4=rhead, xmm1=rtail
281*4afb647cSTimo Kreuzer
282*4afb647cSTimo Kreuzer;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
283*4afb647cSTimo KreuzerLsinf_sse2_expdiff_le_15:
284*4afb647cSTimo Kreuzer    cmp     rcx, 03f2h                                ; is r < 2^-13 ?
285*4afb647cSTimo Kreuzer    jge     Lsinf_sse2_calc_sincosf_piby4             ; use taylor series if not
286*4afb647cSTimo Kreuzer    cmp     rcx, 03deh                                ; if r really small.
287*4afb647cSTimo Kreuzer    jle     Lsinf_sse2_r_very_small                   ; then sinf(r) ~ r or 1
288*4afb647cSTimo Kreuzer
289*4afb647cSTimo Kreuzer    movsd   xmm2, xmm0
290*4afb647cSTimo Kreuzer    mulsd   xmm2, xmm0                                ; xmm2 <-- r^2
291*4afb647cSTimo Kreuzer
292*4afb647cSTimo Kreuzer;;      if region is 0 or 2 do a sinf calc.
293*4afb647cSTimo Kreuzer    and     r8d, eax
294*4afb647cSTimo Kreuzer    jnz     Lsinf_sse2_small_calc_sin
295*4afb647cSTimo Kreuzer
296*4afb647cSTimo Kreuzer; region 0 or 2 do a sinf calculation
297*4afb647cSTimo Kreuzer; use simply polynomial
298*4afb647cSTimo Kreuzer;              x - x*x*x*0.166666666666666666;
299*4afb647cSTimo Kreuzer    movsd   xmm3, QWORD PTR L_one_sixth
300*4afb647cSTimo Kreuzer    mulsd   xmm3, xmm0                                ; * x
301*4afb647cSTimo Kreuzer    mulsd   xmm3, xmm2                                ; * x^2
302*4afb647cSTimo Kreuzer    subsd   xmm0, xmm3                                ; xs
303*4afb647cSTimo Kreuzer    jmp     Lsinf_sse2_adjust_region
304*4afb647cSTimo Kreuzer
305*4afb647cSTimo KreuzerALIGN 16
306*4afb647cSTimo KreuzerLsinf_sse2_small_calc_sin:
307*4afb647cSTimo Kreuzer; region 1 or 3 do a cosf calculation
308*4afb647cSTimo Kreuzer; use simply polynomial
309*4afb647cSTimo Kreuzer;              1.0 - x*x*0.5;
310*4afb647cSTimo Kreuzer    movsd   xmm0, QWORD PTR L_one  ; 1.0
311*4afb647cSTimo Kreuzer    mulsd   xmm2, QWORD PTR L_one_half              ; 0.5 *x^2
312*4afb647cSTimo Kreuzer    subsd   xmm0, xmm2                                ; xc
313*4afb647cSTimo Kreuzer    jmp     Lsinf_sse2_adjust_region
314*4afb647cSTimo Kreuzer
315*4afb647cSTimo KreuzerALIGN 16
316*4afb647cSTimo KreuzerLsinf_sse2_r_very_small:
317*4afb647cSTimo Kreuzer;;      if region is 0 or 2    do a sinf calc. (sinf ~ x)
318*4afb647cSTimo Kreuzer    and     r8d, eax
319*4afb647cSTimo Kreuzer    jz      Lsinf_sse2_adjust_region
320*4afb647cSTimo Kreuzer
321*4afb647cSTimo Kreuzer    movsd   xmm0, QWORD PTR L_one  ; cosf(r) is a 1
322*4afb647cSTimo Kreuzer    jmp     Lsinf_sse2_adjust_region
323*4afb647cSTimo Kreuzer
324*4afb647cSTimo Kreuzer;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
325*4afb647cSTimo KreuzerALIGN 16
326*4afb647cSTimo KreuzerLsinf_sse2_reduce_large:
327*4afb647cSTimo Kreuzer;      Reduce x into range [-pi/4, pi/4]
328*4afb647cSTimo Kreuzer;      __remainder_piby2d2f_forC(x, &r, &region);
329*4afb647cSTimo Kreuzer
330*4afb647cSTimo Kreuzer    mov          QWORD PTR p_temp[rsp], r11
331*4afb647cSTimo Kreuzer    lea          rdx, QWORD PTR r[rsp]
332*4afb647cSTimo Kreuzer    lea          r8,  QWORD PTR region[rsp]
333*4afb647cSTimo Kreuzer    movd         rcx, xmm0
334*4afb647cSTimo Kreuzer    call         __remainder_piby2d2f_forC
335*4afb647cSTimo Kreuzer    mov          r11, QWORD PTR p_temp[rsp]
336*4afb647cSTimo Kreuzer    mov          r8d, 1                   ; for determining region later on
337*4afb647cSTimo Kreuzer    movsd        xmm1, QWORD PTR r[rsp]   ; x
338*4afb647cSTimo Kreuzer    mov          eax,  DWORD PTR region[rsp] ; region
339*4afb647cSTimo Kreuzer
340*4afb647cSTimo Kreuzer; xmm0 = x, xmm4 = xx, r8d = 1, eax= region
341*4afb647cSTimo Kreuzer;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
342*4afb647cSTimo Kreuzer
343*4afb647cSTimo Kreuzer; perform taylor series to calc sinfx, cosfx
344*4afb647cSTimo KreuzerLsinf_sse2_calc_sincosf_piby4:
345*4afb647cSTimo Kreuzer;  x2 = r * r;
346*4afb647cSTimo Kreuzer    movsd   xmm2, xmm0
347*4afb647cSTimo Kreuzer    mulsd   xmm2, xmm0                                ; x2
348*4afb647cSTimo Kreuzer
349*4afb647cSTimo Kreuzer;;      if region is 1 or 3, do a cosf calc.
350*4afb647cSTimo Kreuzer    and     r8d, eax
351*4afb647cSTimo Kreuzer    jnz     Lsinf_sse2_do_cosf_calc
352*4afb647cSTimo Kreuzer
353*4afb647cSTimo Kreuzer; region is 0 or 2: do a sinf calc.
354*4afb647cSTimo Kreuzer;  zs = x + x3((s1 + x2 * s2) + x4(s3 + x2 * s4));
355*4afb647cSTimo KreuzerLsinf_sse2_do_sinf_calc:
356*4afb647cSTimo Kreuzer    movsd   xmm1, QWORD PTR __Lsinfarray+18h          ; s4
357*4afb647cSTimo Kreuzer    mulsd   xmm1, xmm2                                ; s4x2
358*4afb647cSTimo Kreuzer    movsd   xmm4, xmm2                                ; move for x4
359*4afb647cSTimo Kreuzer    mulsd   xmm4, xmm2                                ; x4
360*4afb647cSTimo Kreuzer    movsd   xmm5, QWORD PTR __Lsinfarray+8h           ; s2
361*4afb647cSTimo Kreuzer    mulsd   xmm5, xmm2                                ; s2x2
362*4afb647cSTimo Kreuzer    movsd   xmm3, xmm0                                ; move for x3
363*4afb647cSTimo Kreuzer    mulsd   xmm3, xmm2                                ; x3
364*4afb647cSTimo Kreuzer    addsd   xmm1, QWORD PTR __Lsinfarray+10h          ; s3+s4x2
365*4afb647cSTimo Kreuzer    mulsd   xmm1, xmm4                                ; s3x4+s4x6
366*4afb647cSTimo Kreuzer    addsd   xmm5, QWORD PTR __Lsinfarray              ; s1+s2x2
367*4afb647cSTimo Kreuzer    addsd   xmm1, xmm5                                ; s1+s2x2+s3x4+s4x6
368*4afb647cSTimo Kreuzer    mulsd   xmm1, xmm3                                ; x3(s1+s2x2+s3x4+s4x6)
369*4afb647cSTimo Kreuzer    addsd   xmm0, xmm1                                ; x + x3(s1+s2x2+s3x4+s4x6)
370*4afb647cSTimo Kreuzer    jmp     Lsinf_sse2_adjust_region
371*4afb647cSTimo Kreuzer
372*4afb647cSTimo KreuzerALIGN 16
373*4afb647cSTimo KreuzerLsinf_sse2_do_cosf_calc:
374*4afb647cSTimo Kreuzer
375*4afb647cSTimo Kreuzer; region 1 or 3     - do a cosf calculation
376*4afb647cSTimo Kreuzer;  zc = 1-0.5*x2+ c1*x4 +c2*x6 +c3*x8;
377*4afb647cSTimo Kreuzer;     zc = 1-0.5*x2+ c1*x4 +c2*x6 +c3*x8 + c4*x10 for a higher precision
378*4afb647cSTimo Kreuzer    movsd   xmm1, QWORD PTR __Lcosfarray+20h          ; c4
379*4afb647cSTimo Kreuzer    movsd   xmm4, xmm2                                ; move for x4
380*4afb647cSTimo Kreuzer    mulsd   xmm1, xmm2                                ; c4x2
381*4afb647cSTimo Kreuzer    movsd   xmm3, QWORD PTR __Lcosfarray+10h          ; c2
382*4afb647cSTimo Kreuzer    mulsd   xmm4, xmm2                                ; x4
383*4afb647cSTimo Kreuzer    movsd   xmm0, QWORD PTR __Lcosfarray              ; c0
384*4afb647cSTimo Kreuzer    mulsd   xmm3, xmm2                                ; c2x2
385*4afb647cSTimo Kreuzer    mulsd   xmm0, xmm2                                ; c0x2 (=-0.5x2)
386*4afb647cSTimo Kreuzer    addsd   xmm1, QWORD PTR __Lcosfarray+18h          ; c3+c4x2
387*4afb647cSTimo Kreuzer    mulsd   xmm1, xmm4                                ; c3x4 + c4x6
388*4afb647cSTimo Kreuzer    addsd   xmm3, QWORD PTR __Lcosfarray+8h           ; c1+c2x2
389*4afb647cSTimo Kreuzer    addsd   xmm1, xmm3                                ; c1 + c2x2 + c3x4 + c4x6
390*4afb647cSTimo Kreuzer    mulsd   xmm1, xmm4                                ; c1x4 + c2x6 + c3x8 + c4x10
391*4afb647cSTimo Kreuzer    addsd   xmm0, QWORD PTR L_one                     ; 1 - 0.5x2
392*4afb647cSTimo Kreuzer    addsd   xmm0, xmm1                                ; 1 - 0.5x2 + c1x4 + c2x6 + c3x8 + c4x10
393*4afb647cSTimo Kreuzer
394*4afb647cSTimo Kreuzer;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
395*4afb647cSTimo Kreuzer
396*4afb647cSTimo Kreuzer
397*4afb647cSTimo KreuzerLsinf_sse2_adjust_region:
398*4afb647cSTimo Kreuzer; positive or negative
399*4afb647cSTimo Kreuzer;      switch (region)
400*4afb647cSTimo Kreuzer    shr     eax, 1
401*4afb647cSTimo Kreuzer    mov     ecx, eax
402*4afb647cSTimo Kreuzer    and     eax, r11d
403*4afb647cSTimo Kreuzer
404*4afb647cSTimo Kreuzer    not     ecx
405*4afb647cSTimo Kreuzer    not     r11d
406*4afb647cSTimo Kreuzer    and     ecx, r11d
407*4afb647cSTimo Kreuzer
408*4afb647cSTimo Kreuzer    or      eax, ecx
409*4afb647cSTimo Kreuzer    and     eax, 1
410*4afb647cSTimo Kreuzer    jnz     Lsinf_sse2_cleanup
411*4afb647cSTimo Kreuzer
412*4afb647cSTimo Kreuzer;; if the original region 0, 1 and arg is negative, then we negate the result.
413*4afb647cSTimo Kreuzer;; if the original region 2, 3 and arg is positive, then we negate the result.
414*4afb647cSTimo Kreuzer    movsd   xmm2, xmm0
415*4afb647cSTimo Kreuzer    xorpd   xmm0, xmm0
416*4afb647cSTimo Kreuzer    subsd   xmm0, xmm2
417*4afb647cSTimo Kreuzer
418*4afb647cSTimo Kreuzer
419*4afb647cSTimo KreuzerLsinf_sse2_cleanup:
420*4afb647cSTimo Kreuzer    cvtsd2ss xmm0, xmm0
421*4afb647cSTimo Kreuzer    StackDeallocate stack_size
422*4afb647cSTimo Kreuzer    ret
423*4afb647cSTimo Kreuzer
424*4afb647cSTimo KreuzerALIGN 16
425*4afb647cSTimo KreuzerLsinf_sse2_naninf:
426*4afb647cSTimo Kreuzer    call    fname_special
427*4afb647cSTimo Kreuzer    StackDeallocate stack_size
428*4afb647cSTimo Kreuzer    ret
429*4afb647cSTimo Kreuzer
430*4afb647cSTimo KreuzerALIGN 16
431*4afb647cSTimo KreuzerLsinf_fma3:
432*4afb647cSTimo Kreuzer    vmovd        eax,xmm0
433*4afb647cSTimo Kreuzer    mov          r8d,L_inf_mask_32
434*4afb647cSTimo Kreuzer    and          eax,r8d
435*4afb647cSTimo Kreuzer    cmp          eax, r8d
436*4afb647cSTimo Kreuzer    jz           Lsinf_fma3_naninf
437*4afb647cSTimo Kreuzer
438*4afb647cSTimo Kreuzer    vcvtss2sd    xmm5,xmm0,xmm0
439*4afb647cSTimo Kreuzer    vmovq        r9,xmm5
440*4afb647cSTimo Kreuzer    btr          r9,63                    ; r9 <-- |x|
441*4afb647cSTimo Kreuzer    cmp          r9,L_piby4
442*4afb647cSTimo Kreuzer    jg           Lsinf_fma3_range_reduce
443*4afb647cSTimo Kreuzer
444*4afb647cSTimo Kreuzer    cmp          r9,L_mask_3f8
445*4afb647cSTimo Kreuzer    jge          Lsinf_fma3_compute_sinf_piby_4
446*4afb647cSTimo Kreuzer
447*4afb647cSTimo Kreuzer    cmp          r9,L_mask_3f2
448*4afb647cSTimo Kreuzer    jge          Lsinf_fma3_compute_x_xxx_0_1666
449*4afb647cSTimo Kreuzer
450*4afb647cSTimo Kreuzer    ; Here |x| < 2^-13; just return sin x ~ x
451*4afb647cSTimo Kreuzer    StackDeallocate stack_size
452*4afb647cSTimo Kreuzer    ret
453*4afb647cSTimo Kreuzer
454*4afb647cSTimo KreuzerALIGN 16
455*4afb647cSTimo KreuzerLsinf_fma3_compute_x_xxx_0_1666:
456*4afb647cSTimo Kreuzer    ; Here |x| < 2^-7; return sin x ~ x + 1/6 x^3
457*4afb647cSTimo Kreuzer    vmulsd       xmm1,xmm5,xmm5
458*4afb647cSTimo Kreuzer    vmulsd       xmm0,xmm1,xmm5           ; xmm1 <-- x^3
459*4afb647cSTimo Kreuzer    vfnmadd132sd xmm0,xmm5,L_one_sixth    ; x - x*x*x*0.166666666666666666
460*4afb647cSTimo Kreuzer    jmp          Lsinf_fma3_return_sinf_s
461*4afb647cSTimo Kreuzer
462*4afb647cSTimo KreuzerALIGN 16
463*4afb647cSTimo KreuzerLsinf_fma3_compute_sinf_piby_4:
464*4afb647cSTimo Kreuzer    vmovapd      xmm0,xmm5
465*4afb647cSTimo Kreuzer    vmovsd       xmm1,__Lsinfarray+010h
466*4afb647cSTimo Kreuzer    vmulsd       xmm3,xmm0,xmm0           ; xmm3 <-- x^2
467*4afb647cSTimo Kreuzer    vfmadd231sd  xmm1,xmm3,__Lsinfarray+018h
468*4afb647cSTimo Kreuzer    vfmadd213sd  xmm1,xmm3,__Lsinfarray+08h
469*4afb647cSTimo Kreuzer    vfmadd213sd  xmm1,xmm3,__Lsinfarray
470*4afb647cSTimo Kreuzer    vmulsd       xmm3,xmm0,xmm3           ; xmm3 <-- x^3
471*4afb647cSTimo Kreuzer    vfmadd231sd  xmm0,xmm1,xmm3
472*4afb647cSTimo Kreuzer    jmp          Lsinf_fma3_return_sinf_s
473*4afb647cSTimo Kreuzer
474*4afb647cSTimo KreuzerALIGN 16
475*4afb647cSTimo KreuzerLsinf_fma3_range_reduce:
476*4afb647cSTimo Kreuzer    vmovq        xmm0,r9                  ; xmm0 <-- |x|
477*4afb647cSTimo Kreuzer    cmp          r9,L_large_x_fma3
478*4afb647cSTimo Kreuzer    jge          Lsinf_fma3_reduce_large
479*4afb647cSTimo Kreuzer
480*4afb647cSTimo KreuzerLsinf_fma3_sinf_reduce_moderate:
481*4afb647cSTimo Kreuzer    vandpd       xmm1,xmm0,L_sign_mask    ; xmm1 <-- |x|  mov should suffice WAT
482*4afb647cSTimo Kreuzer    vmovapd      xmm2,L_twobypi
483*4afb647cSTimo Kreuzer    vfmadd213sd  xmm2,xmm1,L_one_half
484*4afb647cSTimo Kreuzer    vcvttpd2dq   xmm2,xmm2
485*4afb647cSTimo Kreuzer    vpmovsxdq    xmm1,xmm2
486*4afb647cSTimo Kreuzer    vandpd       xmm4,xmm1,L_int_three    ; xmm4 <-- region
487*4afb647cSTimo Kreuzer    vshufps      xmm1 ,xmm1,xmm1,8
488*4afb647cSTimo Kreuzer    vcvtdq2pd    xmm1,xmm1
489*4afb647cSTimo Kreuzer    vmovdqa      xmm2,xmm0
490*4afb647cSTimo Kreuzer    vfnmadd231sd xmm2,xmm1,L_piby2_1      ; xmm2 <-- rhead
491*4afb647cSTimo Kreuzer    vmulsd       xmm3,xmm1,L_piby2_1tail  ; xmm3 <-- rtail
492*4afb647cSTimo Kreuzer    vsubsd       xmm0,xmm2,xmm3           ; xmm0 <-- r_1
493*4afb647cSTimo Kreuzer    vsubsd       xmm2,xmm2,xmm0
494*4afb647cSTimo Kreuzer    vsubsd       xmm1,xmm2,xmm3           ; xmm4 <-- rr_1
495*4afb647cSTimo Kreuzer    jmp          Lsinf_fma3_exit_s
496*4afb647cSTimo Kreuzer
497*4afb647cSTimo KreuzerALIGN 16
498*4afb647cSTimo KreuzerLsinf_fma3_reduce_large:
499*4afb647cSTimo Kreuzer    lea          r9,__L_2_by_pi_bits
500*4afb647cSTimo Kreuzer    ;xexp = (x >> 52) 1023
501*4afb647cSTimo Kreuzer    vmovq        r11,xmm0
502*4afb647cSTimo Kreuzer    mov          rcx,r11
503*4afb647cSTimo Kreuzer    shr          r11,52
504*4afb647cSTimo Kreuzer    sub          r11,1023                 ; r11 <-- xexp = exponent of input x
505*4afb647cSTimo Kreuzer    ;calculate the last byte from which to start multiplication
506*4afb647cSTimo Kreuzer    ;last = 134 (xexp >> 3)
507*4afb647cSTimo Kreuzer    mov          r10,r11
508*4afb647cSTimo Kreuzer    shr          r10,3
509*4afb647cSTimo Kreuzer    sub          r10,134 ;r10 = last
510*4afb647cSTimo Kreuzer    neg          r10 ;r10 = last
511*4afb647cSTimo Kreuzer    ;load 64 bits of 2_by_pi
512*4afb647cSTimo Kreuzer    mov          rax,[r9+r10]
513*4afb647cSTimo Kreuzer    ;mantissa of x = ((x << 12) >> 12) | implied bit
514*4afb647cSTimo Kreuzer    shl          rcx,12
515*4afb647cSTimo Kreuzer    shr          rcx,12 ;rcx = mantissa part of input x
516*4afb647cSTimo Kreuzer    bts          rcx,52 ;add the implied bit as well
517*4afb647cSTimo Kreuzer    ;load next 128 bits of 2_by_pi
518*4afb647cSTimo Kreuzer    add          r10,8 ;increment to next 8 bytes of 2_by_pi
519*4afb647cSTimo Kreuzer    vmovdqu      xmm0,XMMWORD PTR[r9+r10]
520*4afb647cSTimo Kreuzer    ;do three 64bit multiplications with mant of x
521*4afb647cSTimo Kreuzer    mul          rcx
522*4afb647cSTimo Kreuzer    mov          r8,rax                   ; r8 <-- last 64 bits of mul = res1[2]
523*4afb647cSTimo Kreuzer    mov          r10,rdx                  ; r10 <-- carry
524*4afb647cSTimo Kreuzer    vmovq        rax,xmm0
525*4afb647cSTimo Kreuzer    mul          rcx
526*4afb647cSTimo Kreuzer    ;resexp = xexp & 7
527*4afb647cSTimo Kreuzer    and          r11,7                    ; r11 <-- resexp = last 3 bits
528*4afb647cSTimo Kreuzer    psrldq       xmm0,8
529*4afb647cSTimo Kreuzer    add          rax,r10                  ; add the previous carry
530*4afb647cSTimo Kreuzer    adc          rdx,0
531*4afb647cSTimo Kreuzer    mov          r9,rax                   ; r9 <-- next 64 bits of mul = res1[1]
532*4afb647cSTimo Kreuzer    mov          r10,rdx                  ; r10 <-- carry
533*4afb647cSTimo Kreuzer    vmovq        rax,xmm0
534*4afb647cSTimo Kreuzer    mul          rcx
535*4afb647cSTimo Kreuzer    add          r10,rax                  ; r10 = most sig 64 bits = res1[0]
536*4afb647cSTimo Kreuzer    ;find the region
537*4afb647cSTimo Kreuzer    ;last three bits ltb = most sig bits >> (54 resexp))
538*4afb647cSTimo Kreuzer    ; decimal point in last 18 bits == 8 lsb's in first 64 bits
539*4afb647cSTimo Kreuzer    ; and 8 msb's in next 64 bits
540*4afb647cSTimo Kreuzer    ;point_five = ltb & 01h;
541*4afb647cSTimo Kreuzer    ;region = ((ltb >> 1) + point_five) & 3;
542*4afb647cSTimo Kreuzer    mov          rcx,54
543*4afb647cSTimo Kreuzer    mov          rax,r10
544*4afb647cSTimo Kreuzer    sub          rcx,r11
545*4afb647cSTimo Kreuzer    xor          rdx,rdx ;rdx = sign of x(i.e first part of x * 2bypi)
546*4afb647cSTimo Kreuzer    shr          rax,cl
547*4afb647cSTimo Kreuzer    jnc          Lsinf_fma3_no_point_five_f
548*4afb647cSTimo Kreuzer    ;;if there is carry.. then negate the result of multiplication
549*4afb647cSTimo Kreuzer    not          r10
550*4afb647cSTimo Kreuzer    not          r9
551*4afb647cSTimo Kreuzer    not          r8
552*4afb647cSTimo Kreuzer    mov          rdx,08000000000000000h
553*4afb647cSTimo Kreuzer
554*4afb647cSTimo KreuzerLsinf_fma3_no_point_five_f:
555*4afb647cSTimo Kreuzer    adc          rax,0
556*4afb647cSTimo Kreuzer    and          rax,3
557*4afb647cSTimo Kreuzer    vmovd        xmm4,eax ;store region to xmm4
558*4afb647cSTimo Kreuzer    ;calculate the number of integer bits and zero them out
559*4afb647cSTimo Kreuzer    mov          rcx,r11
560*4afb647cSTimo Kreuzer    add          rcx,10                   ; rcx <-- no. of integer bits
561*4afb647cSTimo Kreuzer    shl          r10,cl
562*4afb647cSTimo Kreuzer    shr          r10,cl                   ; r10 contains only mant bits
563*4afb647cSTimo Kreuzer    sub          rcx,64                   ; form the exponent
564*4afb647cSTimo Kreuzer    mov          r11,rcx
565*4afb647cSTimo Kreuzer    ;find the highest set bit
566*4afb647cSTimo Kreuzer    bsr          rcx,r10
567*4afb647cSTimo Kreuzer    jnz          Lsinf_fma3_form_mantissa_f
568*4afb647cSTimo Kreuzer    mov          r10,r9
569*4afb647cSTimo Kreuzer    mov          r9,r8
570*4afb647cSTimo Kreuzer    mov          r8,0
571*4afb647cSTimo Kreuzer    bsr          rcx,r10                  ; rcx <-- hsb
572*4afb647cSTimo Kreuzer    sub          r11,64
573*4afb647cSTimo Kreuzer
574*4afb647cSTimo KreuzerLsinf_fma3_form_mantissa_f:
575*4afb647cSTimo Kreuzer    add          r11,rcx ;for exp of x
576*4afb647cSTimo Kreuzer    sub          rcx,52 ;rcx = no. of bits to shift in r10
577*4afb647cSTimo Kreuzer    cmp          rcx,0
578*4afb647cSTimo Kreuzer    jl           Lsinf_fma3_hsb_below_52_f
579*4afb647cSTimo Kreuzer    je           Lsinf_fma3_form_numbers_f
580*4afb647cSTimo Kreuzer    ;hsb above 52
581*4afb647cSTimo Kreuzer    mov         r8,r10                    ; previous contents of r8 not required
582*4afb647cSTimo Kreuzer    shr         r10,cl                    ; r10 = mantissa of x with hsb at 52
583*4afb647cSTimo Kreuzer    shr         r9,cl                     ; make space for bits from r10
584*4afb647cSTimo Kreuzer    sub         rcx,64
585*4afb647cSTimo Kreuzer    neg         rcx
586*4afb647cSTimo Kreuzer    shl         r8,cl
587*4afb647cSTimo Kreuzer    or          r9,r8                     ; r9 = mantissa bits of xx
588*4afb647cSTimo Kreuzer    jmp         Lsinf_fma3_form_numbers_f
589*4afb647cSTimo Kreuzer
590*4afb647cSTimo KreuzerALIGN 16
591*4afb647cSTimo KreuzerLsinf_fma3_hsb_below_52_f:
592*4afb647cSTimo Kreuzer    neg          rcx
593*4afb647cSTimo Kreuzer    mov          rax,r9
594*4afb647cSTimo Kreuzer    shl          r10,cl
595*4afb647cSTimo Kreuzer    shl          r9,cl
596*4afb647cSTimo Kreuzer    sub          rcx,64
597*4afb647cSTimo Kreuzer    neg          rcx
598*4afb647cSTimo Kreuzer    shr          rax,cl
599*4afb647cSTimo Kreuzer    or           r10,rax
600*4afb647cSTimo Kreuzer    shr          r8,cl
601*4afb647cSTimo Kreuzer    or           r9,r8
602*4afb647cSTimo Kreuzer
603*4afb647cSTimo KreuzerALIGN 16
604*4afb647cSTimo KreuzerLsinf_fma3_form_numbers_f:
605*4afb647cSTimo Kreuzer    add          r11,1023
606*4afb647cSTimo Kreuzer    btr          r10,52                   ; remove the implied bit
607*4afb647cSTimo Kreuzer    mov          rcx,r11
608*4afb647cSTimo Kreuzer    or           r10,rdx                  ; put the sign
609*4afb647cSTimo Kreuzer    shl          rcx,52
610*4afb647cSTimo Kreuzer    or           r10,rcx                  ; r10 <-- x
611*4afb647cSTimo Kreuzer    vmovq        xmm0,r10                 ; xmm0 <-- x
612*4afb647cSTimo Kreuzer    vmulsd       xmm0,xmm0,L_piby2_lead
613*4afb647cSTimo KreuzerLsinf_fma3_exit_s:
614*4afb647cSTimo Kreuzer    vmovq        rax,xmm4
615*4afb647cSTimo Kreuzer    and          rax,01h
616*4afb647cSTimo Kreuzer    cmp          rax,01h
617*4afb647cSTimo Kreuzer    jz           Lsinf_fma3_cos_piby4_compute
618*4afb647cSTimo Kreuzer
619*4afb647cSTimo KreuzerLsinf_fma3_sin_piby4_compute:
620*4afb647cSTimo Kreuzer;;    vmovapd      xmm1,__Lsinfarray+010h
621*4afb647cSTimo Kreuzer    vmovsd       xmm1,__Lsinfarray+010h
622*4afb647cSTimo Kreuzer    vmulsd       xmm3,xmm0,xmm0
623*4afb647cSTimo Kreuzer    vfmadd231sd  xmm1,xmm3,__Lsinfarray+018h
624*4afb647cSTimo Kreuzer    vfmadd213sd  xmm1,xmm3,__Lsinfarray+008h
625*4afb647cSTimo Kreuzer    vfmadd213sd  xmm1,xmm3,__Lsinfarray
626*4afb647cSTimo Kreuzer    vmulsd       xmm3,xmm0,xmm3           ; xmm3 <-- x^3
627*4afb647cSTimo Kreuzer    vfmadd231sd  xmm0,xmm1,xmm3
628*4afb647cSTimo Kreuzer    jmp          Lsinf_fma3_exit_s_1
629*4afb647cSTimo Kreuzer
630*4afb647cSTimo KreuzerALIGN 16
631*4afb647cSTimo KreuzerLsinf_fma3_cos_piby4_compute:
632*4afb647cSTimo Kreuzer    vmovapd      xmm2,L_one
633*4afb647cSTimo Kreuzer    vmulsd       xmm3,xmm0,xmm0
634*4afb647cSTimo Kreuzer    vfmadd231sd  xmm2,xmm3,__Lcosfarray   ; xmm2 <-- 1 + c0 x^2
635*4afb647cSTimo Kreuzer    ; would simple Horner's be slower?
636*4afb647cSTimo Kreuzer    vmovsd       xmm1,__Lcosfarray+018h       ; xmm1 <-- c3
637*4afb647cSTimo Kreuzer    vfmadd231sd  xmm1,xmm3,__Lcosfarray+020h  ; xmm1 <--  c4 x^2+ c3
638*4afb647cSTimo Kreuzer    vfmadd213sd  xmm1,xmm3,__Lcosfarray+010h  ; xmm1 <--  (c4 x^2+ c3)x^2 + c2
639*4afb647cSTimo Kreuzer    vfmadd213sd  xmm1,xmm3,__Lcosfarray+008h  ; xmm1 <--  ((c4 x^2+ c3)x^2 + c2)x^2 + c1
640*4afb647cSTimo Kreuzer    vmulsd       xmm3,xmm3,xmm3               ; xmm3 <-- x^4
641*4afb647cSTimo Kreuzer    vmovdqa      xmm0,xmm2
642*4afb647cSTimo Kreuzer    vfmadd231sd  xmm0,xmm1,xmm3
643*4afb647cSTimo KreuzerLsinf_fma3_exit_s_1:
644*4afb647cSTimo Kreuzer    ; assuming FMA3 ==> AVX ==> SSE4.1
645*4afb647cSTimo Kreuzer    vpcmpeqq     xmm2,xmm4,XMMWORD PTR L_int_two
646*4afb647cSTimo Kreuzer    vpcmpeqq     xmm3,xmm4,XMMWORD PTR L_int_three
647*4afb647cSTimo Kreuzer    vorpd        xmm3,xmm2,xmm3
648*4afb647cSTimo Kreuzer    vandnpd      xmm3,xmm3,L_signbit
649*4afb647cSTimo Kreuzer    vxorpd       xmm0,xmm0,xmm3
650*4afb647cSTimo Kreuzer
651*4afb647cSTimo Kreuzer    vandnpd      xmm1,xmm5,L_signbit
652*4afb647cSTimo Kreuzer    vxorpd       xmm0,xmm1,xmm0
653*4afb647cSTimo KreuzerLsinf_fma3_return_sinf_s:
654*4afb647cSTimo Kreuzer    vcvtsd2ss    xmm0,xmm0,xmm0
655*4afb647cSTimo Kreuzer    StackDeallocate stack_size
656*4afb647cSTimo Kreuzer    ret
657*4afb647cSTimo Kreuzer
658*4afb647cSTimo KreuzerLsinf_fma3_naninf:
659*4afb647cSTimo Kreuzer    call         fname_special
660*4afb647cSTimo Kreuzer    StackDeallocate stack_size
661*4afb647cSTimo Kreuzer    ret
662*4afb647cSTimo Kreuzer
663*4afb647cSTimo Kreuzerfname endp
664*4afb647cSTimo KreuzerEND
665