xref: /reactos/sdk/lib/crt/math/libm_sse2/sin.asm (revision e4d572a4)
1;
2; MIT License
3; -----------
4;
5; Copyright (c) 2002-2019 Advanced Micro Devices, Inc.
6;
7; Permission is hereby granted, free of charge, to any person obtaining a copy
8; of this Software and associated documentaon files (the "Software"), to deal
9; in the Software without restriction, including without limitation the rights
10; to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11; copies of the Software, and to permit persons to whom the Software is
12; furnished to do so, subject to the following conditions:
13;
14; The above copyright notice and this permission notice shall be included in
15; all copies or substantial portions of the Software.
16;
17; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18; IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19; FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20; AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21; LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
23; THE SOFTWARE.
24;
25;
26; An implementation of the sin function.
27;
28; Prototype:
29;
30;     double sin(double x);
31;
32;   Computes sin(x).
33;   It will provide proper C99 return values,
34;   but may not raise floating point status bits properly.
35;   Based on the NAG C implementation.
36;
37; If FMA3 hardware is available, an FMA3 implementation of sin will be used.
38
39
40.const
41ALIGN 16
42L_real_piby2_1          DQ 03ff921fb54400000h         ; piby2_1
43                        DQ 0
44L_real_piby2_1tail      DQ 03dd0b4611a626331h         ; piby2_1tail
45                        DQ 0
46L_real_piby2_2          DQ 03dd0b4611a600000h         ; piby2_2
47                        DQ 0
48L_real_piby2_2tail      DQ 03ba3198a2e037073h         ; piby2_2tail
49                        DQ 0
50ALIGN 16
51
52L_one           DQ 03FF0000000000000h, 03FF0000000000000h
53L_signbit       DQ 08000000000000000h, 00000000000000000h
54L_int_one       DQ 00000000000000001h, 00000000000000000h
55L_int_two       DQ 00000000000000002h, 00000000000000000h
56L_int_three     DQ 00000000000000003h, 00000000000000000h
57
58L_2_by_pi       DQ 03fe45f306dc9c883h     ; 2/pi
59L_one_half      DQ 03FE0000000000000h     ; .5
60L_one_sixth     DQ 03FC5555555555555h     ; .1666...
61L_two_to_neg_27 DQ 03e40000000000000h     ; 2^-27
62L_two_to_neg_13 DQ 03f20000000000000h     ; 2^-13
63L_piby4         DQ 03FE921FB54442D18h     ; pi/4
64L_small_arg_cw  DQ 0411E848000000000h     ; 5.e5, appropriate for CW
65L_small_arg_bdl DQ 0417312D000000000h     ; 2e7, works for BDL
66
67L__inf_mask_64  DQ 07FF0000000000000h     ; +Inf
68
69EXTRN __Lcosarray:QWORD
70EXTRN __Lsinarray:QWORD
71EXTRN __use_fma3_lib:DWORD
72
73; define local variable storage offsets
74p_temp          EQU 030h
75p_temp1         EQU 040h
76save_r10        EQU 050h
77dummy_space     EQU 060h
78stack_size      EQU 078h
79
80include fm.inc
81
82fname           TEXTEQU <sin>
83fname_special   TEXTEQU <_sin_special>
84
85;Define name and any external functions being called
86EXTERN          __remainder_piby2_forAsm   : PROC
87EXTERN          __remainder_piby2_fma3     : PROC
88EXTERN          __remainder_piby2_fma3_bdl : PROC
89EXTERN          fname_special      : PROC
90
91.code
92
93PUBLIC fname
94fname PROC FRAME
95    StackAllocate stack_size
96    .ENDPROLOG
97    cmp          DWORD PTR __use_fma3_lib, 0
98    jne          Lsin_fma3
99
100Lsin_sse2:
101    movd    rdx, xmm0
102    xorpd   xmm2, xmm2                                ; zeroed out for later use
103
104    mov     r10,rdx
105    mov     r8d, 1                     ; for determining region later on
106    btr     r10,63                     ; r10 <-- |x|
107    cmp     r10,L_piby4
108    jb      Lsin_sse2_absx_lt_piby4
109
110Lsin_sse2_absx_nlt_piby4:             ; common case
111    mov     r11,rdx
112    shr     r11,63
113    movd    xmm0,r10                      ; xmm0 <-- |x|
114    cmp     r10, QWORD PTR L_small_arg_cw
115    jae     Lsin_reduce_precise           ; Note NaN/Inf will branch
116
117; At this point we have |x| < L_small_arg_cw, which is currently 500000.
118; Note that if |x| were too large, conversion of npi2 to integer would fail.
119; We reduce  the argument to be in a range from -pi/4 to +pi/4
120; by subtracting multiples of pi/2
121    movapd  xmm2, xmm0
122    mulsd   xmm2, L_2_by_pi
123    movapd  xmm4, xmm0
124
125;      xexp  = ax >> EXPSHIFTBITS_DP64;
126    mov     r9, r10
127    shr     r9, 52                        ; >>EXPSHIFTBITS_DP64
128
129; How many pi/2 is |x| a multiple of?
130;        npi2  = (int)(x * twobypi + 0.5);
131    addsd   xmm2, L_one_half              ; npi2
132
133    movsd   xmm3, L_real_piby2_1
134    cvttpd2dq   xmm0, xmm2                ; convert npi2 to integer
135    movsd   xmm1, L_real_piby2_1tail
136    cvtdq2pd    xmm2, xmm0                ; npi2 back to double
137
138;  Subtract the multiple from x to get an extra-precision remainder
139;      rhead  = x - npi2 * piby2_1;
140    mulsd        xmm3, xmm2
141    subsd        xmm4, xmm3               ; rhead
142
143;      rtail  = npi2 * piby2_1tail;
144    mulsd   xmm1, xmm2                    ; rtail
145    movd    eax, xmm0                     ; eax <-- npi2
146
147;      GET_BITS_DP64(rhead-rtail, uy);
148; originally only rhead
149    movapd  xmm0, xmm4
150    subsd   xmm0, xmm1
151
152    movsd   xmm3, L_real_piby2_2
153    movd    rcx, xmm0                     ; rcx <-- rhead - rtail
154    movsd   xmm5, L_real_piby2_2tail      ; piby2_2tail
155
156;    xmm0=r, xmm1=rtail, xmm2=npi2, xmm3=temp for calc,
157;    xmm4=rhead, xmm5= temp for calc
158;      expdiff = xexp - ((uy & EXPBITS_DP64) >> EXPSHIFTBITS_DP64);
159;   expdiff measures how close rhead - rtail is to |x|
160;   (larger expdiff ==> more cancellation in |x| - (rhead-rtail) ==> closer)
161    shl     rcx, 1                        ; strip any sign bit
162    shr     rcx, 53                       ; >> EXPSHIFTBITS_DP64 +1
163    sub     r9, rcx                       ; expdiff
164
165;;      if (expdiff > 15)
166    cmp     r9, 15
167    jle     Lsin_sse2_cw_reduction_done
168
169;    Here the remainder is pretty small compared with x, which
170;    implies that x is a near multiple of pi/2
171;    (x matches the multiple to at least 15 bits)
172;    So we do another stage of argument reduction.
173
174;          t  = rhead;
175    movapd  xmm1, xmm4
176
177;          rtail  = npi2 * piby2_2;
178    mulsd   xmm3, xmm2
179
180;          rhead  = t - rtail;
181    mulsd   xmm5, xmm2                                ; npi2 * piby2_2tail
182    subsd   xmm4, xmm3                                ; rhead
183
184;          rtail  = npi2 * piby2_2tail - ((t - rhead) - rtail);
185    subsd   xmm1, xmm4                                ; t - rhead
186    subsd   xmm1, xmm3                                ; -rtail
187    subsd   xmm5, xmm1                                ; rtail
188
189;      r = rhead - rtail;
190    movapd  xmm0, xmm4
191
192;HARSHA
193;xmm1=rtail
194    movapd  xmm1, xmm5                    ; xmm1 <-- copy of rtail
195    subsd   xmm0, xmm5
196
197
198;    xmm0=r, xmm4=rhead, xmm1=rtail
199Lsin_sse2_cw_reduction_done:
200;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
201;; if the input was close to a pi/2 multiple
202; The original NAG code missed this trick.
203; If the input is very close to n*pi/2 after reduction, so r < 2^-27,
204; then the sin is either  ~ 1.0 or ~r, to within 53 bits.
205
206; Note:  Unfortunately this introduces two jcc instructions close to each
207; other and to other branches.  As r < 2^-13 should be rather uncommon, it
208; almost certainly costs more than it saves. - WAT
209;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
210;      region = npi2 & 3;
211
212    subsd   xmm4, xmm0                                ; rhead-r
213    subsd   xmm4, xmm1                                ; rr = (rhead-r) - rtail
214
215Lsin_piby4:
216; perform taylor series to calc sinx, sinx for |x| <= pi/4
217;  x2 = r * r;
218
219;xmm4 = a part of rr for the sin path, xmm4 is overwritten in the sin path
220;instead use xmm3 because that was freed up in the sin path, xmm3 is overwritten in sin path
221    movapd  xmm3, xmm0
222    movapd  xmm2, xmm0
223    mulsd   xmm2, xmm0                                ;x2
224
225    bt      eax,0
226    jc      Lsin_sse2_calc_cos
227
228;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
229; region 0 or 2 do a sin calculation
230    movsd   xmm3, __Lsinarray+50h                     ; s6
231    mulsd   xmm3, xmm2                                ; x2s6
232    movsd   xmm5, __Lsinarray+20h                     ; s3
233    movsd   QWORD PTR p_temp[rsp], xmm4               ; store xx
234    movapd  xmm1, xmm2                                ; move for x4
235    mulsd   xmm1, xmm2                                ; x4
236    movsd   QWORD PTR p_temp1[rsp], xmm0              ; store x
237    mulsd   xmm5, xmm2                                ; x2s3
238    movapd  xmm4, xmm0                                ; move for x3
239    addsd   xmm3, __Lsinarray+40h                     ; s5+x2s6
240    mulsd   xmm1, xmm2                                ; x6
241    mulsd   xmm3, xmm2                                ; x2(s5+x2s6)
242    mulsd   xmm4, xmm2                                ; x3
243    addsd   xmm5, __Lsinarray+10h                     ; s2+x2s3
244    mulsd   xmm5, xmm2                                ; x2(s2+x2s3)
245    addsd   xmm3, __Lsinarray+30h                     ; s4 + x2(s5+x2s6)
246    mulsd   xmm2, L_one_half                          ; 0.5 *x2
247    movsd   xmm0, QWORD PTR p_temp[rsp]               ; load xx
248    mulsd   xmm3, xmm1                                ; x6(s4 + x2(s5+x2s6))
249    addsd   xmm5, __Lsinarray                         ; s1+x2(s2+x2s3)
250    mulsd   xmm2, xmm0                                ; 0.5 * x2 *xx
251    addsd   xmm3, xmm5                                ; zs
252    mulsd   xmm4, xmm3                                ; *x3
253    subsd   xmm4, xmm2                                ; x3*zs - 0.5 * x2 *xx
254    addsd   xmm0, xmm4                                ; +xx
255    addsd   xmm0, QWORD PTR p_temp1[rsp]              ; +x
256
257    jmp     Lsin_sse2_adjust_region
258
259ALIGN 16
260Lsin_sse2_calc_cos:
261;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
262; region 1 or 3     - do a cos calculation
263;  zc = (c2 + x2 * (c3 + x2 * (c4 + x2 * (c5 + x2 * c6))));
264    mulsd   xmm4, xmm0                                ; x*xx
265    movsd   xmm5, L_one_half
266    movsd   xmm1, __Lcosarray+50h                     ; c6
267    movsd   xmm0, __Lcosarray+20h                     ; c3
268    mulsd   xmm5, xmm2                                ; r = 0.5 *x2
269    movapd  xmm3, xmm2                                ; copy of x2
270    movsd   QWORD PTR p_temp[rsp], xmm4               ; store x*xx
271    mulsd   xmm1, xmm2                                ; c6*x2
272    mulsd   xmm0, xmm2                                ; c3*x2
273    subsd   xmm5, L_one                               ; -t=r-1.0, trash r
274    mulsd   xmm3, xmm2                                ; x4
275    addsd   xmm1, __Lcosarray+40h                     ; c5+x2c6
276    addsd   xmm0, __Lcosarray+10h                     ; c2+x2C3
277    addsd   xmm5, L_one                               ; 1 + (-t), trash t
278    mulsd   xmm3, xmm2                                ; x6
279    mulsd   xmm1, xmm2                                ; x2(c5+x2c6)
280    mulsd   xmm0, xmm2                                ; x2(c2+x2C3)
281    movapd  xmm4, xmm2                                ; copy of x2
282    mulsd   xmm4, L_one_half  ; r recalculate
283    addsd   xmm1, __Lcosarray+30h                     ; c4 + x2(c5+x2c6)
284    addsd   xmm0, __Lcosarray                         ; c1+x2(c2+x2C3)
285    mulsd   xmm2, xmm2                                ; x4 recalculate
286    subsd   xmm5, xmm4                                ; (1 + (-t)) - r
287    mulsd   xmm1, xmm3                                ; x6(c4 + x2(c5+x2c6))
288    addsd   xmm0, xmm1                                ; zc
289    subsd   xmm4, L_one                               ; t relaculate
290    subsd   xmm5, QWORD PTR p_temp[rsp]               ; ((1 + (-t)) - r) - x*xx
291    mulsd   xmm0, xmm2                                ; x4 * zc
292    addsd   xmm0, xmm5                                ; x4 * zc + ((1 + (-t)) - r -x*xx)
293    subsd   xmm0, xmm4                                ; result - (-t)
294
295;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
296
297Lsin_sse2_adjust_region:
298; positive or negative
299;      switch (region)
300    shr     eax, 1
301    mov     ecx, eax
302    and     eax, r11d
303
304    not     ecx
305    not     r11d
306    and     ecx, r11d
307
308    or      eax, ecx
309    and     eax, 1
310    jnz     Lsin_sse2_cleanup
311
312;; if the original region 0, 1 and arg is negative, then we negate the result.
313;; if the original region 2, 3 and arg is positive, then we negate the result.
314    movapd  xmm2, xmm0
315    xorpd   xmm0, xmm0
316    subsd   xmm0, xmm2
317
318ALIGN 16
319Lsin_sse2_cleanup:
320    StackDeallocate stack_size
321    ret
322
323ALIGN 16
324Lsin_sse2_absx_lt_piby4:
325;          sin = sin_piby4(x, 0.0);
326
327;  x2 = r * r;
328    movapd  xmm2, xmm0
329    mulsd   xmm2, xmm0                                ; x2
330
331;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
332; region 0 - do a sin calculation
333;  zs = (s2 + x2 * (s3 + x2 * (s4 + x2 * (s5 + x2 * s6))));
334    movsd   xmm3, __Lsinarray+50h                     ; s6
335    mulsd   xmm3, xmm2                                ; x2s6
336    movsd   xmm5, __Lsinarray+20h                     ; s3
337    movapd  xmm1, xmm2                                ; move for x4
338    mulsd   xmm1, xmm2                                ; x4
339    mulsd   xmm5, xmm2                                ; x2s3
340    movapd  xmm4, xmm0                                ; move for x3
341    addsd   xmm3, __Lsinarray+40h                     ; s5+x2s6
342    mulsd   xmm1, xmm2                                ; x6
343    mulsd   xmm3, xmm2                                ; x2(s5+x2s6)
344    mulsd   xmm4, xmm2                                ; x3
345    addsd   xmm5, __Lsinarray+10h                     ; s2+x2s3
346    mulsd   xmm5, xmm2                                ; x2(s2+x2s3)
347    addsd   xmm3, __Lsinarray+30h                     ; s4 + x2(s5+x2s6)
348    mulsd   xmm3, xmm1                                ; x6(s4 + x2(s5+x2s6))
349    addsd   xmm5, __Lsinarray                         ; s1+x2(s2+x2s3)
350    addsd   xmm3, xmm5                                ; zs
351    mulsd   xmm4, xmm3                                ; *x3
352    addsd   xmm0, xmm4                                ; +x
353    jmp     Lsin_sse2_cleanup
354
355;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
356ALIGN 16
357Lsin_reduce_precise:
358;   Reduce x into range [-pi/4, pih/4]
359    cmp     r10,L__inf_mask_64
360    jae     Lsin_x_naninf
361    mov     QWORD PTR p_temp[rsp], r11
362    call    __remainder_piby2_forAsm
363    mov     r11, QWORD PTR p_temp[rsp]
364
365    ; At this point xmm0 has r, xmm1 has rr, rax has region
366
367    movapd  xmm4, xmm1                ; xmm4 <-- rr
368    jmp Lsin_piby4
369
370; xmm0 = x, xmm4 = xx, eax= region
371
372
373ALIGN 16
374Lsin_x_naninf:
375    call    fname_special
376    StackDeallocate stack_size
377    ret
378
379;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
380; From this point we assume that FMA3 and AVX hardware are present.
381
382ALIGN 16
383Lsin_fma3:
384    vmovq        r9,xmm0
385    mov          r10,r9                ; save x to get sign later
386    btr          r9,63                 ; r9 <-- |x|
387    cmp          r9,L_piby4
388    jae          Lsin_fma3_absx_nlt_piby4  ; Note that NaN will branch
389    cmp          r9,L_two_to_neg_13
390    jae          Lsin_fma3_calc_sin_for_absx_lt_piby4
391    cmp          r9,L_two_to_neg_27
392    jae          Lsin_fma3_compute_x_xxx_0_1666
393    StackDeallocate stack_size
394    ret                                ; sin x ~= x for |x| < 2^-27
395
396ALIGN 16
397Lsin_fma3_compute_x_xxx_0_1666:        ; |x| in [2^-27,2^-13]
398    vmulsd       xmm1,xmm0,xmm0        ; xmm1l <-- x*x
399    vmulsd       xmm1,xmm1,xmm0        ; xmm1l <-- x*x*x
400    vfnmadd231sd xmm0,xmm1,L_one_sixth ; xmm0l <-- x - x*x*x*(1/6)
401    StackDeallocate stack_size
402    ret
403
404ALIGN 16
405Lsin_fma3_calc_sin_for_absx_lt_piby4: ; |x| in [2^-13,pi/4]
406    vmovsd       xmm5,__Lsinarray+050h
407    vmulsd       xmm3,xmm0,xmm0        ; xmm3l <-- x^2
408
409    vfmadd213sd  xmm5,xmm3,__Lsinarray+040h
410    vfmadd213sd  xmm5,xmm3,__Lsinarray+030h
411    vfmadd213sd  xmm5,xmm3,__Lsinarray+020h
412    vfmadd213sd  xmm5,xmm3,__Lsinarray+010h
413
414    vmulsd       xmm4,xmm0,xmm3        ; xmm4l <-- x^3
415    vfmadd213sd  xmm5,xmm3,__Lsinarray
416    vfmadd231sd  xmm0,xmm4,xmm5        ; xmm0l <-- x + x^3 p(x^2)
417
418    StackDeallocate stack_size
419    ret
420
421ALIGN 16
422Lsin_fma3_absx_nlt_piby4:  ; !(|x| < pi/4)
423    ; here r9 has |x|
424    cmp          r9,L__inf_mask_64
425    jae          Lsin_x_naninf
426;Lrange_reduce:                         ;; unused label
427
428    vmovq        xmm0,r9               ; xmm0 <-- |x|
429    cmp          r9,L_small_arg_bdl
430    jae          Lsin_fma3_do_general_arg_reduction
431
432    ; Note that __remainder_piby2_fma3 conventions are
433    ; on input
434    ;   |x| is in xmm0
435    ; on output
436    ;   r is in xmm0
437    ;   rr is in xmm1
438    ;   region of |x| is in rax
439
440    ; Boldo-Daumas-Li reduction for reasonably small |x|
441    call         __remainder_piby2_fma3_bdl
442Lsin_fma3_exit_s:
443    bt           rax,0
444    vmulsd       xmm3,xmm0,xmm0        ; xmm3 <-- x2 = x * x
445    jc           Lsin_fma3_calc_cos
446
447Lsin_fma3_calc_sin:                    ;; unused label
448    ; region 0 or 2
449    ; compute the sine of r+rr, where this sum is in [-pi/4,pi/4]
450    vmovsd       xmm5,__Lsinarray+050h
451    vfmadd213sd  xmm5,xmm3,__Lsinarray+040h
452    vfmadd213sd  xmm5,xmm3,__Lsinarray+030h
453    vfmadd213sd  xmm5,xmm3,__Lsinarray+020h
454    vfmadd213sd  xmm5,xmm3,__Lsinarray+010h  ; xmm5 <-- r
455
456    vmulsd       xmm4,xmm0,xmm3        ; xmm4 <-- x3 = x*x*x
457    vmulsd       xmm2,xmm4,xmm5        ; xmm2 <-- x*x*x * r
458    vmulsd       xmm5,xmm1,L_one_half  ; xmm5 <-- .5*x*x
459    vsubsd       xmm2,xmm5,xmm2        ; xmm2 <-- .5*x*x - x*x*x*r
460    vmulsd       xmm2,xmm3,xmm2
461    vsubsd       xmm2,xmm2,xmm1
462    vfnmadd231sd xmm2, xmm4,__Lsinarray
463    vsubsd       xmm0,xmm0,xmm2
464    jmp          Lsin_fma3_exit_s_1
465
466ALIGN 16
467Lsin_fma3_calc_cos:
468    ; region 1 or 3
469    ; compute the cosine of r+rr, where this sum is in [-pi/4,pi/4]
470    vmovapd      xmm2,L_one
471    vmulsd       xmm5,xmm3,L_one_half  ; xmm5 <-- x*x*.5 == r
472    vsubsd       xmm4,xmm2,xmm5        ; xmm4 <-- t = 1. - x*x*.5
473    vsubsd       xmm2,xmm2,xmm4        ; 1-t
474    vsubsd       xmm2,xmm2,xmm5        ; xmm2 <-- (1-t) - r
475    vmovsd       xmm5,__Lcosarray+050h
476    vfnmadd231sd xmm2,xmm0,xmm1        ; (1.0 - t) - r) - x * xx) xmm2
477    vmulsd       xmm1,xmm3,xmm3        ; x2 * x2 xmm1
478    vfmadd213sd  xmm5,xmm3,__Lcosarray+040h
479    vfmadd213sd  xmm5,xmm3,__Lcosarray+030h
480    vfmadd213sd  xmm5,xmm3,__Lcosarray+020h
481    vfmadd213sd  xmm5,xmm3,__Lcosarray+010h
482    vfmadd213sd  xmm5,xmm3,__Lcosarray
483    vfmadd213sd  xmm5,xmm1,xmm2
484    vaddsd       xmm0,xmm5,xmm4
485
486Lsin_fma3_exit_s_1:
487    xor          r8,r8                    ; prepare r8 for cmov
488    and          r10,L_signbit            ; isolate original sign of x
489    bt           eax,1
490    cmovc        r8,L_signbit
491    xor          r8,r10
492    vmovq        xmm3,r8
493    vxorpd       xmm0,xmm0,xmm3
494
495    StackDeallocate stack_size
496    ret
497
498ALIGN 16
499Lsin_fma3_do_general_arg_reduction:
500    ; argument reduction for general x
501
502    ; NOTE: the BDL argument reduction routine does not touch r10,
503    ; but the general-purpose reduction does.
504    mov          QWORD PTR [save_r10+rsp], r10
505    call         __remainder_piby2_fma3
506    mov          r10, QWORD PTR [save_r10+rsp]
507    jmp          Lsin_fma3_exit_s
508
509fname endp
510END
511
512