xref: /reactos/sdk/lib/crt/math/libm_sse2/cos.asm (revision 10e7643c)
1;
2; MIT License
3; -----------
4;
5; Copyright (c) 2002-2019 Advanced Micro Devices, Inc.
6;
7; Permission is hereby granted, free of charge, to any person obtaining a copy
8; of this Software and associated documentaon files (the "Software"), to deal
9; in the Software without restriction, including without limitation the rights
10; to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11; copies of the Software, and to permit persons to whom the Software is
12; furnished to do so, subject to the following conditions:
13;
14; The above copyright notice and this permission notice shall be included in
15; all copies or substantial portions of the Software.
16;
17; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18; IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19; FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20; AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21; LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
23; THE SOFTWARE.
24;
25;
26; An implementation of the cos function.
27;
28; Prototype:
29;
30;     double cos(double x);
31;
32;   Computes cos(x).
33;   It will provide proper C99 return values,
34;   but may not raise floating point status bits properly.
35;   Based on the NAG C implementation.
36;
37; If FMA3 hardware is available, an FMA3 implementation of cos will be used.
38
39.const
40ALIGN 16
41L_real_piby2_1          DQ 03ff921fb54400000h         ; piby2_1
42                        DQ 0
43L_real_piby2_1tail      DQ 03dd0b4611a626331h         ; piby2_1tail
44                        DQ 0
45L_real_piby2_2          DQ 03dd0b4611a600000h         ; piby2_2
46                        DQ 0
47L_real_piby2_2tail      DQ 03ba3198a2e037073h         ; piby2_2tail
48                        DQ 0
49
50ALIGN 16
51L_one           DQ 03FF0000000000000h, 03FF0000000000000h
52L_signbit       DQ 08000000000000000h, 00000000000000000h
53L_int_one       DQ 00000000000000001h, 00000000000000000h
54L_int_two       DQ 00000000000000002h, 00000000000000000h
55
56L_2_by_pi       DQ 03fe45f306dc9c883h     ; 2/pi
57L_one_half      DQ 03FE0000000000000h     ; .5
58L_neg_one_half  DQ 0bfe0000000000000h     ; - 0.5
59L_two_to_neg_27 DQ 03e40000000000000h     ; 2^-27
60L_two_to_neg_13 DQ 03f20000000000000h     ; 2^-13
61L_piby4         DQ 03FE921FB54442D18h     ; pi/4
62L_small_arg_cw  DQ 0411E848000000000h     ; 5.e5, appropriate for CW
63L_small_arg_bdl DQ 0417312D000000000h     ; 2e7, works for BDL
64L_sign_mask     DQ 07FFFFFFFFFFFFFFFh
65
66L__inf_mask_64  DQ 07FF0000000000000h     ; +Inf
67
68
69
70EXTRN __Lcosarray:QWORD
71EXTRN __Lsinarray:QWORD
72EXTRN __use_fma3_lib:DWORD
73
74; local storage offsets
75p_temp      EQU  020h                     ; temporary for get/put bits operation
76p_temp1     EQU  030h                     ; temporary for get/put bits operation
77dummy_space EQU  040h
78stack_size  EQU  068h
79
80include fm.inc
81
82fname         TEXTEQU <cos>
83fname_special TEXTEQU <_cos_special>
84
85;Define name and any external functions being called
86EXTERN           __remainder_piby2_forAsm   : PROC
87EXTERN           __remainder_piby2_fma3     : PROC
88EXTERN           __remainder_piby2_fma3_bdl : PROC
89EXTERN           fname_special              : PROC
90
91.code
92
93PUBLIC fname
94fname PROC FRAME
95    StackAllocate stack_size
96    .ENDPROLOG
97
98    cmp          DWORD PTR __use_fma3_lib, 0
99    jne          L_cos_fma3
100
101Lcos_sse2:
102    movd         rdx, xmm0
103    xorpd        xmm2, xmm2               ; zeroed out for later use
104
105    mov          r10, rdx
106    btr          r10, 63                  ; r10 <-- |x|
107    cmp          r10, L_piby4
108    jb           Lcos_sse2_absx_lt_piby4
109
110Lcos_absx_nlt_piby4:                      ; common case
111
112;  Here rdx has x, r10 has |x|
113    movd    xmm0, r10                     ; xmm0 <-- |x|
114
115    cmp     r10, QWORD PTR L_small_arg_cw
116    jae     Lcos_reduce_precise           ; Note NaN/Inf will branch
117
118; At this point we have |x| < L_small_arg_cw, which is currently 500000.
119; Note that if |x| were too large, conversion of npi2 to integer would fail.
120; We reduce  the argument to be in a range from -pi/4 to +pi/4
121; by subtracting multiples of pi/2
122    movapd  xmm2, xmm0
123    mulsd   xmm2, L_2_by_pi
124    movapd  xmm4, xmm0
125
126;      xexp  = ax >> EXPSHIFTBITS_DP64;
127    mov     r9, r10
128    shr     r9, 52                        ; >>EXPSHIFTBITS_DP64
129
130; How many pi/2 is |x| a multiple of?
131;      npi2  = (int)(x * twobypi + 0.5);
132    addsd   xmm2, L_one_half              ; npi2
133
134    movsd   xmm3, L_real_piby2_1
135    cvttpd2dq    xmm0, xmm2               ; convert npi2 to integer
136    movsd   xmm1, L_real_piby2_1tail
137    cvtdq2pd    xmm2, xmm0                ; and back to double.
138
139;  Subtract the multiple from x to get an extra-precision remainder
140;      rhead  = x - npi2 * piby2_1;
141    mulsd   xmm3, xmm2
142    subsd   xmm4, xmm3                    ; rhead
143
144;      rtail  = npi2 * piby2_1tail;
145    mulsd   xmm1, xmm2                    ; rtail
146    movd    eax, xmm0                     ; eax <-- npi2
147
148;      GET_BITS_DP64(rhead-rtail, uy);
149; originally only rhead
150    movapd  xmm0, xmm4
151    subsd   xmm0, xmm1
152
153    movsd   xmm3, L_real_piby2_2
154    movd    rcx, xmm0                     ; rcx <-- rhead - rtail
155    movsd   xmm5, L_real_piby2_2tail      ; piby2_2tail
156
157;    xmm0=r, xmm1=rtail, xmm2=npi2, xmm3=temp for calc,
158;    xmm4=rhead xmm5= temp for calc
159;      expdiff = xexp - ((uy & EXPBITS_DP64) >> EXPSHIFTBITS_DP64);
160;   expdiff measures how close rhead - rtail is to |x|
161;   (larger expdiff ==> more cancellation in |x| - (rhead-rtail) ==> closer)
162    shl     rcx, 1                        ; strip any sign bit
163    shr     rcx, 53                       ; >> EXPSHIFTBITS_DP64 +1
164    sub     r9, rcx                       ; expdiff
165
166;;      if (expdiff > 15)
167    cmp     r9, 15
168    jle     Lcos_sse2_cw_reduction_done
169
170;   Here the remainder is pretty small compared with x, which
171;   implies that x is a near multiple of pi/2
172;   (x matches the multiple to at least 15 bits)
173;   So we do another stage of argument reduction.
174
175;          t  = rhead;
176    movapd  xmm1, xmm4
177
178;          rtail  = npi2 * piby2_2;
179    mulsd   xmm3, xmm2
180
181;          rhead  = t - rtail;
182    mulsd   xmm5, xmm2                    ; npi2 * piby2_2tail
183    subsd   xmm4, xmm3                    ; rhead
184
185;          rtail  = npi2 * piby2_2tail - ((t - rhead) - rtail);
186    subsd   xmm1, xmm4                    ; t - rhead
187    subsd   xmm1, xmm3                    ; -rtail
188    subsd   xmm5, xmm1                    ; rtail
189
190;      r = rhead - rtail;
191    movapd  xmm0, xmm4
192
193;HARSHA
194;xmm1=rtail
195    movapd  xmm1, xmm5                    ; xmm1 <-- copy of rtail
196    subsd   xmm0, xmm5
197
198;    xmm0=r, xmm4=rhead, xmm1=rtail
199Lcos_sse2_cw_reduction_done:
200;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
201;; if the input was close to a pi/2 multiple
202; The original NAG code missed this trick.
203; If the input is very close to n*pi/2 after reduction, so  r < 2^-27,
204; then the cos is either ~ 1.0 or ~r, to within 53 bits.
205
206; NOTE: Unfortunately, this introduces two jcc instructions close to each
207; other and to other branches.  As r < 2^-13 should be rather uncommon,
208; the problems for branch prediction outweigh the computational savings. - WAT
209;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
210;      region = npi2 & 3;
211    subsd   xmm4, xmm0                    ; rhead-r
212    subsd   xmm4, xmm1                    ; rr = (rhead-r) - rtail
213
214Lcos_piby4:
215; perform taylor series to calc sinx or cosx
216;  x2 = r * r;
217
218;xmm4 = a part of rr for the sin path, xmm4 is overwritten in the cos path
219;instead use xmm3 because that was freed up in the sin path, xmm3 is overwritten in sin path
220    movapd  xmm3, xmm0
221    movapd  xmm2, xmm0
222    mulsd   xmm2, xmm0                                ;x2
223
224    bt      eax,0
225    jnc     Lcos_sse2_calc_cos
226
227;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
228; region 1 or 3 do a sin calculation
229    movsd   xmm3, __Lsinarray+50h                     ; s6
230    mulsd   xmm3, xmm2                                ; x2s6
231    movsd   xmm5, __Lsinarray+20h                     ; s3
232    movsd   QWORD PTR p_temp[rsp], xmm4               ; store xx
233    movapd  xmm1, xmm2                                ; move for x4
234    mulsd   xmm1, xmm2                                ; x4
235    movsd   QWORD PTR p_temp1[rsp], xmm0              ; store x
236    mulsd   xmm5, xmm2                                ; x2s3
237    movapd  xmm4, xmm0                                ; move for x3
238    addsd   xmm3, __Lsinarray+40h                     ; s5+x2s6
239    mulsd   xmm1, xmm2                                ; x6
240    mulsd   xmm3, xmm2                                ; x2(s5+x2s6)
241    mulsd   xmm4, xmm2                                ; x3
242    addsd   xmm5, __Lsinarray+10h                     ; s2+x2s3
243    mulsd   xmm5, xmm2                                ; x2(s2+x2s3)
244    addsd   xmm3, __Lsinarray+30h                     ; s4 + x2(s5+x2s6)
245    mulsd   xmm2, L_one_half                              ; 0.5 *x2
246    movsd   xmm0, QWORD PTR p_temp[rsp]               ; load xx
247    mulsd   xmm3, xmm1                                ; x6(s4 + x2(s5+x2s6))
248    addsd   xmm5, __Lsinarray                         ; s1+x2(s2+x2s3)
249    mulsd   xmm2, xmm0                                ; 0.5 * x2 *xx
250    addsd   xmm3, xmm5                                ; zs
251    mulsd   xmm4, xmm3                                ; *x3
252    subsd   xmm4, xmm2                                ; x3*zs - 0.5 * x2 *xx
253    addsd   xmm0, xmm4                                ; +xx
254    addsd   xmm0, QWORD PTR p_temp1[rsp]              ; +x
255
256    jmp     Lcos_sse2_adjust_region
257
258ALIGN 16
259Lcos_sse2_calc_cos:
260;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
261; region 0 or 2     - do a cos calculation
262;  zc = (c2 + x2 * (c3 + x2 * (c4 + x2 * (c5 + x2 * c6))));
263    mulsd   xmm4, xmm0                                ; x*xx
264    movsd   xmm5, L_one_half
265    movsd   xmm1, __Lcosarray+50h                     ; c6
266    movsd   xmm0, __Lcosarray+20h                     ; c3
267    mulsd   xmm5, xmm2                                ; r = 0.5 *x2
268    movapd  xmm3, xmm2                                ; copy of x2
269    movsd   QWORD PTR p_temp[rsp], xmm4               ; store x*xx
270    mulsd   xmm1, xmm2                                ; c6*x2
271    mulsd   xmm0, xmm2                                ; c3*x2
272    subsd   xmm5, L_one                               ; -t=r-1.0, trash r
273    mulsd   xmm3, xmm2                                ; x4
274    addsd   xmm1, __Lcosarray+40h                     ; c5+x2c6
275    addsd   xmm0, __Lcosarray+10h                     ; c2+x2C3
276    addsd   xmm5, L_one                               ; 1 + (-t), trash t
277    mulsd   xmm3, xmm2                                ; x6
278    mulsd   xmm1, xmm2                                ; x2(c5+x2c6)
279    mulsd   xmm0, xmm2                                ; x2(c2+x2C3)
280    movapd  xmm4, xmm2                                ; copy of x2
281    mulsd   xmm4, L_one_half                              ; r recalculate
282    addsd   xmm1, __Lcosarray+30h                     ; c4 + x2(c5+x2c6)
283    addsd   xmm0, __Lcosarray                         ; c1+x2(c2+x2C3)
284    mulsd   xmm2, xmm2                                ; x4 recalculate
285    subsd   xmm5, xmm4                                ; (1 + (-t)) - r
286    mulsd   xmm1, xmm3                                ; x6(c4 + x2(c5+x2c6))
287    addsd   xmm0, xmm1                                ; zc
288    subsd   xmm4, L_one                               ; t relaculate
289    subsd   xmm5, QWORD PTR p_temp[rsp]               ; ((1 + (-t)) - r) - x*xx
290    mulsd   xmm0, xmm2                                ; x4 * zc
291    addsd   xmm0, xmm5                                ; x4 * zc + ((1 + (-t)) - r -x*xx)
292    subsd   xmm0, xmm4                                ; result - (-t)
293
294;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
295
296Lcos_sse2_adjust_region:
297;      switch (region)
298    add     eax, 1
299    and     eax, 2
300    jz      Lcos_sse2_cleanup
301
302;; if the original region 1 or 2 then we negate the result.
303    movapd  xmm2, xmm0
304    xorpd   xmm0, xmm0
305    subsd   xmm0, xmm2
306
307ALIGN 16
308Lcos_sse2_cleanup:
309    StackDeallocate stack_size
310    ret
311
312
313
314
315
316
317ALIGN 16
318Lcos_sse2_absx_lt_piby4:
319;          cos = cos_piby4(x, 0.0);
320
321;  x2 = r * r;
322    cmp     r10, L_two_to_neg_13
323    jb      Lcos_sse2_x_small
324    movapd  xmm2, xmm0
325    mulsd   xmm2, xmm0                                ; x2
326
327;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
328; region 0 - do a cos calculation
329;  zc = (c2 + x2 * (c3 + x2 * (c4 + x2 * (c5 + x2 * c6))));
330    movsd   xmm1, __Lcosarray+10h                     ; c2
331    movapd  xmm4, xmm2                                ; move for x4
332    mulsd   xmm4, xmm2                                ; x4
333    movsd   xmm3, __Lcosarray+30h                     ; c4
334    mulsd   xmm1, xmm2                                ; c2x2
335    movsd   xmm5, __Lcosarray+50h                     ; c6
336    mulsd   xmm3, xmm2                                ; c4x2
337    movapd  xmm0, xmm4                                ; move for x8
338    mulsd   xmm5, xmm2                                ; c6x2
339    mulsd   xmm0, xmm4                                ; x8
340    addsd   xmm1, __Lcosarray                         ; c1 + c2x2
341    mulsd   xmm1, xmm4                                ; c1x4 + c2x6
342    addsd   xmm3, __Lcosarray+20h                     ; c3 + c4x2
343    mulsd   xmm2, L_neg_one_half                      ; -0.5x2, destroy xmm2
344    addsd   xmm5, __Lcosarray+40h                     ; c5 + c6x2
345    mulsd   xmm3, xmm0                                ; c3x8 + c4x10
346    mulsd   xmm4, xmm0                                ; x12
347    mulsd   xmm4, xmm5                                ; c5x12 + c6x14
348
349    movsd   xmm0, L_one
350    addsd   xmm1, xmm3                                ; c1x4 + c2x6 + c3x8 + c4x10
351    movapd  xmm3, xmm2                                ; preserve -0.5x2
352    addsd   xmm2, xmm0                                ; t = 1 - 0.5x2
353    subsd   xmm0, xmm2                                ; 1-t
354    addsd   xmm0, xmm3                                ; (1-t) - r
355    addsd   xmm1, xmm4                                ; c1x4 + c2x6 + c3x8 + c4x10 + c5x12 + c6x14
356    addsd   xmm0, xmm1                                ; (1-t) - r + c1x4 + c2x6 + c3x8 + c4x10 + c5x12 + c6x14
357    addsd   xmm0, xmm2                                ; 1 - 0.5x2 + above
358
359    StackDeallocate stack_size
360    ret
361
362ALIGN 16
363Lcos_sse2_x_small:
364    movsd   xmm2, xmm0
365    movsd   xmm0, L_one
366    cmp     r10, L_two_to_neg_27
367    jb      Lcos_sse2_x_smaller
368    mulsd   xmm2, xmm2
369    mulsd   xmm2, L_one_half
370    subsd   xmm0, xmm2
371    StackDeallocate stack_size
372    ret
373
374ALIGN 16
375Lcos_sse2_x_smaller:
376    movsd   xmm0, L_one
377    addsd   xmm0, L_int_one     ; really adding smallest subnormal; set inexact
378    StackDeallocate stack_size
379    ret
380
381ALIGN 16
382Lcos_reduce_precise:
383;   Reduce x into range [-pi/4, pi/4]
384    cmp     r10, L__inf_mask_64
385    jae     Lcos_x_naninf
386    call    __remainder_piby2_forAsm
387
388    ; At this point xmm0 has r, xmm1 has rr, rax has region
389
390    movapd  xmm4, xmm1                ; xmm4 <-- rr
391    jmp     Lcos_piby4
392
393; xmm0 = x, xmm4 = xx, eax= region
394
395
396ALIGN 16
397Lcos_x_naninf:
398    call    fname_special
399    StackDeallocate stack_size
400    ret
401
402;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
403; From this point we assume that FMA3 and AVX hardware are present.
404
405ALIGN 16
406L_cos_fma3:
407    vmovq        r9,xmm0
408    mov          rax,r9
409    and          r9,L_sign_mask           ; clear sign
410
411Lcos_early_exit_s_1:                   ;; unused label
412    cmp          r9,L_piby4
413    jg           Lcos_early_exit_s     ; Note that NaN will branch
414    cmp          r9,L_two_to_neg_13
415    jge          Lcompute_cos_pyby_4
416    cmp          r9,L_two_to_neg_27
417    jge          Lcompute_1_xx_5
418    vmovq        xmm0,L_one               ; for tiniest args, cos is 1
419    jmp          Lreturn_no_restore
420
421Lcompute_1_xx_5:
422    vmulsd       xmm1,xmm0,L_one_half     ; xmm1l <-- .5*x
423    vfnmadd213sd xmm0,xmm1,L_one          ; xmm0l <-- 1.0 - (.5*x)*x
424    jmp          Lreturn_no_restore
425
426Lcompute_cos_pyby_4:
427    ; make sure this is accurate enough
428    ; note that x^2 can't be all that close to 1 here
429    vmulsd       xmm3,xmm0,xmm0           ; xmm3 <-- xx = x*x
430    vmovapd      xmm0,__Lcosarray+050h    ; xmm0 <-- c5
431    vfmadd213sd  xmm0,xmm3,__Lcosarray+040h  ; xmm0 <-- c5*xx + c4
432    vfmadd213sd  xmm0,xmm3,__Lcosarray+030h  ; xmm0 <-- (c5*xx + c4)*xx + c3
433    vfmadd213sd  xmm0,xmm3,__Lcosarray+020h
434    vfmadd213sd  xmm0,xmm3,__Lcosarray+010h
435    vfmadd213sd  xmm0,xmm3,__Lcosarray
436    vfmsub213sd  xmm0,xmm3,L_one_half
437    vfmadd213sd  xmm0,xmm3,L_one
438
439    jmp          Lreturn_no_restore
440
441Lcos_early_exit_s:
442    mov          r8,L__inf_mask_64
443    and          rax,r8
444    cmp          rax, r8
445    jz           Lcos_x_naninf
446
447Lrange_reduce:
448    vmovq        xmm0,r9               ; r9 <-- |x|
449    cmp          r9,L_small_arg_bdl
450    jae          Lcos_remainder_piby2
451
452    ; For __remainder_piby2_fma3 and __remainder_piby2_fma3_bdl
453    ; on input
454    ;   x is in xmm0
455    ; on output
456    ;   r is in xmm0
457    ;   rr is in xmm1
458    ;   region is in rax
459
460    ; Boldo-Daumas-Li reduction for reasonably small |x|
461    call         __remainder_piby2_fma3_bdl
462
463;;      if region is 0 or 2    do a cos calc.
464;;      if region is 1 or 3    do a sin calc.
465Lcos_exit_s:
466    bt           rax,0
467    jc           Lsin_piby4_compute
468
469Lcos_piby4_compute:                    ;; unused label
470    ; compute the cosine of r+rr, where this sum is in [-pi/4,pi/4]
471    vmovapd      xmm2,L_one
472    vmulsd       xmm3,xmm0,xmm0        ; xmm3 <-- x * x
473    vmulsd       xmm5,xmm3,L_one_half      ; xmm5 <-- x*x*.5 == r
474    vsubsd       xmm4,xmm2,xmm5        ; xmm4 <-- t = 1. - x*x*.5
475    vsubsd       xmm2,xmm2,xmm4        ; 1-t
476    vsubsd       xmm2,xmm2,xmm5        ; xmm2 <-- (1-t) - r
477    vmovapd      xmm5,__Lcosarray+040h
478    vfnmadd231sd xmm2,xmm0,xmm1        ; (1.0 - t) - r) - x * xx) xmm2
479    vmulsd       xmm1,xmm3,xmm3           ; x2 * x2 xmm1
480    vfmadd231sd  xmm5,xmm3,__Lcosarray+050h
481    vfmadd213sd  xmm5,xmm3,__Lcosarray+030h
482    vfmadd213sd  xmm5,xmm3,__Lcosarray+020h
483    vfmadd213sd  xmm5,xmm3,__Lcosarray+010h
484    vfmadd213sd  xmm5,xmm3,__Lcosarray
485    vfmadd213sd  xmm5,xmm1,xmm2
486    vaddsd       xmm0,xmm5,xmm4
487
488    jmp          Lcos_exit_s_1
489
490ALIGN 16
491Lsin_piby4_compute:
492    ; compute the sine of r+rr, where this sum is in [-pi/4,pi/4]
493    vmovapd      xmm5,__Lsinarray+040h
494    vmulsd       xmm3,xmm0,xmm0        ; xmm3 <-- x2 = x * x
495    vfmadd231sd  xmm5,xmm3,__Lsinarray+050h
496    vfmadd213sd  xmm5,xmm3,__Lsinarray+030h
497    vfmadd213sd  xmm5,xmm3,__Lsinarray+020h
498    vfmadd213sd  xmm5,xmm3,__Lsinarray+010h ; xmm5 <-- r
499
500    vmulsd       xmm4,xmm0,xmm3        ; xmm4 <-- x3 = x*x*x
501    vmulsd       xmm2,xmm4,xmm5        ; xmm2 <-- x*x*x * r
502    vmulsd       xmm5,xmm1,L_one_half      ; xmm5 <-- .5*x*x
503    vsubsd       xmm2,xmm5,xmm2        ; xmm2 <-- .5*x*x - x*x*x*r
504    vmulsd       xmm2,xmm3,xmm2
505    vsubsd       xmm2,xmm2,xmm1
506    vfnmadd231sd xmm2, xmm4,__Lsinarray
507    vsubsd       xmm0,xmm0,xmm2
508
509Lcos_exit_s_1:
510    xor          r8,r8
511    add          eax, 1
512    and          eax, 2
513    cmovnz       r8, L_signbit
514    vmovq        xmm3,r8
515    vxorpd       xmm0,xmm0,xmm3
516
517Lreturn_restore_regs:
518    StackDeallocate stack_size
519    ret
520
521Lreturn_no_restore:
522    StackDeallocate stack_size
523    ret
524
525ALIGN 16
526Lcos_remainder_piby2:
527    ; argument reduction for general x
528    call         __remainder_piby2_fma3
529    jmp          Lcos_exit_s
530
531
532fname         endp
533END
534