xref: /reactos/sdk/lib/crt/math/libm_sse2/sinf.asm (revision 4afb647c)
1;
2; MIT License
3; -----------
4;
5; Copyright (c) 2002-2019 Advanced Micro Devices, Inc.
6;
7; Permission is hereby granted, free of charge, to any person obtaining a copy
8; of this Software and associated documentaon files (the "Software"), to deal
9; in the Software without restriction, including without limitation the rights
10; to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11; copies of the Software, and to permit persons to whom the Software is
12; furnished to do so, subject to the following conditions:
13;
14; The above copyright notice and this permission notice shall be included in
15; all copies or substantial portions of the Software.
16;
17; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18; IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19; FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20; AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21; LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
23; THE SOFTWARE.
24;
25;
26; An implementation of the sinf function.
27;
28; Prototype
29;
30;     float sinf(float x);
31;
32;   Computes sinf(x).
33;   It will provide proper C99 return values,
34;   but may not raise floating point status bits properly.
35;   Based on the NAG C implementation.
36;
37
38.const
39ALIGN 16
40L_signbit        DQ 08000000000000000h
41                 DQ 08000000000000000h
42L_sign_mask      DQ 07FFFFFFFFFFFFFFFh
43                 DQ 07FFFFFFFFFFFFFFFh
44L_one            DQ 03FF0000000000000h
45                 DQ 03FF0000000000000h
46L_int_three      DQ 00000000000000003h
47                 DQ 00000000000000003h
48L_one_half       DQ 03FE0000000000000h
49                 DQ 03FE0000000000000h
50L_twobypi        DQ 03FE45F306DC9C883h
51                 DQ 03FE45F306DC9C883h
52L_piby2_1        DQ 03FF921FB54400000h
53                 DQ 03FF921FB54400000h
54L_one_sixth      DQ 03FC5555555555555h
55                 DQ 03FC5555555555555h
56L_piby2_1tail    DQ 03DD0B4611A626331h
57                 DQ 03DD0B4611A626331h
58L_piby2_2        DQ 03dd0b4611a600000h
59                 DQ 03dd0b4611a600000h
60L_piby2_2tail    DQ 03ba3198a2e037073h
61                 DQ 03ba3198a2e037073h
62L_inf_mask_32    DD 07F800000h
63                 DD 07F800000h
64                 DQ 07F8000007F800000h
65L_int_two        DQ 00000000000000002h
66                 DQ 00000000000000002h
67L_piby2_lead     DQ 03ff921fb54442d18h
68                 DQ 03ff921fb54442d18h
69L_piby4          DQ 03fe921fb54442d18h
70                 DQ 03fe921fb54442d18h
71L_mask_3f2       DQ 03f20000000000000h
72                 DQ 03f20000000000000h
73L_mask_3f8       DQ 03f80000000000000h
74                 DQ 03f80000000000000h
75
76; Do these really need to be different?
77L_large_x_fma3   DQ 04170008AC0000000h     ; 16779436
78L_large_x_sse2   DQ 0416E848000000000h     ; 16000000
79
80EXTRN __Lcosfarray:QWORD
81EXTRN __Lsinfarray:QWORD
82EXTRN __use_fma3_lib:DWORD
83EXTRN __L_2_by_pi_bits:BYTE
84
85; define local variable storage offsets
86p_temp              EQU 010h     ; temporary for get/put bits operation
87p_temp1             EQU 018h     ; temporary for get/put bits operation
88region              EQU 020h     ; pointer to region for remainder_piby2
89r                   EQU 028h     ; pointer to r for remainder_piby2
90dummy_space         EQU 040h
91
92stack_size          EQU 058h
93
94include fm.inc
95
96fname           TEXTEQU <sinf>
97fname_special   TEXTEQU <_sinf_special>
98
99;Define name and any external functions being called
100EXTRN           __remainder_piby2d2f_forC : PROC    ; NEAR
101EXTERN          fname_special        : PROC
102
103.code
104ALIGN 16
105PUBLIC fname
106fname PROC FRAME
107    StackAllocate stack_size
108    .ENDPROLOG
109    cmp          DWORD PTR __use_fma3_lib, 0
110    jne          Lsinf_fma3
111
112Lsinf_sse2:
113
114    xorpd   xmm2, xmm2                                ; zeroed out for later use
115
116;;  if NaN or inf
117    movd    edx, xmm0
118    mov     eax, 07f800000h
119    mov     r10d, eax
120    and     r10d, edx
121    cmp     r10d, eax
122    jz      Lsinf_sse2_naninf
123
124; GET_BITS_DP64(x, ux);
125; get the input value to an integer register.
126    cvtss2sd     xmm0, xmm0               ; convert input to double.
127    movd         rdx, xmm0                ; rdx is ux
128
129;  ax = (ux & ~SIGNBIT_DP64);
130    mov     r10, rdx
131    btr     r10, 63                                   ; r10 is ax
132    mov     r8d, 1                                    ; for determining region later on
133
134;;  if (ax <= 0x3fe921fb54442d18)  abs(x) <= pi/4
135    mov     rax, 03fe921fb54442d18h
136    cmp     r10, rax
137    jg      Lsinf_absx_gt_piby4
138
139;;      if (ax < 0x3f80000000000000)  abs(x) < 2.0^(-7)
140    mov     rax, 3f80000000000000h
141    cmp     r10, rax
142    jge     Lsinf_sse2_small
143
144;;          if (ax < 0x3f20000000000000) abs(x) < 2.0^(-13)
145    mov     rax, 3f20000000000000h
146    cmp     r10, rax
147    jge     Lsinf_sse2_smaller
148
149;                  sinf = x;
150    jmp     Lsinf_sse2_cleanup
151
152ALIGN 16
153Lsinf_sse2_smaller:
154;   sinf = x - x^3 * 0.1666666666666666666;
155    movsd   xmm2, xmm0
156    movsd   xmm4, QWORD PTR L_one_sixth   ; 0.1666666666666666666
157    mulsd   xmm2, xmm2                    ; x^2
158    mulsd   xmm2, xmm0                    ; x^3
159    mulsd   xmm2, xmm4                    ; x^3 * 0.1666666666666666666
160    subsd   xmm0, xmm2                    ; x - x^3 * 0.1666666666666666666
161    jmp     Lsinf_sse2_cleanup
162
163ALIGN 16
164Lsinf_sse2_small:
165    movsd   xmm2, xmm0                    ; x2 = r * r;
166    mulsd   xmm2, xmm0                    ; x2
167
168;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
169; region 0 or 2     - do a sinf calculation
170;  zs = x + x3((s1 + x2 * s2) + x4(s3 + x2 * s4));
171    movsd   xmm1, QWORD PTR __Lsinfarray+18h          ; s4
172    mulsd   xmm1, xmm2                                ; s4x2
173    movsd   xmm4, xmm2                                ; move for x4
174    movsd   xmm5, QWORD PTR __Lsinfarray+8h           ; s2
175    mulsd   xmm4, xmm2                                ; x4
176    movsd   xmm3, xmm0                                ; move for x3
177    mulsd   xmm5, xmm2                                ; s2x2
178    mulsd   xmm3, xmm2                                ; x3
179    addsd   xmm1, QWORD PTR __Lsinfarray+10h          ; s3+s4x2
180    mulsd   xmm1, xmm4                                ; s3x4+s4x6
181    addsd   xmm5, QWORD PTR __Lsinfarray              ; s1+s2x2
182    addsd   xmm1, xmm5                                ; s1+s2x2+s3x4+s4x6
183    mulsd   xmm1, xmm3                                ; x3(s1+s2x2+s3x4+s4x6)
184    addsd   xmm0, xmm1                                ; x + x3(s1+s2x2+s3x4+s4x6)
185    jmp     Lsinf_sse2_cleanup
186
187;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
188ALIGN 16
189Lsinf_absx_gt_piby4:
190;  xneg = (ax != ux);
191    cmp     rdx, r10
192    mov     r11d, 0
193;;  if (xneg) x = -x;
194    jz      Lsinf_sse2_reduce_moderate
195
196    mov     r11d, 1
197    subsd   xmm2, xmm0
198    movsd   xmm0, xmm2
199
200Lsinf_sse2_reduce_moderate:
201;;  if (x < 5.0e6)
202    cmp     r10, QWORD PTR L_large_x_sse2
203    jae     Lsinf_sse2_reduce_large
204
205; reduce  the argument to be in a range from -pi/4 to +pi/4
206; by subtracting multiples of pi/2
207    movsd   xmm2, xmm0
208    movsd   xmm3, QWORD PTR L_twobypi
209    movsd   xmm4, xmm0
210    movsd   xmm5, QWORD PTR L_one_half              ; .5
211    mulsd   xmm2, xmm3
212
213;/* How many pi/2 is x a multiple of? */
214;      xexp  = ax >> EXPSHIFTBITS_DP64;
215    mov     r9, r10
216    shr     r9, 52                                    ; >>EXPSHIFTBITS_DP64
217
218;        npi2  = (int)(x * twobypi + 0.5);
219    addsd   xmm2, xmm5                        ; npi2
220
221    movsd   xmm3, QWORD PTR L_piby2_1
222    cvttpd2dq  xmm0, xmm2                             ; convert to integer
223    movsd   xmm1, QWORD PTR L_piby2_1tail
224    cvtdq2pd   xmm2, xmm0                             ; and back to double.
225
226;      /* Subtract the multiple from x to get an extra-precision remainder */
227;      rhead  = x - npi2 * piby2_1;
228    mulsd   xmm3, xmm2
229    subsd   xmm4, xmm3                                ; rhead
230
231;      rtail  = npi2 * piby2_1tail;
232    mulsd   xmm1, xmm2
233    movd    eax, xmm0
234
235;      GET_BITS_DP64(rhead-rtail, uy);
236; originally only rhead
237    movsd   xmm0, xmm4
238    subsd   xmm0, xmm1
239
240    movsd   xmm3, QWORD PTR L_piby2_2
241    movd    rcx, xmm0
242    movsd   xmm5, QWORD PTR L_piby2_2tail
243
244;    xmm0=r, xmm4=rhead, xmm1=rtail, xmm2=npi2, xmm3=temp for calc, xmm5= temp for calc
245;      expdiff = xexp - ((uy & EXPBITS_DP64) >> EXPSHIFTBITS_DP64);
246    shl     rcx, 1                                    ; strip any sign bit
247    shr     rcx, 53                                   ; >> EXPSHIFTBITS_DP64 +1
248    sub     r9, rcx                                   ; expdiff
249
250;;      if (expdiff > 15)
251    cmp     r9, 15
252    jle     Lsinf_sse2_expdiff_le_15
253
254;       The remainder is pretty small compared with x, which
255;       implies that x is a near multiple of pi/2
256;       (x matches the multiple to at least 15 bits)
257;          t  = rhead;
258    movsd   xmm1, xmm4
259
260;          rtail  = npi2 * piby2_2;
261    mulsd   xmm3, xmm2
262
263;          rhead  = t - rtail;
264    mulsd   xmm5, xmm2                                ; npi2 * piby2_2tail
265    subsd   xmm4, xmm3                                ; rhead
266
267;          rtail  = npi2 * piby2_2tail - ((t - rhead) - rtail);
268    subsd   xmm1, xmm4                                ; t - rhead
269    subsd   xmm1, xmm3                                ; -rtail
270    subsd   xmm5, xmm1                                ; rtail
271
272;      r = rhead - rtail;
273    movsd   xmm0, xmm4
274
275;HARSHA
276;xmm1=rtail
277    movsd   xmm1, xmm5
278    subsd   xmm0, xmm5
279
280;    xmm0=r, xmm4=rhead, xmm1=rtail
281
282;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
283Lsinf_sse2_expdiff_le_15:
284    cmp     rcx, 03f2h                                ; is r < 2^-13 ?
285    jge     Lsinf_sse2_calc_sincosf_piby4             ; use taylor series if not
286    cmp     rcx, 03deh                                ; if r really small.
287    jle     Lsinf_sse2_r_very_small                   ; then sinf(r) ~ r or 1
288
289    movsd   xmm2, xmm0
290    mulsd   xmm2, xmm0                                ; xmm2 <-- r^2
291
292;;      if region is 0 or 2 do a sinf calc.
293    and     r8d, eax
294    jnz     Lsinf_sse2_small_calc_sin
295
296; region 0 or 2 do a sinf calculation
297; use simply polynomial
298;              x - x*x*x*0.166666666666666666;
299    movsd   xmm3, QWORD PTR L_one_sixth
300    mulsd   xmm3, xmm0                                ; * x
301    mulsd   xmm3, xmm2                                ; * x^2
302    subsd   xmm0, xmm3                                ; xs
303    jmp     Lsinf_sse2_adjust_region
304
305ALIGN 16
306Lsinf_sse2_small_calc_sin:
307; region 1 or 3 do a cosf calculation
308; use simply polynomial
309;              1.0 - x*x*0.5;
310    movsd   xmm0, QWORD PTR L_one  ; 1.0
311    mulsd   xmm2, QWORD PTR L_one_half              ; 0.5 *x^2
312    subsd   xmm0, xmm2                                ; xc
313    jmp     Lsinf_sse2_adjust_region
314
315ALIGN 16
316Lsinf_sse2_r_very_small:
317;;      if region is 0 or 2    do a sinf calc. (sinf ~ x)
318    and     r8d, eax
319    jz      Lsinf_sse2_adjust_region
320
321    movsd   xmm0, QWORD PTR L_one  ; cosf(r) is a 1
322    jmp     Lsinf_sse2_adjust_region
323
324;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
325ALIGN 16
326Lsinf_sse2_reduce_large:
327;      Reduce x into range [-pi/4, pi/4]
328;      __remainder_piby2d2f_forC(x, &r, &region);
329
330    mov          QWORD PTR p_temp[rsp], r11
331    lea          rdx, QWORD PTR r[rsp]
332    lea          r8,  QWORD PTR region[rsp]
333    movd         rcx, xmm0
334    call         __remainder_piby2d2f_forC
335    mov          r11, QWORD PTR p_temp[rsp]
336    mov          r8d, 1                   ; for determining region later on
337    movsd        xmm1, QWORD PTR r[rsp]   ; x
338    mov          eax,  DWORD PTR region[rsp] ; region
339
340; xmm0 = x, xmm4 = xx, r8d = 1, eax= region
341;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
342
343; perform taylor series to calc sinfx, cosfx
344Lsinf_sse2_calc_sincosf_piby4:
345;  x2 = r * r;
346    movsd   xmm2, xmm0
347    mulsd   xmm2, xmm0                                ; x2
348
349;;      if region is 1 or 3, do a cosf calc.
350    and     r8d, eax
351    jnz     Lsinf_sse2_do_cosf_calc
352
353; region is 0 or 2: do a sinf calc.
354;  zs = x + x3((s1 + x2 * s2) + x4(s3 + x2 * s4));
355Lsinf_sse2_do_sinf_calc:
356    movsd   xmm1, QWORD PTR __Lsinfarray+18h          ; s4
357    mulsd   xmm1, xmm2                                ; s4x2
358    movsd   xmm4, xmm2                                ; move for x4
359    mulsd   xmm4, xmm2                                ; x4
360    movsd   xmm5, QWORD PTR __Lsinfarray+8h           ; s2
361    mulsd   xmm5, xmm2                                ; s2x2
362    movsd   xmm3, xmm0                                ; move for x3
363    mulsd   xmm3, xmm2                                ; x3
364    addsd   xmm1, QWORD PTR __Lsinfarray+10h          ; s3+s4x2
365    mulsd   xmm1, xmm4                                ; s3x4+s4x6
366    addsd   xmm5, QWORD PTR __Lsinfarray              ; s1+s2x2
367    addsd   xmm1, xmm5                                ; s1+s2x2+s3x4+s4x6
368    mulsd   xmm1, xmm3                                ; x3(s1+s2x2+s3x4+s4x6)
369    addsd   xmm0, xmm1                                ; x + x3(s1+s2x2+s3x4+s4x6)
370    jmp     Lsinf_sse2_adjust_region
371
372ALIGN 16
373Lsinf_sse2_do_cosf_calc:
374
375; region 1 or 3     - do a cosf calculation
376;  zc = 1-0.5*x2+ c1*x4 +c2*x6 +c3*x8;
377;     zc = 1-0.5*x2+ c1*x4 +c2*x6 +c3*x8 + c4*x10 for a higher precision
378    movsd   xmm1, QWORD PTR __Lcosfarray+20h          ; c4
379    movsd   xmm4, xmm2                                ; move for x4
380    mulsd   xmm1, xmm2                                ; c4x2
381    movsd   xmm3, QWORD PTR __Lcosfarray+10h          ; c2
382    mulsd   xmm4, xmm2                                ; x4
383    movsd   xmm0, QWORD PTR __Lcosfarray              ; c0
384    mulsd   xmm3, xmm2                                ; c2x2
385    mulsd   xmm0, xmm2                                ; c0x2 (=-0.5x2)
386    addsd   xmm1, QWORD PTR __Lcosfarray+18h          ; c3+c4x2
387    mulsd   xmm1, xmm4                                ; c3x4 + c4x6
388    addsd   xmm3, QWORD PTR __Lcosfarray+8h           ; c1+c2x2
389    addsd   xmm1, xmm3                                ; c1 + c2x2 + c3x4 + c4x6
390    mulsd   xmm1, xmm4                                ; c1x4 + c2x6 + c3x8 + c4x10
391    addsd   xmm0, QWORD PTR L_one                     ; 1 - 0.5x2
392    addsd   xmm0, xmm1                                ; 1 - 0.5x2 + c1x4 + c2x6 + c3x8 + c4x10
393
394;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
395
396
397Lsinf_sse2_adjust_region:
398; positive or negative
399;      switch (region)
400    shr     eax, 1
401    mov     ecx, eax
402    and     eax, r11d
403
404    not     ecx
405    not     r11d
406    and     ecx, r11d
407
408    or      eax, ecx
409    and     eax, 1
410    jnz     Lsinf_sse2_cleanup
411
412;; if the original region 0, 1 and arg is negative, then we negate the result.
413;; if the original region 2, 3 and arg is positive, then we negate the result.
414    movsd   xmm2, xmm0
415    xorpd   xmm0, xmm0
416    subsd   xmm0, xmm2
417
418
419Lsinf_sse2_cleanup:
420    cvtsd2ss xmm0, xmm0
421    StackDeallocate stack_size
422    ret
423
424ALIGN 16
425Lsinf_sse2_naninf:
426    call    fname_special
427    StackDeallocate stack_size
428    ret
429
430ALIGN 16
431Lsinf_fma3:
432    vmovd        eax,xmm0
433    mov          r8d,L_inf_mask_32
434    and          eax,r8d
435    cmp          eax, r8d
436    jz           Lsinf_fma3_naninf
437
438    vcvtss2sd    xmm5,xmm0,xmm0
439    vmovq        r9,xmm5
440    btr          r9,63                    ; r9 <-- |x|
441    cmp          r9,L_piby4
442    jg           Lsinf_fma3_range_reduce
443
444    cmp          r9,L_mask_3f8
445    jge          Lsinf_fma3_compute_sinf_piby_4
446
447    cmp          r9,L_mask_3f2
448    jge          Lsinf_fma3_compute_x_xxx_0_1666
449
450    ; Here |x| < 2^-13; just return sin x ~ x
451    StackDeallocate stack_size
452    ret
453
454ALIGN 16
455Lsinf_fma3_compute_x_xxx_0_1666:
456    ; Here |x| < 2^-7; return sin x ~ x + 1/6 x^3
457    vmulsd       xmm1,xmm5,xmm5
458    vmulsd       xmm0,xmm1,xmm5           ; xmm1 <-- x^3
459    vfnmadd132sd xmm0,xmm5,L_one_sixth    ; x - x*x*x*0.166666666666666666
460    jmp          Lsinf_fma3_return_sinf_s
461
462ALIGN 16
463Lsinf_fma3_compute_sinf_piby_4:
464    vmovapd      xmm0,xmm5
465    vmovsd       xmm1,__Lsinfarray+010h
466    vmulsd       xmm3,xmm0,xmm0           ; xmm3 <-- x^2
467    vfmadd231sd  xmm1,xmm3,__Lsinfarray+018h
468    vfmadd213sd  xmm1,xmm3,__Lsinfarray+08h
469    vfmadd213sd  xmm1,xmm3,__Lsinfarray
470    vmulsd       xmm3,xmm0,xmm3           ; xmm3 <-- x^3
471    vfmadd231sd  xmm0,xmm1,xmm3
472    jmp          Lsinf_fma3_return_sinf_s
473
474ALIGN 16
475Lsinf_fma3_range_reduce:
476    vmovq        xmm0,r9                  ; xmm0 <-- |x|
477    cmp          r9,L_large_x_fma3
478    jge          Lsinf_fma3_reduce_large
479
480Lsinf_fma3_sinf_reduce_moderate:
481    vandpd       xmm1,xmm0,L_sign_mask    ; xmm1 <-- |x|  mov should suffice WAT
482    vmovapd      xmm2,L_twobypi
483    vfmadd213sd  xmm2,xmm1,L_one_half
484    vcvttpd2dq   xmm2,xmm2
485    vpmovsxdq    xmm1,xmm2
486    vandpd       xmm4,xmm1,L_int_three    ; xmm4 <-- region
487    vshufps      xmm1 ,xmm1,xmm1,8
488    vcvtdq2pd    xmm1,xmm1
489    vmovdqa      xmm2,xmm0
490    vfnmadd231sd xmm2,xmm1,L_piby2_1      ; xmm2 <-- rhead
491    vmulsd       xmm3,xmm1,L_piby2_1tail  ; xmm3 <-- rtail
492    vsubsd       xmm0,xmm2,xmm3           ; xmm0 <-- r_1
493    vsubsd       xmm2,xmm2,xmm0
494    vsubsd       xmm1,xmm2,xmm3           ; xmm4 <-- rr_1
495    jmp          Lsinf_fma3_exit_s
496
497ALIGN 16
498Lsinf_fma3_reduce_large:
499    lea          r9,__L_2_by_pi_bits
500    ;xexp = (x >> 52) 1023
501    vmovq        r11,xmm0
502    mov          rcx,r11
503    shr          r11,52
504    sub          r11,1023                 ; r11 <-- xexp = exponent of input x
505    ;calculate the last byte from which to start multiplication
506    ;last = 134 (xexp >> 3)
507    mov          r10,r11
508    shr          r10,3
509    sub          r10,134 ;r10 = last
510    neg          r10 ;r10 = last
511    ;load 64 bits of 2_by_pi
512    mov          rax,[r9+r10]
513    ;mantissa of x = ((x << 12) >> 12) | implied bit
514    shl          rcx,12
515    shr          rcx,12 ;rcx = mantissa part of input x
516    bts          rcx,52 ;add the implied bit as well
517    ;load next 128 bits of 2_by_pi
518    add          r10,8 ;increment to next 8 bytes of 2_by_pi
519    vmovdqu      xmm0,XMMWORD PTR[r9+r10]
520    ;do three 64bit multiplications with mant of x
521    mul          rcx
522    mov          r8,rax                   ; r8 <-- last 64 bits of mul = res1[2]
523    mov          r10,rdx                  ; r10 <-- carry
524    vmovq        rax,xmm0
525    mul          rcx
526    ;resexp = xexp & 7
527    and          r11,7                    ; r11 <-- resexp = last 3 bits
528    psrldq       xmm0,8
529    add          rax,r10                  ; add the previous carry
530    adc          rdx,0
531    mov          r9,rax                   ; r9 <-- next 64 bits of mul = res1[1]
532    mov          r10,rdx                  ; r10 <-- carry
533    vmovq        rax,xmm0
534    mul          rcx
535    add          r10,rax                  ; r10 = most sig 64 bits = res1[0]
536    ;find the region
537    ;last three bits ltb = most sig bits >> (54 resexp))
538    ; decimal point in last 18 bits == 8 lsb's in first 64 bits
539    ; and 8 msb's in next 64 bits
540    ;point_five = ltb & 01h;
541    ;region = ((ltb >> 1) + point_five) & 3;
542    mov          rcx,54
543    mov          rax,r10
544    sub          rcx,r11
545    xor          rdx,rdx ;rdx = sign of x(i.e first part of x * 2bypi)
546    shr          rax,cl
547    jnc          Lsinf_fma3_no_point_five_f
548    ;;if there is carry.. then negate the result of multiplication
549    not          r10
550    not          r9
551    not          r8
552    mov          rdx,08000000000000000h
553
554Lsinf_fma3_no_point_five_f:
555    adc          rax,0
556    and          rax,3
557    vmovd        xmm4,eax ;store region to xmm4
558    ;calculate the number of integer bits and zero them out
559    mov          rcx,r11
560    add          rcx,10                   ; rcx <-- no. of integer bits
561    shl          r10,cl
562    shr          r10,cl                   ; r10 contains only mant bits
563    sub          rcx,64                   ; form the exponent
564    mov          r11,rcx
565    ;find the highest set bit
566    bsr          rcx,r10
567    jnz          Lsinf_fma3_form_mantissa_f
568    mov          r10,r9
569    mov          r9,r8
570    mov          r8,0
571    bsr          rcx,r10                  ; rcx <-- hsb
572    sub          r11,64
573
574Lsinf_fma3_form_mantissa_f:
575    add          r11,rcx ;for exp of x
576    sub          rcx,52 ;rcx = no. of bits to shift in r10
577    cmp          rcx,0
578    jl           Lsinf_fma3_hsb_below_52_f
579    je           Lsinf_fma3_form_numbers_f
580    ;hsb above 52
581    mov         r8,r10                    ; previous contents of r8 not required
582    shr         r10,cl                    ; r10 = mantissa of x with hsb at 52
583    shr         r9,cl                     ; make space for bits from r10
584    sub         rcx,64
585    neg         rcx
586    shl         r8,cl
587    or          r9,r8                     ; r9 = mantissa bits of xx
588    jmp         Lsinf_fma3_form_numbers_f
589
590ALIGN 16
591Lsinf_fma3_hsb_below_52_f:
592    neg          rcx
593    mov          rax,r9
594    shl          r10,cl
595    shl          r9,cl
596    sub          rcx,64
597    neg          rcx
598    shr          rax,cl
599    or           r10,rax
600    shr          r8,cl
601    or           r9,r8
602
603ALIGN 16
604Lsinf_fma3_form_numbers_f:
605    add          r11,1023
606    btr          r10,52                   ; remove the implied bit
607    mov          rcx,r11
608    or           r10,rdx                  ; put the sign
609    shl          rcx,52
610    or           r10,rcx                  ; r10 <-- x
611    vmovq        xmm0,r10                 ; xmm0 <-- x
612    vmulsd       xmm0,xmm0,L_piby2_lead
613Lsinf_fma3_exit_s:
614    vmovq        rax,xmm4
615    and          rax,01h
616    cmp          rax,01h
617    jz           Lsinf_fma3_cos_piby4_compute
618
619Lsinf_fma3_sin_piby4_compute:
620;;    vmovapd      xmm1,__Lsinfarray+010h
621    vmovsd       xmm1,__Lsinfarray+010h
622    vmulsd       xmm3,xmm0,xmm0
623    vfmadd231sd  xmm1,xmm3,__Lsinfarray+018h
624    vfmadd213sd  xmm1,xmm3,__Lsinfarray+008h
625    vfmadd213sd  xmm1,xmm3,__Lsinfarray
626    vmulsd       xmm3,xmm0,xmm3           ; xmm3 <-- x^3
627    vfmadd231sd  xmm0,xmm1,xmm3
628    jmp          Lsinf_fma3_exit_s_1
629
630ALIGN 16
631Lsinf_fma3_cos_piby4_compute:
632    vmovapd      xmm2,L_one
633    vmulsd       xmm3,xmm0,xmm0
634    vfmadd231sd  xmm2,xmm3,__Lcosfarray   ; xmm2 <-- 1 + c0 x^2
635    ; would simple Horner's be slower?
636    vmovsd       xmm1,__Lcosfarray+018h       ; xmm1 <-- c3
637    vfmadd231sd  xmm1,xmm3,__Lcosfarray+020h  ; xmm1 <--  c4 x^2+ c3
638    vfmadd213sd  xmm1,xmm3,__Lcosfarray+010h  ; xmm1 <--  (c4 x^2+ c3)x^2 + c2
639    vfmadd213sd  xmm1,xmm3,__Lcosfarray+008h  ; xmm1 <--  ((c4 x^2+ c3)x^2 + c2)x^2 + c1
640    vmulsd       xmm3,xmm3,xmm3               ; xmm3 <-- x^4
641    vmovdqa      xmm0,xmm2
642    vfmadd231sd  xmm0,xmm1,xmm3
643Lsinf_fma3_exit_s_1:
644    ; assuming FMA3 ==> AVX ==> SSE4.1
645    vpcmpeqq     xmm2,xmm4,XMMWORD PTR L_int_two
646    vpcmpeqq     xmm3,xmm4,XMMWORD PTR L_int_three
647    vorpd        xmm3,xmm2,xmm3
648    vandnpd      xmm3,xmm3,L_signbit
649    vxorpd       xmm0,xmm0,xmm3
650
651    vandnpd      xmm1,xmm5,L_signbit
652    vxorpd       xmm0,xmm1,xmm0
653Lsinf_fma3_return_sinf_s:
654    vcvtsd2ss    xmm0,xmm0,xmm0
655    StackDeallocate stack_size
656    ret
657
658Lsinf_fma3_naninf:
659    call         fname_special
660    StackDeallocate stack_size
661    ret
662
663fname endp
664END
665