xref: /reactos/sdk/lib/crt/math/libm_sse2/cosf.asm (revision 77e6348f)
1;
2; MIT License
3; -----------
4;
5; Copyright (c) 2002-2019 Advanced Micro Devices, Inc.
6;
7; Permission is hereby granted, free of charge, to any person obtaining a copy
8; of this Software and associated documentaon files (the "Software"), to deal
9; in the Software without restriction, including without limitation the rights
10; to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11; copies of the Software, and to permit persons to whom the Software is
12; furnished to do so, subject to the following conditions:
13;
14; The above copyright notice and this permission notice shall be included in
15; all copies or substantial portions of the Software.
16;
17; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18; IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19; FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20; AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21; LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
23; THE SOFTWARE.
24;
25; An implementation of the cosf function.
26;
27; Prototype:
28;
29;     float cosf(float x);
30;
31;   Computes cosf(x).
32;   Based on the NAG C implementation.
33;   It will provide proper C99 return values,
34;   but may not raise floating point status bits properly.
35;   Original Author: Harsha Jagasia
36
37.const
38ALIGN 16
39L_real_one                 DQ 03ff0000000000000h      ; 1.0
40                           DQ 0                          ; for alignment
41L_one_half                 DQ 03fe0000000000000h      ; 0.5
42                           DQ 0
43L_2bypi                    DQ 03fe45f306dc9c883h      ; 2./pi
44                           DQ 0
45L_one_sixth                DQ 03fc5555555555555h      ; 0.166666666666
46                           DQ 0
47L_piby2                    DQ 03fe921fb54442d18h
48                           DQ 0
49L_piby2_1                  DQ 03ff921fb54400000h     ; piby2_1
50                           DQ 0
51L_piby2_1tail              DQ 03dd0b4611a626331h     ; piby2_1tail
52                           DQ 0
53L_piby2_2                  DQ 03dd0b4611a600000h     ; piby2_2
54                           DQ 0
55L_piby2_2tail              DQ 03ba3198a2e037073h     ; piby2_2tail
56                           DQ 0
57L_large_x_sse2             DQ 0411E848000000000h     ; 5e5
58                           DQ 0
59L_large_x_fma3             DQ 041E921FB60000000h     ; 3.37325952e9
60                           DQ 0
61L_sign_mask                DQ 07FFFFFFFFFFFFFFFh
62                           DQ 07FFFFFFFFFFFFFFFh
63L__int_three               DQ 00000000000000003h
64                           DQ 00000000000000003h
65L__min_norm_double         DQ 00010000000000000h
66                           DQ 00010000000000000h
67L_two_to_neg_7             DQ 03f80000000000000h
68                           DQ 0
69L_two_to_neg_13            DQ 03f20000000000000h
70                           DQ 0
71L_inf_mask_32              DD 07F800000h
72                           DQ 0
73
74fname           TEXTEQU <cosf>
75fname_special   TEXTEQU <_cosf_special>
76
77;Define name and any external functions being called
78EXTERN           __remainder_piby2d2f_forAsm : PROC    ; NEAR
79EXTERN           __remainder_piby2_fma3_bdl  : PROC   ; NEAR
80EXTERN           __remainder_piby2_fma3      : PROC   ; NEAR
81EXTERN           fname_special      : PROC
82EXTERN           _set_statfp        : PROC
83
84
85EXTRN __Lcosfarray:QWORD
86EXTRN __Lsinfarray:QWORD
87EXTRN __use_fma3_lib:DWORD
88
89; define local variable storage offsets
90p_temp           equ        020h          ; temporary for get/put bits operation
91p_temp1          equ        030h          ; temporary for get/put bits operation
92dummy_space      EQU        040h
93stack_size       EQU        068h
94
95include fm.inc
96
97.code
98
99ALIGN 16
100PUBLIC fname
101fname PROC FRAME
102    StackAllocate stack_size
103    .ENDPROLOG
104    cmp          DWORD PTR __use_fma3_lib, 0
105    jne          Lcosf_fma3
106
107Lcosf_sse2:
108
109    xorpd        xmm2, xmm2               ; zeroed out for later use
110
111;;  if NaN or inf
112    movd         edx, xmm0
113    mov          eax, 07f800000h
114    mov          r10d, eax
115    and          r10d, edx
116    cmp          r10d, eax
117    jz           Lcosf_sse2_naninf
118
119    cvtss2sd     xmm0, xmm0
120    movd         rdx, xmm0
121
122;  ax = (ux & ~SIGNBIT_DP64);
123    mov          r10, rdx
124    btr          r10, 63                  ; r10 <-- |x|
125    mov          r8d, 1                   ; for determining region later on
126
127    movapd       xmm1, xmm0               ; xmm1 <-- copy of x
128
129
130;;  if (ax <= 3fe921fb54442d18h) /* abs(x) <= pi/4 */
131    mov          rax, 03fe921fb54442d18h
132    cmp          r10, rax
133    jg           Lcosf_sse2_absx_gt_piby4
134
135;          *c = cos_piby4(x, 0.0);
136    movapd       xmm2, xmm0
137    mulsd        xmm2, xmm2        ;x^2
138    xor          eax, eax
139    mov          rdx, r10
140    movsd        xmm5, QWORD PTR L_one_half
141    jmp          Lcosf_sse2_calc_sincosf_piby4        ; done
142
143
144ALIGN 16
145Lcosf_sse2_absx_gt_piby4:
146; reduce  the argument to be in a range from -pi/4 to +pi/4
147; by subtracting multiples of pi/2
148;  xneg = (ax != ux);
149    movd         xmm0, r10                ; xmm0 <-- |x|
150    cmp          r10, QWORD PTR L_large_x_sse2
151    jae          Lcosf_sse2_reduce_precise
152
153;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
154; xmm0=abs(x), xmm1=x
155;/* How many pi/2 is x a multiple of? */
156
157    movapd       xmm2, xmm0
158    movsd        xmm3, QWORD PTR L_2bypi
159    movapd       xmm4, xmm0
160    movsd        xmm5, QWORD PTR L_one_half
161    mulsd        xmm2, xmm3
162
163;   movsd        xmm5, QWORD PTR L_one_half
164;   movapd       xmm2, xmm0
165;   mulsd        xmm2, QWORD PTR L_2bypi
166;   movapd       xmm4, xmm0
167
168    mov     r9, r10
169    shr     r9, 52                        ; r9 <-- biased exponent of x
170
171;        npi2  = (int)(x * twobypi + 0.5);
172    addsd   xmm2, xmm5                          ; npi2
173
174    movsd        xmm3, QWORD PTR L_piby2_1      ; piby2_1
175    cvttpd2dq    xmm0, xmm2                     ; xmm0 <-- npi2
176    movsd        xmm1, QWORD PTR L_piby2_1tail  ; piby2_1tail
177    cvtdq2pd     xmm2, xmm0                     ; xmm2 <-- (double)npi2
178
179;    Subtract the multiple from x to get an extra-precision remainder
180;      rhead  = x - npi2 * piby2_1;
181
182    mulsd        xmm3, xmm2                     ; use piby2_1
183    subsd        xmm4, xmm3                     ; rhead
184
185;      rtail  = npi2 * piby2_1tail;
186    mulsd        xmm1, xmm2                     ; rtail
187    movd         eax, xmm0
188
189; GET_BITS_DP64(rhead-rtail, uy);
190; originally only rhead
191    movapd       xmm0, xmm4
192    subsd        xmm0, xmm1
193
194    movsd        xmm3, QWORD PTR L_piby2_2      ; piby2_2
195    movd         rcx, xmm0                      ; rcx <-- rhead-rtail
196    movsd        xmm5, QWORD PTR L_piby2_2tail  ; piby2_2tail
197
198;      region = npi2 & 3;
199;    and        eax, 3
200;      expdiff = xexp - ((uy & EXPBITS_DP64) >> EXPSHIFTBITS_DP64);
201    shl          rcx, 1                         ; strip any sign bit
202    shr          rcx, 53                        ; >> EXPSHIFTBITS_DP64 +1
203    sub          r9, rcx                        ; expdiff
204
205;;      if (expdiff > 15)
206    cmp          r9, 15
207    jle          Lcosf_sse2_expdiff_le_15
208
209; The remainder is pretty small compared with x, which
210; implies that x is a near multiple of pi/2
211; (x matches the multiple to at least 15 bits)
212;          t  = rhead;
213    movapd       xmm1, xmm4
214
215;          rtail  = npi2 * piby2_2;
216    mulsd        xmm3, xmm2
217
218;          rhead  = t - rtail;
219    mulsd        xmm5, xmm2                     ; npi2 * piby2_2tail
220    subsd        xmm4, xmm3                     ; rhead
221
222;          rtail  = npi2 * piby2_2tail - ((t - rhead) - rtail);
223    subsd        xmm1, xmm4                     ; t - rhead
224    subsd        xmm1, xmm3                     ; -rtail
225    subsd        xmm5, xmm1                     ; rtail
226
227;      r = rhead - rtail;
228    movapd       xmm0, xmm4
229
230;HARSHA
231;xmm1=rtail
232    movapd       xmm1, xmm5
233    subsd        xmm0, xmm5
234
235;    xmm0=r, xmm4=rhead, xmm1=rtail
236
237;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
238Lcosf_sse2_expdiff_le_15:
239    cmp          rcx, 03f2h                     ; is r < 2^-13 ?
240    jge          Lcosf_sse2_calc_sincosf_piby4  ; use taylor series if not
241    cmp          rcx, 03deh                     ; is r < 2^-33 ?
242    jle          Lcosf_sse2_r_very_small        ; then cosf(r) ~ 1 or r
243
244    movapd       xmm2, xmm0
245    mulsd        xmm2, xmm0                     ; xmm2 <-- x^2
246
247;;      if region is 1 or 3 do a sinf calc.
248    and          r8d, eax
249    jz           Lcosf_sse2_r_small_calc_sin
250
251Lcosf_sse2_r_small_calc_cos:
252; region 1 or 3
253; use simply polynomial
254;              *s = x - x*x*x*0.166666666666666666;
255    movsd        xmm3, QWORD PTR L_one_sixth
256    mulsd        xmm3, xmm0                     ; * x
257    mulsd        xmm3, xmm2                     ; * x^2
258    subsd        xmm0, xmm3                     ; xs
259    jmp          Lcosf_sse2_adjust_region
260
261ALIGN 16
262Lcosf_sse2_r_small_calc_sin:
263; region 0 or 2
264;              cos = 1.0 - x*x*0.5;
265    movsd        xmm0, QWORD PTR L_real_one     ; 1.0
266    mulsd        xmm2, QWORD PTR L_one_half     ; 0.5 *x^2
267    subsd        xmm0, xmm2
268    jmp          Lcosf_sse2_adjust_region
269
270ALIGN 16
271Lcosf_sse2_r_very_small:
272; then sin(r) = r
273; if region is 1 or 3    do a sin calc.
274    and          r8d, eax
275    jnz          Lcosf_sse2_adjust_region
276
277    movsd        xmm0, QWORD PTR L_real_one  ; cosf(r) is a 1
278    ; By this point, calculations should already have set inexact
279    jmp          Lcosf_sse2_adjust_region
280
281;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
282ALIGN 16
283Lcosf_sse2_reduce_precise:
284;      Reduce abs(x) into range [-pi/4, pi/4]
285;      remainder_piby2d2f(ax, &r, &region);
286    mov          QWORD PTR p_temp[rsp], rdx     ; save ux for use later
287    mov          QWORD PTR p_temp1[rsp], r10    ; save ax for use later
288
289    call         __remainder_piby2d2f_forAsm
290    mov          rdx, QWORD PTR p_temp[rsp]     ; restore ux for use later
291    mov          r10, QWORD PTR p_temp1[rsp]    ; restore ax for use later
292    mov          r8d, 1                         ; for determining region later
293
294    ; Reduced argument is in xmm0.  No second word; after all, we started in
295    ; single precision.  Region is in rax.
296    movapd       xmm1, xmm0
297    movsd        xmm5, QWORD PTR L_one_half
298
299    jmp          Lcosf_sse2_calc_sincosf_piby4
300
301
302; done with reducing the argument.  Now perform the sin/cos calculations.
303ALIGN 16
304Lcosf_sse2_calc_sincosf_piby4:
305    movapd       xmm2, xmm0
306    mulsd        xmm2, xmm0                     ; x^2
307
308;;       if region is 0 or 2, do a cosf calc
309    and          r8d, eax
310    jz           Lcosf_sse2_do_cosf_calc
311;   region is 1 or 3: do a sinf calc.
312Lcosf_sse2_do_sinf_calc:
313    movsd   xmm1, QWORD PTR __Lsinfarray+18h          ; s4
314    mulsd   xmm1, xmm2                                ; s4x2
315    movsd   xmm4, xmm2                                ; move for x4
316    mulsd   xmm4, xmm2                                ; x4
317    movsd   xmm5, QWORD PTR __Lsinfarray+8h           ; s2
318    mulsd   xmm5, xmm2                                ; s2x2
319    movsd   xmm3, xmm0                                ; move for x3
320    mulsd   xmm3, xmm2                                ; x3
321    addsd   xmm1, QWORD PTR __Lsinfarray+10h          ; s3+s4x2
322    mulsd   xmm1, xmm4                                ; s3x4+s4x6
323    addsd   xmm5, QWORD PTR __Lsinfarray              ; s1+s2x2
324    addsd   xmm1, xmm5                                ; s1+s2x2+s3x4+s4x6
325    mulsd   xmm1, xmm3                                ; x3(s1+s2x2+s3x4+s4x6)
326    addsd   xmm0, xmm1                                ; x + x3(s1+s2x2+s3x4+s4x6)
327    jmp     Lcosf_sse2_adjust_region
328
329ALIGN 16
330Lcosf_sse2_do_cosf_calc:
331; region 0 or 2     - do a cos calculation
332;  zc = 1-0.5*x2+ c1*x4 +c2*x6 +c3*x8;
333;     zc = 1-0.5*x2+ c1*x4 +c2*x6 +c3*x8 + c4*x10 for a higher precision
334    movsd   xmm1, QWORD PTR __Lcosfarray+20h          ; c4
335    movsd   xmm4, xmm2                                ; move for x4
336    mulsd   xmm1, xmm2                                ; c4x2
337    movsd   xmm3, QWORD PTR __Lcosfarray+10h          ; c2
338    mulsd   xmm4, xmm2                                ; x4
339    movsd   xmm0, QWORD PTR __Lcosfarray              ; c0
340    mulsd   xmm3, xmm2                                ; c2x2
341    mulsd   xmm0, xmm2                                ; c0x2 (=-0.5x2)
342    addsd   xmm1, QWORD PTR __Lcosfarray+18h          ; c3+c4x2
343    mulsd   xmm1, xmm4                                ; c3x4 + c4x6
344    addsd   xmm3, QWORD PTR __Lcosfarray+8h           ; c1+c2x2
345    addsd   xmm1, xmm3                                ; c1 + c2x2 + c3x4 + c4x6
346    mulsd   xmm1, xmm4                                ; c1x4 + c2x6 + c3x8 + c4x10
347    addsd   xmm0, QWORD PTR L_real_one                ; 1 - 0.5x2
348    addsd   xmm0, xmm1                                ; 1 - 0.5x2 + c1x4 + c2x6 + c3x8 + c4x10
349
350Lcosf_sse2_adjust_region:
351; xmm1 is cos or sin, relies on previous sections to
352;      switch (region)
353    add          eax, 1
354    and          eax, 2
355    jz           Lcosf_sse2_cleanup
356;; if region 1 or 2 then we negate the result.
357    xorpd        xmm2, xmm2
358    subsd        xmm2, xmm0
359    movapd       xmm0, xmm2
360
361ALIGN 16
362Lcosf_sse2_cleanup:
363    cvtsd2ss     xmm0, xmm0
364    StackDeallocate stack_size
365    ret
366
367
368Lcosf_sse2_naninf:
369    call         fname_special
370    StackDeallocate stack_size
371    ret
372
373
374ALIGN 16
375Lcosf_fma3:
376    vmovd        eax,xmm0
377    mov          r8d,L_inf_mask_32
378    and          eax,r8d
379    cmp          eax, r8d
380    jz           Lcosf_fma3_naninf
381
382    vcvtss2sd    xmm5,xmm0,xmm0
383    vmovq        r9,xmm5
384    btr          r9,63                    ;clear sign
385
386    cmp          r9,L_piby2
387    jg           Lcosf_fma3_range_reduce
388    cmp          r9,L_two_to_neg_7
389    jge          Lcosf_fma3_compute_cosf_piby_4
390    cmp          r9,L_two_to_neg_13
391    jge          Lcosf_fma3_compute_1_xx_5
392
393    vmovq        xmm0,QWORD PTR L_real_one
394    ; Here we need to set inexact
395    vaddsd       xmm0,xmm0,L__min_norm_double  ; this will set inexact
396    jmp          Lcosf_fma3_return
397
398ALIGN 16
399Lcosf_fma3_compute_1_xx_5:
400    vmulsd       xmm0,xmm5,QWORD PTR L_one_half
401    vfnmadd213sd xmm0,xmm5,L_real_one           ; xmm9 1.0 - x*x*(double2)0.5
402    jmp          Lcosf_fma3_return
403
404ALIGN 16
405Lcosf_fma3_compute_cosf_piby_4:
406    movsd        xmm0,xmm5
407    vmovapd      xmm2,L_real_one
408    vmulsd       xmm3,xmm0,xmm0
409    vmulsd       xmm1,xmm3,L_one_half           ; xmm1 <-- r
410    vsubsd       xmm2,xmm2,xmm1
411    vmovsd       xmm1,__Lcosfarray+018h
412    vfmadd231sd  xmm1,xmm3,__Lcosfarray+020h
413    vfmadd213sd  xmm1,xmm3,__Lcosfarray+010h
414    vfmadd213sd  xmm1,xmm3,__Lcosfarray+008h
415    vmulsd       xmm3,xmm3,xmm3                 ; xmm3 <-- x^4
416    vmovdqa      xmm0,xmm2
417    vfmadd231sd  xmm0,xmm1,xmm3
418    jmp          Lcosf_fma3_return
419
420ALIGN 16
421Lcosf_fma3_range_reduce:
422    vmovq        xmm0,r9                        ; xmm0 <-- |x|
423    cmp          r9,L_large_x_fma3
424    jge          Lcosf_reduce_precise
425
426;cosff_range_e_5_s:
427    vandpd       xmm1,xmm0,L_sign_mask
428    vmovapd      xmm2,L_2bypi
429    vfmadd213sd  xmm2,xmm1,L_one_half
430    vcvttpd2dq   xmm2,xmm2
431    vpmovsxdq    xmm1,xmm2
432    vandpd       xmm4,xmm1,L__int_three         ; region xmm4
433    vshufps      xmm1 ,xmm1,xmm1,8
434    vcvtdq2pd    xmm1,xmm1
435    vmovdqa      xmm2,xmm0
436    vfnmadd231sd xmm2,xmm1,L_piby2_1            ; xmm2 rhead
437    vmulsd       xmm3,xmm1,L_piby2_1tail        ; xmm3 rtail
438    vsubsd       xmm0,xmm2,xmm3                 ; r_1  xmm0
439    vsubsd       xmm2,xmm2,xmm0
440    vsubsd       xmm1,xmm2,xmm3
441    vmovq        rax,xmm4
442    jmp          Lcosf_exit_s
443
444ALIGN 16
445Lcosf_reduce_precise:
446
447    vmovq        xmm0,r9               ; r9 <-- |x|
448    cmp          r9,L_large_x_fma3
449    jge          Lcos_remainder_piby2
450
451    ; __remainder_piby2_fma3 and __remainder_piby2_fma3_bdl
452    ; have the following conventions:
453    ; on input
454    ;   x is in xmm0
455    ; on output
456    ;   r is in xmm0
457    ;   rr is in xmm1
458    ;   region is in rax
459    ; The _bdl routine is guaranteed not to touch r10
460
461Lcos_remainder_piby2_small: ;; unused label
462    ; Boldo-Daumas-Li reduction for reasonably small |x|
463    call         __remainder_piby2_fma3_bdl
464    jmp          Lcosf_exit_s
465
466ALIGN 16
467Lcos_remainder_piby2:
468    ; argument reduction for general x
469    call         __remainder_piby2_fma3
470Lcosf_exit_s:
471    bt           rax,0
472    jnc          Lcosf_piby4_compute
473
474;sinf_piby4_compute:
475;   vmovapd      xmm1,__Lsinfarray+010h
476    vmovsd       xmm1,__Lsinfarray+010h
477    vmulsd       xmm3,xmm0,xmm0
478    vfmadd231sd  xmm1,xmm3,__Lsinfarray+018h
479    vfmadd213sd  xmm1,xmm3,__Lsinfarray+008h
480    vfmadd213sd  xmm1,xmm3,__Lsinfarray
481    vmulsd       xmm3,xmm0,xmm3                 ; xmm3 <-- x^3
482    vfmadd231sd  xmm0,xmm1,xmm3
483    jmp          Lcosf_fma3_adjust_sign
484
485ALIGN 16
486Lcosf_piby4_compute:
487    vmovapd      xmm2,L_real_one
488    vmulsd       xmm3,xmm0,xmm0
489    vmulsd       xmm1,xmm3,L_one_half           ; xmm1 <-- r
490    vsubsd       xmm2,xmm2,xmm1
491    vmovsd       xmm1,__Lcosfarray+018h
492    vfmadd231sd  xmm1 ,xmm3,__Lcosfarray+020h
493    vfmadd213sd  xmm1 ,xmm3,__Lcosfarray+010h
494    vfmadd213sd  xmm1 ,xmm3,__Lcosfarray+008h
495    vmulsd       xmm3,xmm3,xmm3                 ; xmm3 <-- x^4
496    vmovdqa      xmm0, xmm2
497    vfmadd231sd  xmm0 ,xmm1,xmm3
498
499Lcosf_fma3_adjust_sign:
500    ; assuming FMA3 ==> AVX ==> SSE4.1
501;    vpcmpeqq     xmm1,xmm4,XMMWORD PTR L_int_one
502;    vpcmpeqq     xmm2,xmm4,XMMWORD PTR L_int_two
503;    vorpd        xmm3,xmm2,xmm1
504
505;    vandpd       xmm3,xmm3,L_signbit
506
507    add          rax,1                    ; 1,2 --> 2,3
508    shr          rax,1                    ; 2,3 --> 1
509    shl          rax,63                   ; 1 --> sign bit
510    vmovq        xmm3,rax
511
512    vxorpd       xmm0,xmm0,xmm3
513
514Lcosf_fma3_return:
515    vcvtsd2ss    xmm0,xmm0,xmm0
516    StackDeallocate stack_size
517    ret
518
519Lcosf_fma3_naninf:
520    call         fname_special
521    StackDeallocate stack_size
522    ret
523
524fname  endp
525END
526