xref: /reactos/sdk/lib/crt/math/libm_sse2/tan.asm (revision 299e4305)
1;
2;
3; MIT License
4; -----------
5;
6; Copyright (c) 2002-2019 Advanced Micro Devices, Inc.
7;
8; Permission is hereby granted, free of charge, to any person obtaining a copy
9; of this Software and associated documentaon files (the "Software"), to deal
10; in the Software without restriction, including without limitation the rights
11; to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12; copies of the Software, and to permit persons to whom the Software is
13; furnished to do so, subject to the following conditions:
14;
15; The above copyright notice and this permission notice shall be included in
16; all copies or substantial portions of the Software.
17;
18; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19; IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20; FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21; AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22; LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
23; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
24; THE SOFTWARE.
25;
26; An implementation of the tan function.
27;
28; Prototype:
29;
30;     double tan(double x);
31;
32;   Computes tan(x).
33;   It will provide proper C99 return values,
34;   but may not raise floating point status bits properly.
35;   Based on the NAG C implementation.
36;
37; If FMA3 hardware is present, it will be used for the calculation.
38;
39
40.const
41ALIGN 16
42L_signbit      DQ 08000000000000000h
43               DQ 08000000000000000h ; duplicate for pd
44
45L_sign_mask    DQ 07FFFFFFFFFFFFFFFh
46               DQ 07FFFFFFFFFFFFFFFh ; duplicate for pd
47
48L_int_one      DQ 00000000000000001h
49               DQ 00000000000000001h ; duplicate for pd
50
51L_twobypi      DQ 03FE45F306DC9C883h
52               DQ 03FE45F306DC9C883h ; duplicate for pd
53
54L_point_333    DQ 03FD5555555555555h; 1/3
55               DQ 03FD5555555555555h ; duplicate for pd
56
57L_tan_p0       DQ 03FD7D50F6638564Ah ; 0.372379159759792203640806338901e0
58               DQ 03FD7D50F6638564Ah ; duplicate for pd
59
60L_tan_p2       DQ 0BF977C24C7569ABBh ; -0.229345080057565662883358588111e-1
61               DQ 0BF977C24C7569ABBh ; duplicate for pd
62
63L_tan_p4       DQ 03F2D5DAF289C385Ah ; 0.224044448537022097264602535574e-3
64               DQ 03F2D5DAF289C385Ah ; duplicate for pd
65
66L_tan_q0       DQ 03FF1DFCB8CAA40B8h ; 0.111713747927937668539901657944e1
67               DQ 03FF1DFCB8CAA40B8h ; duplicate for pd
68
69L_tan_q2       DQ 0BFE08046499EB90Fh ; -0.515658515729031149329237816945e0
70               DQ 0BFE08046499EB90Fh ; duplicate for pd
71
72L_tan_q4       DQ 03F9AB0F4F80A0ACFh ; 0.260656620398645407524064091208e-1
73               DQ 03F9AB0F4F80A0ACFh ; duplicate for pd
74
75L_tan_q6       DQ 0BF2E7517EF6D98F8h ; -0.232371494088563558304549252913e-3
76               DQ 0BF2E7517EF6D98F8h ; duplicate for pd
77
78L_half_mask    DQ 0ffffffff00000000h
79               DQ 0ffffffff00000000h ; duplicate for pd
80
81L_piby4_lead   DQ 03FE921FB54442D18h ; pi/4, high part
82               DQ 03FE921FB54442D18h ; duplicate for pd
83
84L_piby4_tail   DQ 03C81A62633145C06h ; pi/4, low parft
85               DQ 03C81A62633145C06h ; duplicate for pd
86
87; Different parts of argument reduction need different versions of pi/2
88
89L_piby2_1      DQ 03FF921FB54400000h ; pi/2, high 33 bits
90L_piby2_1tail  DQ 03DD0B4611A626331h ; pi/2, second 53 bits, overlaps...
91L_piby2_2      DQ 03DD0B4611A600000h ; pi/2, second 33 bits
92L_piby2_2tail  DQ 03BA3198A2E037073h ; pi/2, third 53 bits, overlaps...
93L_piby2_3      DQ 03BA3198A2E000000h ; pi/2, third 33 bits
94L_piby2_3tail  DQ 0397B839A252049C1h ; pi/2, fourth 53 bits
95
96; end of pi/2 versions
97
98L_two_to_neg_27     DQ 03e40000000000000h ; 2^-27
99L_two_to_neg_13     DQ 03f20000000000000h ; 2^-13
100
101L_inf_mask_64  DQ 07FF0000000000000h
102L_point_five   DQ 03FE0000000000000h
103L_point_68     DQ 03FE5C28F5C28F5C3h ; .68
104L_n_point_68   DQ 0BFE5C28F5C28F5C3h ; -.68
105
106L_zero         DQ -0000000000000000h ; 0.0
107L_one          DQ 03FF0000000000000h ; 1.0
108L_n_one        DQ 0BFF0000000000000h ; -1.0
109L_two          DQ 04000000000000000h ; 2.0
110
111L_moderate_arg_cw  DQ 0411E848000000000h ;  5.e5
112L_moderate_arg_bdl DQ 0417312D000000000h ; 2e7, works for BDL
113
114fname         TEXTEQU <tan>
115fname_special TEXTEQU <_tan_special>
116
117; local storage offsets
118save_xmm6       EQU 020h
119save_xmm7       EQU 030h
120store_input     EQU 040h
121save_r10        EQU 050h
122dummy_space     EQU 060h
123stack_size      EQU 088h
124
125include fm.inc
126
127EXTERN           __use_fma3_lib:DWORD
128EXTERN           fname_special      : PROC
129EXTERN           __remainder_piby2_fma3 : PROC
130EXTERN           __remainder_piby2_fma3_bdl : PROC
131EXTERN           __remainder_piby2_forAsm : PROC
132EXTERN           _set_statfp   : PROC
133
134.code
135ALIGN 16
136PUBLIC fname
137fname PROC FRAME
138    StackAllocate stack_size
139    SaveXmm      xmm6, save_xmm6
140    SaveXmm      xmm7, save_xmm7
141    .ENDPROLOG
142    cmp          DWORD PTR __use_fma3_lib, 0
143    jne          Ltan_fma3
144
145Ltan_sse2:
146    movd      rdx, xmm0                          ; really movq
147    movaps    xmm6, xmm0
148    mov       rcx, rdx
149    btr       rcx, 63                            ; rcx <-- |x|
150
151    cmp       rcx, L_piby4_lead
152    ja        Ltan_abs_x_nle_pio4               ; branch if > pi/4 or NaN
153
154
155    cmp       rcx, L_two_to_neg_13
156    jae       Ltan_abs_x_ge_two_to_neg_13
157
158    cmp       rcx, L_two_to_neg_27
159    jae       Labs_x_ge_two_to_neg_27
160
161    ; At this point tan(x) ~= x; if it's not exact, set the inexact flag
162
163    test      rcx, rcx
164    je        Ltan_return
165
166    mov       ecx, 20h                    ; ecx <-- AMD_F_INEXACT
167    call      _set_statfp
168    movaps    xmm0, xmm6                  ; may be redundant, but xmm0 <-- x
169
170    RestoreXmm   xmm7, save_xmm7
171    RestoreXmm   xmm6, save_xmm6
172    StackDeallocate stack_size
173    ret       0
174
175Labs_x_ge_two_to_neg_27:
176
177    mulsd     xmm0, xmm0
178    mulsd     xmm0, xmm6
179    mulsd     xmm0, QWORD PTR L_point_333
180
181    addsd     xmm0, xmm6
182
183    RestoreXmm   xmm7, save_xmm7
184    RestoreXmm   xmm6, save_xmm6
185    StackDeallocate stack_size
186    ret       0
187
188Ltan_abs_x_ge_two_to_neg_13:
189    xorps     xmm1, xmm1                  ; xmm1 <-- xx = 0
190    xor       r8d, r8d                    ; r8 <-- recip flag = 0
191    call      _tan_piby4
192
193Ltan_return:
194    RestoreXmm   xmm7, save_xmm7
195    RestoreXmm   xmm6, save_xmm6
196    StackDeallocate stack_size
197    ret       0
198
199Ltan_abs_x_nle_pio4:
200
201    cmp       rcx, L_inf_mask_64          ; |x| uint >= +inf as uint ?
202    jnae       Ltan_x_is_finite
203
204    call      fname_special
205    RestoreXmm   xmm7, save_xmm7
206    RestoreXmm   xmm6, save_xmm6
207    StackDeallocate stack_size
208    ret
209
210ALIGN 16
211Ltan_x_is_finite:
212    xor       r8d, r8d
213    xor       r10, r10
214    cmp       rcx, rdx
215    setne     r10b                         ; r10 <-- x was negative flag
216    andpd     xmm6, L_sign_mask
217
218    movsd   xmm0, QWORD PTR L_moderate_arg_cw        ; currently 5e5
219    comisd  xmm0, xmm6
220    jbe     Ltan_x_is_very_large
221
222Ltan_x_is_moderate:                                  ; unused label
223
224    ; For these arguments we do a Cody-Waite reduction, subtracting the
225    ; appropriate multiple of pi/2, using extra precision where x is close
226    ; to an exact multiple of pi/2
227    ; We special-case region setting for |x| <= 9pi/4
228    ; It seems strange that this speeds things up, but it does
229
230    mov       rdx, rcx
231
232    mov       rax, 4616025215990052958           ; 400f6a7a2955385eH (5pi/4)
233    shr       rdx, 52                            ; rdx <-- xexp
234    cmp       rcx, rax
235    ja        Labs_x_gt_5pio4
236
237    mov       rax, 4612488097114038738           ; 4002d97c7f3321d2H (3pi/4)
238    cmp       rcx, rax
239    seta      r8b
240    inc       r8d                                ; r8d <-- region (1 or 2)
241    jmp       Lhave_region
242
243Labs_x_gt_5pio4:
244    mov       rax, 4619644535898419899           ; 401c463abeccb2bbH (9pi/4)
245    cmp       rcx, rax
246    ja        Lneed_region_computation
247    mov       rax, 4617875976460412789           ; 4015fdbbe9bba775H (7pi/4)
248    cmp       rcx, rax
249    seta      r8b
250    add       r8d, 3                             ; r8d <-- region (3 or 4)
251    jmp       Lhave_region
252
253ALIGN 16
254Lneed_region_computation:
255    movaps    xmm0, xmm6
256    mulsd     xmm0, QWORD PTR L_twobypi
257    addsd     xmm0, QWORD PTR L_point_five
258    cvttsd2si r8d, xmm0                          ; r8d <-- region
259
260Lhave_region:
261    movd      xmm3, r8d
262    cvtdq2pd  xmm3, xmm3
263
264    movaps    xmm2, xmm3
265    movaps    xmm0, xmm3
266    mulsd     xmm0, QWORD PTR L_piby2_1
267    mulsd     xmm2, QWORD PTR L_piby2_1tail ; xmm2 < rtail = npi2 * piby2_1tail
268    subsd     xmm6, xmm0                    ; xmm6 <-- rhead = x - npi2*piby2_1
269
270    ; If x is not too close to multiple of pi/2,
271    ; we're essentially done with reduction
272    ; If the exponent of rhead is not close to that of x,
273    ; then most of x has been subtracted away in computing rhead;
274    ; i.e., x is close to a multiple of pi/2.
275
276    movd      rax, xmm6
277    shr       rax, 52
278    and       eax, 2047
279    sub       rdx, rax                      ; rdx <-- exp diff of x vs rhead
280
281    cmp       rdx, 15
282    jbe       Ltan_have_rhead_rtail
283
284    ; Oops, x is almost a multiple of pi/2.  Compute more bits of reduced x
285
286    ;   t = rhead;
287    ;   rtail = npi2 * piby2_2;
288    ;   rhead  = t - rtail;
289    ;   rtail  = npi2 * piby2_2tail - ((t - rhead) - rtail);
290
291    movaps    xmm1, xmm6
292    movaps    xmm0, xmm3
293
294    movaps    xmm2, xmm3
295    mulsd     xmm0, QWORD PTR L_piby2_2
296    mulsd     xmm2, QWORD PTR L_piby2_2tail
297    subsd     xmm6, xmm0
298    subsd     xmm1, xmm6
299    subsd     xmm1, xmm0
300    subsd     xmm2, xmm1
301
302    cmp       rdx, 48
303    jbe       Ltan_have_rhead_rtail         ; We've done enough
304
305    ; Wow, x is REALLY close to a multiple of pi/2.  Compute more bits.
306
307    ;   t = rhead;
308    ;   rtail = npi2 * piby2_3;
309    ;   rhead  = t - rtail;
310    ;   rtail  = npi2 * piby2_3tail - ((t - rhead) - rtail);
311
312    movaps    xmm1, xmm6
313    movaps    xmm0, xmm3
314    movaps    xmm2, xmm3
315    mulsd     xmm0, QWORD PTR L_piby2_3
316    mulsd     xmm2, QWORD PTR L_piby2_3tail
317    subsd     xmm6, xmm0                    ; xmm6 <-- rhead = t - rtail
318    subsd     xmm1, xmm6                    ; xmm1 <-- t - rhead
319    subsd     xmm1, xmm0                    ; xmm1 <-- ((t - rhead) - rtail)
320    subsd     xmm2, xmm1                    ; xmm2 <-- final rtail
321
322Ltan_have_rhead_rtail:
323
324    ; At this point xmm6 has a suitable rhead, xmm2 a suitable rtail
325    movaps  xmm0, xmm6                      ; xmm0 <-- copy of rhead
326
327    ;   r = rhead - rtail
328    ;   rr = (rhead - r) - rtail;
329    ;   region = npi2 & 3;
330
331    and       r8d, 3                        ; r8d <-- region
332    subsd     xmm0, xmm2                    ; xmm0 <-- r = rhead - rtail
333    subsd     xmm6, xmm0                    ; xmm6 <-- rhead - r
334    subsd     xmm6, xmm2                    ; xmm6 <-- rr = (rhead - r) - rtail
335
336Ltan_do_tan_computation:
337    and       r8d, 1                        ; r8d <-- region & 1
338    movaps    xmm1, xmm6
339    call      _tan_piby4
340    test      r10d, r10d
341    je        Ltan_pos_return
342    xorpd     xmm0, QWORD PTR L_signbit
343Ltan_pos_return:
344    RestoreXmm   xmm7, save_xmm7
345    RestoreXmm   xmm6, save_xmm6
346    StackDeallocate stack_size
347    ret       0
348
349ALIGN 16
350Ltan_x_is_very_large:
351    ; Reduce x into range [-pi/4,pi/4] (general case)
352    movaps    xmm0, xmm6
353    mov       QWORD PTR [rsp+save_r10], r10
354    call      __remainder_piby2_forAsm      ; this call clobbers r10
355    mov       r10, QWORD PTR [rsp+save_r10]
356    movapd    xmm6,xmm1                     ; xmm6 <-- rr
357    mov       r8d,eax                       ; r8d <-- region
358    jmp       Ltan_do_tan_computation
359
360;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
361;
362; From here on, it is assumed that the hardware supports FMA3 (and AVX).
363
364ALIGN 16
365Ltan_fma3:
366    vmovq        r9,xmm0
367    mov          rdx,r9               ; rdx <-- x
368    btr          r9,63                ; r9 <-- |x|
369    cmp          r9,L_piby4_lead
370    jae          Ltan_fma3_absx_gt_pio4 ; Note that NaN will branch
371
372Ltan_fma3_absx_le_pio4:
373    ; no argument reduction is needed, so recip is 0, xx is 0.
374    ; Note that this routine is not special-casing very small |x|
375    vmovsd       xmm5,L_piby4_lead
376    vmovsd       xmm6,L_piby4_tail
377    vxorpd       xmm1,xmm1,xmm1        ; xx <-- 0.
378    vxorpd       xmm7,xmm7,xmm7        ; transform <-- 0
379    comisd       xmm0,L_point_68
380    jbe          Ltan_fma3_small_x_le_point_68
381Ltan_fma3_x_small_gt_point_68:
382    vmovsd       xmm7,L_one            ; xmm7 <-- transform = 1.0
383    vsubsd       xmm0,xmm5,xmm0        ; x = piby4_lead - x
384    vaddsd       xmm0,xmm0,xmm6        ; xmm0 <-- x = x + xl = x + piby4_tail
385    jmp          Ltan_fma3_compute_Remez_for_small_x
386ALIGN 16
387Ltan_fma3_small_x_le_point_68:
388    comisd       xmm0,L_n_point_68
389    jae          Ltan_fma3_compute_Remez_for_small_x
390Ltan_fma3_small_x_lt_neg_point_68:
391    vmovsd       xmm7,L_n_one          ; xmm7 <-- transform = -1.0
392    vaddsd       xmm0,xmm5,xmm0        ; x = piby4_lead + x
393    vaddsd       xmm0,xmm0,xmm6        ; xmm0 <-- x = x + xl = x + piby4_tail
394Ltan_fma3_compute_Remez_for_small_x:
395    ; At this point xmm0 holds x, possibly transformed
396
397    ; now do core Remez rational approximation for x in [0,0.68]
398    vmovsd       xmm4,L_tan_q6
399    vmovsd       xmm3,L_tan_p4
400    vmulsd       xmm2,xmm0,xmm0        ; xx is 0, so xmm2 <-- r = x*x
401    vfmadd213sd  xmm4,xmm2,L_tan_q4
402    vfmadd213sd  xmm3,xmm2,L_tan_p2
403    vfmadd213sd  xmm4,xmm2,L_tan_q2
404    vfmadd213sd  xmm3,xmm2,L_tan_p0    ; xmm3 <-- p2 (polynomial)
405    vfmadd213sd  xmm4,xmm2,L_tan_q0    ; xmm4 <-- q3 (polynomial)
406    vdivsd       xmm3,xmm3,xmm4        ; xmm3 <-- r3 = p2/q3
407    vmulsd       xmm3,xmm3,xmm2        ; xmm3 <-- r * r3
408    vfmadd132sd  xmm0,xmm0,xmm3        ; xx = 0, so xmm0 <-- t = x + x*(r*r3)
409    comisd       xmm7,L_zero           ; did we transform x?
410    ; if x was transformed, we need to transform t to get answer;
411    ; if not, the answer is just t.
412    je           Ltan_fma3_ext_piby4_zero
413
414    ; x was transformed, so answer is +- (1. - 2.*t/(1.+t))
415    ; (remember recip is 0 here)
416    vmovsd       xmm3,L_one
417    vaddsd       xmm4,xmm0,L_one       ; xmm4 <-- 1. + t
418    vdivsd       xmm6,xmm0,xmm4        ; xmm6 <-- t / (1.+t)
419    vfnmadd231sd xmm3,xmm6,L_two       ; xmm3 <-- 1. - 2.*t/(1.+t)
420    vmulsd       xmm0,xmm3,xmm7        ; multiply by +- 1.
421
422Ltan_fma3_ext_piby4_zero:
423    ; restore volatile registers
424    AVXRestoreXmm xmm7, save_xmm7
425    AVXRestoreXmm xmm6, save_xmm6
426    StackDeallocate stack_size
427    ret          0
428
429ALIGN 16
430Ltan_fma3_absx_gt_pio4: ;;; come here if |x| > pi/4
431    cmp          r9, L_inf_mask_64
432    jae          Ltan_fma3_naninf
433
434;Ltan_fma3_range_reduce:
435    vmovapd      [store_input + rsp],xmm0 ; save copy of x
436    vmovq        xmm0,r9               ; xmm0l <-- |x|
437    cmp          r9,L_moderate_arg_bdl
438    jge          Ltan_fma3_remainder_piby2  ; go elsewhere if |x| > 500000.
439
440    ; Note that __remainder_piby2_fma3 and __remainder_piby2_fma3_bdl
441    ; have calling conventions that differ from the C routine
442    ; on input
443    ;   |x| is in xmm0
444    ; on output
445    ;   z is in xmm0
446    ;   zz is in xmm1
447    ;   where z + zz = arg reduced |x| and zz is small compared to z
448    ;   region of |x| is in rax
449
450 Ltan_fma3_remainder_piby2_small:
451    ; Boldo-Daumas-Li reduction for reasonably small |x|
452    call         __remainder_piby2_fma3_bdl
453
454
455Ltan_fma3_full_computation:
456    ; we have done argument reduction; recip and xx may be nonzero
457    ; x is in xmm0, xx is in xmm1
458    ; recip is region & 1, and region is in rax.
459
460    vmovsd       xmm5,L_piby4_lead
461    vmovsd       xmm6,L_piby4_tail
462
463    vxorpd       xmm7,xmm7,xmm7        ; transform <-- 0
464    vcomisd      xmm0,L_point_68
465    jbe          Ltan_fma3_full_x_le_point_68
466Ltan_fma3_full_x_gt_point_68:
467    vmovsd       xmm7,L_one            ; xmm7 <-- transform = 1.0
468    vsubsd       xmm0,xmm5,xmm0        ; xmm0 <-- x = piby4_lead - x
469    vsubsd       xmm2,xmm6,xmm1        ; xmm2 <-- xl = pibi4_tail - xx
470    vaddsd       xmm0,xmm0,xmm2        ; xmm0 <-- x = x + xl
471    vxorps       xmm1,xmm1,xmm1        ; xmm1 <-- xx  = 0
472    jmp          Ltan_fma3_compute_Remez
473ALIGN 16
474Ltan_fma3_full_x_le_point_68:
475    vcomisd      xmm0,L_n_point_68
476    jae          Ltan_fma3_compute_Remez
477Ltan_fma3_full_x_lt_neg_point_68:
478    vmovsd       xmm7,L_n_one          ; xmm7 <-- transform = -1.0
479    vaddsd       xmm0,xmm5,xmm0        ; x = piby4_lead + x
480    vaddsd       xmm2,xmm6,xmm1        ; xmm2 <-- xl = piby4_tail + xx
481    vaddsd       xmm0,xmm0,xmm2        ; xmm0 <-- x = x + xl
482    vxorps       xmm1,xmm1,xmm1        ; xmm1 <-- xx  = 0
483
484Ltan_fma3_compute_Remez:
485    vmulsd       xmm2,xmm0,xmm0           ; xmm2 <-- x*x
486    vmulsd       xmm5,xmm1,xmm0           ; xmm5 <-- x*xx
487    vfmadd132sd  xmm5,xmm2,L_two          ; xmm5 <-- r = x*x + 2.*x*xx
488    vmovsd       xmm2,L_tan_p4
489    vfmadd213sd  xmm2,xmm5,L_tan_p2       ; xmm2 <-- p4*r+p2
490    vfmadd213sd  xmm2,xmm5,L_tan_p0       ; xmm2 <-- p = (p4*r+p2)*r+p0
491    vmovsd       xmm4,L_tan_q6
492    vfmadd213sd  xmm4,xmm5,L_tan_q4       ; xmm4 <-- q6*r+q4
493    vfmadd213sd  xmm4,xmm5,L_tan_q2       ; xmm4 <-- (q6*r+q4)*r+q2
494    vfmadd213sd  xmm4,xmm5,L_tan_q0       ; xmm4 <-- q = ((q6*r+q4)*r+q2)*r+q0
495    vdivsd       xmm2,xmm2,xmm4           ; xmm2 <-- p/q
496    vmulsd       xmm2,xmm2,xmm5           ; xmm2 <-- r*p/q
497    vfmadd213sd  xmm2,xmm0,xmm1           ; xmm2 <-- t2 = xx + x*r*(p/q)
498    vaddsd       xmm1,xmm0,xmm2           ; xmm1 <-- t = (t1=x) + t2
499
500    ; If |x| > .68 we transformed, and t is an approximation of
501    ; tan(pi/4 +- (x+xx))
502    ; otherwise, t is just tan(x+xx)
503    vxorpd       xmm6,xmm6,xmm6
504    vcomisd      xmm7,xmm6                ; did we transform? (|x| > .68) ?
505    jz           Ltan_fma3_if_recip_set   ; if not, go check recip
506
507Ltan_fma3_if_transfor_set:
508    ; Because we transformed x+xx, we have to transform t before returning
509    ; let transform be 1 for x > .68, -1 for x < -.68, then we return
510    ; transform * (recip ? (2.*t/(t-1.) - 1.) : (1. - 2.*t/(1.+t)))
511    vaddsd       xmm6,xmm1,xmm1           ; xmm6 <-- 2.*t
512    vmovsd       xmm4,L_one
513    vaddsd       xmm2,xmm1,xmm4           ; xmm2 <-- t+1
514    vsubsd       xmm5,xmm1,xmm4           ; xmm5 <-- t-1
515    bt           rax,0
516    jc           Ltan_fma3_transform_and_recip_set
517    ; here recip is not set
518    vaddsd       xmm2,xmm1,xmm4           ; xmm2 <-- t+1
519    vdivsd       xmm2,xmm1,xmm2           ; xmm2 <-- t/(t+1)
520    vfnmadd132sd xmm2,xmm4,L_two          ; xmm2 <-- 1 - 2*t/(t+1)
521    vmulsd       xmm1,xmm2,xmm7           ; xmm1 <-- transform*(1 - 2*t/(t+1))
522    jmp          Ltan_fma3_exit_piby4
523ALIGN 16
524Ltan_fma3_transform_and_recip_set:
525    ; here recip is set
526    vsubsd       xmm2,xmm1,xmm4           ; xmm2 <-- t-1
527    vdivsd       xmm2,xmm1,xmm2           ; xmm2 <-- t/(t-1)
528    vfmsub132sd  xmm2,xmm4,L_two          ; xmm2 <-- 2*t/(t-1) - 1
529    vmulsd       xmm1,xmm2,xmm7           ; xmm1 <-- transform*(2*t/(t-1) - 1)
530    jmp          Ltan_fma3_exit_piby4
531
532ALIGN 16
533Ltan_fma3_if_recip_set:
534    ; Here we did not transform x and xx, but if we are in an odd quadrant
535    ; we will need to return -1./(t1+t2), computed accurately
536    ; (t=t1 is in xmm1, t2 is in xmm2)
537    bt           rax,0
538    jnc          Ltan_fma3_exit_piby4
539
540    vandpd       xmm7,xmm1,L_half_mask    ; xmm7 <-- z1 = high bits of t
541    vsubsd       xmm4,xmm7,xmm0           ; xmm4 <-- z1 - t1
542    vsubsd       xmm4,xmm2,xmm4           ; xmm4 <-- z2 = t2 - (z1-t1)
543    vmovsd       xmm2,L_n_one
544    vdivsd       xmm2,xmm2,xmm1           ; xmm2 <-- trec = -1./t
545    vandpd       xmm5,xmm2,L_half_mask    ; xmm5 <-- trec_top=high bits of trec
546    vfmadd213sd  xmm7,xmm5,L_one ; xmm7 <-- trec_top*z1 + 1.
547    vfmadd231sd  xmm7 ,xmm4,xmm5 ; xmm7 <-- z2*trec_top + (trec_top*z1 + 1.)
548    vfmadd213sd  xmm7,xmm2,xmm5  ; xmm7 <-- u = trec_top + trec*(z2*trec_top + (trec_top*z1+1.))
549    vmovapd      xmm1,xmm7       ; xmm1 <-- u
550
551Ltan_fma3_exit_piby4:
552    vmovapd      xmm0,xmm1        ; xmm0 <-- t, u, or v, as needed
553
554    vmovapd      xmm1,[store_input + rsp]
555    vandpd       xmm1,xmm1,L_signbit
556    vxorpd       xmm0,xmm0,xmm1           ; tan(-x) = -tan(x)
557
558    ; restore volatile registers
559    AVXRestoreXmm   xmm7, save_xmm7
560    AVXRestoreXmm   xmm6, save_xmm6
561    StackDeallocate stack_size
562    ret
563
564ALIGN 16
565Ltan_fma3_remainder_piby2:
566    ; argument reduction for general x
567
568    call         __remainder_piby2_fma3
569    jmp          Ltan_fma3_full_computation
570
571
572Ltan_fma3_naninf: ; here argument is +-Inf or NaN.  Special case.
573    call         fname_special
574    AVXRestoreXmm   xmm7, save_xmm7
575    AVXRestoreXmm   xmm6, save_xmm6
576    StackDeallocate stack_size
577    ret
578
579fname endp
580
581;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
582
583.const
584tan_piby4_save_xmm6  EQU 030h
585tan_piby4_stack_size EQU 048h
586.code
587ALIGN 16
588_tan_piby4 PROC PRIVATE FRAME
589    StackAllocate tan_piby4_stack_size
590    SaveXmm      xmm6, tan_piby4_save_xmm6
591    .ENDPROLOG
592
593    ; Compute tangent for x+xx in [-pi/4,pi/4].
594    ; xmm0 has x
595    ; xmm1 has xx
596    ; r8d has recip.  If recip is true, return -1/tan(x+xx) else tan(x+xx)
597
598    xor       eax, eax
599
600    comisd    xmm0, QWORD PTR L_point_68
601    movaps    xmm3, xmm1
602    movaps    xmm6, xmm0
603    jbe       Ltan_piby4_x_le_point_68
604
605    ; Here x > .68, so we transform x using the identity
606    ;   tan(pi/4-x) = (1-tan(x))/(1+tan(x))
607
608    movsd     xmm2, QWORD PTR L_piby4_lead
609    mov       eax, 1                        ; eax <-- transform = 1
610    subsd     xmm2, xmm0                    ; xmm2 <-- x = piby4_lead - x
611    movsd     xmm0, QWORD PTR L_piby4_tail
612    subsd     xmm0, xmm1                    ; xmm0 <-- xl = piby4_tail - xx
613    movaps    xmm6, xmm2
614    addsd     xmm6, xmm0                    ; xmm6 <-- x = x + xl
615    xorps     xmm3,xmm3                     ; xmm3 <-- xx = 0.
616    jmp       Ltan_piby4_do_remez
617
618Ltan_piby4_x_le_point_68:
619; 43   :   else if (x < -0.68)
620
621    movsd     xmm0, QWORD PTR L_n_point_68
622    comisd    xmm0, xmm6
623    jbe       Ltan_piby4_do_remez           ; jump if x >= -.68
624
625    ; Here x < -.68, so we transform x using the identity
626    ; tan(x-pi/4) = (tan(x)-1)/(tan(x)+1)
627
628    addsd     xmm6, QWORD PTR L_piby4_lead  ; xmm6 <-- x = piby4_lead + x
629    addsd     xmm3, QWORD PTR L_piby4_tail  ; xmm3 <-- xl = piby4_tail + xx
630    or        eax, -1                       ; eax <-- transform = -1
631    addsd     xmm6, xmm3                    ; xmm6 <-- x = x + xl
632    xorps     xmm3, xmm3                    ; xmm3 <-- xx = 0
633
634Ltan_piby4_do_remez:
635
636    ; Core Remez [2,3] approximation to tan(x+xx) on the interval [0,0.68].
637    movaps    xmm0, xmm6
638    movaps    xmm2, xmm6;
639; An implementation of the tan function.
640;
641; Prototype:
642;
643;     double tan(double x);
644;
645;   Computes tan(x).
646;   It will provide proper C99 return values,
647;   but may not raise floating point status bits properly.
648;   Based on the NAG C implementation.
649;
650;
651
652    mulsd     xmm0, xmm6                    ; xmm0 <-- x*x
653    addsd     xmm2, xmm2                    ; xmm2 <-- 2*x
654    mulsd     xmm2, xmm3                    ; xmm2 <-- 2*x*xx
655    addsd     xmm2, xmm0                    ; xmm2 <-- r = x*x + 2*x*xx
656
657    ; Magic Remez approximation
658    movaps    xmm0, xmm2
659    movaps    xmm5, xmm2
660    movaps    xmm1, xmm2
661    mulsd     xmm5, QWORD PTR L_tan_p4
662    mulsd     xmm1, QWORD PTR L_tan_q6
663    mulsd     xmm0, xmm6
664    addsd     xmm5, QWORD PTR L_tan_p2
665    mulsd     xmm5, xmm2
666    addsd     xmm5, QWORD PTR L_tan_p0
667    mulsd     xmm5, xmm0
668    movsd     xmm0, QWORD PTR L_tan_q4
669    addsd     xmm0, xmm1
670    mulsd     xmm0, xmm2
671    addsd     xmm0, QWORD PTR L_tan_q2
672    mulsd     xmm0, xmm2
673    addsd     xmm0, QWORD PTR L_tan_q0
674    divsd     xmm5, xmm0
675    addsd     xmm5, xmm3                    ; xmm5 <-- t2
676
677    test      eax, eax
678    je        Ltan_piby4_transform_false
679
680    addsd     xmm5, xmm6                    ; xmm5 <-- t = t1 + t2 = x + t2
681
682    test      r8d, r8d
683    je        Ltan_piby4_transform_true_recip_false
684
685    ; Here transform and recip are both true.
686    ;   return transform*(2*t/(t-1) - 1.0);
687
688    movaps    xmm0, xmm5
689    subsd     xmm5, QWORD PTR L_one
690    movd      xmm1, eax
691    addsd     xmm0, xmm0
692    divsd     xmm0, xmm5
693    cvtdq2pd  xmm1, xmm1
694    subsd     xmm0, QWORD PTR L_one
695    mulsd     xmm0, xmm1
696    RestoreXmm      xmm6, tan_piby4_save_xmm6
697    StackDeallocate tan_piby4_stack_size
698    ret       0
699
700Ltan_piby4_transform_true_recip_false:
701    ; Here return transform*(1.0 - 2*t/(1+t));
702    movsd     xmm0, QWORD PTR L_one
703    movaps    xmm1, xmm5
704    addsd     xmm5, xmm0
705    addsd     xmm1, xmm1
706    divsd     xmm1, xmm5
707    subsd     xmm0, xmm1
708    movd      xmm1, eax
709    cvtdq2pd  xmm1, xmm1
710    mulsd     xmm0, xmm1
711    RestoreXmm      xmm6, tan_piby4_save_xmm6
712    StackDeallocate tan_piby4_stack_size
713    ret       0
714
715Ltan_piby4_transform_false:
716    test      r8d, r8d
717    je        Ltan_piby4_atransform_false_recip_false
718
719    ; Here transform is false but recip is true
720    ; We return an accurate computation of -1.0/(t1 + t2).
721
722    movsd     xmm4, QWORD PTR L_n_one
723    movaps    xmm0, xmm5
724    mov       rcx, -4294967296              ; ffffffff00000000H
725    addsd     xmm0, xmm6
726    movd      rax, xmm0                     ; really movq
727    divsd     xmm4, xmm0
728    and       rax, rcx
729    movd      xmm3, rax                     ; really movq
730    movaps    xmm1, xmm3
731    subsd     xmm1, xmm6
732
733    movd      rax, xmm4                     ; really movq
734    subsd     xmm5, xmm1
735
736    and       rax, rcx
737    movd      xmm2, rax                     ; really movq
738
739    ;   return trec_top + trec * ((1.0 + trec_top * z1) + trec_top * z2);
740
741    movaps    xmm0, xmm2
742    mulsd     xmm5, xmm2
743    mulsd     xmm0, xmm3
744    addsd     xmm0, QWORD PTR L_one
745    addsd     xmm0, xmm5
746    mulsd     xmm0, xmm4
747    addsd     xmm0, xmm2
748
749    RestoreXmm      xmm6, tan_piby4_save_xmm6
750    StackDeallocate tan_piby4_stack_size
751    ret       0
752
753Ltan_piby4_atransform_false_recip_false:
754    ; Here both transform and recip are false; we just return t1 + t2
755    addsd     xmm5, xmm6
756    movaps    xmm0, xmm5
757    RestoreXmm      xmm6, tan_piby4_save_xmm6
758    StackDeallocate tan_piby4_stack_size
759    ret       0
760
761_tan_piby4 endp
762END
763