xref: /reactos/sdk/lib/crt/math/libm_sse2/tanf.asm (revision 4afb647c)
1;
2;
3; MIT License
4; -----------
5;
6; Copyright (c) 2002-2019 Advanced Micro Devices, Inc.
7;
8; Permission is hereby granted, free of charge, to any person obtaining a copy
9; of this Software and associated documentaon files (the "Software"), to deal
10; in the Software without restriction, including without limitation the rights
11; to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12; copies of the Software, and to permit persons to whom the Software is
13; furnished to do so, subject to the following conditions:
14;
15; The above copyright notice and this permission notice shall be included in
16; all copies or substantial portions of the Software.
17;
18; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19; IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20; FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21; AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22; LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
23; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
24; THE SOFTWARE.
25;
26; An implementation of the tanf function using the fma3 instruction.
27;
28; Prototype:
29;
30;     float tanf(float x);
31;
32;   Computes tanf(x).
33;   It will provide proper C99 return values,
34;   but may not raise floating point status bits properly.
35;   Based on the NAG C implementation.
36;
37.const
38ALIGN 16
39L_sign_mask    DQ 07FFFFFFFFFFFFFFFh
40               DQ 07FFFFFFFFFFFFFFFh
41L_twobypi      DQ 03FE45F306DC9C883h
42               DQ 03FE45F306DC9C883h
43L_int_three    DQ 00000000000000003h
44               DQ 00000000000000003h
45L_int_one      DQ 00000000000000001h
46               DQ 00000000000000001h
47L_signbit      DQ 08000000000000000h
48               DQ 08000000000000000h
49
50L_tanf         DQ 03FD8A8B0DA56CB17h    ; c0
51               DQ 0BF919DBA6EFD6AADh    ; c1
52               DQ 03FF27E84A3E73A2Eh    ; d0
53               DQ 0BFE07266D7B3511Bh    ; d1
54               DQ 03F92E29003C692D9h    ; d2
55
56L_large_x_sse2 DQ 04160000000000000h    ; 8388608.
57L_large_x_fma3 DQ 041E921FB40000000h    ; 3.373259264e9
58L_point_333    DQ 03FD5555555555555h
59L_mask_3e4     DQ 03e40000000000000h
60L_mask_3f2     DQ 03f20000000000000h
61L_point_five   DQ 03FE0000000000000h
62L_piby2_1      DQ 03FF921FB54400000h
63L_piby2_1tail  DQ 03DD0B4611A626331h
64L_piby2_lead   DQ 03ff921fb54442d18h
65L_n_one        DQ 0BFF0000000000000h
66L_piby4        DQ 03fe921fb54442d18h
67L_min_norm     DQ 00010000000000000h
68
69
70L_inf_mask_32  DD 07F800000h
71               DD 07F800000h
72
73EXTRN __use_fma3_lib:DWORD
74EXTRN __L_2_by_pi_bits:BYTE
75
76fname           TEXTEQU <tanf>
77fname_special   TEXTEQU <_tanf_special>
78
79; define local variable storage offsets
80; actually there aren't any, but we need to leave room for _tanf_special.
81dummy_space     EQU 20h
82stack_size      EQU 38h
83
84include fm.inc
85
86;Define name and any external functions being called
87EXTERN           fname_special : PROC
88
89.code
90PUBLIC fname
91fname PROC FRAME
92    StackAllocate stack_size
93    .ENDPROLOG
94    cmp          DWORD PTR __use_fma3_lib, 0
95    jne          Ltanf_fma3
96
97Ltanf_sse2:
98    movd         eax,xmm0
99    mov          r8d,L_inf_mask_32
100    and          eax,r8d
101    cmp          eax, r8d
102    jz           Ltanf_sse2_naninf
103
104    cvtss2sd     xmm5,xmm0
105    movd         r9,xmm5
106    btr          r9,63                    ; r9 <-- |x|
107
108    cmp          r9,L_piby4
109    jg           Ltanf_sse2_range_reduce
110    cmp          r9,L_mask_3f2 ; compare to 2^-13 = 0.0001220703125
111    jge          Ltanf_sse2_compute_tanf_piby_4
112    cmp          r9,L_mask_3e4 ; compare to 2^-27 = 7.4505805969238281e-009
113    jge          Ltanf_sse2_compute_x_xxx_0_333
114    ; At this point tan(x) ~= x; if it's not exact, set the inexact flag.
115
116    test         r9, r9
117    je           Ltanf_sse2_exact_return
118    movsd        xmm1, L_n_one
119    addsd        xmm1, L_min_norm          ; set inexact
120
121Ltanf_sse2_exact_return:
122    StackDeallocate stack_size
123    ret
124
125ALIGN 16
126Ltanf_sse2_compute_x_xxx_0_333:
127    movapd       xmm2,xmm5
128    mulsd        xmm2,xmm2                ; xmm2 <-- x^2
129    movapd       xmm0,xmm2
130    mulsd        xmm0,xmm5                ; xmm0 <-- x^3
131    mulsd        xmm0,L_point_333
132    addsd        xmm0,xmm5    ;  x + x*x*x*0.3333333333333333;
133    jmp          Ltanf_sse2_return_s
134
135ALIGN 16
136Ltanf_sse2_compute_tanf_piby_4:
137    movapd       xmm0,xmm5                ; xmm0 <-- x (as double)
138
139    movapd       xmm1,xmm0
140    mulsd        xmm1,xmm0                ; xmm1 <-- x*x
141
142    movsd        xmm3,L_tanf+008h         ; xmm3 <-- c1
143    mulsd        xmm3,xmm1                ; xmm3 <-- c1*x^2
144    addsd        xmm3,L_tanf              ; xmm3 <-- c = c1*x^2 + c0
145
146    movsd        xmm2,L_tanf+020h         ; xmm2 <-- d2
147    mulsd        xmm2,xmm1                ; xmm2 <-- d2*x^2
148    addsd        xmm2,L_tanf+018h         ; xmm2 <-- d2*x^2 + d1
149    mulsd        xmm2,xmm1                ; xmm2 <-- (d2*x^2 + d1)*x^2
150    addsd        xmm2,L_tanf+010h         ; xmm2 <-- d = (d2*x^2 + d1)*x^2 + d0
151    divsd        xmm3,xmm2                ; xmm3 <-- c/d
152    mulsd        xmm1,xmm0                ; xmm1 <-- x^3
153    mulsd        xmm1,xmm3                ; xmm1 <-- x^3 * c/d
154    addsd        xmm0,xmm1                ; xmm0 <-- x + x^3 * c/d
155    jmp          Ltanf_sse2_return_s
156
157Ltanf_sse2_range_reduce:
158    movd         xmm0,r9
159    cmp          r9,L_large_x_sse2
160    jge          Ltanf_sse2_tanf_reduce_large
161
162Ltanf_sse2_tanf_reduce_moderate:
163    movapd       xmm1,xmm0
164    andpd        xmm1,L_sign_mask
165    movapd       xmm2,L_twobypi
166    mulsd        xmm2,xmm1
167    addsd        xmm2,L_point_five
168    cvttpd2dq    xmm4,xmm2
169    cvtdq2pd     xmm1,xmm4
170    andpd        xmm4,L_int_three    ; xmm4 <-- region
171    movapd       xmm2,xmm0
172
173    movapd       xmm3,xmm1
174    mulsd        xmm1,L_piby2_1
175    subsd        xmm2,xmm1
176    mulsd        xmm3,L_piby2_1tail  ; xmm3 rtail
177    movapd       xmm0,xmm2
178    subsd        xmm0,xmm3
179    subsd        xmm2,xmm0
180    movapd       xmm1,xmm2
181    subsd        xmm1,xmm3
182    jmp          Ltanf_sse2_exit_s
183
184Ltanf_sse2_tanf_reduce_large:
185    lea          r9,__L_2_by_pi_bits
186    ;xexp = (x >> 52) 1023
187    movd         r11,xmm0
188    mov          rcx,r11
189    shr          r11,52
190    sub          r11,1023                 ; r11 <-- xexp = exponent of input x
191    ;calculate the last byte from which to start multiplication
192    ;last = 134 (xexp >> 3)
193    mov          r10,r11
194    shr          r10,3
195    sub          r10,134                  ; r10 <-- -last
196    neg          r10                      ; r10 <-- last
197    ;load 64 bits of 2_by_pi
198    mov          rax,[r9+r10]
199    ;mantissa of x = ((x << 12) >> 12) | implied bit
200    shl          rcx,12
201    shr          rcx,12                   ; rcx <-- mantissa part of input x
202    bts          rcx,52                   ; add the implied bit as well
203    ;load next 128 bits of 2_by_pi
204    add          r10,8                    ; increment to next 8 bytes of 2_by_pi
205    movdqu       xmm0,[r9+r10]
206    ;do three 64bit multiplications with mant of x
207    mul          rcx
208    mov          r8,rax                   ; r8 = last 64 bits of mul = res1[2]
209    mov          r10,rdx                  ; r10 = carry
210    vmovq        rax,xmm0
211    mul          rcx
212    ;resexp = xexp & 7
213    and          r11,7                    ; r11 <-- resexp = last 3 bits of xexp
214    psrldq       xmm0,8
215    add          rax,r10                  ; add the previous carry
216    adc          rdx,0
217    mov          r9,rax                   ; r9 <-- next 64 bits of mul = res1[1]
218    mov          r10,rdx                  ; r10 <-- carry
219    movd         rax,xmm0
220    mul          rcx
221    add          r10,rax                 ;r10 = most sig 64 bits = res1[0]
222    ;find the region
223    ;last three bits ltb = most sig bits >> (54 resexp))
224    ;  decimal point in last 18 bits == 8 lsb's in first 64 bits
225    ;  and 8 msb's in next 64 bits
226    ;point_five = ltb & 01h;
227    ;region = ((ltb >> 1) + point_five) & 3;
228    mov          rcx,54
229    mov          rax,r10
230    sub          rcx,r11
231    xor          rdx,rdx                  ;rdx = sign of x
232    shr          rax,cl
233    jnc          Ltanf_sse2_no_point_five_f
234    ;;if there is carry.. then negate the result of multiplication
235    not          r10
236    not          r9
237    not          r8
238    mov          rdx,08000000000000000h
239ALIGN 16
240Ltanf_sse2_no_point_five_f:
241    adc          rax,0
242    and          rax,3
243    movd         xmm4,eax                 ; xmm4 <-- region
244    ;calculate the number of integer bits and zero them out
245    mov          rcx,r11
246    add          rcx,10                   ; rcx = no. of integer bits
247    shl          r10,cl
248    shr          r10,cl                   ; r10 contains only mant bits
249    sub          rcx,64                   ; form the exponent
250    mov          r11,rcx
251    ;find the highest set bit
252    bsr          rcx,r10
253    jnz          Ltanf_sse2_form_mantissa_f
254    mov          r10,r9
255    mov          r9,r8
256    mov          r8,0
257    bsr          rcx,r10 ;rcx = hsb
258    sub          r11,64
259ALIGN 16
260Ltanf_sse2_form_mantissa_f:
261    add          r11,rcx                  ; for exp of x
262    sub          rcx,52                   ; rcx = no. of bits to shift in r10
263    cmp          rcx,0
264    jl           Ltanf_sse2_hsb_below_52_f
265    je           Ltanf_sse2_form_numbers_f
266    ;hsb above 52
267    mov          r8,r10
268    shr          r10,cl                   ; r10 = mantissa of x with hsb at 52
269    shr          r9,cl                    ; make space for bits from r10
270    sub          rcx,64
271    neg          rcx                      ; rcx = no of bits to shift r10
272    shl          r8,cl
273    or           r9,r8                    ; r9 = mantissa bits of xx
274    jmp          Ltanf_sse2_form_numbers_f
275
276ALIGN 16
277Ltanf_sse2_hsb_below_52_f:
278    neg          rcx
279    mov          rax,r9
280    shl          r10,cl
281    shl          r9,cl
282    sub          rcx,64
283    neg          rcx
284    shr          rax,cl
285    or           r10,rax
286    shr          r8,cl
287    or           r9,r8
288ALIGN 16
289Ltanf_sse2_form_numbers_f:
290    add          r11,1023
291    btr          r10,52                   ; remove the implied bit
292    mov          rcx,r11
293    or           r10,rdx                  ; put the sign
294    shl          rcx,52
295    or           r10,rcx                  ; x is in r10
296    movd         xmm0,r10                 ; xmm0 <-- x
297    mulsd        xmm0,L_piby2_lead
298
299Ltanf_sse2_exit_s:
300    movd         eax,xmm4
301    and          eax,1                    ; eax <-- region & 1
302    movapd       xmm1,xmm0
303    mulsd        xmm1,xmm0                ; xmm1 <-- x*x
304
305    movsd        xmm3,L_tanf+008h         ; xmm3 <-- c1
306    mulsd        xmm3,xmm1                ; xmm3 <-- c1*x^2
307    addsd        xmm3,L_tanf              ; xmm3 <-- c = c1*x^2 + c0
308
309    movsd        xmm2,L_tanf+020h         ; xmm2 <-- d2
310    mulsd        xmm2,xmm1                ; xmm2 <-- d2*x^2
311    addsd        xmm2,L_tanf+018h         ; xmm2 <-- d2*x^2 + d1
312    mulsd        xmm2,xmm1                ; xmm2 <-- (d2*x^2 + d1)*x^2
313    addsd        xmm2,L_tanf+010h         ; xmm2 <-- d = (d2*x^2 + d1)*x^2 + d0
314    divsd        xmm3,xmm2                ; xmm3 <-- c/d
315    mulsd        xmm1,xmm0                ; xmm1 <-- x^3
316    mulsd        xmm1,xmm3                ; xmm1 <-- x^3 * c/d
317    addsd        xmm0,xmm1                ; xmm0 <-- x + x^3 * c/d
318    cmp          eax,01h
319    jne          Ltanf_sse2_exit_tanpiby4
320Ltanf_sse2_recip :
321    movd         xmm3,L_n_one
322    divsd        xmm3,xmm0
323    movsd        xmm0,xmm3
324Ltanf_sse2_exit_tanpiby4 :
325    andpd        xmm5,L_signbit
326    xorpd        xmm0,xmm5
327
328Ltanf_sse2_return_s:
329    cvtsd2ss     xmm0,xmm0
330Ltanf_sse2_return_c:
331    StackDeallocate stack_size
332    ret
333
334Ltanf_sse2_naninf:
335    call    fname_special
336     StackDeallocate stack_size
337    ret
338
339ALIGN 16
340Ltanf_fma3:
341    vmovd        eax,xmm0
342    mov          r8d,L_inf_mask_32
343    and          eax,r8d
344    cmp          eax, r8d
345    jz           Ltanf_fma3_naninf
346
347    vcvtss2sd    xmm5,xmm0,xmm0
348    vmovq        r9,xmm5
349    btr          r9,63                    ; r9 <-- |x|
350
351    cmp          r9,L_piby4
352    jg           Ltanf_fma3_range_reduce
353    cmp          r9,L_mask_3f2
354    jge          Ltanf_fma3_compute_tanf_piby_4
355    cmp          r9,L_mask_3e4
356    jge          Ltanf_fma3_compute_x_xxx_0_333
357    jmp          Ltanf_fma3_return_c
358
359Ltanf_fma3_compute_x_xxx_0_333:
360    vmulsd       xmm2,xmm5,xmm5
361    vmulsd       xmm0,xmm2,xmm5
362    vfmadd132sd  xmm0,xmm5,L_point_333    ;  x + x*x*x*0.3333333333333333;
363    jmp          Ltanf_fma3_return_s
364
365Ltanf_fma3_compute_tanf_piby_4:
366    vmovsd       xmm0,xmm5,xmm5
367    vmulsd       xmm1,xmm0,xmm0
368    vmovsd       xmm3,L_tanf+008h
369    vfmadd213sd  xmm3,xmm1,L_tanf
370    vmovsd       xmm2,L_tanf+020h
371    vfmadd213sd  xmm2,xmm1,L_tanf+018h
372    vfmadd213sd  xmm2,xmm1,L_tanf+010h
373    vdivsd       xmm3,xmm3,xmm2
374    vmulsd       xmm1,xmm1,xmm0
375    vfmadd231sd  xmm0,xmm1,xmm3
376    jmp          Ltanf_fma3_return_s
377
378Ltanf_fma3_range_reduce:
379    vmovq        xmm0,r9
380    cmp          r9,L_large_x_fma3
381    jge          Ltanf_fma3_tanf_reduce_large
382
383Ltanf_fma3_tanf_reduce_moderate:
384    vandpd       xmm1,xmm0,L_sign_mask
385    vmovapd      xmm2,L_twobypi
386    vfmadd213sd  xmm2,xmm1,L_point_five
387    vcvttpd2dq   xmm2,xmm2
388    vpmovsxdq    xmm1,xmm2
389    vandpd       xmm4,xmm1,L_int_three    ; xmm4 <-- region
390    vshufps      xmm1 ,xmm1,xmm1,8
391    vcvtdq2pd    xmm1,xmm1
392    vmovdqa      xmm2,xmm0
393    vfnmadd231sd xmm2,xmm1,L_piby2_1      ; xmm2 rhead
394    vmulsd       xmm3,xmm1,L_piby2_1tail  ; xmm3 rtail
395    vsubsd       xmm0,xmm2,xmm3
396    vsubsd       xmm2,xmm2,xmm0
397    vsubsd       xmm1,xmm2,xmm3
398    jmp          Ltanf_fma3_exit_s
399
400Ltanf_fma3_tanf_reduce_large:
401    lea          r9,__L_2_by_pi_bits
402    ;xexp = (x >> 52) 1023
403    vmovq        r11,xmm0
404    mov          rcx,r11
405    shr          r11,52
406    sub          r11,1023                 ; r11 <-- xexp = exponent of input x
407    ;calculate the last byte from which to start multiplication
408    ;last = 134 (xexp >> 3)
409    mov          r10,r11
410    shr          r10,3
411    sub          r10,134                  ; r10 <-- -last
412    neg          r10                      ; r10 <-- last
413    ;load 64 bits of 2_by_pi
414    mov          rax,[r9+r10]
415    ;mantissa of x = ((x << 12) >> 12) | implied bit
416    shl          rcx,12
417    shr          rcx,12                   ; rcx <-- mantissa part of input x
418    bts          rcx,52                   ; add the implied bit as well
419    ;load next 128 bits of 2_by_pi
420    add          r10,8                    ; increment to next 8 bytes of 2_by_pi
421    vmovdqu      xmm0,XMMWORD PTR[r9+r10]
422    ;do three 64bit multiplications with mant of x
423    mul          rcx
424    mov          r8,rax                   ; r8 = last 64 bits of mul = res1[2]
425    mov          r10,rdx                  ; r10 = carry
426    vmovq        rax,xmm0
427    mul          rcx
428    ;resexp = xexp & 7
429    and          r11,7                    ; r11 <-- resexp = last 3 bits of xexp
430    vpsrldq      xmm0,xmm0,8
431    add          rax,r10                  ; add the previous carry
432    adc          rdx,0
433    mov          r9,rax                   ; r9 <-- next 64 bits of mul = res1[1]
434    mov          r10,rdx                  ; r10 <-- carry
435    vmovq        rax,xmm0
436    mul          rcx
437    add          r10,rax                 ;r10 = most sig 64 bits = res1[0]
438    ;find the region
439    ;last three bits ltb = most sig bits >> (54 resexp))
440    ;  decimal point in last 18 bits == 8 lsb's in first 64 bits
441    ;  and 8 msb's in next 64 bits
442    ;point_five = ltb & 01h;
443    ;region = ((ltb >> 1) + point_five) & 3;
444    mov          rcx,54
445    mov          rax,r10
446    sub          rcx,r11
447    xor          rdx,rdx                  ;rdx = sign of x
448    shr          rax,cl
449    jnc          Ltanf_fma3_no_point_five_f
450    ;;if there is carry.. then negate the result of multiplication
451    not          r10
452    not          r9
453    not          r8
454    mov          rdx,08000000000000000h
455ALIGN 16
456Ltanf_fma3_no_point_five_f:
457    adc          rax,0
458    and          rax,3
459    vmovd        xmm4,eax                 ; xmm4 <-- region
460    ;calculate the number of integer bits and zero them out
461    mov          rcx,r11
462    add          rcx,10                   ; rcx = no. of integer bits
463    shl          r10,cl
464    shr          r10,cl                   ; r10 contains only mant bits
465    sub          rcx,64                   ; form the exponent
466    mov          r11,rcx
467    ;find the highest set bit
468    bsr          rcx,r10
469    jnz          Ltanf_fma3_form_mantissa_f
470    mov          r10,r9
471    mov          r9,r8
472    mov          r8,0
473    bsr          rcx,r10 ;rcx = hsb
474    sub          r11,64
475ALIGN 16
476Ltanf_fma3_form_mantissa_f:
477    add          r11,rcx                  ; for exp of x
478    sub          rcx,52                   ; rcx = no. of bits to shift in r10
479    cmp          rcx,0
480    jl           Ltanf_fma3_hsb_below_52_f
481    je           Ltanf_fma3_form_numbers_f
482    ;hsb above 52
483    mov          r8,r10
484    shr          r10,cl                   ; r10 = mantissa of x with hsb at 52
485    shr          r9,cl                    ; make space for bits from r10
486    sub          rcx,64
487    neg          rcx                      ; rcx = no of bits to shift r10
488    shl          r8,cl
489    or           r9,r8                    ; r9 = mantissa bits of xx
490    jmp          Ltanf_fma3_form_numbers_f
491
492ALIGN 16
493Ltanf_fma3_hsb_below_52_f:
494    neg          rcx
495    mov          rax,r9
496    shl          r10,cl
497    shl          r9,cl
498    sub          rcx,64
499    neg          rcx
500    shr          rax,cl
501    or           r10,rax
502    shr          r8,cl
503    or           r9,r8
504ALIGN 16
505Ltanf_fma3_form_numbers_f:
506    add          r11,1023
507    btr          r10,52                   ; remove the implied bit
508    mov          rcx,r11
509    or           r10,rdx                  ; put the sign
510    shl          rcx,52
511    or           r10,rcx                  ; x is in r10
512    vmovq        xmm0,r10                 ; xmm0 <-- x
513    vmulsd       xmm0,xmm0,L_piby2_lead
514
515Ltanf_fma3_exit_s:
516    vandpd       xmm2,xmm4,XMMWORD PTR L_int_one
517    vmovd        eax,xmm2
518    vmulsd       xmm1,xmm0,xmm0
519    vmovsd       xmm3,L_tanf+008h
520    vfmadd213sd  xmm3,xmm1,L_tanf
521    vmovsd       xmm2,L_tanf+020h
522    vfmadd213sd  xmm2,xmm1,L_tanf+018h
523    vfmadd213sd  xmm2,xmm1,L_tanf+010h
524    vdivsd       xmm3,xmm3,xmm2
525    vmulsd       xmm1,xmm1,xmm0
526    vfmadd231sd  xmm0,xmm1,xmm3
527    cmp          eax,01h
528    je           Ltanf_fma3_recip
529    jmp          Ltanf_fma3_exit_tanpiby4
530
531Ltanf_fma3_recip :
532    vmovq        xmm3,L_n_one
533    vdivsd       xmm0,xmm3,xmm0
534
535Ltanf_fma3_exit_tanpiby4 :
536    vandpd       xmm5,xmm5,L_signbit
537    vxorpd       xmm0,xmm0,xmm5
538
539Ltanf_fma3_return_s:
540    vcvtsd2ss    xmm0,xmm0,xmm0
541Ltanf_fma3_return_c:
542     StackDeallocate stack_size
543    ret
544
545Ltanf_fma3_naninf:
546    call    fname_special
547     StackDeallocate stack_size
548    ret
549
550fname endp
551END
552