; ; ; MIT License ; ----------- ; ; Copyright (c) 2002-2019 Advanced Micro Devices, Inc. ; ; Permission is hereby granted, free of charge, to any person obtaining a copy ; of this Software and associated documentaon files (the "Software"), to deal ; in the Software without restriction, including without limitation the rights ; to use, copy, modify, merge, publish, distribute, sublicense, and/or sell ; copies of the Software, and to permit persons to whom the Software is ; furnished to do so, subject to the following conditions: ; ; The above copyright notice and this permission notice shall be included in ; all copies or substantial portions of the Software. ; ; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ; IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ; FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE ; AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ; LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, ; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN ; THE SOFTWARE. ; ; An implementation of the tanf function using the fma3 instruction. ; ; Prototype: ; ; float tanf(float x); ; ; Computes tanf(x). ; It will provide proper C99 return values, ; but may not raise floating point status bits properly. ; Based on the NAG C implementation. ; .const ALIGN 16 L_sign_mask DQ 07FFFFFFFFFFFFFFFh DQ 07FFFFFFFFFFFFFFFh L_twobypi DQ 03FE45F306DC9C883h DQ 03FE45F306DC9C883h L_int_three DQ 00000000000000003h DQ 00000000000000003h L_int_one DQ 00000000000000001h DQ 00000000000000001h L_signbit DQ 08000000000000000h DQ 08000000000000000h L_tanf DQ 03FD8A8B0DA56CB17h ; c0 DQ 0BF919DBA6EFD6AADh ; c1 DQ 03FF27E84A3E73A2Eh ; d0 DQ 0BFE07266D7B3511Bh ; d1 DQ 03F92E29003C692D9h ; d2 L_large_x_sse2 DQ 04160000000000000h ; 8388608. L_large_x_fma3 DQ 041E921FB40000000h ; 3.373259264e9 L_point_333 DQ 03FD5555555555555h L_mask_3e4 DQ 03e40000000000000h L_mask_3f2 DQ 03f20000000000000h L_point_five DQ 03FE0000000000000h L_piby2_1 DQ 03FF921FB54400000h L_piby2_1tail DQ 03DD0B4611A626331h L_piby2_lead DQ 03ff921fb54442d18h L_n_one DQ 0BFF0000000000000h L_piby4 DQ 03fe921fb54442d18h L_min_norm DQ 00010000000000000h L_inf_mask_32 DD 07F800000h DD 07F800000h EXTRN __use_fma3_lib:DWORD EXTRN __L_2_by_pi_bits:BYTE fname TEXTEQU fname_special TEXTEQU <_tanf_special> ; define local variable storage offsets ; actually there aren't any, but we need to leave room for _tanf_special. dummy_space EQU 20h stack_size EQU 38h include fm.inc ;Define name and any external functions being called EXTERN fname_special : PROC .code PUBLIC fname fname PROC FRAME StackAllocate stack_size .ENDPROLOG cmp DWORD PTR __use_fma3_lib, 0 jne Ltanf_fma3 Ltanf_sse2: movd eax,xmm0 mov r8d,L_inf_mask_32 and eax,r8d cmp eax, r8d jz Ltanf_sse2_naninf cvtss2sd xmm5,xmm0 movd r9,xmm5 btr r9,63 ; r9 <-- |x| cmp r9,L_piby4 jg Ltanf_sse2_range_reduce cmp r9,L_mask_3f2 ; compare to 2^-13 = 0.0001220703125 jge Ltanf_sse2_compute_tanf_piby_4 cmp r9,L_mask_3e4 ; compare to 2^-27 = 7.4505805969238281e-009 jge Ltanf_sse2_compute_x_xxx_0_333 ; At this point tan(x) ~= x; if it's not exact, set the inexact flag. test r9, r9 je Ltanf_sse2_exact_return movsd xmm1, L_n_one addsd xmm1, L_min_norm ; set inexact Ltanf_sse2_exact_return: StackDeallocate stack_size ret ALIGN 16 Ltanf_sse2_compute_x_xxx_0_333: movapd xmm2,xmm5 mulsd xmm2,xmm2 ; xmm2 <-- x^2 movapd xmm0,xmm2 mulsd xmm0,xmm5 ; xmm0 <-- x^3 mulsd xmm0,L_point_333 addsd xmm0,xmm5 ; x + x*x*x*0.3333333333333333; jmp Ltanf_sse2_return_s ALIGN 16 Ltanf_sse2_compute_tanf_piby_4: movapd xmm0,xmm5 ; xmm0 <-- x (as double) movapd xmm1,xmm0 mulsd xmm1,xmm0 ; xmm1 <-- x*x movsd xmm3,L_tanf+008h ; xmm3 <-- c1 mulsd xmm3,xmm1 ; xmm3 <-- c1*x^2 addsd xmm3,L_tanf ; xmm3 <-- c = c1*x^2 + c0 movsd xmm2,L_tanf+020h ; xmm2 <-- d2 mulsd xmm2,xmm1 ; xmm2 <-- d2*x^2 addsd xmm2,L_tanf+018h ; xmm2 <-- d2*x^2 + d1 mulsd xmm2,xmm1 ; xmm2 <-- (d2*x^2 + d1)*x^2 addsd xmm2,L_tanf+010h ; xmm2 <-- d = (d2*x^2 + d1)*x^2 + d0 divsd xmm3,xmm2 ; xmm3 <-- c/d mulsd xmm1,xmm0 ; xmm1 <-- x^3 mulsd xmm1,xmm3 ; xmm1 <-- x^3 * c/d addsd xmm0,xmm1 ; xmm0 <-- x + x^3 * c/d jmp Ltanf_sse2_return_s Ltanf_sse2_range_reduce: movd xmm0,r9 cmp r9,L_large_x_sse2 jge Ltanf_sse2_tanf_reduce_large Ltanf_sse2_tanf_reduce_moderate: movapd xmm1,xmm0 andpd xmm1,L_sign_mask movapd xmm2,L_twobypi mulsd xmm2,xmm1 addsd xmm2,L_point_five cvttpd2dq xmm4,xmm2 cvtdq2pd xmm1,xmm4 andpd xmm4,L_int_three ; xmm4 <-- region movapd xmm2,xmm0 movapd xmm3,xmm1 mulsd xmm1,L_piby2_1 subsd xmm2,xmm1 mulsd xmm3,L_piby2_1tail ; xmm3 rtail movapd xmm0,xmm2 subsd xmm0,xmm3 subsd xmm2,xmm0 movapd xmm1,xmm2 subsd xmm1,xmm3 jmp Ltanf_sse2_exit_s Ltanf_sse2_tanf_reduce_large: lea r9,__L_2_by_pi_bits ;xexp = (x >> 52) 1023 movd r11,xmm0 mov rcx,r11 shr r11,52 sub r11,1023 ; r11 <-- xexp = exponent of input x ;calculate the last byte from which to start multiplication ;last = 134 (xexp >> 3) mov r10,r11 shr r10,3 sub r10,134 ; r10 <-- -last neg r10 ; r10 <-- last ;load 64 bits of 2_by_pi mov rax,[r9+r10] ;mantissa of x = ((x << 12) >> 12) | implied bit shl rcx,12 shr rcx,12 ; rcx <-- mantissa part of input x bts rcx,52 ; add the implied bit as well ;load next 128 bits of 2_by_pi add r10,8 ; increment to next 8 bytes of 2_by_pi movdqu xmm0,[r9+r10] ;do three 64bit multiplications with mant of x mul rcx mov r8,rax ; r8 = last 64 bits of mul = res1[2] mov r10,rdx ; r10 = carry vmovq rax,xmm0 mul rcx ;resexp = xexp & 7 and r11,7 ; r11 <-- resexp = last 3 bits of xexp psrldq xmm0,8 add rax,r10 ; add the previous carry adc rdx,0 mov r9,rax ; r9 <-- next 64 bits of mul = res1[1] mov r10,rdx ; r10 <-- carry movd rax,xmm0 mul rcx add r10,rax ;r10 = most sig 64 bits = res1[0] ;find the region ;last three bits ltb = most sig bits >> (54 resexp)) ; decimal point in last 18 bits == 8 lsb's in first 64 bits ; and 8 msb's in next 64 bits ;point_five = ltb & 01h; ;region = ((ltb >> 1) + point_five) & 3; mov rcx,54 mov rax,r10 sub rcx,r11 xor rdx,rdx ;rdx = sign of x shr rax,cl jnc Ltanf_sse2_no_point_five_f ;;if there is carry.. then negate the result of multiplication not r10 not r9 not r8 mov rdx,08000000000000000h ALIGN 16 Ltanf_sse2_no_point_five_f: adc rax,0 and rax,3 movd xmm4,eax ; xmm4 <-- region ;calculate the number of integer bits and zero them out mov rcx,r11 add rcx,10 ; rcx = no. of integer bits shl r10,cl shr r10,cl ; r10 contains only mant bits sub rcx,64 ; form the exponent mov r11,rcx ;find the highest set bit bsr rcx,r10 jnz Ltanf_sse2_form_mantissa_f mov r10,r9 mov r9,r8 mov r8,0 bsr rcx,r10 ;rcx = hsb sub r11,64 ALIGN 16 Ltanf_sse2_form_mantissa_f: add r11,rcx ; for exp of x sub rcx,52 ; rcx = no. of bits to shift in r10 cmp rcx,0 jl Ltanf_sse2_hsb_below_52_f je Ltanf_sse2_form_numbers_f ;hsb above 52 mov r8,r10 shr r10,cl ; r10 = mantissa of x with hsb at 52 shr r9,cl ; make space for bits from r10 sub rcx,64 neg rcx ; rcx = no of bits to shift r10 shl r8,cl or r9,r8 ; r9 = mantissa bits of xx jmp Ltanf_sse2_form_numbers_f ALIGN 16 Ltanf_sse2_hsb_below_52_f: neg rcx mov rax,r9 shl r10,cl shl r9,cl sub rcx,64 neg rcx shr rax,cl or r10,rax shr r8,cl or r9,r8 ALIGN 16 Ltanf_sse2_form_numbers_f: add r11,1023 btr r10,52 ; remove the implied bit mov rcx,r11 or r10,rdx ; put the sign shl rcx,52 or r10,rcx ; x is in r10 movd xmm0,r10 ; xmm0 <-- x mulsd xmm0,L_piby2_lead Ltanf_sse2_exit_s: movd eax,xmm4 and eax,1 ; eax <-- region & 1 movapd xmm1,xmm0 mulsd xmm1,xmm0 ; xmm1 <-- x*x movsd xmm3,L_tanf+008h ; xmm3 <-- c1 mulsd xmm3,xmm1 ; xmm3 <-- c1*x^2 addsd xmm3,L_tanf ; xmm3 <-- c = c1*x^2 + c0 movsd xmm2,L_tanf+020h ; xmm2 <-- d2 mulsd xmm2,xmm1 ; xmm2 <-- d2*x^2 addsd xmm2,L_tanf+018h ; xmm2 <-- d2*x^2 + d1 mulsd xmm2,xmm1 ; xmm2 <-- (d2*x^2 + d1)*x^2 addsd xmm2,L_tanf+010h ; xmm2 <-- d = (d2*x^2 + d1)*x^2 + d0 divsd xmm3,xmm2 ; xmm3 <-- c/d mulsd xmm1,xmm0 ; xmm1 <-- x^3 mulsd xmm1,xmm3 ; xmm1 <-- x^3 * c/d addsd xmm0,xmm1 ; xmm0 <-- x + x^3 * c/d cmp eax,01h jne Ltanf_sse2_exit_tanpiby4 Ltanf_sse2_recip : movd xmm3,L_n_one divsd xmm3,xmm0 movsd xmm0,xmm3 Ltanf_sse2_exit_tanpiby4 : andpd xmm5,L_signbit xorpd xmm0,xmm5 Ltanf_sse2_return_s: cvtsd2ss xmm0,xmm0 Ltanf_sse2_return_c: StackDeallocate stack_size ret Ltanf_sse2_naninf: call fname_special StackDeallocate stack_size ret ALIGN 16 Ltanf_fma3: vmovd eax,xmm0 mov r8d,L_inf_mask_32 and eax,r8d cmp eax, r8d jz Ltanf_fma3_naninf vcvtss2sd xmm5,xmm0,xmm0 vmovq r9,xmm5 btr r9,63 ; r9 <-- |x| cmp r9,L_piby4 jg Ltanf_fma3_range_reduce cmp r9,L_mask_3f2 jge Ltanf_fma3_compute_tanf_piby_4 cmp r9,L_mask_3e4 jge Ltanf_fma3_compute_x_xxx_0_333 jmp Ltanf_fma3_return_c Ltanf_fma3_compute_x_xxx_0_333: vmulsd xmm2,xmm5,xmm5 vmulsd xmm0,xmm2,xmm5 vfmadd132sd xmm0,xmm5,L_point_333 ; x + x*x*x*0.3333333333333333; jmp Ltanf_fma3_return_s Ltanf_fma3_compute_tanf_piby_4: vmovsd xmm0,xmm5,xmm5 vmulsd xmm1,xmm0,xmm0 vmovsd xmm3,L_tanf+008h vfmadd213sd xmm3,xmm1,L_tanf vmovsd xmm2,L_tanf+020h vfmadd213sd xmm2,xmm1,L_tanf+018h vfmadd213sd xmm2,xmm1,L_tanf+010h vdivsd xmm3,xmm3,xmm2 vmulsd xmm1,xmm1,xmm0 vfmadd231sd xmm0,xmm1,xmm3 jmp Ltanf_fma3_return_s Ltanf_fma3_range_reduce: vmovq xmm0,r9 cmp r9,L_large_x_fma3 jge Ltanf_fma3_tanf_reduce_large Ltanf_fma3_tanf_reduce_moderate: vandpd xmm1,xmm0,L_sign_mask vmovapd xmm2,L_twobypi vfmadd213sd xmm2,xmm1,L_point_five vcvttpd2dq xmm2,xmm2 vpmovsxdq xmm1,xmm2 vandpd xmm4,xmm1,L_int_three ; xmm4 <-- region vshufps xmm1 ,xmm1,xmm1,8 vcvtdq2pd xmm1,xmm1 vmovdqa xmm2,xmm0 vfnmadd231sd xmm2,xmm1,L_piby2_1 ; xmm2 rhead vmulsd xmm3,xmm1,L_piby2_1tail ; xmm3 rtail vsubsd xmm0,xmm2,xmm3 vsubsd xmm2,xmm2,xmm0 vsubsd xmm1,xmm2,xmm3 jmp Ltanf_fma3_exit_s Ltanf_fma3_tanf_reduce_large: lea r9,__L_2_by_pi_bits ;xexp = (x >> 52) 1023 vmovq r11,xmm0 mov rcx,r11 shr r11,52 sub r11,1023 ; r11 <-- xexp = exponent of input x ;calculate the last byte from which to start multiplication ;last = 134 (xexp >> 3) mov r10,r11 shr r10,3 sub r10,134 ; r10 <-- -last neg r10 ; r10 <-- last ;load 64 bits of 2_by_pi mov rax,[r9+r10] ;mantissa of x = ((x << 12) >> 12) | implied bit shl rcx,12 shr rcx,12 ; rcx <-- mantissa part of input x bts rcx,52 ; add the implied bit as well ;load next 128 bits of 2_by_pi add r10,8 ; increment to next 8 bytes of 2_by_pi vmovdqu xmm0,XMMWORD PTR[r9+r10] ;do three 64bit multiplications with mant of x mul rcx mov r8,rax ; r8 = last 64 bits of mul = res1[2] mov r10,rdx ; r10 = carry vmovq rax,xmm0 mul rcx ;resexp = xexp & 7 and r11,7 ; r11 <-- resexp = last 3 bits of xexp vpsrldq xmm0,xmm0,8 add rax,r10 ; add the previous carry adc rdx,0 mov r9,rax ; r9 <-- next 64 bits of mul = res1[1] mov r10,rdx ; r10 <-- carry vmovq rax,xmm0 mul rcx add r10,rax ;r10 = most sig 64 bits = res1[0] ;find the region ;last three bits ltb = most sig bits >> (54 resexp)) ; decimal point in last 18 bits == 8 lsb's in first 64 bits ; and 8 msb's in next 64 bits ;point_five = ltb & 01h; ;region = ((ltb >> 1) + point_five) & 3; mov rcx,54 mov rax,r10 sub rcx,r11 xor rdx,rdx ;rdx = sign of x shr rax,cl jnc Ltanf_fma3_no_point_five_f ;;if there is carry.. then negate the result of multiplication not r10 not r9 not r8 mov rdx,08000000000000000h ALIGN 16 Ltanf_fma3_no_point_five_f: adc rax,0 and rax,3 vmovd xmm4,eax ; xmm4 <-- region ;calculate the number of integer bits and zero them out mov rcx,r11 add rcx,10 ; rcx = no. of integer bits shl r10,cl shr r10,cl ; r10 contains only mant bits sub rcx,64 ; form the exponent mov r11,rcx ;find the highest set bit bsr rcx,r10 jnz Ltanf_fma3_form_mantissa_f mov r10,r9 mov r9,r8 mov r8,0 bsr rcx,r10 ;rcx = hsb sub r11,64 ALIGN 16 Ltanf_fma3_form_mantissa_f: add r11,rcx ; for exp of x sub rcx,52 ; rcx = no. of bits to shift in r10 cmp rcx,0 jl Ltanf_fma3_hsb_below_52_f je Ltanf_fma3_form_numbers_f ;hsb above 52 mov r8,r10 shr r10,cl ; r10 = mantissa of x with hsb at 52 shr r9,cl ; make space for bits from r10 sub rcx,64 neg rcx ; rcx = no of bits to shift r10 shl r8,cl or r9,r8 ; r9 = mantissa bits of xx jmp Ltanf_fma3_form_numbers_f ALIGN 16 Ltanf_fma3_hsb_below_52_f: neg rcx mov rax,r9 shl r10,cl shl r9,cl sub rcx,64 neg rcx shr rax,cl or r10,rax shr r8,cl or r9,r8 ALIGN 16 Ltanf_fma3_form_numbers_f: add r11,1023 btr r10,52 ; remove the implied bit mov rcx,r11 or r10,rdx ; put the sign shl rcx,52 or r10,rcx ; x is in r10 vmovq xmm0,r10 ; xmm0 <-- x vmulsd xmm0,xmm0,L_piby2_lead Ltanf_fma3_exit_s: vandpd xmm2,xmm4,XMMWORD PTR L_int_one vmovd eax,xmm2 vmulsd xmm1,xmm0,xmm0 vmovsd xmm3,L_tanf+008h vfmadd213sd xmm3,xmm1,L_tanf vmovsd xmm2,L_tanf+020h vfmadd213sd xmm2,xmm1,L_tanf+018h vfmadd213sd xmm2,xmm1,L_tanf+010h vdivsd xmm3,xmm3,xmm2 vmulsd xmm1,xmm1,xmm0 vfmadd231sd xmm0,xmm1,xmm3 cmp eax,01h je Ltanf_fma3_recip jmp Ltanf_fma3_exit_tanpiby4 Ltanf_fma3_recip : vmovq xmm3,L_n_one vdivsd xmm0,xmm3,xmm0 Ltanf_fma3_exit_tanpiby4 : vandpd xmm5,xmm5,L_signbit vxorpd xmm0,xmm0,xmm5 Ltanf_fma3_return_s: vcvtsd2ss xmm0,xmm0,xmm0 Ltanf_fma3_return_c: StackDeallocate stack_size ret Ltanf_fma3_naninf: call fname_special StackDeallocate stack_size ret fname endp END