1 
2 /*
3  * Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
4  *
5  * Licensed under the Apache License, Version 2.0 (the "License");
6  * you may not use this file except in compliance with the License.
7  * You may obtain a copy of the License at
8  *
9  *     http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  *
17  */
18 
19 
20 #include <math.h>
21 #include <common_tanf.h>
22 
23 //static vmask i2opi_vec[] = {
24 //    vcast_vm_i_i(0, i2opi_f[0]),
25 //    vcast_vm_i_i(0, i2opi_f[1]),
26 //    vcast_vm_i_i(0, i2opi_f[2]),
27 //    vcast_vm_i_i(0, i2opi_f[3]),
28 //    vcast_vm_i_i(0, i2opi_f[4]),
29 //    vcast_vm_i_i(0, i2opi_f[5]),
30 //};
31 
32 vfloat static INLINE
__reduction_slowpath(vfloat const a,vmask * h)33 __reduction_slowpath(vfloat const a, vmask *h)
34 {
35     vint2 ia, e, idx, q, p, s;
36     vint2 ia_a, ia_b, p_a, p_b, hi_a, hi_b;
37     vint2 hi, lo, ll, prev, prev2;
38 
39     vmask i2opi_vec[] = {
40         vcast_vm_i_i(0, i2opi_f[0]),
41         vcast_vm_i_i(0, i2opi_f[1]),
42         vcast_vm_i_i(0, i2opi_f[2]),
43         vcast_vm_i_i(0, i2opi_f[3]),
44         vcast_vm_i_i(0, i2opi_f[4]),
45         vcast_vm_i_i(0, i2opi_f[5]),
46     };
47 
48     ia = (vint2)a;
49     s = vand_vi2_vi2_vi2(ia, vcast_vi2_i(0x80000000));
50     /* e = ((ia >> 23) & 0xff) - 127; */
51     e = vsrl_vi2_vi2_i(ia, 23);
52     e = vand_vi2_vi2_vi2(e, vcast_vi2_i(0xff));
53     e = vsub_vi2_vi2_vi2(e, vcast_vi2_i(127));
54     /* ia = (ia << 8) | 0x80000000; */
55     ia = vsll_vi2_vi2_i(ia, 8);
56     ia = vor_vi2_vi2_vi2(ia, vcast_vi2_i(0x80000000));
57 
58     /* compute x * 1/pi */
59     /* idx = 6 - ((e >> 5) & 3); */
60     idx = vsrl_vi2_vi2_i(e, 5);
61     idx = vand_vi2_vi2_vi2(idx, vcast_vi2_i(3));
62     idx = vsub_vi2_vi2_vi2(vcast_vi2_i(6), idx);
63 
64     ia_a = vsrl64_vi2_vi2_i(ia, 32);
65     ia_b = ia;
66     hi_a = vcast_vi2_i(0);
67     hi_b = vcast_vi2_i(0);
68 
69     q = vcast_vi2_i(0);
70     for (int i = 0; i < 6; i++) {
71         p_a = vmulu_vi2_vi2_vi2((vint2)i2opi_vec[i], ia_a);
72         p_b = vmulu_vi2_vi2_vi2((vint2)i2opi_vec[i], ia_b);
73         p_a = vadd64_vi2_vi2_vi2(p_a, hi_a);
74         p_b = vadd64_vi2_vi2_vi2(p_b, hi_b);
75 
76         hi_a = vsrl64_vi2_vi2_i(p_a, 32);
77         hi_b = vsrl64_vi2_vi2_i(p_b, 32);
78 
79         p_a = vsll64_vi2_vi2_i(p_a, 32);
80         p_b = vand_vi2_vi2_vi2(p_b, vcast_vm_i_i(0, 0xffffffff));
81 
82         p = vor_vi2_vi2_vi2(p_a, p_b);
83 
84         vopmask m = veq_vo_vi2_vi2(idx, q);
85         hi = vsel_vi2_vo_vi2_vi2(m, p, hi);
86         lo = vsel_vi2_vo_vi2_vi2(m, prev, lo);
87         ll = vsel_vi2_vo_vi2_vi2(m, prev2, ll);
88 
89         prev2 = prev;
90         prev = p;
91 
92         q = vadd_vi2_vi2_vi2(q, vcast_vi2_i(1));
93     }
94     p = vor_vi2_vi2_vi2(vsll64_vi2_vi2_i(hi_a, 32), hi_b);
95 
96     vopmask m = veq_vo_vi2_vi2(idx, q);
97     hi = vsel_vi2_vo_vi2_vi2(m, p, hi);
98     lo = vsel_vi2_vo_vi2_vi2(m, prev, lo);
99     ll = vsel_vi2_vo_vi2_vi2(m, prev2, ll);
100 
101     e = vand_vi2_vi2_vi2(e, vcast_vi2_i(31));
102 
103     union {
104         vint2 v;
105         uint32_t t[sizeof(vint2) / sizeof(uint32_t)];
106     } ue, uhi, ulo, ull, uh, ur;
107     ue.v = e; uhi.v = hi; ulo.v = lo; ull.v = ll;
108     for (unsigned i = 0; i < sizeof(vint2) / sizeof(uint32_t); i++) {
109         uint32_t e = ue.t[i], q;
110         uint64_t p;
111         p = (uint64_t)uhi.t[i] << 32;
112         p |= ulo.t[i];
113 
114         if (e) {
115             q = 32 - e;
116             p = (p << e) | (ull.t[i] >> q);
117         }
118 
119         q = (uhi.t[i] << e) & 0x80000000;
120         p &= 0x7fffffffffffffffULL;
121 
122         if (p & 0x4000000000000000ULL) {
123             p |= 0x8000000000000000ULL;
124             q ^= 0x80000000;
125         }
126         uh.t[i] = q;
127 
128         double d = (double)(int64_t)p;
129         d *= PI_2_M64;
130         float r = (float)d;
131         ur.t[i] = float_as_int(r);
132     }
133     vstore_v_p_vf((float*)h, (vfloat)uh.v);
134     return (vfloat)vxor_vi2_vi2_vi2(ur.v, s);
135 }
136 
137 vfloat static INLINE
__tan_kernel(vfloat const a,vint2 const h)138 __tan_kernel(vfloat const a, vint2 const h)
139 {
140     vfloat s, r, rd, t;
141     vopmask cmp;
142 
143     s = vmul_vf_vf_vf(a, a);
144     r = vcast_vf_f(A_F);
145     r = vfma_vf_vf_vf_vf(r, s, vcast_vf_f(B_F));
146     r = vfma_vf_vf_vf_vf(r, s, vcast_vf_f(C_F));
147     r = vfma_vf_vf_vf_vf(r, s, vcast_vf_f(D_F));
148     r = vfma_vf_vf_vf_vf(r, s, vcast_vf_f(E_F));
149     r = vfma_vf_vf_vf_vf(r, s, vcast_vf_f(F_F));
150     t = vmul_vf_vf_vf(s, a);
151     r = vfma_vf_vf_vf_vf(r, t, a);
152 
153     rd = vdiv_vf_vf_vf(vcast_vf_f(-1.0f), r);
154     cmp = veq_vo_vi2_vi2((vint2)h, vcast_vi2_i(0));
155     r = vsel_vf_vo_vf_vf(cmp, r, rd);
156 
157     return r;
158 }
159 
160 vfloat static INLINE
__tan_f_vec(vfloat const x)161 __tan_f_vec(vfloat const x)
162 {
163 
164     vfloat a, k, r;
165     vopmask m;
166     vint2 p, h;
167 
168     k = vfma_vf_vf_vf_vf(x, vcast_vf_f(_2_OVER_PI_F), vcast_vf_f(12582912.0f));
169     h = vsll_vi2_vi2_i((vint2)k, 31);
170     k = vsub_vf_vf_vf(k, vcast_vf_f(12582912.0f));
171 
172     a = vfma_vf_vf_vf_vf(k, vcast_vf_f(-PI_2_HI_F), x);
173     a = vfma_vf_vf_vf_vf(k, vcast_vf_f(-PI_2_MI_F), a);
174     a = vfma_vf_vf_vf_vf(k, vcast_vf_f(-PI_2_LO_F), a);
175 
176     r = __tan_kernel(a, h);
177 
178     p = vand_vi2_vi2_vi2((vint2)x, vcast_vi2_i(0x7fffffff));
179     m = vgt_vo_vi2_vi2(p, (vint2)vcast_vf_f(THRESHOLD_F));
180     if (__builtin_expect(!vtestz_i_vo(m), 0)) {
181         vfloat res;
182         vopmask ninf;
183         vmask half;
184 
185         res = __reduction_slowpath(x, &half);
186         res = __tan_kernel(res, half);
187         ninf = vgt_vo_vi2_vi2(vcast_vi2_i(0x7f800000), p);
188         res = vsel_vf_vo_vf_vf(ninf, res, vmul_vf_vf_vf(x, vcast_vf_f(0.0f)));
189 
190         r = vsel_vf_vo_vf_vf(m, res, r);
191     }
192 
193     return r;
194 }
195