1
2 /*
3 * Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 *
17 */
18
19
20 #include <math.h>
21 #include <common_tanf.h>
22
23 //static vmask i2opi_vec[] = {
24 // vcast_vm_i_i(0, i2opi_f[0]),
25 // vcast_vm_i_i(0, i2opi_f[1]),
26 // vcast_vm_i_i(0, i2opi_f[2]),
27 // vcast_vm_i_i(0, i2opi_f[3]),
28 // vcast_vm_i_i(0, i2opi_f[4]),
29 // vcast_vm_i_i(0, i2opi_f[5]),
30 //};
31
32 vfloat static INLINE
__reduction_slowpath(vfloat const a,vmask * h)33 __reduction_slowpath(vfloat const a, vmask *h)
34 {
35 vint2 ia, e, idx, q, p, s;
36 vint2 ia_a, ia_b, p_a, p_b, hi_a, hi_b;
37 vint2 hi, lo, ll, prev, prev2;
38
39 vmask i2opi_vec[] = {
40 vcast_vm_i_i(0, i2opi_f[0]),
41 vcast_vm_i_i(0, i2opi_f[1]),
42 vcast_vm_i_i(0, i2opi_f[2]),
43 vcast_vm_i_i(0, i2opi_f[3]),
44 vcast_vm_i_i(0, i2opi_f[4]),
45 vcast_vm_i_i(0, i2opi_f[5]),
46 };
47
48 ia = (vint2)a;
49 s = vand_vi2_vi2_vi2(ia, vcast_vi2_i(0x80000000));
50 /* e = ((ia >> 23) & 0xff) - 127; */
51 e = vsrl_vi2_vi2_i(ia, 23);
52 e = vand_vi2_vi2_vi2(e, vcast_vi2_i(0xff));
53 e = vsub_vi2_vi2_vi2(e, vcast_vi2_i(127));
54 /* ia = (ia << 8) | 0x80000000; */
55 ia = vsll_vi2_vi2_i(ia, 8);
56 ia = vor_vi2_vi2_vi2(ia, vcast_vi2_i(0x80000000));
57
58 /* compute x * 1/pi */
59 /* idx = 6 - ((e >> 5) & 3); */
60 idx = vsrl_vi2_vi2_i(e, 5);
61 idx = vand_vi2_vi2_vi2(idx, vcast_vi2_i(3));
62 idx = vsub_vi2_vi2_vi2(vcast_vi2_i(6), idx);
63
64 ia_a = vsrl64_vi2_vi2_i(ia, 32);
65 ia_b = ia;
66 hi_a = vcast_vi2_i(0);
67 hi_b = vcast_vi2_i(0);
68
69 q = vcast_vi2_i(0);
70 for (int i = 0; i < 6; i++) {
71 p_a = vmulu_vi2_vi2_vi2((vint2)i2opi_vec[i], ia_a);
72 p_b = vmulu_vi2_vi2_vi2((vint2)i2opi_vec[i], ia_b);
73 p_a = vadd64_vi2_vi2_vi2(p_a, hi_a);
74 p_b = vadd64_vi2_vi2_vi2(p_b, hi_b);
75
76 hi_a = vsrl64_vi2_vi2_i(p_a, 32);
77 hi_b = vsrl64_vi2_vi2_i(p_b, 32);
78
79 p_a = vsll64_vi2_vi2_i(p_a, 32);
80 p_b = vand_vi2_vi2_vi2(p_b, vcast_vm_i_i(0, 0xffffffff));
81
82 p = vor_vi2_vi2_vi2(p_a, p_b);
83
84 vopmask m = veq_vo_vi2_vi2(idx, q);
85 hi = vsel_vi2_vo_vi2_vi2(m, p, hi);
86 lo = vsel_vi2_vo_vi2_vi2(m, prev, lo);
87 ll = vsel_vi2_vo_vi2_vi2(m, prev2, ll);
88
89 prev2 = prev;
90 prev = p;
91
92 q = vadd_vi2_vi2_vi2(q, vcast_vi2_i(1));
93 }
94 p = vor_vi2_vi2_vi2(vsll64_vi2_vi2_i(hi_a, 32), hi_b);
95
96 vopmask m = veq_vo_vi2_vi2(idx, q);
97 hi = vsel_vi2_vo_vi2_vi2(m, p, hi);
98 lo = vsel_vi2_vo_vi2_vi2(m, prev, lo);
99 ll = vsel_vi2_vo_vi2_vi2(m, prev2, ll);
100
101 e = vand_vi2_vi2_vi2(e, vcast_vi2_i(31));
102
103 union {
104 vint2 v;
105 uint32_t t[sizeof(vint2) / sizeof(uint32_t)];
106 } ue, uhi, ulo, ull, uh, ur;
107 ue.v = e; uhi.v = hi; ulo.v = lo; ull.v = ll;
108 for (unsigned i = 0; i < sizeof(vint2) / sizeof(uint32_t); i++) {
109 uint32_t e = ue.t[i], q;
110 uint64_t p;
111 p = (uint64_t)uhi.t[i] << 32;
112 p |= ulo.t[i];
113
114 if (e) {
115 q = 32 - e;
116 p = (p << e) | (ull.t[i] >> q);
117 }
118
119 q = (uhi.t[i] << e) & 0x80000000;
120 p &= 0x7fffffffffffffffULL;
121
122 if (p & 0x4000000000000000ULL) {
123 p |= 0x8000000000000000ULL;
124 q ^= 0x80000000;
125 }
126 uh.t[i] = q;
127
128 double d = (double)(int64_t)p;
129 d *= PI_2_M64;
130 float r = (float)d;
131 ur.t[i] = float_as_int(r);
132 }
133 vstore_v_p_vf((float*)h, (vfloat)uh.v);
134 return (vfloat)vxor_vi2_vi2_vi2(ur.v, s);
135 }
136
137 vfloat static INLINE
__tan_kernel(vfloat const a,vint2 const h)138 __tan_kernel(vfloat const a, vint2 const h)
139 {
140 vfloat s, r, rd, t;
141 vopmask cmp;
142
143 s = vmul_vf_vf_vf(a, a);
144 r = vcast_vf_f(A_F);
145 r = vfma_vf_vf_vf_vf(r, s, vcast_vf_f(B_F));
146 r = vfma_vf_vf_vf_vf(r, s, vcast_vf_f(C_F));
147 r = vfma_vf_vf_vf_vf(r, s, vcast_vf_f(D_F));
148 r = vfma_vf_vf_vf_vf(r, s, vcast_vf_f(E_F));
149 r = vfma_vf_vf_vf_vf(r, s, vcast_vf_f(F_F));
150 t = vmul_vf_vf_vf(s, a);
151 r = vfma_vf_vf_vf_vf(r, t, a);
152
153 rd = vdiv_vf_vf_vf(vcast_vf_f(-1.0f), r);
154 cmp = veq_vo_vi2_vi2((vint2)h, vcast_vi2_i(0));
155 r = vsel_vf_vo_vf_vf(cmp, r, rd);
156
157 return r;
158 }
159
160 vfloat static INLINE
__tan_f_vec(vfloat const x)161 __tan_f_vec(vfloat const x)
162 {
163
164 vfloat a, k, r;
165 vopmask m;
166 vint2 p, h;
167
168 k = vfma_vf_vf_vf_vf(x, vcast_vf_f(_2_OVER_PI_F), vcast_vf_f(12582912.0f));
169 h = vsll_vi2_vi2_i((vint2)k, 31);
170 k = vsub_vf_vf_vf(k, vcast_vf_f(12582912.0f));
171
172 a = vfma_vf_vf_vf_vf(k, vcast_vf_f(-PI_2_HI_F), x);
173 a = vfma_vf_vf_vf_vf(k, vcast_vf_f(-PI_2_MI_F), a);
174 a = vfma_vf_vf_vf_vf(k, vcast_vf_f(-PI_2_LO_F), a);
175
176 r = __tan_kernel(a, h);
177
178 p = vand_vi2_vi2_vi2((vint2)x, vcast_vi2_i(0x7fffffff));
179 m = vgt_vo_vi2_vi2(p, (vint2)vcast_vf_f(THRESHOLD_F));
180 if (__builtin_expect(!vtestz_i_vo(m), 0)) {
181 vfloat res;
182 vopmask ninf;
183 vmask half;
184
185 res = __reduction_slowpath(x, &half);
186 res = __tan_kernel(res, half);
187 ninf = vgt_vo_vi2_vi2(vcast_vi2_i(0x7f800000), p);
188 res = vsel_vf_vo_vf_vf(ninf, res, vmul_vf_vf_vf(x, vcast_vf_f(0.0f)));
189
190 r = vsel_vf_vo_vf_vf(m, res, r);
191 }
192
193 return r;
194 }
195