1 // This file is part of Eigen, a lightweight C++ template library
2 // for linear algebra.
3 //
4 // Copyright (C) 2007 Julien Pommier
5 // Copyright (C) 2014 Pedro Gonnet (pedro.gonnet@gmail.com)
6 // Copyright (C) 2009-2019 Gael Guennebaud <gael.guennebaud@inria.fr>
7 //
8 // This Source Code Form is subject to the terms of the Mozilla
9 // Public License v. 2.0. If a copy of the MPL was not distributed
10 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
11
12 /* The exp and log functions of this file initially come from
13 * Julien Pommier's sse math library: http://gruntthepeon.free.fr/ssemath/
14 */
15
16 #ifndef EIGEN_ARCH_GENERIC_PACKET_MATH_FUNCTIONS_H
17 #define EIGEN_ARCH_GENERIC_PACKET_MATH_FUNCTIONS_H
18
19 namespace Eigen {
20 namespace internal {
21
22 template<typename Packet> EIGEN_STRONG_INLINE Packet
pfrexp_float(const Packet & a,Packet & exponent)23 pfrexp_float(const Packet& a, Packet& exponent) {
24 typedef typename unpacket_traits<Packet>::integer_packet PacketI;
25 const Packet cst_126f = pset1<Packet>(126.0f);
26 const Packet cst_half = pset1<Packet>(0.5f);
27 const Packet cst_inv_mant_mask = pset1frombits<Packet>(~0x7f800000u);
28 exponent = psub(pcast<PacketI,Packet>(plogical_shift_right<23>(preinterpret<PacketI>(a))), cst_126f);
29 return por(pand(a, cst_inv_mant_mask), cst_half);
30 }
31
32 template<typename Packet> EIGEN_STRONG_INLINE Packet
pfrexp_double(const Packet & a,Packet & exponent)33 pfrexp_double(const Packet& a, Packet& exponent) {
34 typedef typename unpacket_traits<Packet>::integer_packet PacketI;
35 const Packet cst_1022d = pset1<Packet>(1022.0);
36 const Packet cst_half = pset1<Packet>(0.5);
37 const Packet cst_inv_mant_mask = pset1frombits<Packet>(static_cast<uint64_t>(~0x7ff0000000000000ull));
38 exponent = psub(pcast<PacketI,Packet>(plogical_shift_right<52>(preinterpret<PacketI>(a))), cst_1022d);
39 return por(pand(a, cst_inv_mant_mask), cst_half);
40 }
41
42 template<typename Packet> EIGEN_STRONG_INLINE Packet
pldexp_float(Packet a,Packet exponent)43 pldexp_float(Packet a, Packet exponent)
44 {
45 typedef typename unpacket_traits<Packet>::integer_packet PacketI;
46 const Packet cst_127 = pset1<Packet>(127.f);
47 // return a * 2^exponent
48 PacketI ei = pcast<Packet,PacketI>(padd(exponent, cst_127));
49 return pmul(a, preinterpret<Packet>(plogical_shift_left<23>(ei)));
50 }
51
52 template<typename Packet> EIGEN_STRONG_INLINE Packet
pldexp_double(Packet a,Packet exponent)53 pldexp_double(Packet a, Packet exponent)
54 {
55 typedef typename unpacket_traits<Packet>::integer_packet PacketI;
56 const Packet cst_1023 = pset1<Packet>(1023.0);
57 // return a * 2^exponent
58 PacketI ei = pcast<Packet,PacketI>(padd(exponent, cst_1023));
59 return pmul(a, preinterpret<Packet>(plogical_shift_left<52>(ei)));
60 }
61
62 // Natural or base 2 logarithm.
63 // Computes log(x) as log(2^e * m) = C*e + log(m), where the constant C =log(2)
64 // and m is in the range [sqrt(1/2),sqrt(2)). In this range, the logarithm can
65 // be easily approximated by a polynomial centered on m=1 for stability.
66 // TODO(gonnet): Further reduce the interval allowing for lower-degree
67 // polynomial interpolants -> ... -> profit!
68 template <typename Packet, bool base2>
69 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
70 EIGEN_UNUSED
plog_impl_float(const Packet _x)71 Packet plog_impl_float(const Packet _x)
72 {
73 Packet x = _x;
74
75 const Packet cst_1 = pset1<Packet>(1.0f);
76 const Packet cst_neg_half = pset1<Packet>(-0.5f);
77 // The smallest non denormalized float number.
78 const Packet cst_min_norm_pos = pset1frombits<Packet>( 0x00800000u);
79 const Packet cst_minus_inf = pset1frombits<Packet>( 0xff800000u);
80 const Packet cst_pos_inf = pset1frombits<Packet>( 0x7f800000u);
81
82 // Polynomial coefficients.
83 const Packet cst_cephes_SQRTHF = pset1<Packet>(0.707106781186547524f);
84 const Packet cst_cephes_log_p0 = pset1<Packet>(7.0376836292E-2f);
85 const Packet cst_cephes_log_p1 = pset1<Packet>(-1.1514610310E-1f);
86 const Packet cst_cephes_log_p2 = pset1<Packet>(1.1676998740E-1f);
87 const Packet cst_cephes_log_p3 = pset1<Packet>(-1.2420140846E-1f);
88 const Packet cst_cephes_log_p4 = pset1<Packet>(+1.4249322787E-1f);
89 const Packet cst_cephes_log_p5 = pset1<Packet>(-1.6668057665E-1f);
90 const Packet cst_cephes_log_p6 = pset1<Packet>(+2.0000714765E-1f);
91 const Packet cst_cephes_log_p7 = pset1<Packet>(-2.4999993993E-1f);
92 const Packet cst_cephes_log_p8 = pset1<Packet>(+3.3333331174E-1f);
93
94 // Truncate input values to the minimum positive normal.
95 x = pmax(x, cst_min_norm_pos);
96
97 Packet e;
98 // extract significant in the range [0.5,1) and exponent
99 x = pfrexp(x,e);
100
101 // part2: Shift the inputs from the range [0.5,1) to [sqrt(1/2),sqrt(2))
102 // and shift by -1. The values are then centered around 0, which improves
103 // the stability of the polynomial evaluation.
104 // if( x < SQRTHF ) {
105 // e -= 1;
106 // x = x + x - 1.0;
107 // } else { x = x - 1.0; }
108 Packet mask = pcmp_lt(x, cst_cephes_SQRTHF);
109 Packet tmp = pand(x, mask);
110 x = psub(x, cst_1);
111 e = psub(e, pand(cst_1, mask));
112 x = padd(x, tmp);
113
114 Packet x2 = pmul(x, x);
115 Packet x3 = pmul(x2, x);
116
117 // Evaluate the polynomial approximant of degree 8 in three parts, probably
118 // to improve instruction-level parallelism.
119 Packet y, y1, y2;
120 y = pmadd(cst_cephes_log_p0, x, cst_cephes_log_p1);
121 y1 = pmadd(cst_cephes_log_p3, x, cst_cephes_log_p4);
122 y2 = pmadd(cst_cephes_log_p6, x, cst_cephes_log_p7);
123 y = pmadd(y, x, cst_cephes_log_p2);
124 y1 = pmadd(y1, x, cst_cephes_log_p5);
125 y2 = pmadd(y2, x, cst_cephes_log_p8);
126 y = pmadd(y, x3, y1);
127 y = pmadd(y, x3, y2);
128 y = pmul(y, x3);
129
130 y = pmadd(cst_neg_half, x2, y);
131 x = padd(x, y);
132
133 // Add the logarithm of the exponent back to the result of the interpolation.
134 if (base2) {
135 const Packet cst_log2e = pset1<Packet>(static_cast<float>(EIGEN_LOG2E));
136 x = pmadd(x, cst_log2e, e);
137 } else {
138 const Packet cst_ln2 = pset1<Packet>(static_cast<float>(EIGEN_LN2));
139 x = pmadd(e, cst_ln2, x);
140 }
141
142 Packet invalid_mask = pcmp_lt_or_nan(_x, pzero(_x));
143 Packet iszero_mask = pcmp_eq(_x,pzero(_x));
144 Packet pos_inf_mask = pcmp_eq(_x,cst_pos_inf);
145 // Filter out invalid inputs, i.e.:
146 // - negative arg will be NAN
147 // - 0 will be -INF
148 // - +INF will be +INF
149 return pselect(iszero_mask, cst_minus_inf,
150 por(pselect(pos_inf_mask,cst_pos_inf,x), invalid_mask));
151 }
152
153 template <typename Packet>
154 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
155 EIGEN_UNUSED
plog_float(const Packet _x)156 Packet plog_float(const Packet _x)
157 {
158 return plog_impl_float<Packet, /* base2 */ false>(_x);
159 }
160
161 template <typename Packet>
162 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
163 EIGEN_UNUSED
plog2_float(const Packet _x)164 Packet plog2_float(const Packet _x)
165 {
166 return plog_impl_float<Packet, /* base2 */ true>(_x);
167 }
168
169 /* Returns the base e (2.718...) or base 2 logarithm of x.
170 * The argument is separated into its exponent and fractional parts.
171 * The logarithm of the fraction in the interval [sqrt(1/2), sqrt(2)],
172 * is approximated by
173 *
174 * log(1+x) = x - 0.5 x**2 + x**3 P(x)/Q(x).
175 *
176 * for more detail see: http://www.netlib.org/cephes/
177 */
178 template <typename Packet, bool base2>
179 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
180 EIGEN_UNUSED
plog_impl_double(const Packet _x)181 Packet plog_impl_double(const Packet _x)
182 {
183 Packet x = _x;
184
185 const Packet cst_1 = pset1<Packet>(1.0);
186 const Packet cst_neg_half = pset1<Packet>(-0.5);
187 // The smallest non denormalized double.
188 const Packet cst_min_norm_pos = pset1frombits<Packet>( static_cast<uint64_t>(0x0010000000000000ull));
189 const Packet cst_minus_inf = pset1frombits<Packet>( static_cast<uint64_t>(0xfff0000000000000ull));
190 const Packet cst_pos_inf = pset1frombits<Packet>( static_cast<uint64_t>(0x7ff0000000000000ull));
191
192
193 // Polynomial Coefficients for log(1+x) = x - x**2/2 + x**3 P(x)/Q(x)
194 // 1/sqrt(2) <= x < sqrt(2)
195 const Packet cst_cephes_SQRTHF = pset1<Packet>(0.70710678118654752440E0);
196 const Packet cst_cephes_log_p0 = pset1<Packet>(1.01875663804580931796E-4);
197 const Packet cst_cephes_log_p1 = pset1<Packet>(4.97494994976747001425E-1);
198 const Packet cst_cephes_log_p2 = pset1<Packet>(4.70579119878881725854E0);
199 const Packet cst_cephes_log_p3 = pset1<Packet>(1.44989225341610930846E1);
200 const Packet cst_cephes_log_p4 = pset1<Packet>(1.79368678507819816313E1);
201 const Packet cst_cephes_log_p5 = pset1<Packet>(7.70838733755885391666E0);
202
203 const Packet cst_cephes_log_q0 = pset1<Packet>(1.0);
204 const Packet cst_cephes_log_q1 = pset1<Packet>(1.12873587189167450590E1);
205 const Packet cst_cephes_log_q2 = pset1<Packet>(4.52279145837532221105E1);
206 const Packet cst_cephes_log_q3 = pset1<Packet>(8.29875266912776603211E1);
207 const Packet cst_cephes_log_q4 = pset1<Packet>(7.11544750618563894466E1);
208 const Packet cst_cephes_log_q5 = pset1<Packet>(2.31251620126765340583E1);
209
210 // Truncate input values to the minimum positive normal.
211 x = pmax(x, cst_min_norm_pos);
212
213 Packet e;
214 // extract significant in the range [0.5,1) and exponent
215 x = pfrexp(x,e);
216
217 // Shift the inputs from the range [0.5,1) to [sqrt(1/2),sqrt(2))
218 // and shift by -1. The values are then centered around 0, which improves
219 // the stability of the polynomial evaluation.
220 // if( x < SQRTHF ) {
221 // e -= 1;
222 // x = x + x - 1.0;
223 // } else { x = x - 1.0; }
224 Packet mask = pcmp_lt(x, cst_cephes_SQRTHF);
225 Packet tmp = pand(x, mask);
226 x = psub(x, cst_1);
227 e = psub(e, pand(cst_1, mask));
228 x = padd(x, tmp);
229
230 Packet x2 = pmul(x, x);
231 Packet x3 = pmul(x2, x);
232
233 // Evaluate the polynomial approximant , probably to improve instruction-level parallelism.
234 // y = x - 0.5*x^2 + x^3 * polevl( x, P, 5 ) / p1evl( x, Q, 5 ) );
235 Packet y, y1, y_;
236 y = pmadd(cst_cephes_log_p0, x, cst_cephes_log_p1);
237 y1 = pmadd(cst_cephes_log_p3, x, cst_cephes_log_p4);
238 y = pmadd(y, x, cst_cephes_log_p2);
239 y1 = pmadd(y1, x, cst_cephes_log_p5);
240 y_ = pmadd(y, x3, y1);
241
242 y = pmadd(cst_cephes_log_q0, x, cst_cephes_log_q1);
243 y1 = pmadd(cst_cephes_log_q3, x, cst_cephes_log_q4);
244 y = pmadd(y, x, cst_cephes_log_q2);
245 y1 = pmadd(y1, x, cst_cephes_log_q5);
246 y = pmadd(y, x3, y1);
247
248 y_ = pmul(y_, x3);
249 y = pdiv(y_, y);
250
251 y = pmadd(cst_neg_half, x2, y);
252 x = padd(x, y);
253
254 // Add the logarithm of the exponent back to the result of the interpolation.
255 if (base2) {
256 const Packet cst_log2e = pset1<Packet>(static_cast<double>(EIGEN_LOG2E));
257 x = pmadd(x, cst_log2e, e);
258 } else {
259 const Packet cst_ln2 = pset1<Packet>(static_cast<double>(EIGEN_LN2));
260 x = pmadd(e, cst_ln2, x);
261 }
262
263 Packet invalid_mask = pcmp_lt_or_nan(_x, pzero(_x));
264 Packet iszero_mask = pcmp_eq(_x,pzero(_x));
265 Packet pos_inf_mask = pcmp_eq(_x,cst_pos_inf);
266 // Filter out invalid inputs, i.e.:
267 // - negative arg will be NAN
268 // - 0 will be -INF
269 // - +INF will be +INF
270 return pselect(iszero_mask, cst_minus_inf,
271 por(pselect(pos_inf_mask,cst_pos_inf,x), invalid_mask));
272 }
273
274 template <typename Packet>
275 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
276 EIGEN_UNUSED
plog_double(const Packet _x)277 Packet plog_double(const Packet _x)
278 {
279 return plog_impl_double<Packet, /* base2 */ false>(_x);
280 }
281
282 template <typename Packet>
283 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
284 EIGEN_UNUSED
plog2_double(const Packet _x)285 Packet plog2_double(const Packet _x)
286 {
287 return plog_impl_double<Packet, /* base2 */ true>(_x);
288 }
289
290 /** \internal \returns log(1 + x) computed using W. Kahan's formula.
291 See: http://www.plunk.org/~hatch/rightway.php
292 */
293 template<typename Packet>
generic_plog1p(const Packet & x)294 Packet generic_plog1p(const Packet& x)
295 {
296 typedef typename unpacket_traits<Packet>::type ScalarType;
297 const Packet one = pset1<Packet>(ScalarType(1));
298 Packet xp1 = padd(x, one);
299 Packet small_mask = pcmp_eq(xp1, one);
300 Packet log1 = plog(xp1);
301 Packet inf_mask = pcmp_eq(xp1, log1);
302 Packet log_large = pmul(x, pdiv(log1, psub(xp1, one)));
303 return pselect(por(small_mask, inf_mask), x, log_large);
304 }
305
306 /** \internal \returns exp(x)-1 computed using W. Kahan's formula.
307 See: http://www.plunk.org/~hatch/rightway.php
308 */
309 template<typename Packet>
generic_expm1(const Packet & x)310 Packet generic_expm1(const Packet& x)
311 {
312 typedef typename unpacket_traits<Packet>::type ScalarType;
313 const Packet one = pset1<Packet>(ScalarType(1));
314 const Packet neg_one = pset1<Packet>(ScalarType(-1));
315 Packet u = pexp(x);
316 Packet one_mask = pcmp_eq(u, one);
317 Packet u_minus_one = psub(u, one);
318 Packet neg_one_mask = pcmp_eq(u_minus_one, neg_one);
319 Packet logu = plog(u);
320 // The following comparison is to catch the case where
321 // exp(x) = +inf. It is written in this way to avoid having
322 // to form the constant +inf, which depends on the packet
323 // type.
324 Packet pos_inf_mask = pcmp_eq(logu, u);
325 Packet expm1 = pmul(u_minus_one, pdiv(x, logu));
326 expm1 = pselect(pos_inf_mask, u, expm1);
327 return pselect(one_mask,
328 x,
329 pselect(neg_one_mask,
330 neg_one,
331 expm1));
332 }
333
334
335 // Exponential function. Works by writing "x = m*log(2) + r" where
336 // "m = floor(x/log(2)+1/2)" and "r" is the remainder. The result is then
337 // "exp(x) = 2^m*exp(r)" where exp(r) is in the range [-1,1).
338 template <typename Packet>
339 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
340 EIGEN_UNUSED
pexp_float(const Packet _x)341 Packet pexp_float(const Packet _x)
342 {
343 const Packet cst_1 = pset1<Packet>(1.0f);
344 const Packet cst_half = pset1<Packet>(0.5f);
345 const Packet cst_exp_hi = pset1<Packet>( 88.3762626647950f);
346 const Packet cst_exp_lo = pset1<Packet>(-88.3762626647949f);
347
348 const Packet cst_cephes_LOG2EF = pset1<Packet>(1.44269504088896341f);
349 const Packet cst_cephes_exp_p0 = pset1<Packet>(1.9875691500E-4f);
350 const Packet cst_cephes_exp_p1 = pset1<Packet>(1.3981999507E-3f);
351 const Packet cst_cephes_exp_p2 = pset1<Packet>(8.3334519073E-3f);
352 const Packet cst_cephes_exp_p3 = pset1<Packet>(4.1665795894E-2f);
353 const Packet cst_cephes_exp_p4 = pset1<Packet>(1.6666665459E-1f);
354 const Packet cst_cephes_exp_p5 = pset1<Packet>(5.0000001201E-1f);
355
356 // Clamp x.
357 Packet x = pmax(pmin(_x, cst_exp_hi), cst_exp_lo);
358
359 // Express exp(x) as exp(m*ln(2) + r), start by extracting
360 // m = floor(x/ln(2) + 0.5).
361 Packet m = pfloor(pmadd(x, cst_cephes_LOG2EF, cst_half));
362
363 // Get r = x - m*ln(2). If no FMA instructions are available, m*ln(2) is
364 // subtracted out in two parts, m*C1+m*C2 = m*ln(2), to avoid accumulating
365 // truncation errors.
366 Packet r;
367 #ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
368 const Packet cst_nln2 = pset1<Packet>(-0.6931471805599453f);
369 r = pmadd(m, cst_nln2, x);
370 #else
371 const Packet cst_cephes_exp_C1 = pset1<Packet>(0.693359375f);
372 const Packet cst_cephes_exp_C2 = pset1<Packet>(-2.12194440e-4f);
373 r = psub(x, pmul(m, cst_cephes_exp_C1));
374 r = psub(r, pmul(m, cst_cephes_exp_C2));
375 #endif
376
377 Packet r2 = pmul(r, r);
378 Packet r3 = pmul(r2, r);
379
380 // Evaluate the polynomial approximant,improved by instruction-level parallelism.
381 Packet y, y1, y2;
382 y = pmadd(cst_cephes_exp_p0, r, cst_cephes_exp_p1);
383 y1 = pmadd(cst_cephes_exp_p3, r, cst_cephes_exp_p4);
384 y2 = padd(r, cst_1);
385 y = pmadd(y, r, cst_cephes_exp_p2);
386 y1 = pmadd(y1, r, cst_cephes_exp_p5);
387 y = pmadd(y, r3, y1);
388 y = pmadd(y, r2, y2);
389
390 // Return 2^m * exp(r).
391 return pmax(pldexp(y,m), _x);
392 }
393
394 template <typename Packet>
395 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
396 EIGEN_UNUSED
pexp_double(const Packet _x)397 Packet pexp_double(const Packet _x)
398 {
399 Packet x = _x;
400
401 const Packet cst_1 = pset1<Packet>(1.0);
402 const Packet cst_2 = pset1<Packet>(2.0);
403 const Packet cst_half = pset1<Packet>(0.5);
404
405 const Packet cst_exp_hi = pset1<Packet>(709.437);
406 const Packet cst_exp_lo = pset1<Packet>(-709.436139303);
407
408 const Packet cst_cephes_LOG2EF = pset1<Packet>(1.4426950408889634073599);
409 const Packet cst_cephes_exp_p0 = pset1<Packet>(1.26177193074810590878e-4);
410 const Packet cst_cephes_exp_p1 = pset1<Packet>(3.02994407707441961300e-2);
411 const Packet cst_cephes_exp_p2 = pset1<Packet>(9.99999999999999999910e-1);
412 const Packet cst_cephes_exp_q0 = pset1<Packet>(3.00198505138664455042e-6);
413 const Packet cst_cephes_exp_q1 = pset1<Packet>(2.52448340349684104192e-3);
414 const Packet cst_cephes_exp_q2 = pset1<Packet>(2.27265548208155028766e-1);
415 const Packet cst_cephes_exp_q3 = pset1<Packet>(2.00000000000000000009e0);
416 const Packet cst_cephes_exp_C1 = pset1<Packet>(0.693145751953125);
417 const Packet cst_cephes_exp_C2 = pset1<Packet>(1.42860682030941723212e-6);
418
419 Packet tmp, fx;
420
421 // clamp x
422 x = pmax(pmin(x, cst_exp_hi), cst_exp_lo);
423 // Express exp(x) as exp(g + n*log(2)).
424 fx = pmadd(cst_cephes_LOG2EF, x, cst_half);
425
426 // Get the integer modulus of log(2), i.e. the "n" described above.
427 fx = pfloor(fx);
428
429 // Get the remainder modulo log(2), i.e. the "g" described above. Subtract
430 // n*log(2) out in two steps, i.e. n*C1 + n*C2, C1+C2=log2 to get the last
431 // digits right.
432 tmp = pmul(fx, cst_cephes_exp_C1);
433 Packet z = pmul(fx, cst_cephes_exp_C2);
434 x = psub(x, tmp);
435 x = psub(x, z);
436
437 Packet x2 = pmul(x, x);
438
439 // Evaluate the numerator polynomial of the rational interpolant.
440 Packet px = cst_cephes_exp_p0;
441 px = pmadd(px, x2, cst_cephes_exp_p1);
442 px = pmadd(px, x2, cst_cephes_exp_p2);
443 px = pmul(px, x);
444
445 // Evaluate the denominator polynomial of the rational interpolant.
446 Packet qx = cst_cephes_exp_q0;
447 qx = pmadd(qx, x2, cst_cephes_exp_q1);
448 qx = pmadd(qx, x2, cst_cephes_exp_q2);
449 qx = pmadd(qx, x2, cst_cephes_exp_q3);
450
451 // I don't really get this bit, copied from the SSE2 routines, so...
452 // TODO(gonnet): Figure out what is going on here, perhaps find a better
453 // rational interpolant?
454 x = pdiv(px, psub(qx, px));
455 x = pmadd(cst_2, x, cst_1);
456
457 // Construct the result 2^n * exp(g) = e * x. The max is used to catch
458 // non-finite values in the input.
459 return pmax(pldexp(x,fx), _x);
460 }
461
462 // The following code is inspired by the following stack-overflow answer:
463 // https://stackoverflow.com/questions/30463616/payne-hanek-algorithm-implementation-in-c/30465751#30465751
464 // It has been largely optimized:
465 // - By-pass calls to frexp.
466 // - Aligned loads of required 96 bits of 2/pi. This is accomplished by
467 // (1) balancing the mantissa and exponent to the required bits of 2/pi are
468 // aligned on 8-bits, and (2) replicating the storage of the bits of 2/pi.
469 // - Avoid a branch in rounding and extraction of the remaining fractional part.
470 // Overall, I measured a speed up higher than x2 on x86-64.
trig_reduce_huge(float xf,int * quadrant)471 inline float trig_reduce_huge (float xf, int *quadrant)
472 {
473 using Eigen::numext::int32_t;
474 using Eigen::numext::uint32_t;
475 using Eigen::numext::int64_t;
476 using Eigen::numext::uint64_t;
477
478 const double pio2_62 = 3.4061215800865545e-19; // pi/2 * 2^-62
479 const uint64_t zero_dot_five = uint64_t(1) << 61; // 0.5 in 2.62-bit fixed-point foramt
480
481 // 192 bits of 2/pi for Payne-Hanek reduction
482 // Bits are introduced by packet of 8 to enable aligned reads.
483 static const uint32_t two_over_pi [] =
484 {
485 0x00000028, 0x000028be, 0x0028be60, 0x28be60db,
486 0xbe60db93, 0x60db9391, 0xdb939105, 0x9391054a,
487 0x91054a7f, 0x054a7f09, 0x4a7f09d5, 0x7f09d5f4,
488 0x09d5f47d, 0xd5f47d4d, 0xf47d4d37, 0x7d4d3770,
489 0x4d377036, 0x377036d8, 0x7036d8a5, 0x36d8a566,
490 0xd8a5664f, 0xa5664f10, 0x664f10e4, 0x4f10e410,
491 0x10e41000, 0xe4100000
492 };
493
494 uint32_t xi = numext::bit_cast<uint32_t>(xf);
495 // Below, -118 = -126 + 8.
496 // -126 is to get the exponent,
497 // +8 is to enable alignment of 2/pi's bits on 8 bits.
498 // This is possible because the fractional part of x as only 24 meaningful bits.
499 uint32_t e = (xi >> 23) - 118;
500 // Extract the mantissa and shift it to align it wrt the exponent
501 xi = ((xi & 0x007fffffu)| 0x00800000u) << (e & 0x7);
502
503 uint32_t i = e >> 3;
504 uint32_t twoopi_1 = two_over_pi[i-1];
505 uint32_t twoopi_2 = two_over_pi[i+3];
506 uint32_t twoopi_3 = two_over_pi[i+7];
507
508 // Compute x * 2/pi in 2.62-bit fixed-point format.
509 uint64_t p;
510 p = uint64_t(xi) * twoopi_3;
511 p = uint64_t(xi) * twoopi_2 + (p >> 32);
512 p = (uint64_t(xi * twoopi_1) << 32) + p;
513
514 // Round to nearest: add 0.5 and extract integral part.
515 uint64_t q = (p + zero_dot_five) >> 62;
516 *quadrant = int(q);
517 // Now it remains to compute "r = x - q*pi/2" with high accuracy,
518 // since we have p=x/(pi/2) with high accuracy, we can more efficiently compute r as:
519 // r = (p-q)*pi/2,
520 // where the product can be be carried out with sufficient accuracy using double precision.
521 p -= q<<62;
522 return float(double(int64_t(p)) * pio2_62);
523 }
524
525 template<bool ComputeSine,typename Packet>
526 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
527 EIGEN_UNUSED
528 #if EIGEN_GNUC_AT_LEAST(4,4) && EIGEN_COMP_GNUC_STRICT
529 __attribute__((optimize("-fno-unsafe-math-optimizations")))
530 #endif
psincos_float(const Packet & _x)531 Packet psincos_float(const Packet& _x)
532 {
533 // Workaround -ffast-math aggressive optimizations
534 // See bug 1674
535 #if EIGEN_COMP_CLANG && defined(EIGEN_VECTORIZE_SSE)
536 #define EIGEN_SINCOS_DONT_OPT(X) __asm__ ("" : "+x" (X));
537 #else
538 #define EIGEN_SINCOS_DONT_OPT(X)
539 #endif
540
541 typedef typename unpacket_traits<Packet>::integer_packet PacketI;
542
543 const Packet cst_2oPI = pset1<Packet>(0.636619746685028076171875f); // 2/PI
544 const Packet cst_rounding_magic = pset1<Packet>(12582912); // 2^23 for rounding
545 const PacketI csti_1 = pset1<PacketI>(1);
546 const Packet cst_sign_mask = pset1frombits<Packet>(0x80000000u);
547
548 Packet x = pabs(_x);
549
550 // Scale x by 2/Pi to find x's octant.
551 Packet y = pmul(x, cst_2oPI);
552
553 // Rounding trick:
554 Packet y_round = padd(y, cst_rounding_magic);
555 EIGEN_SINCOS_DONT_OPT(y_round)
556 PacketI y_int = preinterpret<PacketI>(y_round); // last 23 digits represent integer (if abs(x)<2^24)
557 y = psub(y_round, cst_rounding_magic); // nearest integer to x*4/pi
558
559 // Reduce x by y octants to get: -Pi/4 <= x <= +Pi/4
560 // using "Extended precision modular arithmetic"
561 #if defined(EIGEN_HAS_SINGLE_INSTRUCTION_MADD)
562 // This version requires true FMA for high accuracy
563 // It provides a max error of 1ULP up to (with absolute_error < 5.9605e-08):
564 const float huge_th = ComputeSine ? 117435.992f : 71476.0625f;
565 x = pmadd(y, pset1<Packet>(-1.57079601287841796875f), x);
566 x = pmadd(y, pset1<Packet>(-3.1391647326017846353352069854736328125e-07f), x);
567 x = pmadd(y, pset1<Packet>(-5.390302529957764765544681040410068817436695098876953125e-15f), x);
568 #else
569 // Without true FMA, the previous set of coefficients maintain 1ULP accuracy
570 // up to x<15.7 (for sin), but accuracy is immediately lost for x>15.7.
571 // We thus use one more iteration to maintain 2ULPs up to reasonably large inputs.
572
573 // The following set of coefficients maintain 1ULP up to 9.43 and 14.16 for sin and cos respectively.
574 // and 2 ULP up to:
575 const float huge_th = ComputeSine ? 25966.f : 18838.f;
576 x = pmadd(y, pset1<Packet>(-1.5703125), x); // = 0xbfc90000
577 EIGEN_SINCOS_DONT_OPT(x)
578 x = pmadd(y, pset1<Packet>(-0.000483989715576171875), x); // = 0xb9fdc000
579 EIGEN_SINCOS_DONT_OPT(x)
580 x = pmadd(y, pset1<Packet>(1.62865035235881805419921875e-07), x); // = 0x342ee000
581 x = pmadd(y, pset1<Packet>(5.5644315544167710640977020375430583953857421875e-11), x); // = 0x2e74b9ee
582
583 // For the record, the following set of coefficients maintain 2ULP up
584 // to a slightly larger range:
585 // const float huge_th = ComputeSine ? 51981.f : 39086.125f;
586 // but it slightly fails to maintain 1ULP for two values of sin below pi.
587 // x = pmadd(y, pset1<Packet>(-3.140625/2.), x);
588 // x = pmadd(y, pset1<Packet>(-0.00048351287841796875), x);
589 // x = pmadd(y, pset1<Packet>(-3.13855707645416259765625e-07), x);
590 // x = pmadd(y, pset1<Packet>(-6.0771006282767103812147979624569416046142578125e-11), x);
591
592 // For the record, with only 3 iterations it is possible to maintain
593 // 1 ULP up to 3PI (maybe more) and 2ULP up to 255.
594 // The coefficients are: 0xbfc90f80, 0xb7354480, 0x2e74b9ee
595 #endif
596
597 if(predux_any(pcmp_le(pset1<Packet>(huge_th),pabs(_x))))
598 {
599 const int PacketSize = unpacket_traits<Packet>::size;
600 EIGEN_ALIGN_TO_BOUNDARY(sizeof(Packet)) float vals[PacketSize];
601 EIGEN_ALIGN_TO_BOUNDARY(sizeof(Packet)) float x_cpy[PacketSize];
602 EIGEN_ALIGN_TO_BOUNDARY(sizeof(Packet)) int y_int2[PacketSize];
603 pstoreu(vals, pabs(_x));
604 pstoreu(x_cpy, x);
605 pstoreu(y_int2, y_int);
606 for(int k=0; k<PacketSize;++k)
607 {
608 float val = vals[k];
609 if(val>=huge_th && (numext::isfinite)(val))
610 x_cpy[k] = trig_reduce_huge(val,&y_int2[k]);
611 }
612 x = ploadu<Packet>(x_cpy);
613 y_int = ploadu<PacketI>(y_int2);
614 }
615
616 // Compute the sign to apply to the polynomial.
617 // sin: sign = second_bit(y_int) xor signbit(_x)
618 // cos: sign = second_bit(y_int+1)
619 Packet sign_bit = ComputeSine ? pxor(_x, preinterpret<Packet>(plogical_shift_left<30>(y_int)))
620 : preinterpret<Packet>(plogical_shift_left<30>(padd(y_int,csti_1)));
621 sign_bit = pand(sign_bit, cst_sign_mask); // clear all but left most bit
622
623 // Get the polynomial selection mask from the second bit of y_int
624 // We'll calculate both (sin and cos) polynomials and then select from the two.
625 Packet poly_mask = preinterpret<Packet>(pcmp_eq(pand(y_int, csti_1), pzero(y_int)));
626
627 Packet x2 = pmul(x,x);
628
629 // Evaluate the cos(x) polynomial. (-Pi/4 <= x <= Pi/4)
630 Packet y1 = pset1<Packet>(2.4372266125283204019069671630859375e-05f);
631 y1 = pmadd(y1, x2, pset1<Packet>(-0.00138865201734006404876708984375f ));
632 y1 = pmadd(y1, x2, pset1<Packet>(0.041666619479656219482421875f ));
633 y1 = pmadd(y1, x2, pset1<Packet>(-0.5f));
634 y1 = pmadd(y1, x2, pset1<Packet>(1.f));
635
636 // Evaluate the sin(x) polynomial. (Pi/4 <= x <= Pi/4)
637 // octave/matlab code to compute those coefficients:
638 // x = (0:0.0001:pi/4)';
639 // A = [x.^3 x.^5 x.^7];
640 // w = ((1.-(x/(pi/4)).^2).^5)*2000+1; # weights trading relative accuracy
641 // c = (A'*diag(w)*A)\(A'*diag(w)*(sin(x)-x)); # weighted LS, linear coeff forced to 1
642 // printf('%.64f\n %.64f\n%.64f\n', c(3), c(2), c(1))
643 //
644 Packet y2 = pset1<Packet>(-0.0001959234114083702898469196984621021329076029360294342041015625f);
645 y2 = pmadd(y2, x2, pset1<Packet>( 0.0083326873655616851693794799871284340042620897293090820312500000f));
646 y2 = pmadd(y2, x2, pset1<Packet>(-0.1666666203982298255503735617821803316473960876464843750000000000f));
647 y2 = pmul(y2, x2);
648 y2 = pmadd(y2, x, x);
649
650 // Select the correct result from the two polynomials.
651 y = ComputeSine ? pselect(poly_mask,y2,y1)
652 : pselect(poly_mask,y1,y2);
653
654 // Update the sign and filter huge inputs
655 return pxor(y, sign_bit);
656
657 #undef EIGEN_SINCOS_DONT_OPT
658 }
659
660 template<typename Packet>
661 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
662 EIGEN_UNUSED
psin_float(const Packet & x)663 Packet psin_float(const Packet& x)
664 {
665 return psincos_float<true>(x);
666 }
667
668 template<typename Packet>
669 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
670 EIGEN_UNUSED
pcos_float(const Packet & x)671 Packet pcos_float(const Packet& x)
672 {
673 return psincos_float<false>(x);
674 }
675
676
677 template<typename Packet>
678 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
679 EIGEN_UNUSED
psqrt_complex(const Packet & a)680 Packet psqrt_complex(const Packet& a) {
681 typedef typename unpacket_traits<Packet>::type Scalar;
682 typedef typename Scalar::value_type RealScalar;
683 typedef typename unpacket_traits<Packet>::as_real RealPacket;
684
685 // Computes the principal sqrt of the complex numbers in the input.
686 //
687 // For example, for packets containing 2 complex numbers stored in interleaved format
688 // a = [a0, a1] = [x0, y0, x1, y1],
689 // where x0 = real(a0), y0 = imag(a0) etc., this function returns
690 // b = [b0, b1] = [u0, v0, u1, v1],
691 // such that b0^2 = a0, b1^2 = a1.
692 //
693 // To derive the formula for the complex square roots, let's consider the equation for
694 // a single complex square root of the number x + i*y. We want to find real numbers
695 // u and v such that
696 // (u + i*v)^2 = x + i*y <=>
697 // u^2 - v^2 + i*2*u*v = x + i*v.
698 // By equating the real and imaginary parts we get:
699 // u^2 - v^2 = x
700 // 2*u*v = y.
701 //
702 // For x >= 0, this has the numerically stable solution
703 // u = sqrt(0.5 * (x + sqrt(x^2 + y^2)))
704 // v = 0.5 * (y / u)
705 // and for x < 0,
706 // v = sign(y) * sqrt(0.5 * (x + sqrt(x^2 + y^2)))
707 // u = |0.5 * (y / v)|
708 //
709 // To avoid unnecessary over- and underflow, we compute sqrt(x^2 + y^2) as
710 // l = max(|x|, |y|) * sqrt(1 + (min(|x|, |y|) / max(|x|, |y|))^2) ,
711
712 // In the following, without lack of generality, we have annotated the code, assuming
713 // that the input is a packet of 2 complex numbers.
714 //
715 // Step 1. Compute l = [l0, l0, l1, l1], where
716 // l0 = sqrt(x0^2 + y0^2), l1 = sqrt(x1^2 + y1^2)
717 // To avoid over- and underflow, we use the stable formula for each hypotenuse
718 // l0 = (min0 == 0 ? max0 : max0 * sqrt(1 + (min0/max0)**2)),
719 // where max0 = max(|x0|, |y0|), min0 = min(|x0|, |y0|), and similarly for l1.
720
721 Packet a_flip = pcplxflip(a);
722 RealPacket a_abs = pabs(a.v); // [|x0|, |y0|, |x1|, |y1|]
723 RealPacket a_abs_flip = pabs(a_flip.v); // [|y0|, |x0|, |y1|, |x1|]
724 RealPacket a_max = pmax(a_abs, a_abs_flip);
725 RealPacket a_min = pmin(a_abs, a_abs_flip);
726 RealPacket a_min_zero_mask = pcmp_eq(a_min, pzero(a_min));
727 RealPacket a_max_zero_mask = pcmp_eq(a_max, pzero(a_max));
728 RealPacket r = pdiv(a_min, a_max);
729 const RealPacket cst_one = pset1<RealPacket>(RealScalar(1));
730 RealPacket l = pmul(a_max, psqrt(padd(cst_one, pmul(r, r)))); // [l0, l0, l1, l1]
731 // Set l to a_max if a_min is zero.
732 l = pselect(a_min_zero_mask, a_max, l);
733
734 // Step 2. Compute [rho0, *, rho1, *], where
735 // rho0 = sqrt(0.5 * (l0 + |x0|)), rho1 = sqrt(0.5 * (l1 + |x1|))
736 // We don't care about the imaginary parts computed here. They will be overwritten later.
737 const RealPacket cst_half = pset1<RealPacket>(RealScalar(0.5));
738 Packet rho;
739 rho.v = psqrt(pmul(cst_half, padd(a_abs, l)));
740
741 // Step 3. Compute [rho0, eta0, rho1, eta1], where
742 // eta0 = (y0 / l0) / 2, and eta1 = (y1 / l1) / 2.
743 // set eta = 0 of input is 0 + i0.
744 RealPacket eta = pandnot(pmul(cst_half, pdiv(a.v, pcplxflip(rho).v)), a_max_zero_mask);
745 RealPacket real_mask = peven_mask(a.v);
746 Packet positive_real_result;
747 // Compute result for inputs with positive real part.
748 positive_real_result.v = pselect(real_mask, rho.v, eta);
749
750 // Step 4. Compute solution for inputs with negative real part:
751 // [|eta0|, sign(y0)*rho0, |eta1|, sign(y1)*rho1]
752 const RealPacket cst_imag_sign_mask = pset1<Packet>(Scalar(RealScalar(0.0), RealScalar(-0.0))).v;
753 RealPacket imag_signs = pand(a.v, cst_imag_sign_mask);
754 Packet negative_real_result;
755 // Notice that rho is positive, so taking it's absolute value is a noop.
756 negative_real_result.v = por(pabs(pcplxflip(positive_real_result).v), imag_signs);
757
758 // Step 5. Select solution branch based on the sign of the real parts.
759 Packet negative_real_mask;
760 negative_real_mask.v = pcmp_lt(pand(real_mask, a.v), pzero(a.v));
761 negative_real_mask.v = por(negative_real_mask.v, pcplxflip(negative_real_mask).v);
762 Packet result = pselect(negative_real_mask, negative_real_result, positive_real_result);
763
764 // Step 6. Handle special cases for infinities:
765 // * If z is (x,+∞), the result is (+∞,+∞) even if x is NaN
766 // * If z is (x,-∞), the result is (+∞,-∞) even if x is NaN
767 // * If z is (-∞,y), the result is (0*|y|,+∞) for finite or NaN y
768 // * If z is (+∞,y), the result is (+∞,0*|y|) for finite or NaN y
769 const RealPacket cst_pos_inf = pset1<RealPacket>(NumTraits<RealScalar>::infinity());
770 Packet is_inf;
771 is_inf.v = pcmp_eq(a_abs, cst_pos_inf);
772 Packet is_real_inf;
773 is_real_inf.v = pand(is_inf.v, real_mask);
774 is_real_inf = por(is_real_inf, pcplxflip(is_real_inf));
775 // prepare packet of (+∞,0*|y|) or (0*|y|,+∞), depending on the sign of the infinite real part.
776 Packet real_inf_result;
777 real_inf_result.v = pmul(a_abs, pset1<Packet>(Scalar(RealScalar(1.0), RealScalar(0.0))).v);
778 real_inf_result.v = pselect(negative_real_mask.v, pcplxflip(real_inf_result).v, real_inf_result.v);
779 // prepare packet of (+∞,+∞) or (+∞,-∞), depending on the sign of the infinite imaginary part.
780 Packet is_imag_inf;
781 is_imag_inf.v = pandnot(is_inf.v, real_mask);
782 is_imag_inf = por(is_imag_inf, pcplxflip(is_imag_inf));
783 Packet imag_inf_result;
784 imag_inf_result.v = por(pand(cst_pos_inf, real_mask), pandnot(a.v, real_mask));
785
786 return pselect(is_imag_inf, imag_inf_result,
787 pselect(is_real_inf, real_inf_result,result));
788 }
789
790 /* polevl (modified for Eigen)
791 *
792 * Evaluate polynomial
793 *
794 *
795 *
796 * SYNOPSIS:
797 *
798 * int N;
799 * Scalar x, y, coef[N+1];
800 *
801 * y = polevl<decltype(x), N>( x, coef);
802 *
803 *
804 *
805 * DESCRIPTION:
806 *
807 * Evaluates polynomial of degree N:
808 *
809 * 2 N
810 * y = C + C x + C x +...+ C x
811 * 0 1 2 N
812 *
813 * Coefficients are stored in reverse order:
814 *
815 * coef[0] = C , ..., coef[N] = C .
816 * N 0
817 *
818 * The function p1evl() assumes that coef[N] = 1.0 and is
819 * omitted from the array. Its calling arguments are
820 * otherwise the same as polevl().
821 *
822 *
823 * The Eigen implementation is templatized. For best speed, store
824 * coef as a const array (constexpr), e.g.
825 *
826 * const double coef[] = {1.0, 2.0, 3.0, ...};
827 *
828 */
829 template <typename Packet, int N>
830 struct ppolevl {
runppolevl831 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet run(const Packet& x, const typename unpacket_traits<Packet>::type coeff[]) {
832 EIGEN_STATIC_ASSERT((N > 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
833 return pmadd(ppolevl<Packet, N-1>::run(x, coeff), x, pset1<Packet>(coeff[N]));
834 }
835 };
836
837 template <typename Packet>
838 struct ppolevl<Packet, 0> {
839 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet run(const Packet& x, const typename unpacket_traits<Packet>::type coeff[]) {
840 EIGEN_UNUSED_VARIABLE(x);
841 return pset1<Packet>(coeff[0]);
842 }
843 };
844
845 /* chbevl (modified for Eigen)
846 *
847 * Evaluate Chebyshev series
848 *
849 *
850 *
851 * SYNOPSIS:
852 *
853 * int N;
854 * Scalar x, y, coef[N], chebevl();
855 *
856 * y = chbevl( x, coef, N );
857 *
858 *
859 *
860 * DESCRIPTION:
861 *
862 * Evaluates the series
863 *
864 * N-1
865 * - '
866 * y = > coef[i] T (x/2)
867 * - i
868 * i=0
869 *
870 * of Chebyshev polynomials Ti at argument x/2.
871 *
872 * Coefficients are stored in reverse order, i.e. the zero
873 * order term is last in the array. Note N is the number of
874 * coefficients, not the order.
875 *
876 * If coefficients are for the interval a to b, x must
877 * have been transformed to x -> 2(2x - b - a)/(b-a) before
878 * entering the routine. This maps x from (a, b) to (-1, 1),
879 * over which the Chebyshev polynomials are defined.
880 *
881 * If the coefficients are for the inverted interval, in
882 * which (a, b) is mapped to (1/b, 1/a), the transformation
883 * required is x -> 2(2ab/x - b - a)/(b-a). If b is infinity,
884 * this becomes x -> 4a/x - 1.
885 *
886 *
887 *
888 * SPEED:
889 *
890 * Taking advantage of the recurrence properties of the
891 * Chebyshev polynomials, the routine requires one more
892 * addition per loop than evaluating a nested polynomial of
893 * the same degree.
894 *
895 */
896
897 template <typename Packet, int N>
898 struct pchebevl {
899 EIGEN_DEVICE_FUNC
900 static EIGEN_STRONG_INLINE Packet run(Packet x, const typename unpacket_traits<Packet>::type coef[]) {
901 typedef typename unpacket_traits<Packet>::type Scalar;
902 Packet b0 = pset1<Packet>(coef[0]);
903 Packet b1 = pset1<Packet>(static_cast<Scalar>(0.f));
904 Packet b2;
905
906 for (int i = 1; i < N; i++) {
907 b2 = b1;
908 b1 = b0;
909 b0 = psub(pmadd(x, b1, pset1<Packet>(coef[i])), b2);
910 }
911
912 return pmul(pset1<Packet>(static_cast<Scalar>(0.5f)), psub(b0, b2));
913 }
914 };
915
916 } // end namespace internal
917 } // end namespace Eigen
918
919 #endif // EIGEN_ARCH_GENERIC_PACKET_MATH_FUNCTIONS_H
920