1 // This file is part of Eigen, a lightweight C++ template library
2 // for linear algebra.
3 //
4 // Copyright (C) 2007 Julien Pommier
5 // Copyright (C) 2014 Pedro Gonnet (pedro.gonnet@gmail.com)
6 // Copyright (C) 2009-2019 Gael Guennebaud <gael.guennebaud@inria.fr>
7 //
8 // This Source Code Form is subject to the terms of the Mozilla
9 // Public License v. 2.0. If a copy of the MPL was not distributed
10 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
11 
12 /* The exp and log functions of this file initially come from
13  * Julien Pommier's sse math library: http://gruntthepeon.free.fr/ssemath/
14  */
15 
16 #ifndef EIGEN_ARCH_GENERIC_PACKET_MATH_FUNCTIONS_H
17 #define EIGEN_ARCH_GENERIC_PACKET_MATH_FUNCTIONS_H
18 
19 namespace Eigen {
20 namespace internal {
21 
22 template<typename Packet> EIGEN_STRONG_INLINE Packet
pfrexp_float(const Packet & a,Packet & exponent)23 pfrexp_float(const Packet& a, Packet& exponent) {
24   typedef typename unpacket_traits<Packet>::integer_packet PacketI;
25   const Packet cst_126f = pset1<Packet>(126.0f);
26   const Packet cst_half = pset1<Packet>(0.5f);
27   const Packet cst_inv_mant_mask  = pset1frombits<Packet>(~0x7f800000u);
28   exponent = psub(pcast<PacketI,Packet>(plogical_shift_right<23>(preinterpret<PacketI>(a))), cst_126f);
29   return por(pand(a, cst_inv_mant_mask), cst_half);
30 }
31 
32 template<typename Packet> EIGEN_STRONG_INLINE Packet
pfrexp_double(const Packet & a,Packet & exponent)33 pfrexp_double(const Packet& a, Packet& exponent) {
34   typedef typename unpacket_traits<Packet>::integer_packet PacketI;
35   const Packet cst_1022d = pset1<Packet>(1022.0);
36   const Packet cst_half = pset1<Packet>(0.5);
37   const Packet cst_inv_mant_mask  = pset1frombits<Packet>(static_cast<uint64_t>(~0x7ff0000000000000ull));
38   exponent = psub(pcast<PacketI,Packet>(plogical_shift_right<52>(preinterpret<PacketI>(a))), cst_1022d);
39   return por(pand(a, cst_inv_mant_mask), cst_half);
40 }
41 
42 template<typename Packet> EIGEN_STRONG_INLINE Packet
pldexp_float(Packet a,Packet exponent)43 pldexp_float(Packet a, Packet exponent)
44 {
45   typedef typename unpacket_traits<Packet>::integer_packet PacketI;
46   const Packet cst_127 = pset1<Packet>(127.f);
47   // return a * 2^exponent
48   PacketI ei = pcast<Packet,PacketI>(padd(exponent, cst_127));
49   return pmul(a, preinterpret<Packet>(plogical_shift_left<23>(ei)));
50 }
51 
52 template<typename Packet> EIGEN_STRONG_INLINE Packet
pldexp_double(Packet a,Packet exponent)53 pldexp_double(Packet a, Packet exponent)
54 {
55   typedef typename unpacket_traits<Packet>::integer_packet PacketI;
56   const Packet cst_1023 = pset1<Packet>(1023.0);
57   // return a * 2^exponent
58   PacketI ei = pcast<Packet,PacketI>(padd(exponent, cst_1023));
59   return pmul(a, preinterpret<Packet>(plogical_shift_left<52>(ei)));
60 }
61 
62 // Natural or base 2 logarithm.
63 // Computes log(x) as log(2^e * m) = C*e + log(m), where the constant C =log(2)
64 // and m is in the range [sqrt(1/2),sqrt(2)). In this range, the logarithm can
65 // be easily approximated by a polynomial centered on m=1 for stability.
66 // TODO(gonnet): Further reduce the interval allowing for lower-degree
67 //               polynomial interpolants -> ... -> profit!
68 template <typename Packet, bool base2>
69 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
70 EIGEN_UNUSED
plog_impl_float(const Packet _x)71 Packet plog_impl_float(const Packet _x)
72 {
73   Packet x = _x;
74 
75   const Packet cst_1              = pset1<Packet>(1.0f);
76   const Packet cst_neg_half       = pset1<Packet>(-0.5f);
77   // The smallest non denormalized float number.
78   const Packet cst_min_norm_pos   = pset1frombits<Packet>( 0x00800000u);
79   const Packet cst_minus_inf      = pset1frombits<Packet>( 0xff800000u);
80   const Packet cst_pos_inf        = pset1frombits<Packet>( 0x7f800000u);
81 
82   // Polynomial coefficients.
83   const Packet cst_cephes_SQRTHF = pset1<Packet>(0.707106781186547524f);
84   const Packet cst_cephes_log_p0 = pset1<Packet>(7.0376836292E-2f);
85   const Packet cst_cephes_log_p1 = pset1<Packet>(-1.1514610310E-1f);
86   const Packet cst_cephes_log_p2 = pset1<Packet>(1.1676998740E-1f);
87   const Packet cst_cephes_log_p3 = pset1<Packet>(-1.2420140846E-1f);
88   const Packet cst_cephes_log_p4 = pset1<Packet>(+1.4249322787E-1f);
89   const Packet cst_cephes_log_p5 = pset1<Packet>(-1.6668057665E-1f);
90   const Packet cst_cephes_log_p6 = pset1<Packet>(+2.0000714765E-1f);
91   const Packet cst_cephes_log_p7 = pset1<Packet>(-2.4999993993E-1f);
92   const Packet cst_cephes_log_p8 = pset1<Packet>(+3.3333331174E-1f);
93 
94   // Truncate input values to the minimum positive normal.
95   x = pmax(x, cst_min_norm_pos);
96 
97   Packet e;
98   // extract significant in the range [0.5,1) and exponent
99   x = pfrexp(x,e);
100 
101   // part2: Shift the inputs from the range [0.5,1) to [sqrt(1/2),sqrt(2))
102   // and shift by -1. The values are then centered around 0, which improves
103   // the stability of the polynomial evaluation.
104   //   if( x < SQRTHF ) {
105   //     e -= 1;
106   //     x = x + x - 1.0;
107   //   } else { x = x - 1.0; }
108   Packet mask = pcmp_lt(x, cst_cephes_SQRTHF);
109   Packet tmp = pand(x, mask);
110   x = psub(x, cst_1);
111   e = psub(e, pand(cst_1, mask));
112   x = padd(x, tmp);
113 
114   Packet x2 = pmul(x, x);
115   Packet x3 = pmul(x2, x);
116 
117   // Evaluate the polynomial approximant of degree 8 in three parts, probably
118   // to improve instruction-level parallelism.
119   Packet y, y1, y2;
120   y  = pmadd(cst_cephes_log_p0, x, cst_cephes_log_p1);
121   y1 = pmadd(cst_cephes_log_p3, x, cst_cephes_log_p4);
122   y2 = pmadd(cst_cephes_log_p6, x, cst_cephes_log_p7);
123   y  = pmadd(y, x, cst_cephes_log_p2);
124   y1 = pmadd(y1, x, cst_cephes_log_p5);
125   y2 = pmadd(y2, x, cst_cephes_log_p8);
126   y  = pmadd(y, x3, y1);
127   y  = pmadd(y, x3, y2);
128   y  = pmul(y, x3);
129 
130   y = pmadd(cst_neg_half, x2, y);
131   x = padd(x, y);
132 
133   // Add the logarithm of the exponent back to the result of the interpolation.
134   if (base2) {
135     const Packet cst_log2e = pset1<Packet>(static_cast<float>(EIGEN_LOG2E));
136     x = pmadd(x, cst_log2e, e);
137   } else {
138     const Packet cst_ln2 = pset1<Packet>(static_cast<float>(EIGEN_LN2));
139     x = pmadd(e, cst_ln2, x);
140   }
141 
142   Packet invalid_mask = pcmp_lt_or_nan(_x, pzero(_x));
143   Packet iszero_mask  = pcmp_eq(_x,pzero(_x));
144   Packet pos_inf_mask = pcmp_eq(_x,cst_pos_inf);
145   // Filter out invalid inputs, i.e.:
146   //  - negative arg will be NAN
147   //  - 0 will be -INF
148   //  - +INF will be +INF
149   return pselect(iszero_mask, cst_minus_inf,
150                               por(pselect(pos_inf_mask,cst_pos_inf,x), invalid_mask));
151 }
152 
153 template <typename Packet>
154 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
155 EIGEN_UNUSED
plog_float(const Packet _x)156 Packet plog_float(const Packet _x)
157 {
158   return plog_impl_float<Packet, /* base2 */ false>(_x);
159 }
160 
161 template <typename Packet>
162 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
163 EIGEN_UNUSED
plog2_float(const Packet _x)164 Packet plog2_float(const Packet _x)
165 {
166   return plog_impl_float<Packet, /* base2 */ true>(_x);
167 }
168 
169 /* Returns the base e (2.718...) or base 2 logarithm of x.
170  * The argument is separated into its exponent and fractional parts.
171  * The logarithm of the fraction in the interval [sqrt(1/2), sqrt(2)],
172  * is approximated by
173  *
174  *     log(1+x) = x - 0.5 x**2 + x**3 P(x)/Q(x).
175  *
176  * for more detail see: http://www.netlib.org/cephes/
177  */
178 template <typename Packet, bool base2>
179 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
180 EIGEN_UNUSED
plog_impl_double(const Packet _x)181 Packet plog_impl_double(const Packet _x)
182 {
183   Packet x = _x;
184 
185   const Packet cst_1              = pset1<Packet>(1.0);
186   const Packet cst_neg_half       = pset1<Packet>(-0.5);
187   // The smallest non denormalized double.
188   const Packet cst_min_norm_pos   = pset1frombits<Packet>( static_cast<uint64_t>(0x0010000000000000ull));
189   const Packet cst_minus_inf      = pset1frombits<Packet>( static_cast<uint64_t>(0xfff0000000000000ull));
190   const Packet cst_pos_inf        = pset1frombits<Packet>( static_cast<uint64_t>(0x7ff0000000000000ull));
191 
192 
193  // Polynomial Coefficients for log(1+x) = x - x**2/2 + x**3 P(x)/Q(x)
194  //                             1/sqrt(2) <= x < sqrt(2)
195   const Packet cst_cephes_SQRTHF = pset1<Packet>(0.70710678118654752440E0);
196   const Packet cst_cephes_log_p0 = pset1<Packet>(1.01875663804580931796E-4);
197   const Packet cst_cephes_log_p1 = pset1<Packet>(4.97494994976747001425E-1);
198   const Packet cst_cephes_log_p2 = pset1<Packet>(4.70579119878881725854E0);
199   const Packet cst_cephes_log_p3 = pset1<Packet>(1.44989225341610930846E1);
200   const Packet cst_cephes_log_p4 = pset1<Packet>(1.79368678507819816313E1);
201   const Packet cst_cephes_log_p5 = pset1<Packet>(7.70838733755885391666E0);
202 
203   const Packet cst_cephes_log_q0 = pset1<Packet>(1.0);
204   const Packet cst_cephes_log_q1 = pset1<Packet>(1.12873587189167450590E1);
205   const Packet cst_cephes_log_q2 = pset1<Packet>(4.52279145837532221105E1);
206   const Packet cst_cephes_log_q3 = pset1<Packet>(8.29875266912776603211E1);
207   const Packet cst_cephes_log_q4 = pset1<Packet>(7.11544750618563894466E1);
208   const Packet cst_cephes_log_q5 = pset1<Packet>(2.31251620126765340583E1);
209 
210   // Truncate input values to the minimum positive normal.
211   x = pmax(x, cst_min_norm_pos);
212 
213   Packet e;
214   // extract significant in the range [0.5,1) and exponent
215   x = pfrexp(x,e);
216 
217   // Shift the inputs from the range [0.5,1) to [sqrt(1/2),sqrt(2))
218   // and shift by -1. The values are then centered around 0, which improves
219   // the stability of the polynomial evaluation.
220   //   if( x < SQRTHF ) {
221   //     e -= 1;
222   //     x = x + x - 1.0;
223   //   } else { x = x - 1.0; }
224   Packet mask = pcmp_lt(x, cst_cephes_SQRTHF);
225   Packet tmp = pand(x, mask);
226   x = psub(x, cst_1);
227   e = psub(e, pand(cst_1, mask));
228   x = padd(x, tmp);
229 
230   Packet x2 = pmul(x, x);
231   Packet x3 = pmul(x2, x);
232 
233   // Evaluate the polynomial approximant , probably to improve instruction-level parallelism.
234   // y = x - 0.5*x^2 + x^3 * polevl( x, P, 5 ) / p1evl( x, Q, 5 ) );
235   Packet y, y1, y_;
236   y  = pmadd(cst_cephes_log_p0, x, cst_cephes_log_p1);
237   y1 = pmadd(cst_cephes_log_p3, x, cst_cephes_log_p4);
238   y  = pmadd(y, x, cst_cephes_log_p2);
239   y1 = pmadd(y1, x, cst_cephes_log_p5);
240   y_ = pmadd(y, x3, y1);
241 
242   y  = pmadd(cst_cephes_log_q0, x, cst_cephes_log_q1);
243   y1 = pmadd(cst_cephes_log_q3, x, cst_cephes_log_q4);
244   y  = pmadd(y, x, cst_cephes_log_q2);
245   y1 = pmadd(y1, x, cst_cephes_log_q5);
246   y  = pmadd(y, x3, y1);
247 
248   y_ = pmul(y_, x3);
249   y  = pdiv(y_, y);
250 
251   y = pmadd(cst_neg_half, x2, y);
252   x = padd(x, y);
253 
254   // Add the logarithm of the exponent back to the result of the interpolation.
255   if (base2) {
256     const Packet cst_log2e = pset1<Packet>(static_cast<double>(EIGEN_LOG2E));
257     x = pmadd(x, cst_log2e, e);
258   } else {
259     const Packet cst_ln2 = pset1<Packet>(static_cast<double>(EIGEN_LN2));
260     x = pmadd(e, cst_ln2, x);
261   }
262 
263   Packet invalid_mask = pcmp_lt_or_nan(_x, pzero(_x));
264   Packet iszero_mask  = pcmp_eq(_x,pzero(_x));
265   Packet pos_inf_mask = pcmp_eq(_x,cst_pos_inf);
266   // Filter out invalid inputs, i.e.:
267   //  - negative arg will be NAN
268   //  - 0 will be -INF
269   //  - +INF will be +INF
270   return pselect(iszero_mask, cst_minus_inf,
271                               por(pselect(pos_inf_mask,cst_pos_inf,x), invalid_mask));
272 }
273 
274 template <typename Packet>
275 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
276 EIGEN_UNUSED
plog_double(const Packet _x)277 Packet plog_double(const Packet _x)
278 {
279   return plog_impl_double<Packet, /* base2 */ false>(_x);
280 }
281 
282 template <typename Packet>
283 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
284 EIGEN_UNUSED
plog2_double(const Packet _x)285 Packet plog2_double(const Packet _x)
286 {
287   return plog_impl_double<Packet, /* base2 */ true>(_x);
288 }
289 
290 /** \internal \returns log(1 + x) computed using W. Kahan's formula.
291     See: http://www.plunk.org/~hatch/rightway.php
292  */
293 template<typename Packet>
generic_plog1p(const Packet & x)294 Packet generic_plog1p(const Packet& x)
295 {
296   typedef typename unpacket_traits<Packet>::type ScalarType;
297   const Packet one = pset1<Packet>(ScalarType(1));
298   Packet xp1 = padd(x, one);
299   Packet small_mask = pcmp_eq(xp1, one);
300   Packet log1 = plog(xp1);
301   Packet inf_mask = pcmp_eq(xp1, log1);
302   Packet log_large = pmul(x, pdiv(log1, psub(xp1, one)));
303   return pselect(por(small_mask, inf_mask), x, log_large);
304 }
305 
306 /** \internal \returns exp(x)-1 computed using W. Kahan's formula.
307     See: http://www.plunk.org/~hatch/rightway.php
308  */
309 template<typename Packet>
generic_expm1(const Packet & x)310 Packet generic_expm1(const Packet& x)
311 {
312   typedef typename unpacket_traits<Packet>::type ScalarType;
313   const Packet one = pset1<Packet>(ScalarType(1));
314   const Packet neg_one = pset1<Packet>(ScalarType(-1));
315   Packet u = pexp(x);
316   Packet one_mask = pcmp_eq(u, one);
317   Packet u_minus_one = psub(u, one);
318   Packet neg_one_mask = pcmp_eq(u_minus_one, neg_one);
319   Packet logu = plog(u);
320   // The following comparison is to catch the case where
321   // exp(x) = +inf. It is written in this way to avoid having
322   // to form the constant +inf, which depends on the packet
323   // type.
324   Packet pos_inf_mask = pcmp_eq(logu, u);
325   Packet expm1 = pmul(u_minus_one, pdiv(x, logu));
326   expm1 = pselect(pos_inf_mask, u, expm1);
327   return pselect(one_mask,
328                  x,
329                  pselect(neg_one_mask,
330                          neg_one,
331                          expm1));
332 }
333 
334 
335 // Exponential function. Works by writing "x = m*log(2) + r" where
336 // "m = floor(x/log(2)+1/2)" and "r" is the remainder. The result is then
337 // "exp(x) = 2^m*exp(r)" where exp(r) is in the range [-1,1).
338 template <typename Packet>
339 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
340 EIGEN_UNUSED
pexp_float(const Packet _x)341 Packet pexp_float(const Packet _x)
342 {
343   const Packet cst_1      = pset1<Packet>(1.0f);
344   const Packet cst_half   = pset1<Packet>(0.5f);
345   const Packet cst_exp_hi = pset1<Packet>( 88.3762626647950f);
346   const Packet cst_exp_lo = pset1<Packet>(-88.3762626647949f);
347 
348   const Packet cst_cephes_LOG2EF = pset1<Packet>(1.44269504088896341f);
349   const Packet cst_cephes_exp_p0 = pset1<Packet>(1.9875691500E-4f);
350   const Packet cst_cephes_exp_p1 = pset1<Packet>(1.3981999507E-3f);
351   const Packet cst_cephes_exp_p2 = pset1<Packet>(8.3334519073E-3f);
352   const Packet cst_cephes_exp_p3 = pset1<Packet>(4.1665795894E-2f);
353   const Packet cst_cephes_exp_p4 = pset1<Packet>(1.6666665459E-1f);
354   const Packet cst_cephes_exp_p5 = pset1<Packet>(5.0000001201E-1f);
355 
356   // Clamp x.
357   Packet x = pmax(pmin(_x, cst_exp_hi), cst_exp_lo);
358 
359   // Express exp(x) as exp(m*ln(2) + r), start by extracting
360   // m = floor(x/ln(2) + 0.5).
361   Packet m = pfloor(pmadd(x, cst_cephes_LOG2EF, cst_half));
362 
363   // Get r = x - m*ln(2). If no FMA instructions are available, m*ln(2) is
364   // subtracted out in two parts, m*C1+m*C2 = m*ln(2), to avoid accumulating
365   // truncation errors.
366   Packet r;
367 #ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
368   const Packet cst_nln2 = pset1<Packet>(-0.6931471805599453f);
369   r = pmadd(m, cst_nln2, x);
370 #else
371   const Packet cst_cephes_exp_C1 = pset1<Packet>(0.693359375f);
372   const Packet cst_cephes_exp_C2 = pset1<Packet>(-2.12194440e-4f);
373   r = psub(x, pmul(m, cst_cephes_exp_C1));
374   r = psub(r, pmul(m, cst_cephes_exp_C2));
375 #endif
376 
377   Packet r2 = pmul(r, r);
378   Packet r3 = pmul(r2, r);
379 
380   // Evaluate the polynomial approximant,improved by instruction-level parallelism.
381   Packet y, y1, y2;
382   y  = pmadd(cst_cephes_exp_p0, r, cst_cephes_exp_p1);
383   y1 = pmadd(cst_cephes_exp_p3, r, cst_cephes_exp_p4);
384   y2 = padd(r, cst_1);
385   y  = pmadd(y, r, cst_cephes_exp_p2);
386   y1 = pmadd(y1, r, cst_cephes_exp_p5);
387   y  = pmadd(y, r3, y1);
388   y  = pmadd(y, r2, y2);
389 
390   // Return 2^m * exp(r).
391   return pmax(pldexp(y,m), _x);
392 }
393 
394 template <typename Packet>
395 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
396 EIGEN_UNUSED
pexp_double(const Packet _x)397 Packet pexp_double(const Packet _x)
398 {
399   Packet x = _x;
400 
401   const Packet cst_1 = pset1<Packet>(1.0);
402   const Packet cst_2 = pset1<Packet>(2.0);
403   const Packet cst_half = pset1<Packet>(0.5);
404 
405   const Packet cst_exp_hi = pset1<Packet>(709.437);
406   const Packet cst_exp_lo = pset1<Packet>(-709.436139303);
407 
408   const Packet cst_cephes_LOG2EF = pset1<Packet>(1.4426950408889634073599);
409   const Packet cst_cephes_exp_p0 = pset1<Packet>(1.26177193074810590878e-4);
410   const Packet cst_cephes_exp_p1 = pset1<Packet>(3.02994407707441961300e-2);
411   const Packet cst_cephes_exp_p2 = pset1<Packet>(9.99999999999999999910e-1);
412   const Packet cst_cephes_exp_q0 = pset1<Packet>(3.00198505138664455042e-6);
413   const Packet cst_cephes_exp_q1 = pset1<Packet>(2.52448340349684104192e-3);
414   const Packet cst_cephes_exp_q2 = pset1<Packet>(2.27265548208155028766e-1);
415   const Packet cst_cephes_exp_q3 = pset1<Packet>(2.00000000000000000009e0);
416   const Packet cst_cephes_exp_C1 = pset1<Packet>(0.693145751953125);
417   const Packet cst_cephes_exp_C2 = pset1<Packet>(1.42860682030941723212e-6);
418 
419   Packet tmp, fx;
420 
421   // clamp x
422   x = pmax(pmin(x, cst_exp_hi), cst_exp_lo);
423   // Express exp(x) as exp(g + n*log(2)).
424   fx = pmadd(cst_cephes_LOG2EF, x, cst_half);
425 
426   // Get the integer modulus of log(2), i.e. the "n" described above.
427   fx = pfloor(fx);
428 
429   // Get the remainder modulo log(2), i.e. the "g" described above. Subtract
430   // n*log(2) out in two steps, i.e. n*C1 + n*C2, C1+C2=log2 to get the last
431   // digits right.
432   tmp = pmul(fx, cst_cephes_exp_C1);
433   Packet z = pmul(fx, cst_cephes_exp_C2);
434   x = psub(x, tmp);
435   x = psub(x, z);
436 
437   Packet x2 = pmul(x, x);
438 
439   // Evaluate the numerator polynomial of the rational interpolant.
440   Packet px = cst_cephes_exp_p0;
441   px = pmadd(px, x2, cst_cephes_exp_p1);
442   px = pmadd(px, x2, cst_cephes_exp_p2);
443   px = pmul(px, x);
444 
445   // Evaluate the denominator polynomial of the rational interpolant.
446   Packet qx = cst_cephes_exp_q0;
447   qx = pmadd(qx, x2, cst_cephes_exp_q1);
448   qx = pmadd(qx, x2, cst_cephes_exp_q2);
449   qx = pmadd(qx, x2, cst_cephes_exp_q3);
450 
451   // I don't really get this bit, copied from the SSE2 routines, so...
452   // TODO(gonnet): Figure out what is going on here, perhaps find a better
453   // rational interpolant?
454   x = pdiv(px, psub(qx, px));
455   x = pmadd(cst_2, x, cst_1);
456 
457   // Construct the result 2^n * exp(g) = e * x. The max is used to catch
458   // non-finite values in the input.
459   return pmax(pldexp(x,fx), _x);
460 }
461 
462 // The following code is inspired by the following stack-overflow answer:
463 //   https://stackoverflow.com/questions/30463616/payne-hanek-algorithm-implementation-in-c/30465751#30465751
464 // It has been largely optimized:
465 //  - By-pass calls to frexp.
466 //  - Aligned loads of required 96 bits of 2/pi. This is accomplished by
467 //    (1) balancing the mantissa and exponent to the required bits of 2/pi are
468 //    aligned on 8-bits, and (2) replicating the storage of the bits of 2/pi.
469 //  - Avoid a branch in rounding and extraction of the remaining fractional part.
470 // Overall, I measured a speed up higher than x2 on x86-64.
trig_reduce_huge(float xf,int * quadrant)471 inline float trig_reduce_huge (float xf, int *quadrant)
472 {
473   using Eigen::numext::int32_t;
474   using Eigen::numext::uint32_t;
475   using Eigen::numext::int64_t;
476   using Eigen::numext::uint64_t;
477 
478   const double pio2_62 = 3.4061215800865545e-19;    // pi/2 * 2^-62
479   const uint64_t zero_dot_five = uint64_t(1) << 61; // 0.5 in 2.62-bit fixed-point foramt
480 
481   // 192 bits of 2/pi for Payne-Hanek reduction
482   // Bits are introduced by packet of 8 to enable aligned reads.
483   static const uint32_t two_over_pi [] =
484   {
485     0x00000028, 0x000028be, 0x0028be60, 0x28be60db,
486     0xbe60db93, 0x60db9391, 0xdb939105, 0x9391054a,
487     0x91054a7f, 0x054a7f09, 0x4a7f09d5, 0x7f09d5f4,
488     0x09d5f47d, 0xd5f47d4d, 0xf47d4d37, 0x7d4d3770,
489     0x4d377036, 0x377036d8, 0x7036d8a5, 0x36d8a566,
490     0xd8a5664f, 0xa5664f10, 0x664f10e4, 0x4f10e410,
491     0x10e41000, 0xe4100000
492   };
493 
494   uint32_t xi = numext::bit_cast<uint32_t>(xf);
495   // Below, -118 = -126 + 8.
496   //   -126 is to get the exponent,
497   //   +8 is to enable alignment of 2/pi's bits on 8 bits.
498   // This is possible because the fractional part of x as only 24 meaningful bits.
499   uint32_t e = (xi >> 23) - 118;
500   // Extract the mantissa and shift it to align it wrt the exponent
501   xi = ((xi & 0x007fffffu)| 0x00800000u) << (e & 0x7);
502 
503   uint32_t i = e >> 3;
504   uint32_t twoopi_1  = two_over_pi[i-1];
505   uint32_t twoopi_2  = two_over_pi[i+3];
506   uint32_t twoopi_3  = two_over_pi[i+7];
507 
508   // Compute x * 2/pi in 2.62-bit fixed-point format.
509   uint64_t p;
510   p = uint64_t(xi) * twoopi_3;
511   p = uint64_t(xi) * twoopi_2 + (p >> 32);
512   p = (uint64_t(xi * twoopi_1) << 32) + p;
513 
514   // Round to nearest: add 0.5 and extract integral part.
515   uint64_t q = (p + zero_dot_five) >> 62;
516   *quadrant = int(q);
517   // Now it remains to compute "r = x - q*pi/2" with high accuracy,
518   // since we have p=x/(pi/2) with high accuracy, we can more efficiently compute r as:
519   //   r = (p-q)*pi/2,
520   // where the product can be be carried out with sufficient accuracy using double precision.
521   p -= q<<62;
522   return float(double(int64_t(p)) * pio2_62);
523 }
524 
525 template<bool ComputeSine,typename Packet>
526 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
527 EIGEN_UNUSED
528 #if EIGEN_GNUC_AT_LEAST(4,4) && EIGEN_COMP_GNUC_STRICT
529 __attribute__((optimize("-fno-unsafe-math-optimizations")))
530 #endif
psincos_float(const Packet & _x)531 Packet psincos_float(const Packet& _x)
532 {
533 // Workaround -ffast-math aggressive optimizations
534 // See bug 1674
535 #if EIGEN_COMP_CLANG && defined(EIGEN_VECTORIZE_SSE)
536 #define EIGEN_SINCOS_DONT_OPT(X) __asm__  ("" : "+x" (X));
537 #else
538 #define EIGEN_SINCOS_DONT_OPT(X)
539 #endif
540 
541   typedef typename unpacket_traits<Packet>::integer_packet PacketI;
542 
543   const Packet  cst_2oPI            = pset1<Packet>(0.636619746685028076171875f); // 2/PI
544   const Packet  cst_rounding_magic  = pset1<Packet>(12582912); // 2^23 for rounding
545   const PacketI csti_1              = pset1<PacketI>(1);
546   const Packet  cst_sign_mask       = pset1frombits<Packet>(0x80000000u);
547 
548   Packet x = pabs(_x);
549 
550   // Scale x by 2/Pi to find x's octant.
551   Packet y = pmul(x, cst_2oPI);
552 
553   // Rounding trick:
554   Packet y_round = padd(y, cst_rounding_magic);
555   EIGEN_SINCOS_DONT_OPT(y_round)
556   PacketI y_int = preinterpret<PacketI>(y_round); // last 23 digits represent integer (if abs(x)<2^24)
557   y = psub(y_round, cst_rounding_magic); // nearest integer to x*4/pi
558 
559   // Reduce x by y octants to get: -Pi/4 <= x <= +Pi/4
560   // using "Extended precision modular arithmetic"
561   #if defined(EIGEN_HAS_SINGLE_INSTRUCTION_MADD)
562   // This version requires true FMA for high accuracy
563   // It provides a max error of 1ULP up to (with absolute_error < 5.9605e-08):
564   const float huge_th = ComputeSine ? 117435.992f : 71476.0625f;
565   x = pmadd(y, pset1<Packet>(-1.57079601287841796875f), x);
566   x = pmadd(y, pset1<Packet>(-3.1391647326017846353352069854736328125e-07f), x);
567   x = pmadd(y, pset1<Packet>(-5.390302529957764765544681040410068817436695098876953125e-15f), x);
568   #else
569   // Without true FMA, the previous set of coefficients maintain 1ULP accuracy
570   // up to x<15.7 (for sin), but accuracy is immediately lost for x>15.7.
571   // We thus use one more iteration to maintain 2ULPs up to reasonably large inputs.
572 
573   // The following set of coefficients maintain 1ULP up to 9.43 and 14.16 for sin and cos respectively.
574   // and 2 ULP up to:
575   const float huge_th = ComputeSine ? 25966.f : 18838.f;
576   x = pmadd(y, pset1<Packet>(-1.5703125), x); // = 0xbfc90000
577   EIGEN_SINCOS_DONT_OPT(x)
578   x = pmadd(y, pset1<Packet>(-0.000483989715576171875), x); // = 0xb9fdc000
579   EIGEN_SINCOS_DONT_OPT(x)
580   x = pmadd(y, pset1<Packet>(1.62865035235881805419921875e-07), x); // = 0x342ee000
581   x = pmadd(y, pset1<Packet>(5.5644315544167710640977020375430583953857421875e-11), x); // = 0x2e74b9ee
582 
583   // For the record, the following set of coefficients maintain 2ULP up
584   // to a slightly larger range:
585   // const float huge_th = ComputeSine ? 51981.f : 39086.125f;
586   // but it slightly fails to maintain 1ULP for two values of sin below pi.
587   // x = pmadd(y, pset1<Packet>(-3.140625/2.), x);
588   // x = pmadd(y, pset1<Packet>(-0.00048351287841796875), x);
589   // x = pmadd(y, pset1<Packet>(-3.13855707645416259765625e-07), x);
590   // x = pmadd(y, pset1<Packet>(-6.0771006282767103812147979624569416046142578125e-11), x);
591 
592   // For the record, with only 3 iterations it is possible to maintain
593   // 1 ULP up to 3PI (maybe more) and 2ULP up to 255.
594   // The coefficients are: 0xbfc90f80, 0xb7354480, 0x2e74b9ee
595   #endif
596 
597   if(predux_any(pcmp_le(pset1<Packet>(huge_th),pabs(_x))))
598   {
599     const int PacketSize = unpacket_traits<Packet>::size;
600     EIGEN_ALIGN_TO_BOUNDARY(sizeof(Packet)) float vals[PacketSize];
601     EIGEN_ALIGN_TO_BOUNDARY(sizeof(Packet)) float x_cpy[PacketSize];
602     EIGEN_ALIGN_TO_BOUNDARY(sizeof(Packet)) int y_int2[PacketSize];
603     pstoreu(vals, pabs(_x));
604     pstoreu(x_cpy, x);
605     pstoreu(y_int2, y_int);
606     for(int k=0; k<PacketSize;++k)
607     {
608       float val = vals[k];
609       if(val>=huge_th && (numext::isfinite)(val))
610         x_cpy[k] = trig_reduce_huge(val,&y_int2[k]);
611     }
612     x = ploadu<Packet>(x_cpy);
613     y_int = ploadu<PacketI>(y_int2);
614   }
615 
616   // Compute the sign to apply to the polynomial.
617   // sin: sign = second_bit(y_int) xor signbit(_x)
618   // cos: sign = second_bit(y_int+1)
619   Packet sign_bit = ComputeSine ? pxor(_x, preinterpret<Packet>(plogical_shift_left<30>(y_int)))
620                                 : preinterpret<Packet>(plogical_shift_left<30>(padd(y_int,csti_1)));
621   sign_bit = pand(sign_bit, cst_sign_mask); // clear all but left most bit
622 
623   // Get the polynomial selection mask from the second bit of y_int
624   // We'll calculate both (sin and cos) polynomials and then select from the two.
625   Packet poly_mask = preinterpret<Packet>(pcmp_eq(pand(y_int, csti_1), pzero(y_int)));
626 
627   Packet x2 = pmul(x,x);
628 
629   // Evaluate the cos(x) polynomial. (-Pi/4 <= x <= Pi/4)
630   Packet y1 =        pset1<Packet>(2.4372266125283204019069671630859375e-05f);
631   y1 = pmadd(y1, x2, pset1<Packet>(-0.00138865201734006404876708984375f     ));
632   y1 = pmadd(y1, x2, pset1<Packet>(0.041666619479656219482421875f           ));
633   y1 = pmadd(y1, x2, pset1<Packet>(-0.5f));
634   y1 = pmadd(y1, x2, pset1<Packet>(1.f));
635 
636   // Evaluate the sin(x) polynomial. (Pi/4 <= x <= Pi/4)
637   // octave/matlab code to compute those coefficients:
638   //    x = (0:0.0001:pi/4)';
639   //    A = [x.^3 x.^5 x.^7];
640   //    w = ((1.-(x/(pi/4)).^2).^5)*2000+1;         # weights trading relative accuracy
641   //    c = (A'*diag(w)*A)\(A'*diag(w)*(sin(x)-x)); # weighted LS, linear coeff forced to 1
642   //    printf('%.64f\n %.64f\n%.64f\n', c(3), c(2), c(1))
643   //
644   Packet y2 =        pset1<Packet>(-0.0001959234114083702898469196984621021329076029360294342041015625f);
645   y2 = pmadd(y2, x2, pset1<Packet>( 0.0083326873655616851693794799871284340042620897293090820312500000f));
646   y2 = pmadd(y2, x2, pset1<Packet>(-0.1666666203982298255503735617821803316473960876464843750000000000f));
647   y2 = pmul(y2, x2);
648   y2 = pmadd(y2, x, x);
649 
650   // Select the correct result from the two polynomials.
651   y = ComputeSine ? pselect(poly_mask,y2,y1)
652                   : pselect(poly_mask,y1,y2);
653 
654   // Update the sign and filter huge inputs
655   return pxor(y, sign_bit);
656 
657 #undef EIGEN_SINCOS_DONT_OPT
658 }
659 
660 template<typename Packet>
661 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
662 EIGEN_UNUSED
psin_float(const Packet & x)663 Packet psin_float(const Packet& x)
664 {
665   return psincos_float<true>(x);
666 }
667 
668 template<typename Packet>
669 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
670 EIGEN_UNUSED
pcos_float(const Packet & x)671 Packet pcos_float(const Packet& x)
672 {
673   return psincos_float<false>(x);
674 }
675 
676 
677 template<typename Packet>
678 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
679 EIGEN_UNUSED
psqrt_complex(const Packet & a)680 Packet psqrt_complex(const Packet& a) {
681   typedef typename unpacket_traits<Packet>::type Scalar;
682   typedef typename Scalar::value_type RealScalar;
683   typedef typename unpacket_traits<Packet>::as_real RealPacket;
684 
685   // Computes the principal sqrt of the complex numbers in the input.
686   //
687   // For example, for packets containing 2 complex numbers stored in interleaved format
688   //    a = [a0, a1] = [x0, y0, x1, y1],
689   // where x0 = real(a0), y0 = imag(a0) etc., this function returns
690   //    b = [b0, b1] = [u0, v0, u1, v1],
691   // such that b0^2 = a0, b1^2 = a1.
692   //
693   // To derive the formula for the complex square roots, let's consider the equation for
694   // a single complex square root of the number x + i*y. We want to find real numbers
695   // u and v such that
696   //    (u + i*v)^2 = x + i*y  <=>
697   //    u^2 - v^2 + i*2*u*v = x + i*v.
698   // By equating the real and imaginary parts we get:
699   //    u^2 - v^2 = x
700   //    2*u*v = y.
701   //
702   // For x >= 0, this has the numerically stable solution
703   //    u = sqrt(0.5 * (x + sqrt(x^2 + y^2)))
704   //    v = 0.5 * (y / u)
705   // and for x < 0,
706   //    v = sign(y) * sqrt(0.5 * (x + sqrt(x^2 + y^2)))
707   //    u = |0.5 * (y / v)|
708   //
709   //  To avoid unnecessary over- and underflow, we compute sqrt(x^2 + y^2) as
710   //     l = max(|x|, |y|) * sqrt(1 + (min(|x|, |y|) / max(|x|, |y|))^2) ,
711 
712   // In the following, without lack of generality, we have annotated the code, assuming
713   // that the input is a packet of 2 complex numbers.
714   //
715   // Step 1. Compute l = [l0, l0, l1, l1], where
716   //    l0 = sqrt(x0^2 + y0^2),  l1 = sqrt(x1^2 + y1^2)
717   // To avoid over- and underflow, we use the stable formula for each hypotenuse
718   //    l0 = (min0 == 0 ? max0 : max0 * sqrt(1 + (min0/max0)**2)),
719   // where max0 = max(|x0|, |y0|), min0 = min(|x0|, |y0|), and similarly for l1.
720 
721   Packet a_flip = pcplxflip(a);
722   RealPacket a_abs = pabs(a.v);           // [|x0|, |y0|, |x1|, |y1|]
723   RealPacket a_abs_flip = pabs(a_flip.v); // [|y0|, |x0|, |y1|, |x1|]
724   RealPacket a_max = pmax(a_abs, a_abs_flip);
725   RealPacket a_min = pmin(a_abs, a_abs_flip);
726   RealPacket a_min_zero_mask = pcmp_eq(a_min, pzero(a_min));
727   RealPacket a_max_zero_mask = pcmp_eq(a_max, pzero(a_max));
728   RealPacket r = pdiv(a_min, a_max);
729   const RealPacket cst_one  = pset1<RealPacket>(RealScalar(1));
730   RealPacket l = pmul(a_max, psqrt(padd(cst_one, pmul(r, r))));  // [l0, l0, l1, l1]
731   // Set l to a_max if a_min is zero.
732   l = pselect(a_min_zero_mask, a_max, l);
733 
734   // Step 2. Compute [rho0, *, rho1, *], where
735   // rho0 = sqrt(0.5 * (l0 + |x0|)), rho1 =  sqrt(0.5 * (l1 + |x1|))
736   // We don't care about the imaginary parts computed here. They will be overwritten later.
737   const RealPacket cst_half = pset1<RealPacket>(RealScalar(0.5));
738   Packet rho;
739   rho.v = psqrt(pmul(cst_half, padd(a_abs, l)));
740 
741   // Step 3. Compute [rho0, eta0, rho1, eta1], where
742   // eta0 = (y0 / l0) / 2, and eta1 = (y1 / l1) / 2.
743   // set eta = 0 of input is 0 + i0.
744   RealPacket eta = pandnot(pmul(cst_half, pdiv(a.v, pcplxflip(rho).v)), a_max_zero_mask);
745   RealPacket real_mask = peven_mask(a.v);
746   Packet positive_real_result;
747   // Compute result for inputs with positive real part.
748   positive_real_result.v = pselect(real_mask, rho.v, eta);
749 
750   // Step 4. Compute solution for inputs with negative real part:
751   //         [|eta0|, sign(y0)*rho0, |eta1|, sign(y1)*rho1]
752   const RealPacket cst_imag_sign_mask = pset1<Packet>(Scalar(RealScalar(0.0), RealScalar(-0.0))).v;
753   RealPacket imag_signs = pand(a.v, cst_imag_sign_mask);
754   Packet negative_real_result;
755   // Notice that rho is positive, so taking it's absolute value is a noop.
756   negative_real_result.v = por(pabs(pcplxflip(positive_real_result).v), imag_signs);
757 
758   // Step 5. Select solution branch based on the sign of the real parts.
759   Packet negative_real_mask;
760   negative_real_mask.v = pcmp_lt(pand(real_mask, a.v), pzero(a.v));
761   negative_real_mask.v = por(negative_real_mask.v, pcplxflip(negative_real_mask).v);
762   Packet result = pselect(negative_real_mask, negative_real_result, positive_real_result);
763 
764   // Step 6. Handle special cases for infinities:
765   // * If z is (x,+∞), the result is (+∞,+∞) even if x is NaN
766   // * If z is (x,-∞), the result is (+∞,-∞) even if x is NaN
767   // * If z is (-∞,y), the result is (0*|y|,+∞) for finite or NaN y
768   // * If z is (+∞,y), the result is (+∞,0*|y|) for finite or NaN y
769   const RealPacket cst_pos_inf = pset1<RealPacket>(NumTraits<RealScalar>::infinity());
770   Packet is_inf;
771   is_inf.v = pcmp_eq(a_abs, cst_pos_inf);
772   Packet is_real_inf;
773   is_real_inf.v = pand(is_inf.v, real_mask);
774   is_real_inf = por(is_real_inf, pcplxflip(is_real_inf));
775   // prepare packet of (+∞,0*|y|) or (0*|y|,+∞), depending on the sign of the infinite real part.
776   Packet real_inf_result;
777   real_inf_result.v = pmul(a_abs, pset1<Packet>(Scalar(RealScalar(1.0), RealScalar(0.0))).v);
778   real_inf_result.v = pselect(negative_real_mask.v, pcplxflip(real_inf_result).v, real_inf_result.v);
779   // prepare packet of (+∞,+∞) or (+∞,-∞), depending on the sign of the infinite imaginary part.
780   Packet is_imag_inf;
781   is_imag_inf.v = pandnot(is_inf.v, real_mask);
782   is_imag_inf = por(is_imag_inf, pcplxflip(is_imag_inf));
783   Packet imag_inf_result;
784   imag_inf_result.v = por(pand(cst_pos_inf, real_mask), pandnot(a.v, real_mask));
785 
786   return  pselect(is_imag_inf, imag_inf_result,
787                   pselect(is_real_inf, real_inf_result,result));
788 }
789 
790 /* polevl (modified for Eigen)
791  *
792  *      Evaluate polynomial
793  *
794  *
795  *
796  * SYNOPSIS:
797  *
798  * int N;
799  * Scalar x, y, coef[N+1];
800  *
801  * y = polevl<decltype(x), N>( x, coef);
802  *
803  *
804  *
805  * DESCRIPTION:
806  *
807  * Evaluates polynomial of degree N:
808  *
809  *                     2          N
810  * y  =  C  + C x + C x  +...+ C x
811  *        0    1     2          N
812  *
813  * Coefficients are stored in reverse order:
814  *
815  * coef[0] = C  , ..., coef[N] = C  .
816  *            N                   0
817  *
818  *  The function p1evl() assumes that coef[N] = 1.0 and is
819  * omitted from the array.  Its calling arguments are
820  * otherwise the same as polevl().
821  *
822  *
823  * The Eigen implementation is templatized.  For best speed, store
824  * coef as a const array (constexpr), e.g.
825  *
826  * const double coef[] = {1.0, 2.0, 3.0, ...};
827  *
828  */
829 template <typename Packet, int N>
830 struct ppolevl {
runppolevl831   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet run(const Packet& x, const typename unpacket_traits<Packet>::type coeff[]) {
832     EIGEN_STATIC_ASSERT((N > 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
833     return pmadd(ppolevl<Packet, N-1>::run(x, coeff), x, pset1<Packet>(coeff[N]));
834   }
835 };
836 
837 template <typename Packet>
838 struct ppolevl<Packet, 0> {
839   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet run(const Packet& x, const typename unpacket_traits<Packet>::type coeff[]) {
840     EIGEN_UNUSED_VARIABLE(x);
841     return pset1<Packet>(coeff[0]);
842   }
843 };
844 
845 /* chbevl (modified for Eigen)
846  *
847  *     Evaluate Chebyshev series
848  *
849  *
850  *
851  * SYNOPSIS:
852  *
853  * int N;
854  * Scalar x, y, coef[N], chebevl();
855  *
856  * y = chbevl( x, coef, N );
857  *
858  *
859  *
860  * DESCRIPTION:
861  *
862  * Evaluates the series
863  *
864  *        N-1
865  *         - '
866  *  y  =   >   coef[i] T (x/2)
867  *         -            i
868  *        i=0
869  *
870  * of Chebyshev polynomials Ti at argument x/2.
871  *
872  * Coefficients are stored in reverse order, i.e. the zero
873  * order term is last in the array.  Note N is the number of
874  * coefficients, not the order.
875  *
876  * If coefficients are for the interval a to b, x must
877  * have been transformed to x -> 2(2x - b - a)/(b-a) before
878  * entering the routine.  This maps x from (a, b) to (-1, 1),
879  * over which the Chebyshev polynomials are defined.
880  *
881  * If the coefficients are for the inverted interval, in
882  * which (a, b) is mapped to (1/b, 1/a), the transformation
883  * required is x -> 2(2ab/x - b - a)/(b-a).  If b is infinity,
884  * this becomes x -> 4a/x - 1.
885  *
886  *
887  *
888  * SPEED:
889  *
890  * Taking advantage of the recurrence properties of the
891  * Chebyshev polynomials, the routine requires one more
892  * addition per loop than evaluating a nested polynomial of
893  * the same degree.
894  *
895  */
896 
897 template <typename Packet, int N>
898 struct pchebevl {
899   EIGEN_DEVICE_FUNC
900   static EIGEN_STRONG_INLINE Packet run(Packet x, const typename unpacket_traits<Packet>::type coef[]) {
901     typedef typename unpacket_traits<Packet>::type Scalar;
902     Packet b0 = pset1<Packet>(coef[0]);
903     Packet b1 = pset1<Packet>(static_cast<Scalar>(0.f));
904     Packet b2;
905 
906     for (int i = 1; i < N; i++) {
907       b2 = b1;
908       b1 = b0;
909       b0 = psub(pmadd(x, b1, pset1<Packet>(coef[i])), b2);
910     }
911 
912     return pmul(pset1<Packet>(static_cast<Scalar>(0.5f)), psub(b0, b2));
913   }
914 };
915 
916 } // end namespace internal
917 } // end namespace Eigen
918 
919 #endif // EIGEN_ARCH_GENERIC_PACKET_MATH_FUNCTIONS_H
920