1 /****************************  vectormath_exp.h   ******************************
2 * Author:        Agner Fog
3 * Date created:  2014-04-18
4 * Last modified: 2016-12-26
5 * Version:       1.26
6 * Project:       vector classes
7 * Description:
8 * Header file containing inline vector functions of logarithms, exponential
9 * and power functions:
10 * exp         exponential function
11 * exp2        exponential function base 2
12 * exp10       exponential function base 10
13 * exmp1       exponential function minus 1
14 * log         natural logarithm
15 * log2        logarithm base 2
16 * log10       logarithm base 10
17 * log1p       natural logarithm of 1+x
18 * cbrt        cube root
19 * pow         raise vector elements to power
20 * pow_ratio   raise vector elements to rational power
21 *
22 * Theory, methods and inspiration based partially on these sources:
23 * > Moshier, Stephen Lloyd Baluk: Methods and programs for mathematical functions.
24 *   Ellis Horwood, 1989.
25 * > VDT library developed on CERN by Danilo Piparo, Thomas Hauth and
26 *   Vincenzo Innocente, 2012, https://svnweb.cern.ch/trac/vdt
27 * > Cephes math library by Stephen L. Moshier 1992,
28 *   http://www.netlib.org/cephes/
29 *
30 * For detailed instructions, see vectormath_common.h and VectorClass.pdf
31 *
32 * (c) Copyright 2014-2016 GNU General Public License http://www.gnu.org/licenses
33 ******************************************************************************/
34 
35 #ifndef VECTORMATH_EXP_H
36 #define VECTORMATH_EXP_H  1
37 
38 #include "vectormath_common.h"
39 
40 #ifdef VCL_NAMESPACE
41 namespace VCL_NAMESPACE {
42 #endif
43 
44 /******************************************************************************
45 *                 Exponential functions
46 ******************************************************************************/
47 
48 // Helper functions, used internally:
49 
50 // This function calculates pow(2,n) where n must be an integer. Does not check for overflow or underflow
vm_pow2n(Vec2d const & n)51 static inline Vec2d vm_pow2n (Vec2d const & n) {
52     const double pow2_52 = 4503599627370496.0;   // 2^52
53     const double bias = 1023.0;                  // bias in exponent
54     Vec2d a = n + (bias + pow2_52);              // put n + bias in least significant bits
55     Vec2q b = reinterpret_i(a);                  // bit-cast to integer
56     Vec2q c = b << 52;                           // shift left 52 places to get into exponent field
57     Vec2d d = reinterpret_d(c);                  // bit-cast back to double
58     return d;
59 }
60 
vm_pow2n(Vec4f const & n)61 static inline Vec4f vm_pow2n (Vec4f const & n) {
62     const float pow2_23 =  8388608.0;            // 2^23
63     const float bias = 127.0;                    // bias in exponent
64     Vec4f a = n + (bias + pow2_23);              // put n + bias in least significant bits
65     Vec4i b = reinterpret_i(a);                  // bit-cast to integer
66     Vec4i c = b << 23;                           // shift left 23 places to get into exponent field
67     Vec4f d = reinterpret_f(c);                  // bit-cast back to float
68     return d;
69 }
70 
71 #if MAX_VECTOR_SIZE >= 256
72 
vm_pow2n(Vec4d const & n)73 static inline Vec4d vm_pow2n (Vec4d const & n) {
74     const double pow2_52 = 4503599627370496.0;   // 2^52
75     const double bias = 1023.0;                  // bias in exponent
76     Vec4d a = n + (bias + pow2_52);              // put n + bias in least significant bits
77     Vec4q b = reinterpret_i(a);                  // bit-cast to integer
78     Vec4q c = b << 52;                           // shift left 52 places to get value into exponent field
79     Vec4d d = reinterpret_d(c);                  // bit-cast back to double
80     return d;
81 }
82 
vm_pow2n(Vec8f const & n)83 static inline Vec8f vm_pow2n (Vec8f const & n) {
84     const float pow2_23 =  8388608.0;            // 2^23
85     const float bias = 127.0;                    // bias in exponent
86     Vec8f a = n + (bias + pow2_23);              // put n + bias in least significant bits
87     Vec8i b = reinterpret_i(a);                  // bit-cast to integer
88     Vec8i c = b << 23;                           // shift left 23 places to get into exponent field
89     Vec8f d = reinterpret_f(c);                  // bit-cast back to float
90     return d;
91 }
92 
93 #endif // MAX_VECTOR_SIZE >= 256
94 
95 #if MAX_VECTOR_SIZE >= 512
96 
vm_pow2n(Vec8d const & n)97 static inline Vec8d vm_pow2n (Vec8d const & n) {
98 #ifdef __AVX512ER__
99     return _mm512_exp2a23_round_pd(n, _MM_FROUND_NO_EXC); // this is exact only for integral n
100 #else
101     const double pow2_52 = 4503599627370496.0;   // 2^52
102     const double bias = 1023.0;                  // bias in exponent
103     Vec8d a = n + (bias + pow2_52);              // put n + bias in least significant bits
104     Vec8q b = Vec8q(reinterpret_i(a));           // bit-cast to integer
105     Vec8q c = b << 52;                           // shift left 52 places to get value into exponent field
106     Vec8d d = Vec8d(reinterpret_d(c));           // bit-cast back to double
107     return d;
108 #endif
109 }
110 
vm_pow2n(Vec16f const & n)111 static inline Vec16f vm_pow2n (Vec16f const & n) {
112 #ifdef __AVX512ER__
113     return _mm512_exp2a23_round_ps(n, _MM_FROUND_NO_EXC);
114 #else
115     const float pow2_23 =  8388608.0;            // 2^23
116     const float bias = 127.0;                    // bias in exponent
117     Vec16f a = n + (bias + pow2_23);             // put n + bias in least significant bits
118     Vec16i b = Vec16i(reinterpret_i(a));         // bit-cast to integer
119     Vec16i c = b << 23;                          // shift left 23 places to get into exponent field
120     Vec16f d = Vec16f(reinterpret_f(c));         // bit-cast back to float
121     return d;
122 #endif
123 }
124 
125 #endif // MAX_VECTOR_SIZE >= 512
126 
127 
128 // Template for exp function, double precision
129 // The limit of abs(x) is defined by max_x below
130 // This function does not produce denormals
131 // Template parameters:
132 // VTYPE:  double vector type
133 // BVTYPE: boolean vector type
134 // M1: 0 for exp, 1 for expm1
135 // BA: 0 for exp, 1 for 0.5*exp, 2 for pow(2,x), 10 for pow(10,x)
136 
137 #if 1  // choose method
138 
139 // Taylor expansion
140 template<class VTYPE, class BVTYPE, int M1, int BA>
exp_d(VTYPE const & initial_x)141 static inline VTYPE exp_d(VTYPE const & initial_x) {
142 
143     // Taylor coefficients, 1/n!
144     // Not using minimax approximation because we prioritize precision close to x = 0
145     const double p2  = 1./2.;
146     const double p3  = 1./6.;
147     const double p4  = 1./24.;
148     const double p5  = 1./120.;
149     const double p6  = 1./720.;
150     const double p7  = 1./5040.;
151     const double p8  = 1./40320.;
152     const double p9  = 1./362880.;
153     const double p10 = 1./3628800.;
154     const double p11 = 1./39916800.;
155     const double p12 = 1./479001600.;
156     const double p13 = 1./6227020800.;
157 
158     // maximum abs(x), value depends on BA, defined below
159     // The lower limit of x is slightly more restrictive than the upper limit.
160     // We are specifying the lower limit, except for BA = 1 because it is not used for negative x
161     double max_x;
162 
163     // data vectors
164     VTYPE  x, r, z, n2;
165     BVTYPE inrange;                              // boolean vector
166 
167     if (BA <= 1) { // exp(x)
168         max_x = BA == 0 ? 708.39 : 709.7; // lower limit for 0.5*exp(x) is -707.6, but we are using 0.5*exp(x) only for positive x in hyperbolic functions
169         const double ln2d_hi = 0.693145751953125;
170         const double ln2d_lo = 1.42860682030941723212E-6;
171         x  = initial_x;
172         r  = round(initial_x*VM_LOG2E);
173         // subtraction in two steps for higher precision
174         x = nmul_add(r, ln2d_hi, x);             //  x -= r * ln2d_hi;
175         x = nmul_add(r, ln2d_lo, x);             //  x -= r * ln2d_lo;
176     }
177     else if (BA == 2) { // pow(2,x)
178         max_x = 1022.0;
179         r  = round(initial_x);
180         x  = initial_x - r;
181         x *= VM_LN2;
182     }
183     else if (BA == 10) { // pow(10,x)
184         max_x = 307.65;
185         const double log10_2_hi = 0.30102999554947019;     // log10(2) in two parts
186         const double log10_2_lo = 1.1451100899212592E-10;
187         x  = initial_x;
188         r  = round(initial_x*(VM_LOG2E*VM_LN10));
189         // subtraction in two steps for higher precision
190         x  = nmul_add(r, log10_2_hi, x);         //  x -= r * log10_2_hi;
191         x  = nmul_add(r, log10_2_lo, x);         //  x -= r * log10_2_lo;
192         x *= VM_LN10;
193     }
194     else  {  // undefined value of BA
195         return 0.;
196     }
197 
198     z = polynomial_13m(x, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13);
199 
200     if (BA == 1) r--;  // 0.5 * exp(x)
201 
202     // multiply by power of 2
203     n2 = vm_pow2n(r);
204 
205     if (M1 == 0) {
206         // exp
207         z = (z + 1.0) * n2;
208     }
209     else {
210         // expm1
211         z = mul_add(z, n2, n2 - 1.0);            // z = z * n2 + (n2 - 1.0);
212     }
213 
214     // check for overflow
215     inrange  = abs(initial_x) < max_x;
216     // check for INF and NAN
217     inrange &= is_finite(initial_x);
218 
219     if (horizontal_and(inrange)) {
220         // fast normal path
221         return z;
222     }
223     else {
224         // overflow, underflow and NAN
225         r = select(sign_bit(initial_x), 0.-M1, infinite_vec<VTYPE>()); // value in case of +/- overflow or INF
226         z = select(inrange, z, r);                                     // +/- underflow
227         z = select(is_nan(initial_x), initial_x, z);                   // NAN goes through
228         return z;
229     }
230 }
231 
232 #else
233 
234 // Pade expansion uses less code and fewer registers, but is slower
235 template<class VTYPE, class BVTYPE, int M1, int BA>
exp_d(VTYPE const & initial_x)236 static inline VTYPE exp_d(VTYPE const & initial_x) {
237 
238     // define constants
239     const double ln2p1   = 0.693145751953125;
240     const double ln2p2   = 1.42860682030941723212E-6;
241     const double log2e   = VM_LOG2E;
242     const double max_exp = 708.39;
243     // coefficients of pade polynomials
244     const double P0exp = 9.99999999999999999910E-1;
245     const double P1exp = 3.02994407707441961300E-2;
246     const double P2exp = 1.26177193074810590878E-4;
247     const double Q0exp = 2.00000000000000000009E0;
248     const double Q1exp = 2.27265548208155028766E-1;
249     const double Q2exp = 2.52448340349684104192E-3;
250     const double Q3exp = 3.00198505138664455042E-6;
251 
252     VTYPE  x, r, xx, px, qx, y, n2;              // data vectors
253     BVTYPE inrange;                              // boolean vector
254 
255     x = initial_x;
256     r = round(initial_x*log2e);
257 
258     // subtraction in one step would gives loss of precision
259     x -= r * ln2p1;
260     x -= r * ln2p2;
261 
262     xx = x * x;
263 
264     // px = x * P(x^2).
265     px = polynomial_2(xx, P0exp, P1exp, P2exp) * x;
266 
267     // Evaluate Q(x^2).
268     qx = polynomial_3(xx, Q0exp, Q1exp, Q2exp, Q3exp);
269 
270     // e^x = 1 + 2*P(x^2)/( Q(x^2) - P(x^2) )
271     y = (2.0 * px) / (qx - px);
272 
273     // Get 2^n in double.
274     // n  = round_to_int64_limited(r);
275     // n2 = exp2(n);
276     n2 = vm_pow2n(r);  // this is faster
277 
278     if (M1 == 0) {
279         // exp
280         y = (y + 1.0) * n2;
281     }
282     else {
283         // expm1
284         y = y * n2 + (n2 - 1.0);
285     }
286 
287     // overflow
288     inrange  = abs(initial_x) < max_exp;
289     // check for INF and NAN
290     inrange &= is_finite(initial_x);
291 
292     if (horizontal_and(inrange)) {
293         // fast normal path
294         return y;
295     }
296     else {
297         // overflow, underflow and NAN
298         r = select(sign_bit(initial_x), 0.-M1, infinite_vec<VTYPE>()); // value in case of overflow or INF
299         y = select(inrange, y, r);                                     // +/- overflow
300         y = select(is_nan(initial_x), initial_x, y);                   // NAN goes through
301         return y;
302     }
303 }
304 #endif
305 
306 // instances of exp_d template
exp(Vec2d const & x)307 static inline Vec2d exp(Vec2d const & x) {
308     return exp_d<Vec2d, Vec2db, 0, 0>(x);
309 }
310 
expm1(Vec2d const & x)311 static inline Vec2d expm1(Vec2d const & x) {
312     return exp_d<Vec2d, Vec2db, 1, 0>(x);
313 }
314 
exp2(Vec2d const & x)315 static inline Vec2d exp2(Vec2d const & x) {
316     return exp_d<Vec2d, Vec2db, 0, 2>(x);
317 }
318 
exp10(Vec2d const & x)319 static inline Vec2d exp10(Vec2d const & x) {
320     return exp_d<Vec2d, Vec2db, 0, 10>(x);
321 }
322 
323 #if MAX_VECTOR_SIZE >= 256
324 
exp(Vec4d const & x)325 static inline Vec4d exp(Vec4d const & x) {
326     return exp_d<Vec4d, Vec4db, 0, 0>(x);
327 }
328 
expm1(Vec4d const & x)329 static inline Vec4d expm1(Vec4d const & x) {
330     return exp_d<Vec4d, Vec4db, 1, 0>(x);
331 }
332 
exp2(Vec4d const & x)333 static inline Vec4d exp2(Vec4d const & x) {
334     return exp_d<Vec4d, Vec4db, 0, 2>(x);
335 }
336 
exp10(Vec4d const & x)337 static inline Vec4d exp10(Vec4d const & x) {
338     return exp_d<Vec4d, Vec4db, 0, 10>(x);
339 }
340 
341 #endif // MAX_VECTOR_SIZE >= 256
342 
343 #if MAX_VECTOR_SIZE >= 512
344 
exp(Vec8d const & x)345 static inline Vec8d exp(Vec8d const & x) {
346     return exp_d<Vec8d, Vec8db, 0, 0>(x);
347 }
348 
expm1(Vec8d const & x)349 static inline Vec8d expm1(Vec8d const & x) {
350     return exp_d<Vec8d, Vec8db, 1, 0>(x);
351 }
352 
exp2(Vec8d const & x)353 static inline Vec8d exp2(Vec8d const & x) {
354     return exp_d<Vec8d, Vec8db, 0, 2>(x);
355 }
356 
exp10(Vec8d const & x)357 static inline Vec8d exp10(Vec8d const & x) {
358     return exp_d<Vec8d, Vec8db, 0, 10>(x);
359 }
360 
361 #endif // MAX_VECTOR_SIZE >= 512
362 
363 // Template for exp function, single precision
364 // The limit of abs(x) is defined by max_x below
365 // This function does not produce denormals
366 // Template parameters:
367 // VTYPE:  float vector type
368 // BVTYPE: boolean vector type
369 // M1: 0 for exp, 1 for expm1
370 // BA: 0 for exp, 1 for 0.5*exp, 2 for pow(2,x), 10 for pow(10,x)
371 
372 template<class VTYPE, class BVTYPE, int M1, int BA>
exp_f(VTYPE const & initial_x)373 static inline VTYPE exp_f(VTYPE const & initial_x) {
374 
375     // Taylor coefficients
376     const float P0expf   =  1.f/2.f;
377     const float P1expf   =  1.f/6.f;
378     const float P2expf   =  1.f/24.f;
379     const float P3expf   =  1.f/120.f;
380     const float P4expf   =  1.f/720.f;
381     const float P5expf   =  1.f/5040.f;
382 
383     VTYPE  x, r, x2, z, n2;                      // data vectors
384     BVTYPE inrange;                              // boolean vector
385 
386     // maximum abs(x), value depends on BA, defined below
387     // The lower limit of x is slightly more restrictive than the upper limit.
388     // We are specifying the lower limit, except for BA = 1 because it is not used for negative x
389     float max_x;
390 
391     if (BA <= 1) { // exp(x)
392         const float ln2f_hi  =  0.693359375f;
393         const float ln2f_lo  = -2.12194440e-4f;
394         max_x = (BA == 0) ? 87.3f : 89.0f;
395 
396         x = initial_x;
397         r = round(initial_x*float(VM_LOG2E));
398         x = nmul_add(r, VTYPE(ln2f_hi), x);      //  x -= r * ln2f_hi;
399         x = nmul_add(r, VTYPE(ln2f_lo), x);      //  x -= r * ln2f_lo;
400     }
401     else if (BA == 2) {                          // pow(2,x)
402         max_x = 126.f;
403         r = round(initial_x);
404         x = initial_x - r;
405         x = x * (float)VM_LN2;
406     }
407     else if (BA == 10) {                         // pow(10,x)
408         max_x = 37.9f;
409         const float log10_2_hi = 0.301025391f;   // log10(2) in two parts
410         const float log10_2_lo = 4.60503907E-6f;
411         x = initial_x;
412         r = round(initial_x*float(VM_LOG2E*VM_LN10));
413         x = nmul_add(r, VTYPE(log10_2_hi), x);   //  x -= r * log10_2_hi;
414         x = nmul_add(r, VTYPE(log10_2_lo), x);   //  x -= r * log10_2_lo;
415         x = x * (float)VM_LN10;
416     }
417     else  {  // undefined value of BA
418         return 0.;
419     }
420 
421     x2 = x * x;
422     z = polynomial_5(x,P0expf,P1expf,P2expf,P3expf,P4expf,P5expf);
423     z = mul_add(z, x2, x);                       // z *= x2;  z += x;
424 
425     if (BA == 1) r--;                            // 0.5 * exp(x)
426 
427     // multiply by power of 2
428     n2 = vm_pow2n(r);
429 
430     if (M1 == 0) {
431         // exp
432         z = (z + 1.0f) * n2;
433     }
434     else {
435         // expm1
436         z = mul_add(z, n2, n2 - 1.0f);           //  z = z * n2 + (n2 - 1.0f);
437     }
438 
439     // check for overflow
440     inrange  = abs(initial_x) < max_x;
441     // check for INF and NAN
442     inrange &= is_finite(initial_x);
443 
444     if (horizontal_and(inrange)) {
445         // fast normal path
446         return z;
447     }
448     else {
449         // overflow, underflow and NAN
450         r = select(sign_bit(initial_x), 0.f-M1, infinite_vec<VTYPE>()); // value in case of +/- overflow or INF
451         z = select(inrange, z, r);                                      // +/- underflow
452         z = select(is_nan(initial_x), initial_x, z);                    // NAN goes through
453         return z;
454     }
455 }
456 #if defined(__AVX512ER__) && MAX_VECTOR_SIZE >= 512
457 // forward declarations of fast 512 bit versions
458 static Vec16f exp(Vec16f const & x);
459 static Vec16f exp2(Vec16f const & x);
460 static Vec16f exp10(Vec16f const & x);
461 #endif
462 
463 // instances of exp_f template
exp(Vec4f const & x)464 static inline Vec4f exp(Vec4f const & x) {
465 #if defined(__AVX512ER__) && MAX_VECTOR_SIZE >= 512 // use faster 512 bit version
466     return _mm512_castps512_ps128(exp(Vec16f(_mm512_castps128_ps512(x))));
467 #else
468     return exp_f<Vec4f, Vec4fb, 0, 0>(x);
469 #endif
470 }
471 
expm1(Vec4f const & x)472 static inline Vec4f expm1(Vec4f const & x) {
473     return exp_f<Vec4f, Vec4fb, 1, 0>(x);
474 }
475 
exp2(Vec4f const & x)476 static inline Vec4f exp2(Vec4f const & x) {
477 #if defined(__AVX512ER__) && MAX_VECTOR_SIZE >= 512 // use faster 512 bit version
478     return _mm512_castps512_ps128(exp2(Vec16f(_mm512_castps128_ps512(x))));
479 #else
480     return exp_f<Vec4f, Vec4fb, 0, 2>(x);
481 #endif
482 }
483 
exp10(Vec4f const & x)484 static inline Vec4f exp10(Vec4f const & x) {
485 #if defined(__AVX512ER__) && MAX_VECTOR_SIZE >= 512 // use faster 512 bit version
486     return _mm512_castps512_ps128(exp10(Vec16f(_mm512_castps128_ps512(x))));
487 #else
488     return exp_f<Vec4f, Vec4fb, 0, 10>(x);
489 #endif
490 }
491 
492 #if MAX_VECTOR_SIZE >= 256
493 
exp(Vec8f const & x)494 static inline Vec8f exp(Vec8f const & x) {
495 #if defined(__AVX512ER__) && MAX_VECTOR_SIZE >= 512 // use faster 512 bit version
496     return _mm512_castps512_ps256(exp(Vec16f(_mm512_castps256_ps512(x))));
497 #else
498     return exp_f<Vec8f, Vec8fb, 0, 0>(x);
499 #endif
500 }
501 
expm1(Vec8f const & x)502 static inline Vec8f expm1(Vec8f const & x) {
503     return exp_f<Vec8f, Vec8fb, 1, 0>(x);
504 }
505 
exp2(Vec8f const & x)506 static inline Vec8f exp2(Vec8f const & x) {
507 #if defined(__AVX512ER__) && MAX_VECTOR_SIZE >= 512 // use faster 512 bit version
508     return _mm512_castps512_ps256(exp2(Vec16f(_mm512_castps256_ps512(x))));
509 #else
510     return exp_f<Vec8f, Vec8fb, 0, 2>(x);
511 #endif
512 }
513 
exp10(Vec8f const & x)514 static inline Vec8f exp10(Vec8f const & x) {
515 #if defined(__AVX512ER__) && MAX_VECTOR_SIZE >= 512 // use faster 512 bit version
516     return _mm512_castps512_ps256(exp10(Vec16f(_mm512_castps256_ps512(x))));
517 #else
518     return exp_f<Vec8f, Vec8fb, 0, 10>(x);
519 #endif
520 }
521 
522 #endif // MAX_VECTOR_SIZE >= 256
523 
524 #if MAX_VECTOR_SIZE >= 512
525 
exp(Vec16f const & x)526 static inline Vec16f exp(Vec16f const & x) {
527 #ifdef __AVX512ER__  // AVX512ER instruction set includes fast exponential function
528 #ifdef VCL_FASTEXP
529     // very fast, but less precise for large x:
530     return _mm512_exp2a23_round_ps(x*float(VM_LOG2E), _MM_FROUND_NO_EXC);
531 #else
532     // best precision, also for large x:
533     const Vec16f log2e = float(VM_LOG2E);
534     const float ln2f_hi = 0.693359375f;
535     const float ln2f_lo = -2.12194440e-4f;
536     Vec16f x1 = x, r, y;
537     r = round(x1*log2e);
538     x1 = nmul_add(r, Vec16f(ln2f_hi), x1);      //  x -= r * ln2f_hi;
539     x1 = nmul_add(r, Vec16f(ln2f_lo), x1);      //  x -= r * ln2f_lo;
540     x1 = x1 * log2e;
541     y = _mm512_exp2a23_round_ps(r, _MM_FROUND_NO_EXC);
542     // y = vm_pow2n(r);
543     return y * _mm512_exp2a23_round_ps(x1, _MM_FROUND_NO_EXC);
544 #endif // VCL_FASTEXP
545 #else  // no AVX512ER, use above template
546     return exp_f<Vec16f, Vec16fb, 0, 0>(x);
547 #endif
548 }
549 
expm1(Vec16f const & x)550 static inline Vec16f expm1(Vec16f const & x) {
551     return exp_f<Vec16f, Vec16fb, 1, 0>(x);
552 }
553 
exp2(Vec16f const & x)554 static inline Vec16f exp2(Vec16f const & x) {
555 #ifdef __AVX512ER__
556     return Vec16f(_mm512_exp2a23_round_ps(x, _MM_FROUND_NO_EXC));
557 #else
558     return exp_f<Vec16f, Vec16fb, 0, 2>(x);
559 #endif
560 }
561 
exp10(Vec16f const & x)562 static inline Vec16f exp10(Vec16f const & x) {
563 #ifdef __AVX512ER__  // AVX512ER instruction set includes fast exponential function
564 #ifdef VCL_FASTEXP
565     // very fast, but less precise for large x:
566     return _mm512_exp2a23_round_ps(x*float(VM_LOG210), _MM_FROUND_NO_EXC);
567 #else
568     // best precision, also for large x:
569     const float log10_2_hi = 0.301025391f;   // log10(2) in two parts
570     const float log10_2_lo = 4.60503907E-6f;
571     Vec16f x1 = x, r, y;
572     Vec16f log210 = float(VM_LOG210);
573     r = round(x1*log210);
574     x1 = nmul_add(r, Vec16f(log10_2_hi), x1);      //  x -= r * log10_2_hi
575     x1 = nmul_add(r, Vec16f(log10_2_lo), x1);      //  x -= r * log10_2_lo
576     x1 = x1 * log210;
577     // y = vm_pow2n(r);
578     y = _mm512_exp2a23_round_ps(r, _MM_FROUND_NO_EXC);
579     return y * _mm512_exp2a23_round_ps(x1, _MM_FROUND_NO_EXC);
580 #endif // VCL_FASTEXP
581 #else  // no AVX512ER, use above template
582     return exp_f<Vec16f, Vec16fb, 0, 10>(x);
583 #endif
584 }
585 
586 #endif // MAX_VECTOR_SIZE >= 512
587 
588 
589 /******************************************************************************
590 *                 Logarithm functions
591 ******************************************************************************/
592 
593 // Helper functions: fraction_2(x) = fraction(x)*0.5
594 
595 // Modified fraction function:
596 // Extract the fraction part of a floating point number, and divide by 2
597 // The fraction function is defined in vectorf128.h etc.
598 // fraction_2(x) = fraction(x)*0.5
599 // This version gives half the fraction without extra delay
600 // Does not work for x = 0
fraction_2(Vec4f const & a)601 static inline Vec4f fraction_2(Vec4f const & a) {
602     Vec4ui t1 = _mm_castps_si128(a);   // reinterpret as 32-bit integer
603     Vec4ui t2 = Vec4ui((t1 & 0x007FFFFF) | 0x3F000000); // set exponent to 0 + bias
604     return _mm_castsi128_ps(t2);
605 }
606 
fraction_2(Vec2d const & a)607 static inline Vec2d fraction_2(Vec2d const & a) {
608     Vec2uq t1 = _mm_castpd_si128(a);   // reinterpret as 64-bit integer
609     Vec2uq t2 = Vec2uq((t1 & 0x000FFFFFFFFFFFFFll) | 0x3FE0000000000000ll); // set exponent to 0 + bias
610     return _mm_castsi128_pd(t2);
611 }
612 
613 #if MAX_VECTOR_SIZE >= 256
614 
fraction_2(Vec8f const & a)615 static inline Vec8f fraction_2(Vec8f const & a) {
616 #if defined (VECTORI256_H) && VECTORI256_H > 2  // 256 bit integer vectors are available, AVX2
617     Vec8ui t1 = _mm256_castps_si256(a);   // reinterpret as 32-bit integer
618     Vec8ui t2 = (t1 & 0x007FFFFF) | 0x3F000000; // set exponent to 0 + bias
619     return _mm256_castsi256_ps(t2);
620 #else
621     return Vec8f(fraction_2(a.get_low()), fraction_2(a.get_high()));
622 #endif
623 }
624 
fraction_2(Vec4d const & a)625 static inline Vec4d fraction_2(Vec4d const & a) {
626 #if VECTORI256_H > 1  // AVX2
627     Vec4uq t1 = _mm256_castpd_si256(a);   // reinterpret as 64-bit integer
628     Vec4uq t2 = Vec4uq((t1 & 0x000FFFFFFFFFFFFFll) | 0x3FE0000000000000ll); // set exponent to 0 + bias
629     return _mm256_castsi256_pd(t2);
630 #else
631     return Vec4d(fraction_2(a.get_low()), fraction_2(a.get_high()));
632 #endif
633 }
634 
635 #endif // MAX_VECTOR_SIZE >= 256
636 
637 #if MAX_VECTOR_SIZE >= 512
638 
fraction_2(Vec16f const & a)639 static inline Vec16f fraction_2(Vec16f const & a) {
640 #if INSTRSET >= 9                    // 512 bit integer vectors are available, AVX512
641     return _mm512_getmant_ps(a, _MM_MANT_NORM_p5_1, _MM_MANT_SIGN_zero);
642     //return Vec16f(_mm512_getmant_ps(a, _MM_MANT_NORM_1_2, _MM_MANT_SIGN_zero)) * 0.5f;
643 #else
644     return Vec16f(fraction_2(a.get_low()), fraction_2(a.get_high()));
645 #endif
646 }
647 
fraction_2(Vec8d const & a)648 static inline Vec8d fraction_2(Vec8d const & a) {
649 #if INSTRSET >= 9                    // 512 bit integer vectors are available, AVX512
650     return _mm512_getmant_pd(a, _MM_MANT_NORM_p5_1, _MM_MANT_SIGN_zero);
651     //return Vec8d(_mm512_getmant_pd(a, _MM_MANT_NORM_1_2, _MM_MANT_SIGN_zero)) * 0.5;
652 #else
653     return Vec8d(fraction_2(a.get_low()), fraction_2(a.get_high()));
654 #endif
655 }
656 
657 #endif // MAX_VECTOR_SIZE >= 512
658 
659 
660 // Helper functions: exponent_f(x) = exponent(x) as floating point number
661 
662 union vm_ufi {
663     float f;
664     uint32_t i;
665 };
666 
667 union vm_udi {
668     double d;
669     uint64_t i;
670 };
671 
672 // extract exponent of a positive number x as a floating point number
673 //Note: the AVX512 version return -inf for x=0, the non-AVX versions return a negative number
exponent_f(Vec4f const & x)674 static inline Vec4f exponent_f(Vec4f const & x) {
675 #ifdef __AVX512VL__                              // AVX512VL
676     // note: this version returns -inf for x=0
677     return _mm_getexp_ps(x);
678 #else
679     const float pow2_23 =  8388608.0f;           // 2^23
680     const float bias = 127.f;                    // bias in exponent
681     const vm_ufi upow2_23 = {pow2_23};
682     Vec4ui a = reinterpret_i(x);                 // bit-cast x to integer
683     Vec4ui b = a >> 23;                          // shift down exponent to low bits
684     Vec4ui c = b | Vec4ui(upow2_23.i);           // insert new exponent
685     Vec4f  d = reinterpret_f(c);                 // bit-cast back to double
686     Vec4f  e = d - (pow2_23 + bias);             // subtract magic number and bias
687     return e;
688 #endif
689 }
690 
exponent_f(Vec2d const & x)691 static inline Vec2d exponent_f(Vec2d const & x) {
692 #ifdef __AVX512VL__                              // AVX512VL
693     // note: this version returns -inf for x=0
694     return _mm_getexp_pd(x);
695 #else
696     const double pow2_52 = 4503599627370496.0;   // 2^52
697     const double bias = 1023.0;                  // bias in exponent
698     const vm_udi upow2_52 = {pow2_52};
699 
700     Vec2uq a = reinterpret_i(x);                 // bit-cast x to integer
701     Vec2uq b = a >> 52;                          // shift down exponent to low bits
702     Vec2uq c = b | Vec2uq(upow2_52.i);           // insert new exponent
703     Vec2d  d = reinterpret_d(c);                 // bit-cast back to double
704     Vec2d  e = d - (pow2_52 + bias);             // subtract magic number and bias
705     return e;
706 #endif
707 }
708 
709 #if MAX_VECTOR_SIZE >= 256
710 
exponent_f(Vec8f const & x)711 static inline Vec8f exponent_f(Vec8f const & x) {
712 #ifdef __AVX512VL__                              // AVX512VL
713     // note: this version returns -inf for x=0
714     return _mm256_getexp_ps(x);
715 #else
716     const float pow2_23 =  8388608.0f;           // 2^23
717     const float bias = 127.f;                    // bias in exponent
718     const vm_ufi upow2_23 = {pow2_23};
719     Vec8ui a = reinterpret_i(x);                 // bit-cast x to integer
720     Vec8ui b = a >> 23;                          // shift down exponent to low bits
721     Vec8ui c = b | Vec8ui(upow2_23.i);           // insert new exponent
722     Vec8f  d = reinterpret_f(c);                 // bit-cast back to double
723     Vec8f  e = d - (pow2_23 + bias);             // subtract magic number and bias
724     return e;
725 #endif
726 }
727 
728 // extract exponent of a positive number x as a floating point number
exponent_f(Vec4d const & x)729 static inline Vec4d exponent_f(Vec4d const & x) {
730 #ifdef __AVX512VL__                              // AVX512VL
731     return _mm256_getexp_pd(x);
732 #else
733     const double pow2_52 = 4503599627370496.0;   // 2^52
734     const double bias = 1023.0;                  // bias in exponent
735     const vm_udi upow2_52 = {pow2_52};
736 
737     Vec4uq a = reinterpret_i(x);                 // bit-cast x to integer
738     Vec4uq b = a >> 52;                          // shift down exponent to low bits
739     Vec4uq c = b | Vec4uq(upow2_52.i);           // insert new exponent
740     Vec4d  d = reinterpret_d(c);                 // bit-cast back to double
741     Vec4d  e = d - (pow2_52 + bias);             // subtract magic number and bias
742     return e;
743 #endif
744 }
745 
746 #endif // MAX_VECTOR_SIZE >= 256
747 
748 #if MAX_VECTOR_SIZE >= 512
749 
exponent_f(Vec16f const & x)750 static inline Vec16f exponent_f(Vec16f const & x) {
751 #if INSTRSET >= 9                                // AVX512
752     // note: this version returns -inf for x=0
753     return _mm512_getexp_ps(x);
754 #else
755     return Vec16f(exponent_f(x.get_low()), exponent_f(x.get_high()));
756 #endif
757 }
758 
759 // extract exponent of a positive number x as a floating point number
exponent_f(Vec8d const & x)760 static inline Vec8d exponent_f(Vec8d const & x) {
761 #if INSTRSET >= 9                                // AVX512
762     // note: this returns -inf for x=0
763     return _mm512_getexp_pd(x);
764 #else
765     return Vec8d(exponent_f(x.get_low()), exponent_f(x.get_high()));
766 #endif
767 }
768 
769 #endif // MAX_VECTOR_SIZE >= 512
770 
771 
772 // log function, double precision
773 // template parameters:
774 // VTYPE:  f.p. vector type
775 // BVTYPE: boolean vector type
776 // M1: 0 for log, 1 for log1p
777 template<class VTYPE, class BVTYPE, int M1>
log_d(VTYPE const & initial_x)778 static inline VTYPE log_d(VTYPE const & initial_x) {
779 
780     // define constants
781     const double ln2_hi =  0.693359375;
782     const double ln2_lo = -2.121944400546905827679E-4;
783     const double P0log  =  7.70838733755885391666E0;
784     const double P1log  =  1.79368678507819816313E1;
785     const double P2log  =  1.44989225341610930846E1;
786     const double P3log  =  4.70579119878881725854E0;
787     const double P4log  =  4.97494994976747001425E-1;
788     const double P5log  =  1.01875663804580931796E-4;
789     const double Q0log  =  2.31251620126765340583E1;
790     const double Q1log  =  7.11544750618563894466E1;
791     const double Q2log  =  8.29875266912776603211E1;
792     const double Q3log  =  4.52279145837532221105E1;
793     const double Q4log  =  1.12873587189167450590E1;
794 
795     VTYPE  x1, x, x2, px, qx, res, fe;           // data vectors
796     BVTYPE blend, overflow, underflow;           // boolean vectors
797 
798     if (M1 == 0) {
799         x1 = initial_x;                          // log(x)
800     }
801     else {
802         x1 = initial_x + 1.0;                    // log(x+1)
803     }
804     // separate mantissa from exponent
805     // VTYPE x  = fraction(x1) * 0.5;
806     x  = fraction_2(x1);
807     fe = exponent_f(x1);
808 
809     blend = x > VM_SQRT2*0.5;
810     x  = if_add(!blend, x, x);                   // conditional add
811     fe = if_add(blend, fe, 1.);                  // conditional add
812 
813     if (M1 == 0) {
814         // log(x). Expand around 1.0
815         x -= 1.0;
816     }
817     else {
818         // log(x+1). Avoid loss of precision when adding 1 and later subtracting 1 if exponent = 0
819         x = select(fe==0., initial_x, x - 1.0);
820     }
821 
822     // rational form
823     px  = polynomial_5 (x, P0log, P1log, P2log, P3log, P4log, P5log);
824     x2  = x * x;
825     px *= x * x2;
826     qx  = polynomial_5n(x, Q0log, Q1log, Q2log, Q3log, Q4log);
827     res = px / qx ;
828 
829     // add exponent
830     res  = mul_add(fe, ln2_lo, res);             // res += fe * ln2_lo;
831     res += nmul_add(x2, 0.5, x);                 // res += x  - 0.5 * x2;
832     res  = mul_add(fe, ln2_hi, res);             // res += fe * ln2_hi;
833 
834     overflow  = !is_finite(x1);
835     underflow = x1 < VM_SMALLEST_NORMAL;         // denormals not supported by this functions
836 
837     if (!horizontal_or(overflow | underflow)) {
838         // normal path
839         return res;
840     }
841     else {
842         // overflow and underflow
843         res = select(underflow, nan_vec<VTYPE>(NAN_LOG), res);                   // x1  < 0 gives NAN
844         res = select(x1 == 0. || is_subnormal(x1), -infinite_vec<VTYPE>(), res); // x1 == 0 gives -INF
845         res = select(overflow,  x1, res);                                        // INF or NAN goes through
846         res = select(is_inf(x1)&sign_bit(x1), nan_vec<VTYPE>(NAN_LOG), res);     // -INF gives NAN
847         return res;
848     }
849 }
850 
851 
log(Vec2d const & x)852 static inline Vec2d log(Vec2d const & x) {
853     return log_d<Vec2d, Vec2db, 0>(x);
854 }
855 
log1p(Vec2d const & x)856 static inline Vec2d log1p(Vec2d const & x) {
857     return log_d<Vec2d, Vec2db, 1>(x);
858 }
859 
log2(Vec2d const & x)860 static inline Vec2d log2(Vec2d const & x) {
861     return VM_LOG2E * log_d<Vec2d, Vec2db, 0>(x);
862 }
863 
log10(Vec2d const & x)864 static inline Vec2d log10(Vec2d const & x) {
865     return VM_LOG10E * log_d<Vec2d, Vec2db, 0>(x);
866 }
867 
868 #if MAX_VECTOR_SIZE >= 256
869 
log(Vec4d const & x)870 static inline Vec4d log(Vec4d const & x) {
871     return log_d<Vec4d, Vec4db, 0>(x);
872 }
873 
log1p(Vec4d const & x)874 static inline Vec4d log1p(Vec4d const & x) {
875     return log_d<Vec4d, Vec4db, 1>(x);
876 }
877 
log2(Vec4d const & x)878 static inline Vec4d log2(Vec4d const & x) {
879     return VM_LOG2E * log_d<Vec4d, Vec4db, 0>(x);
880 }
881 
log10(Vec4d const & x)882 static inline Vec4d log10(Vec4d const & x) {
883     return VM_LOG10E * log_d<Vec4d, Vec4db, 0>(x);
884 }
885 
886 #endif // MAX_VECTOR_SIZE >= 256
887 
888 #if MAX_VECTOR_SIZE >= 512
889 
log(Vec8d const & x)890 static inline Vec8d log(Vec8d const & x) {
891     return log_d<Vec8d, Vec8db, 0>(x);
892 }
893 
log1p(Vec8d const & x)894 static inline Vec8d log1p(Vec8d const & x) {
895     return log_d<Vec8d, Vec8db, 1>(x);
896 }
897 
log2(Vec8d const & x)898 static inline Vec8d log2(Vec8d const & x) {
899     return VM_LOG2E * log_d<Vec8d, Vec8db, 0>(x);
900 }
901 
log10(Vec8d const & x)902 static inline Vec8d log10(Vec8d const & x) {
903     return VM_LOG10E * log_d<Vec8d, Vec8db, 0>(x);
904 }
905 
906 #endif // MAX_VECTOR_SIZE >= 512
907 
908 
909 
910 // log function, single precision
911 // template parameters:
912 // VTYPE:  f.p. vector type
913 // ITYPE:  integer vector type with same element size
914 // BVTYPE: boolean vector type
915 // BTYPEI: boolean vector type for ITYPE
916 // M1: 0 for log, 1 for log1p
917 template<class VTYPE, class ITYPE, class BVTYPE, class BTYPEI, int M1>
log_f(VTYPE const & initial_x)918 static inline VTYPE log_f(VTYPE const & initial_x) {
919 
920     // define constants
921     const float ln2f_hi =  0.693359375f;
922     const float ln2f_lo = -2.12194440E-4f;
923     const float P0logf  =  3.3333331174E-1f;
924     const float P1logf  = -2.4999993993E-1f;
925     const float P2logf  =  2.0000714765E-1f;
926     const float P3logf  = -1.6668057665E-1f;
927     const float P4logf  =  1.4249322787E-1f;
928     const float P5logf  = -1.2420140846E-1f;
929     const float P6logf  =  1.1676998740E-1f;
930     const float P7logf  = -1.1514610310E-1f;
931     const float P8logf  =  7.0376836292E-2f;
932 
933     VTYPE  x1, x, res, x2, fe;                   // data vectors
934     ITYPE  e;                                    // integer vector
935     BVTYPE blend, overflow, underflow;           // boolean vectors
936 
937     if (M1 == 0) {
938         x1 = initial_x;                          // log(x)
939     }
940     else {
941         x1 = initial_x + 1.0f;                   // log(x+1)
942     }
943 
944     // separate mantissa from exponent
945     x = fraction_2(x1);
946     e = exponent(x1);
947 
948     blend = x > float(VM_SQRT2*0.5);
949     x  = if_add(!blend, x, x);                   // conditional add
950     e  = if_add(BTYPEI(blend),  e, ITYPE(1));    // conditional add
951     fe = to_float(e);
952 
953     if (M1 == 0) {
954         // log(x). Expand around 1.0
955         x -= 1.0f;
956     }
957     else {
958         // log(x+1). Avoid loss of precision when adding 1 and later subtracting 1 if exponent = 0
959         x = select(BVTYPE(e==0), initial_x, x - 1.0f);
960     }
961 
962     // Taylor expansion
963     res = polynomial_8(x, P0logf, P1logf, P2logf, P3logf, P4logf, P5logf, P6logf, P7logf, P8logf);
964     x2  = x*x;
965     res *= x2*x;
966 
967     // add exponent
968     res  = mul_add(fe, ln2f_lo, res);            // res += ln2f_lo  * fe;
969     res += nmul_add(x2, 0.5f, x);                // res += x - 0.5f * x2;
970     res  = mul_add(fe, ln2f_hi, res);            // res += ln2f_hi  * fe;
971 
972     overflow  = !is_finite(x1);
973     underflow = x1 < VM_SMALLEST_NORMALF;        // denormals not supported by this functions
974 
975     if (!horizontal_or(overflow | underflow)) {
976         // normal path
977         return res;
978     }
979     else {
980         // overflow and underflow
981         res = select(underflow, nan_vec<VTYPE>(NAN_LOG), res);                    // x1 < 0 gives NAN
982         res = select(x1 == 0.f || is_subnormal(x1), -infinite_vec<VTYPE>(), res); // x1 == 0 or denormal gives -INF
983         res = select(overflow,  x1, res);                                         // INF or NAN goes through
984         res = select(is_inf(x1)&sign_bit(x1), nan_vec<VTYPE>(NAN_LOG), res);      // -INF gives NAN
985         return res;
986     }
987 }
988 
log(Vec4f const & x)989 static inline Vec4f log(Vec4f const & x) {
990     return log_f<Vec4f, Vec4i, Vec4fb, Vec4ib, 0>(x);
991 }
992 
log1p(Vec4f const & x)993 static inline Vec4f log1p(Vec4f const & x) {
994     return log_f<Vec4f, Vec4i, Vec4fb, Vec4ib, 1>(x);
995 }
996 
log2(Vec4f const & x)997 static inline Vec4f log2(Vec4f const & x) {
998     return float(VM_LOG2E) * log_f<Vec4f, Vec4i, Vec4fb, Vec4ib, 0>(x);
999 }
1000 
log10(Vec4f const & x)1001 static inline Vec4f log10(Vec4f const & x) {
1002     return float(VM_LOG10E) * log_f<Vec4f, Vec4i, Vec4fb, Vec4ib, 0>(x);
1003 }
1004 
1005 #if MAX_VECTOR_SIZE >= 256
1006 
log(Vec8f const & x)1007 static inline Vec8f log(Vec8f const & x) {
1008     return log_f<Vec8f, Vec8i, Vec8fb, Vec8ib, 0>(x);
1009 }
1010 
log1p(Vec8f const & x)1011 static inline Vec8f log1p(Vec8f const & x) {
1012     return log_f<Vec8f, Vec8i, Vec8fb, Vec8ib, 1>(x);
1013 }
1014 
log2(Vec8f const & x)1015 static inline Vec8f log2(Vec8f const & x) {
1016     return float(VM_LOG2E) * log_f<Vec8f, Vec8i, Vec8fb, Vec8ib, 0>(x);
1017 }
1018 
log10(Vec8f const & x)1019 static inline Vec8f log10(Vec8f const & x) {
1020     return float(VM_LOG10E) * log_f<Vec8f, Vec8i, Vec8fb, Vec8ib, 0>(x);
1021 }
1022 
1023 #endif // MAX_VECTOR_SIZE >= 256
1024 
1025 #if MAX_VECTOR_SIZE >= 512
1026 
log(Vec16f const & x)1027 static inline Vec16f log(Vec16f const & x) {
1028     return log_f<Vec16f, Vec16i, Vec16fb, Vec16ib, 0>(x);
1029 }
1030 
log1p(Vec16f const & x)1031 static inline Vec16f log1p(Vec16f const & x) {
1032     return log_f<Vec16f, Vec16i, Vec16fb, Vec16ib, 1>(x);
1033 }
1034 
log2(Vec16f const & x)1035 static inline Vec16f log2(Vec16f const & x) {
1036     return float(VM_LOG2E) * log_f<Vec16f, Vec16i, Vec16fb, Vec16ib, 0>(x);
1037 }
1038 
log10(Vec16f const & x)1039 static inline Vec16f log10(Vec16f const & x) {
1040     return float(VM_LOG10E) * log_f<Vec16f, Vec16i, Vec16fb, Vec16ib, 0>(x);
1041 }
1042 
1043 #endif // MAX_VECTOR_SIZE >= 512
1044 
1045 
1046 /******************************************************************************
1047 *           Cube root and reciprocal cube root
1048 ******************************************************************************/
1049 
1050 // cube root template, double precision
1051 // template parameters:
1052 // VTYPE:  f.p. vector type
1053 // ITYPE:  uint32_t integer vector type with same total number of bits
1054 // ITYPE2: uint64_t integer vector type with same total number of bits
1055 // BVTYPE: boolean vector type
1056 // CR:     -1 for reciprocal cube root, 1 for cube root, 2 for cube root squared
1057 template<class VTYPE, class ITYPE, class ITYPE2, class BVTYPE, int CR>
cbrt_d(VTYPE const & x)1058 static inline VTYPE cbrt_d(VTYPE const & x) {
1059     const int iter = 7;     // iteration count of x^(-1/3) loop
1060     int i;
1061     VTYPE  xa, xa3, a, a2;
1062     ITYPE  m1, m2;
1063     BVTYPE underflow;
1064     ITYPE2 q1(0x5540000000000000ULL);            // exponent bias
1065     ITYPE2 q2(0x0005555500000000ULL);            // exponent multiplier for 1/3
1066     ITYPE2 q3(0x0010000000000000ULL);            // denormal limit
1067     const double one_third  = 1./3.;
1068     const double four_third = 4./3.;
1069 
1070     xa  = abs(x);
1071     xa3 = one_third*xa;
1072 
1073     // multiply exponent by -1/3
1074     m1 = reinterpret_i(xa);
1075     m2 = ITYPE(q1) - (m1 >> 20) * ITYPE(q2);
1076     a  = reinterpret_d(m2);
1077     underflow = BVTYPE(ITYPE2(m1) < q3);          // true if denormal or zero
1078 
1079     // Newton Raphson iteration
1080     for (i = 0; i < iter-1; i++) {
1081         a2 = a * a;
1082         a = nmul_add(xa3, a2*a2, four_third*a);  // a = four_third*a - xa3*a2*a2;
1083     }
1084     // last iteration with better precision
1085     a2 = a * a;
1086     a = mul_add(one_third, nmul_add(xa, a2*a2, a), a); // a = a + one_third*(a - xa*a2*a2);
1087 
1088     if (CR == -1) {  // reciprocal cube root
1089         // (note: gives wrong sign when input is INF)
1090         // generate INF if underflow
1091         a = select(underflow, infinite_vec<VTYPE>(), a);
1092         // get sign
1093         a = sign_combine(a, x);
1094     }
1095     else if (CR == 1) {     // cube root
1096         a = a * a * x;
1097         // generate 0 if underflow
1098         a = select(underflow, 0., a);
1099     }
1100     else if (CR == 2) {     // cube root squared
1101         // (note: gives wrong sign when input is INF)
1102         a = a * xa;
1103         // generate 0 if underflow
1104         a = select(underflow, 0., a);
1105     }
1106     return a;
1107 }
1108 
1109 // template instances for cbrt and reciprocal_cbrt
1110 
1111 // cube root
cbrt(Vec2d const & x)1112 static inline Vec2d cbrt(Vec2d const & x) {
1113     return cbrt_d<Vec2d, Vec4ui, Vec2uq, Vec2db, 1> (x);
1114 }
1115 
1116 // reciprocal cube root
reciprocal_cbrt(Vec2d const & x)1117 static inline Vec2d reciprocal_cbrt(Vec2d const & x) {
1118     return cbrt_d<Vec2d, Vec4ui, Vec2uq, Vec2db, -1> (x);
1119 }
1120 
1121 // square cube root
square_cbrt(Vec2d const & x)1122 static inline Vec2d square_cbrt(Vec2d const & x) {
1123     return cbrt_d<Vec2d, Vec4ui, Vec2uq, Vec2db, 2> (x);
1124 }
1125 
1126 #if MAX_VECTOR_SIZE >= 256
1127 
cbrt(Vec4d const & x)1128 static inline Vec4d cbrt(Vec4d const & x) {
1129     return cbrt_d<Vec4d, Vec8ui, Vec4uq, Vec4db, 1> (x);
1130 }
1131 
reciprocal_cbrt(Vec4d const & x)1132 static inline Vec4d reciprocal_cbrt(Vec4d const & x) {
1133     return cbrt_d<Vec4d, Vec8ui, Vec4uq, Vec4db, -1> (x);
1134 }
1135 
square_cbrt(Vec4d const & x)1136 static inline Vec4d square_cbrt(Vec4d const & x) {
1137     return cbrt_d<Vec4d, Vec8ui, Vec4uq, Vec4db, 2> (x);
1138 }
1139 
1140 #endif // MAX_VECTOR_SIZE >= 256
1141 
1142 #if MAX_VECTOR_SIZE >= 512
1143 
cbrt(Vec8d const & x)1144 static inline Vec8d cbrt(Vec8d const & x) {
1145     return cbrt_d<Vec8d, Vec16ui, Vec8uq, Vec8db, 1> (x);
1146 }
1147 
reciprocal_cbrt(Vec8d const & x)1148 static inline Vec8d reciprocal_cbrt(Vec8d const & x) {
1149     return cbrt_d<Vec8d, Vec16ui, Vec8uq, Vec8db, -1> (x);
1150 }
1151 
square_cbrt(Vec8d const & x)1152 static inline Vec8d square_cbrt(Vec8d const & x) {
1153     return cbrt_d<Vec8d, Vec16ui, Vec8uq, Vec8db, 2> (x);
1154 }
1155 
1156 #endif // MAX_VECTOR_SIZE >= 512
1157 
1158 
1159 // cube root template, single precision
1160 // template parameters:
1161 // VTYPE:  f.p. vector type
1162 // ITYPE:  uint32_t integer vector type
1163 // BVTYPE: boolean vector type
1164 // CR:     -1 for reciprocal cube root, 1 for cube root, 2 for cube root squared
1165 template<class VTYPE, class ITYPE, class BVTYPE, int CR>
cbrt_f(VTYPE const & x)1166 static inline VTYPE cbrt_f(VTYPE const & x) {
1167 
1168     const int iter = 6;                          // iteration count of x^(-1/3) loop
1169     int i;
1170     VTYPE  xa, xa3, a, a2;
1171     ITYPE  m1, m2;
1172     BVTYPE underflow;
1173     ITYPE  q1(0x54800000U);                      // exponent bias
1174     ITYPE  q2(0x002AAAAAU);                      // exponent multiplier for 1/3
1175     ITYPE  q3(0x00800000U);                      // denormal limit
1176     const  float one_third  = float(1./3.);
1177     const  float four_third = float(4./3.);
1178 
1179     xa  = abs(x);
1180     xa3 = one_third*xa;
1181 
1182     // multiply exponent by -1/3
1183     m1 = reinterpret_i(xa);
1184     m2 = q1 - (m1 >> 23) * q2;
1185     a  = reinterpret_f(m2);
1186 
1187     underflow = BVTYPE(m1 < q3);                  // true if denormal or zero
1188 
1189     // Newton Raphson iteration
1190     for (i = 0; i < iter-1; i++) {
1191         a2 = a*a;
1192         a = nmul_add(xa3, a2*a2, four_third*a);  // a = four_third*a - xa3*a2*a2;
1193     }
1194     // last iteration with better precision
1195     a2 = a*a;
1196     a = mul_add(one_third, nmul_add(xa, a2*a2, a), a); //a = a + one_third*(a - xa*a2*a2);
1197 
1198     if (CR == -1) {                              // reciprocal cube root
1199         // generate INF if underflow
1200         a = select(underflow, infinite_vec<VTYPE>(), a);
1201         // get sign
1202         a = sign_combine(a, x);
1203     }
1204     else if (CR == 1) {                          // cube root
1205         a = a * a * x;
1206         // generate 0 if underflow
1207         a = select(underflow, 0., a);
1208     }
1209     else if (CR == 2) {                          // cube root squared
1210         a = a * xa;
1211         // generate 0 if underflow
1212         a = select(underflow, 0., a);
1213     }
1214     return a;
1215 }
1216 
1217 // template instances for cbrt and reciprocal_cbrt
1218 
1219 // cube root
cbrt(Vec4f const & x)1220 static inline Vec4f cbrt(Vec4f const & x) {
1221     return cbrt_f<Vec4f, Vec4ui, Vec4fb, 1> (x);
1222 }
1223 
1224 // reciprocal cube root
reciprocal_cbrt(Vec4f const & x)1225 static inline Vec4f reciprocal_cbrt(Vec4f const & x) {
1226     return cbrt_f<Vec4f, Vec4ui, Vec4fb, -1> (x);
1227 }
1228 
1229 // square cube root
square_cbrt(Vec4f const & x)1230 static inline Vec4f square_cbrt(Vec4f const & x) {
1231     return cbrt_f<Vec4f, Vec4ui, Vec4fb, 2> (x);
1232 }
1233 
1234 #if MAX_VECTOR_SIZE >= 256
1235 
cbrt(Vec8f const & x)1236 static inline Vec8f cbrt(Vec8f const & x) {
1237     return cbrt_f<Vec8f, Vec8ui, Vec8fb, 1> (x);
1238 }
1239 
reciprocal_cbrt(Vec8f const & x)1240 static inline Vec8f reciprocal_cbrt(Vec8f const & x) {
1241     return cbrt_f<Vec8f, Vec8ui, Vec8fb, -1> (x);
1242 }
1243 
square_cbrt(Vec8f const & x)1244 static inline Vec8f square_cbrt(Vec8f const & x) {
1245     return cbrt_f<Vec8f, Vec8ui, Vec8fb, 2> (x);
1246 }
1247 
1248 #endif // MAX_VECTOR_SIZE >= 256
1249 
1250 #if MAX_VECTOR_SIZE >= 512
1251 
cbrt(Vec16f const & x)1252 static inline Vec16f cbrt(Vec16f const & x) {
1253     return cbrt_f<Vec16f, Vec16ui, Vec16fb, 1> (x);
1254 }
1255 
reciprocal_cbrt(Vec16f const & x)1256 static inline Vec16f reciprocal_cbrt(Vec16f const & x) {
1257     return cbrt_f<Vec16f, Vec16ui, Vec16fb, -1> (x);
1258 }
1259 
square_cbrt(Vec16f const & x)1260 static inline Vec16f square_cbrt(Vec16f const & x) {
1261     return cbrt_f<Vec16f, Vec16ui, Vec16fb, 2> (x);
1262 }
1263 
1264 // Helper function for power function: insert special values of pow(x,y) when x=0:
1265 // y<0 -> inf, y=0 -> 1, y>0 -> 0, y=nan -> nan
wm_pow_case_x0(Vec8db const & xiszero,Vec8d const & y,Vec8d const & z)1266 static inline Vec8d wm_pow_case_x0(Vec8db const & xiszero, Vec8d const & y, Vec8d const & z) {
1267 #if INSTRSET >= 9
1268     const __m512i table = Vec8q(0x85858A00);
1269     return _mm512_mask_fixupimm_pd(z, xiszero, y, table, 0);
1270 #else
1271     return select(xiszero, select(y < 0., infinite_vec<Vec8d>(), select(y == 0., Vec8d(1.), Vec8d(0.))), z);
1272 #endif
1273 }
1274 
1275 #endif // MAX_VECTOR_SIZE >= 512
1276 
1277 #if MAX_VECTOR_SIZE >= 256
1278 
wm_pow_case_x0(Vec4db const & xiszero,Vec4d const & y,Vec4d const & z)1279 static inline Vec4d wm_pow_case_x0(Vec4db const & xiszero, Vec4d const & y, Vec4d const & z) {
1280 //#if defined __AVX512VL__ && defined ?
1281 //   const __m256i table = Vec4q(0x85858A00);
1282 //    return _mm256_mask_fixupimm_pd(z, xiszero, y, table, 0);
1283 //#else
1284     return select(xiszero, select(y < 0., infinite_vec<Vec4d>(), select(y == 0., Vec4d(1.), Vec4d(0.))), z);
1285 //#endif
1286 }
1287 #endif
1288 
wm_pow_case_x0(Vec2db const & xiszero,Vec2d const & y,Vec2d const & z)1289 static inline Vec2d wm_pow_case_x0(Vec2db const & xiszero, Vec2d const & y, Vec2d const & z) {
1290 //#if defined __AVX512VL__ && defined ?
1291 //    const __m128i table = Vec2q(0x85858A00);
1292 //    return _mm_mask_fixupimm_pd(z, xiszero, y, table, 0);
1293 //#else
1294     return select(xiszero, select(y < 0., infinite_vec<Vec2d>(), select(y == 0., Vec2d(1.), Vec2d(0.))), z);
1295 //#endif
1296 }
1297 
1298 // ****************************************************************************
1299 //                pow template, double precision
1300 // ****************************************************************************
1301 // Calculate x to the power of y.
1302 
1303 // Precision is important here because rounding errors get multiplied by y.
1304 // The logarithm is calculated with extra precision, and the exponent is
1305 // calculated separately.
1306 // The logarithm is calculated by Pad\E9 approximation with 6'th degree
1307 // polynomials. A 7'th degree would be preferred for best precision by high y.
1308 // The alternative method: log(x) = z + z^3*R(z)/S(z), where z = 2(x-1)/(x+1)
1309 // did not give better precision.
1310 
1311 // Template parameters:
1312 // VTYPE:  data vector type
1313 // ITYPE:  signed integer vector type
1314 // BVTYPE: boolean vector type
1315 template <class VTYPE, class ITYPE, class BVTYPE>
pow_template_d(VTYPE const & x0,VTYPE const & y)1316 static inline VTYPE pow_template_d(VTYPE const & x0, VTYPE const & y) {
1317 
1318     // define constants
1319     const double ln2d_hi = 0.693145751953125;           // log(2) in extra precision, high bits
1320     const double ln2d_lo = 1.42860682030941723212E-6;   // low bits of log(2)
1321     const double log2e   = VM_LOG2E;                    // 1/log(2)
1322     const double pow2_52 = 4503599627370496.0;          // 2^52
1323 
1324     // coefficients for Pad\E9 polynomials
1325     const double P0logl =  2.0039553499201281259648E1;
1326     const double P1logl =  5.7112963590585538103336E1;
1327     const double P2logl =  6.0949667980987787057556E1;
1328     const double P3logl =  2.9911919328553073277375E1;
1329     const double P4logl =  6.5787325942061044846969E0;
1330     const double P5logl =  4.9854102823193375972212E-1;
1331     const double P6logl =  4.5270000862445199635215E-5;
1332     const double Q0logl =  6.0118660497603843919306E1;
1333     const double Q1logl =  2.1642788614495947685003E2;
1334     const double Q2logl =  3.0909872225312059774938E2;
1335     const double Q3logl =  2.2176239823732856465394E2;
1336     const double Q4logl =  8.3047565967967209469434E1;
1337     const double Q5logl =  1.5062909083469192043167E1;
1338 
1339     // Taylor coefficients for exp function, 1/n!
1340     const double p2  = 1./2.;
1341     const double p3  = 1./6.;
1342     const double p4  = 1./24.;
1343     const double p5  = 1./120.;
1344     const double p6  = 1./720.;
1345     const double p7  = 1./5040.;
1346     const double p8  = 1./40320.;
1347     const double p9  = 1./362880.;
1348     const double p10 = 1./3628800.;
1349     const double p11 = 1./39916800.;
1350     const double p12 = 1./479001600.;
1351     const double p13 = 1./6227020800.;
1352 
1353     // data vectors
1354     VTYPE x, x1, x2;
1355     VTYPE px, qx, ef, yr, v, z, z1;
1356     VTYPE lg, lg1, lg2;
1357     VTYPE lgerr, x2err;
1358     VTYPE e1, e2, e3, ee;
1359     // integer vectors
1360     ITYPE ei, ej, yodd;
1361     // boolean vectors
1362     BVTYPE blend, xzero, xnegative;
1363     BVTYPE overflow, underflow, xfinite, yfinite, efinite;
1364 
1365     // remove sign
1366     x1 = abs(x0);
1367 
1368     // Separate mantissa from exponent
1369     // This gives the mantissa * 0.5
1370     x  = fraction_2(x1);
1371 
1372     // reduce range of x = +/- sqrt(2)/2
1373     blend = x > VM_SQRT2*0.5;
1374     x  = if_add(!blend, x, x);                   // conditional add
1375 
1376     // Pade approximation
1377     // Higher precision than in log function. Still higher precision wanted
1378     x -= 1.0;
1379     x2 = x*x;
1380     px = polynomial_6  (x, P0logl, P1logl, P2logl, P3logl, P4logl, P5logl, P6logl);
1381     px *= x * x2;
1382     qx = polynomial_6n (x, Q0logl, Q1logl, Q2logl, Q3logl, Q4logl, Q5logl);
1383     lg1 = px / qx;
1384 
1385     // extract exponent
1386     ef = exponent_f(x1);
1387     ef = if_add(blend, ef, 1.);                  // conditional add
1388 
1389     // multiply exponent by y
1390     // nearest integer e1 goes into exponent of result, remainder yr is added to log
1391     e1 = round(ef * y);
1392     yr = mul_sub_x(ef, y, e1);                   // calculate remainder yr. precision very important here
1393 
1394     // add initial terms to Pade expansion
1395     lg = nmul_add(0.5, x2, x) + lg1;             // lg = (x - 0.5 * x2) + lg1;
1396     // calculate rounding errors in lg
1397     // rounding error in multiplication 0.5*x*x
1398     x2err = mul_sub_x(0.5*x, x, 0.5*x2);
1399     // rounding error in additions and subtractions
1400     lgerr = mul_add(0.5, x2, lg - x) - lg1;      // lgerr = ((lg - x) + 0.5 * x2) - lg1;
1401 
1402     // extract something for the exponent
1403     e2 = round(lg * y * VM_LOG2E);
1404     // subtract this from lg, with extra precision
1405     v = mul_sub_x(lg, y, e2 * ln2d_hi);
1406     v = nmul_add(e2, ln2d_lo, v);                // v -= e2 * ln2d_lo;
1407 
1408     // add remainder from ef * y
1409     v = mul_add(yr, VM_LN2, v);                  // v += yr * VM_LN2;
1410 
1411     // correct for previous rounding errors
1412     v = nmul_add(lgerr + x2err, y, v);           // v -= (lgerr + x2err) * y;
1413 
1414     // exp function
1415 
1416     // extract something for the exponent if possible
1417     x = v;
1418     e3 = round(x*log2e);
1419     // high precision multiplication not needed here because abs(e3) <= 1
1420     x = nmul_add(e3, VM_LN2, x);                 // x -= e3 * VM_LN2;
1421 
1422     z = polynomial_13m(x, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13);
1423     z = z + 1.0;
1424 
1425     // contributions to exponent
1426     ee = e1 + e2 + e3;
1427     ei = round_to_int64_limited(ee);
1428     // biased exponent of result:
1429     ej = ei + (ITYPE(reinterpret_i(z)) >> 52);
1430     // check exponent for overflow and underflow
1431     overflow  = BVTYPE(ej >= 0x07FF) | (ee >  3000.);
1432     underflow = BVTYPE(ej <= 0x0000) | (ee < -3000.);
1433 
1434     // add exponent by integer addition
1435     z = reinterpret_d(ITYPE(reinterpret_i(z)) + (ei << 52));
1436 
1437     // check for special cases
1438     xfinite   = is_finite(x0);
1439     yfinite   = is_finite(y);
1440     efinite   = is_finite(ee);
1441     xzero     = is_zero_or_subnormal(x0);
1442     xnegative = x0  < 0.;
1443 
1444     // check for overflow and underflow
1445     if (horizontal_or(overflow | underflow)) {
1446         // handle errors
1447         z = select(underflow, VTYPE(0.), z);
1448         z = select(overflow, infinite_vec<VTYPE>(), z);
1449     }
1450 
1451     // check for x == 0
1452     z = wm_pow_case_x0(xzero, y, z);
1453     //z = select(xzero, select(y < 0., infinite_vec<VTYPE>(), select(y == 0., VTYPE(1.), VTYPE(0.))), z);
1454 
1455     // check for x < 0. y must be integer
1456     if (horizontal_or(xnegative)) {
1457         // test if y odd
1458         yodd = ITYPE(reinterpret_i(abs(y) + pow2_52)) << 63;     // convert y to integer and shift bit 0 to position of sign bit
1459         z1 = z | (x0 & VTYPE(reinterpret_d(yodd)));              // apply sign if y odd
1460         z1 = select(y == round(y), z1, nan_vec<VTYPE>(NAN_POW)); // NAN if y not integer
1461         z = select(xnegative, z1, z);
1462     }
1463 
1464     // check for range errors
1465     if (horizontal_and(xfinite & yfinite & (efinite | xzero))) {
1466         // fast return if no special cases
1467         return z;
1468     }
1469 
1470     // handle special error cases
1471     z = select(yfinite & efinite, z, select(x1 == 1., VTYPE(1.), select((x1 > 1.) ^ sign_bit(y), infinite_vec<VTYPE>(), 0.)));
1472     yodd = ITYPE(reinterpret_i(abs(y) + pow2_52)) << 63; // same as above
1473     z = select(xfinite, z, select(y == 0., VTYPE(1.), select(y < 0., VTYPE(0.), infinite_vec<VTYPE>() | ( VTYPE(reinterpret_d(yodd)) & x0))));
1474     z = select(is_nan(x0), select(is_nan(y), x0 | y, x0), select(is_nan(y), y, z));
1475     return z;
1476 }
1477 
1478 
1479 //This template is in vectorf128.h to prevent implicit conversion of float y to int when float version is not defined:
1480 //template <typename TT> static Vec2d pow(Vec2d const & a, TT n);
1481 
1482 // instantiations of pow_template_d:
1483 template <>
1484 inline Vec2d pow<Vec2d>(Vec2d const & x, Vec2d const & y) {
1485     return pow_template_d<Vec2d, Vec2q, Vec2db>(x, y);
1486 }
1487 
1488 template <>
1489 inline Vec2d pow<double>(Vec2d const & x, double const & y) {
1490     return pow_template_d<Vec2d, Vec2q, Vec2db>(x, y);
1491 }
1492 template <>
1493 inline Vec2d pow<float>(Vec2d const & x, float const & y) {
1494     return pow_template_d<Vec2d, Vec2q, Vec2db>(x, (double)y);
1495 }
1496 
1497 #if MAX_VECTOR_SIZE >= 256
1498 
1499 template <>
1500 inline Vec4d pow<Vec4d>(Vec4d const & x, Vec4d const & y) {
1501     return pow_template_d<Vec4d, Vec4q, Vec4db>(x, y);
1502 }
1503 
1504 template <>
1505 inline Vec4d pow<double>(Vec4d const & x, double const & y) {
1506     return pow_template_d<Vec4d, Vec4q, Vec4db>(x, y);
1507 }
1508 
1509 template <>
1510 inline Vec4d pow<float>(Vec4d const & x, float const & y) {
1511     return pow_template_d<Vec4d, Vec4q, Vec4db>(x, (double)y);
1512 }
1513 
1514 #endif // MAX_VECTOR_SIZE >= 256
1515 
1516 #if MAX_VECTOR_SIZE >= 512
1517 
1518 template <>
1519 inline Vec8d pow<Vec8d>(Vec8d const & x, Vec8d const & y) {
1520     return pow_template_d<Vec8d, Vec8q, Vec8db>(x, y);
1521 }
1522 
1523 template <>
1524 inline Vec8d pow<double>(Vec8d const & x, double const & y) {
1525     return pow_template_d<Vec8d, Vec8q, Vec8db>(x, y);
1526 }
1527 
1528 template <>
1529 inline Vec8d pow<float>(Vec8d const & x, float const & y) {
1530     return pow_template_d<Vec8d, Vec8q, Vec8db>(x, (double)y);
1531 }
1532 
1533 // Helper function for power function: insert special values of pow(x,y) when x=0:
1534 // y<0 -> inf, y=0 -> 1, y>0 -> 0, y=nan -> nan
wm_pow_case_x0(Vec16fb const & xiszero,Vec16f const & y,Vec16f const & z)1535 static inline Vec16f wm_pow_case_x0(Vec16fb const & xiszero, Vec16f const & y, Vec16f const & z) {
1536 #if INSTRSET >= 9
1537     const __m512i table = Vec16ui(0x85858A00);
1538     return _mm512_mask_fixupimm_ps(z, xiszero, y, table, 0);
1539 #else
1540     return select(xiszero, select(y < 0.f, infinite_vec<Vec16f>(), select(y == 0.f, Vec16f(1.f), Vec16f(0.f))), z);
1541 #endif
1542 }
1543 
1544 #endif // MAX_VECTOR_SIZE >= 512
1545 
1546 #if MAX_VECTOR_SIZE >= 256
wm_pow_case_x0(Vec8fb const & xiszero,Vec8f const & y,Vec8f const & z)1547 static inline Vec8f wm_pow_case_x0(Vec8fb const & xiszero, Vec8f const & y, Vec8f const & z) {
1548     return select(xiszero, select(y < 0.f, infinite_vec<Vec8f>(), select(y == 0.f, Vec8f(1.f), Vec8f(0.f))), z);
1549 }
1550 #endif
1551 
wm_pow_case_x0(Vec4fb const & xiszero,Vec4f const & y,Vec4f const & z)1552 static inline Vec4f wm_pow_case_x0(Vec4fb const & xiszero, Vec4f const & y, Vec4f const & z) {
1553     return select(xiszero, select(y < 0.f, infinite_vec<Vec4f>(), select(y == 0.f, Vec4f(1.f), Vec4f(0.f))), z);
1554 }
1555 
1556 // ****************************************************************************
1557 //                pow template, single precision
1558 // ****************************************************************************
1559 
1560 // Template parameters:
1561 // VTYPE:  data vector type
1562 // ITYPE:  signed integer vector type
1563 // BVTYPE: boolean vector type
1564 // Calculate x to the power of y
1565 template <class VTYPE, class ITYPE, class BVTYPE>
pow_template_f(VTYPE const & x0,VTYPE const & y)1566 static inline VTYPE pow_template_f(VTYPE const & x0, VTYPE const & y) {
1567 
1568     // define constants
1569     const float ln2f_hi  =  0.693359375f;
1570     const float ln2f_lo  = -2.12194440e-4f;
1571     //const float max_expf =  87.3f;
1572     const float log2e    =  float(VM_LOG2E);     // 1/log(2)
1573     const float pow2_23  =  8388608.0f;          // 2^23
1574 
1575     const float P0logf  =  3.3333331174E-1f;
1576     const float P1logf  = -2.4999993993E-1f;
1577     const float P2logf  =  2.0000714765E-1f;
1578     const float P3logf  = -1.6668057665E-1f;
1579     const float P4logf  =  1.4249322787E-1f;
1580     const float P5logf  = -1.2420140846E-1f;
1581     const float P6logf  =  1.1676998740E-1f;
1582     const float P7logf  = -1.1514610310E-1f;
1583     const float P8logf  =  7.0376836292E-2f;
1584 
1585     // Taylor coefficients for exp function, 1/n!
1586     const float p2expf   =  1.f/2.f;
1587     const float p3expf   =  1.f/6.f;
1588     const float p4expf   =  1.f/24.f;
1589     const float p5expf   =  1.f/120.f;
1590     const float p6expf   =  1.f/720.f;
1591     const float p7expf   =  1.f/5040.f;
1592 
1593     // data vectors
1594     VTYPE x, x1, x2;
1595     VTYPE ef, yr, v, z, z1;
1596     VTYPE lg, lg1;
1597     VTYPE lgerr, x2err;
1598     VTYPE e1, e2, e3, ee;
1599     // integer vectors
1600     ITYPE ei, ej, yodd;
1601     // boolean vectors
1602     BVTYPE blend, xzero, xnegative;
1603     BVTYPE overflow, underflow, xfinite, yfinite, efinite;
1604 
1605     // remove sign
1606     x1 = abs(x0);
1607 
1608     // Separate mantissa from exponent
1609     // This gives the mantissa * 0.5
1610     x  = fraction_2(x1);
1611 
1612     // reduce range of x = +/- sqrt(2)/2
1613     blend = x > float(VM_SQRT2 * 0.5);
1614     x  = if_add(!blend, x, x);                   // conditional add
1615 
1616     // Taylor expansion, high precision
1617     x   -= 1.0f;
1618     x2   = x * x;
1619     lg1  = polynomial_8(x, P0logf, P1logf, P2logf, P3logf, P4logf, P5logf, P6logf, P7logf, P8logf);
1620     lg1 *= x2 * x;
1621 
1622     // extract exponent
1623     ef = exponent_f(x1);
1624     ef = if_add(blend, ef, 1.0f);                // conditional add
1625 
1626     // multiply exponent by y
1627     // nearest integer e1 goes into exponent of result, remainder yr is added to log
1628     e1 = round(ef * y);
1629     yr = mul_sub_x(ef, y, e1);                   // calculate remainder yr. precision very important here
1630 
1631     // add initial terms to expansion
1632     lg = nmul_add(0.5f, x2, x) + lg1;            // lg = (x - 0.5f * x2) + lg1;
1633 
1634     // calculate rounding errors in lg
1635     // rounding error in multiplication 0.5*x*x
1636     x2err = mul_sub_x(0.5f*x, x, 0.5f * x2);
1637     // rounding error in additions and subtractions
1638     lgerr = mul_add(0.5f, x2, lg - x) - lg1;     // lgerr = ((lg - x) + 0.5f * x2) - lg1;
1639 
1640     // extract something for the exponent
1641     e2 = round(lg * y * float(VM_LOG2E));
1642     // subtract this from lg, with extra precision
1643     v = mul_sub_x(lg, y, e2 * ln2f_hi);
1644     v = nmul_add(e2, ln2f_lo, v);                // v -= e2 * ln2f_lo;
1645 
1646     // correct for previous rounding errors
1647     v -= mul_sub(lgerr + x2err, y, yr * float(VM_LN2)); // v -= (lgerr + x2err) * y - yr * float(VM_LN2) ;
1648 
1649     // exp function
1650 
1651     // extract something for the exponent if possible
1652     x = v;
1653     e3 = round(x*log2e);
1654     // high precision multiplication not needed here because abs(e3) <= 1
1655     x = nmul_add(e3, float(VM_LN2), x);          // x -= e3 * float(VM_LN2);
1656 
1657     // Taylor polynomial
1658     x2  = x  * x;
1659     z = polynomial_5(x, p2expf, p3expf, p4expf, p5expf, p6expf, p7expf)*x2 + x + 1.0f;
1660 
1661     // contributions to exponent
1662     ee = e1 + e2 + e3;
1663     ei = round_to_int(ee);
1664     // biased exponent of result:
1665     ej = ei + (ITYPE(reinterpret_i(z)) >> 23);
1666     // check exponent for overflow and underflow
1667     overflow  = BVTYPE(ej >= 0x0FF) | (ee >  300.f);
1668     underflow = BVTYPE(ej <= 0x000) | (ee < -300.f);
1669 
1670     // add exponent by integer addition
1671     z = reinterpret_f(ITYPE(reinterpret_i(z)) + (ei << 23)); // the extra 0x10000 is shifted out here
1672 
1673     // check for special cases
1674     xfinite   = is_finite(x0);
1675     yfinite   = is_finite(y);
1676     efinite   = is_finite(ee);
1677 
1678     xzero     = is_zero_or_subnormal(x0);
1679     xnegative = x0  < 0.f;
1680 
1681     // check for overflow and underflow
1682     if (horizontal_or(overflow | underflow)) {
1683         // handle errors
1684         z = select(underflow, VTYPE(0.f), z);
1685         z = select(overflow, infinite_vec<VTYPE>(), z);
1686     }
1687 
1688     // check for x == 0
1689     z = wm_pow_case_x0(xzero, y, z);
1690     //z = select(xzero, select(y < 0.f, infinite_vec<VTYPE>(), select(y == 0.f, VTYPE(1.f), VTYPE(0.f))), z);
1691 
1692     // check for x < 0. y must be integer
1693     if (horizontal_or(xnegative)) {
1694         // test if y odd
1695         yodd = ITYPE(reinterpret_i(abs(y) + pow2_23)) << 31;     // convert y to integer and shift bit 0 to position of sign bit
1696         z1 = z | (x0 & VTYPE(reinterpret_f(yodd)));              // apply sign if y odd
1697         z1 = select(y == round(y), z1, nan_vec<VTYPE>(NAN_POW)); // NAN if y not integer
1698         z = select(xnegative, z1, z);
1699     }
1700 
1701     // check for range errors
1702     if (horizontal_and(xfinite & yfinite & (efinite | xzero))) {
1703         // fast return if no special cases
1704         return z;
1705     }
1706 
1707     // handle special error cases
1708     z = select(yfinite & efinite, z, select(x1 == 1.f, VTYPE(1.f), select((x1 > 1.f) ^ sign_bit(y), infinite_vec<VTYPE>(), 0.f)));
1709     yodd = ITYPE(reinterpret_i(abs(y) + pow2_23)) << 31; // same as above
1710     z = select(xfinite, z, select(y == 0.f, VTYPE(1.f), select(y < 0.f, VTYPE(0.f), infinite_vec<VTYPE>() | (VTYPE(reinterpret_f(yodd)) & x0))));
1711     z = select(is_nan(x0), select(is_nan(y), x0 | y, x0), select(is_nan(y), y, z));
1712     return z;
1713 }
1714 
1715 //This template is in vectorf128.h to prevent implicit conversion of float y to int when float version is not defined:
1716 //template <typename TT> static Vec4f pow(Vec4f const & a, TT n);
1717 
1718 template <>
1719 inline Vec4f pow<Vec4f>(Vec4f const & x, Vec4f const & y) {
1720     return pow_template_f<Vec4f, Vec4i, Vec4fb>(x, y);
1721 }
1722 
1723 template <>
1724 inline Vec4f pow<float>(Vec4f const & x, float const & y) {
1725     return pow_template_f<Vec4f, Vec4i, Vec4fb>(x, y);
1726 }
1727 
1728 template <>
1729 inline Vec4f pow<double>(Vec4f const & x, double const & y) {
1730     return pow_template_f<Vec4f, Vec4i, Vec4fb>(x, (float)y);
1731 }
1732 
1733 #if MAX_VECTOR_SIZE >= 256
1734 
1735 template <>
1736 inline Vec8f pow<Vec8f>(Vec8f const & x, Vec8f const & y) {
1737     return pow_template_f<Vec8f, Vec8i,  Vec8fb>(x, y);
1738 }
1739 
1740 template <>
1741 inline Vec8f pow<float>(Vec8f const & x, float const & y) {
1742     return pow_template_f<Vec8f, Vec8i,  Vec8fb>(x, y);
1743 }
1744 template <>
1745 inline Vec8f pow<double>(Vec8f const & x, double const & y) {
1746     return pow_template_f<Vec8f, Vec8i,  Vec8fb>(x, (float)y);
1747 }
1748 
1749 #endif // MAX_VECTOR_SIZE >= 256
1750 
1751 #if MAX_VECTOR_SIZE >= 512
1752 
1753 template <>
1754 inline Vec16f pow<Vec16f>(Vec16f const & x, Vec16f const & y) {
1755     return pow_template_f<Vec16f, Vec16i,  Vec16fb>(x, y);
1756 }
1757 
1758 template <>
1759 inline Vec16f pow<float>(Vec16f const & x, float const & y) {
1760     return pow_template_f<Vec16f, Vec16i,  Vec16fb>(x, y);
1761 }
1762 
1763 template <>
1764 inline Vec16f pow<double>(Vec16f const & x, double const & y) {
1765     return pow_template_f<Vec16f, Vec16i,  Vec16fb>(x, (float)y);
1766 }
1767 
1768 #endif // MAX_VECTOR_SIZE >= 512
1769 
1770 
1771 // *************************************************************
1772 //             power function with rational exponent
1773 // *************************************************************
1774 // Power function with rational exponent: x^(a/b)
1775 // Template must be defined as class to allow partial template specialization
1776 template <int a, int b>
1777 class Power_rational {
1778 public:
1779     // overloaded member function for each vector type
pow(Vec4f const & x)1780     Vec4f pow(Vec4f const & x) {
1781         Vec4f y = x;
1782         // negative x allowed when b odd or a even
1783         // (if a is even then either b is odd or a/b can be reduced,
1784         // but we can check a even anyway at no cost to be sure)
1785         if (a == 0) return 1.f;
1786         if ((b | ~a) & 1) y = abs(y);
1787         y = pow(y, float(double(a)/double(b)));
1788         if (a & b & 1) y = sign_combine(y, x);          // apply sign if a and b both odd
1789         if ((a ^ b) >= 0) y = select(x == 0.f, 0.f, y); // zero allowed for positive a and b
1790         return y;
1791     }
pow(Vec2d const & x)1792     Vec2d pow(Vec2d const & x) {
1793         Vec2d y = x;
1794         if (a == 0) return 1.;
1795         if ((b | ~a) & 1) y = abs(y);
1796         y = pow(y, double((long double)a/(long double)b));
1797         if (a & b & 1) y = sign_combine(y, x);
1798         if ((a ^ b) >= 0) y = select(x == 0., 0., y);
1799         return y;
1800     }
1801 #if MAX_VECTOR_SIZE >= 256
pow(Vec8f const & x)1802     Vec8f pow(Vec8f const & x) {
1803         Vec8f y = x;
1804         if (a == 0) return 1.f;
1805         if ((b | ~a) & 1) y = abs(y);
1806         y = pow(y, float(double(a)/double(b)));
1807         if (a & b & 1) y = sign_combine(y, x);
1808         if ((a ^ b) >= 0) y = select(x == 0.f, 0.f, y);
1809         return y;
1810     }
pow(Vec4d const & x)1811     Vec4d pow(Vec4d const & x) {
1812         Vec4d y = x;
1813         if (a == 0) return 1.;
1814         if ((b | ~a) & 1) y = abs(y);
1815         y = pow(y, double((long double)a/(long double)b));
1816         if (a & b & 1) y = sign_combine(y, x);
1817         if ((a ^ b) >= 0) y = select(x == 0., 0., y);
1818         return y;
1819     }
1820 #endif // MAX_VECTOR_SIZE >= 256
1821 #if MAX_VECTOR_SIZE >= 512
pow(Vec16f const & x)1822     Vec16f pow(Vec16f const & x) {
1823         Vec16f y = x;
1824         if (a == 0) return 1.f;
1825         if ((b | ~a) & 1) y = abs(y);
1826         y = pow(y, float(double(a)/double(b)));
1827         if (a & b & 1) y = sign_combine(y, x);
1828         if ((a ^ b) >= 0) y = select(x == 0.f, 0.f, y);
1829         return y;
1830     }
pow(Vec8d const & x)1831     Vec8d pow(Vec8d const & x) {
1832         Vec8d y = x;
1833         if (a == 0) return 1.;
1834         if ((b | ~a) & 1) y = abs(y);
1835         y = pow(y, double((long double)a/(long double)b));
1836         if (a & b & 1) y = sign_combine(y, x);
1837         if ((a ^ b) >= 0) y = select(x == 0., 0., y);
1838         return y;
1839     }
1840 #endif // MAX_VECTOR_SIZE >= 512
1841 };
1842 
1843 // partial specialization for b = 1
1844 template<int a>
1845 class Power_rational<a,1> {
1846 public:
1847     template<class VTYPE>
pow(VTYPE const & x)1848     VTYPE pow(VTYPE const & x) {return pow_n<a>(x);}
1849 };
1850 
1851 // partial specialization for b = 2
1852 template<int a>
1853 class Power_rational<a,2> {
1854 public:
1855     template<class VTYPE>
pow(VTYPE const & x)1856     VTYPE pow(VTYPE const & x) {
1857         VTYPE y = pow_n<(a > 0 ? a/2 : (a-1)/2)>(x);
1858         if (a & 1) y *= sqrt(x);
1859         return y;
1860     }
1861 };
1862 
1863 // full specialization for a = 1, b = 2
1864 template<>
1865 class Power_rational<1,2> {
1866 public:
1867     template<class VTYPE>
pow(VTYPE const & x)1868     VTYPE pow(VTYPE const & x) {
1869         return sqrt(x);
1870     }
1871 };
1872 
1873 // full specialization for a = -1, b = 2
1874 template<>
1875 class Power_rational<-1,2> {
1876 public:
1877     template<class VTYPE>
pow(VTYPE const & x)1878     VTYPE pow(VTYPE const & x) {
1879         // (this is faster than iteration method on modern CPUs)
1880         return VTYPE(1.f) / sqrt(x);
1881     }
1882 };
1883 
1884 // partial specialization for b = 3
1885 template<int a>
1886 class Power_rational<a,3> {
1887 public:
1888     template<class VTYPE>
pow(VTYPE const & x)1889     VTYPE pow(VTYPE const & x) {
1890         VTYPE t;
1891         switch (a % 3) {
1892         case -2:
1893             t = reciprocal_cbrt(x);
1894             t *= t;
1895             if (a == -2) return t;
1896             t = t / pow_n<(-a-2)/3>(x);
1897             break;
1898         case -1:
1899             t = reciprocal_cbrt(x);
1900             if (a == -1) return t;
1901             t = t / pow_n<(-a-1)/3>(x);
1902             break;
1903         case  0:
1904             t = pow_n<a/3>(x);
1905             break;
1906         case  1:
1907             t = cbrt(x);
1908             if (a == 1) return t;
1909             t = t * pow_n<a/3>(x);
1910             break;
1911         case  2:
1912             t = square_cbrt(x);
1913             if (a == 2) return t;
1914             t = t * pow_n<a/3>(x);
1915             break;
1916         }
1917         return t;
1918     }
1919 };
1920 
1921 // partial specialization for b = 4
1922 template<int a>
1923 class Power_rational<a,4> {
1924 public:
1925     template<class VTYPE>
pow(VTYPE const & x)1926     VTYPE pow(VTYPE const & x) {
1927         VTYPE t, s1, s2;
1928         s1 = sqrt(x);
1929         if (a & 1) s2 = sqrt(s1);
1930         switch (a % 4) {
1931         case -3:
1932             t = s2 / pow_n<1+(-a)/4>(x);
1933             break;
1934         case -2:
1935             t = s1 / pow_n<1+(-a)/4>(x);
1936             break;
1937         case -1:
1938             if (a != -1) s2 *= pow_n<(-a)/4>(x);
1939             t = VTYPE(1.f) / s2;
1940             break;
1941         case  0: default:
1942             t = pow_n<a/4>(x);
1943             break;
1944         case  1:
1945             t = s2;
1946             if (a != 1) t *= pow_n<a/4>(x);
1947             break;
1948         case  2:
1949             t = s1;
1950             if (a != 2) t *= pow_n<a/4>(x);
1951             break;
1952         case  3:
1953             t = s1 * s2;
1954             if (a != 3) t *= pow_n<a/4>(x);
1955             break;
1956         }
1957         return t;
1958     }
1959 };
1960 
1961 // partial specialization for b = 6
1962 template<int a>
1963 class Power_rational<a,6> {
1964 public:
1965     template<class VTYPE>
pow(VTYPE const & x)1966     VTYPE pow(VTYPE const & x) {
1967         VTYPE t, s1, s2, s3;
1968         switch (a % 6) {
1969         case -5:
1970             t = reciprocal_cbrt(x);
1971             t = t * t * sqrt(t);
1972             if (a != -5) t /= pow_n<(-a)/6>(x);
1973             break;
1974         case -4:
1975             t = reciprocal_cbrt(x);
1976             t *= t;
1977             if (a != -4) t /= pow_n<(-a)/6>(x);
1978             break;
1979         case -3:
1980             t = pow_n<a/6>(x);
1981             t /= sqrt(x);
1982             break;
1983         case -2:
1984             t = reciprocal_cbrt(x);
1985             if (a != -2) t /= pow_n<(-a)/6>(x);
1986             break;
1987         case -1:
1988             t = sqrt(reciprocal_cbrt(x));
1989             if (a != -1) t /= pow_n<(-a)/6>(x);
1990             break;
1991         case  0: default:
1992             t = pow_n<a/6>(x);
1993             break;
1994         case  1:
1995             t = sqrt(cbrt(x));
1996             if (a != 1) t *= pow_n<a/6>(x);
1997             break;
1998         case  2:
1999             t = cbrt(x);
2000             if (a != 2) t *= pow_n<a/6>(x);
2001             break;
2002         case  3:
2003             t = sqrt(x);
2004             if (a != 3) t *= pow_n<a/6>(x);
2005             break;
2006         case  4:
2007             t = square_cbrt(x);
2008             if (a != 4) t *= pow_n<a/6>(x);
2009             break;
2010         case  5:
2011             t = cbrt(x);
2012             t = t * t * sqrt(t);
2013             if (a != 5) t *= pow_n<a/6>(x);
2014             break;
2015         }
2016         return t;
2017     }
2018 };
2019 
2020 // partial specialization for b = 8
2021 template<int a>
2022 class Power_rational<a,8> {
2023 public:
2024     template<class VTYPE>
pow(VTYPE const & x)2025     VTYPE pow(VTYPE const & x) {
2026         VTYPE t, s1, s2, s3;
2027         s1 = sqrt(x);               // x^(1/2)
2028         if (a & 3) s2 = sqrt(s1);   // x^(1/4)
2029         if (a & 1) s3 = sqrt(s2);   // x^(1/8)
2030         switch (a % 8) {
2031         case -7:
2032             t = s3 / pow_n<1+(-a)/8>(x);
2033             break;
2034         case -6:
2035             t = s2 / pow_n<1+(-a)/8>(x);
2036             break;
2037         case -5:
2038             t = s3 * (s2 / pow_n<1+(-a)/8>(x));
2039             break;
2040         case -4:
2041             t = s1 / pow_n<1+(-a)/8>(x);
2042             break;
2043         case -3:
2044             t = s3 * (s1 / pow_n<1+(-a)/8>(x));
2045             break;
2046         case -2:
2047             if (a != -2) s2 *= pow_n<(-a)/8>(x);
2048             t = VTYPE(1.f) / s2;
2049             break;
2050         case -1:
2051             if (a != -1) s3 *= pow_n<(-a)/8>(x);
2052             t = VTYPE(1.f) / s3;
2053             break;
2054         case  0: default:
2055             t = pow_n<a/8>(x);
2056             break;
2057         case  1:
2058             t = s3;
2059             if (a != 1) t *= pow_n<a/8>(x);
2060             break;
2061         case  2:
2062             t = s2;
2063             if (a != 2) t *= pow_n<a/8>(x);
2064             break;
2065         case  3:
2066             t = s2 * s3;
2067             if (a != 3) t *= pow_n<a/8>(x);
2068             break;
2069         case  4:
2070             t = s1;
2071             if (a != 4) t *= pow_n<a/8>(x);
2072             break;
2073         case  5:
2074             t = s1 * s3;
2075             if (a != 5) t *= pow_n<a/8>(x);
2076             break;
2077         case  6:
2078             t = s1 * s2;
2079             if (a != 6) t *= pow_n<a/8>(x);
2080             break;
2081         case  7:
2082             t = s2 * s3;
2083             if (a != 7) s1 *= pow_n<a/8>(x);
2084             t *= s1;
2085             break;
2086 
2087         }
2088         return t;
2089     }
2090 };
2091 
2092 // macro to call template class member function pow
2093 #define pow_ratio(x, a, b) (Power_rational<(b)<0 ? -(a):(a), (b)<0 ? -(b):(b)> ().pow(x))
2094 
2095 
2096 /******************************************************************************
2097 *                 Detect NAN codes
2098 *
2099 * These functions return the code hidden in a NAN. The sign bit is ignored
2100 ******************************************************************************/
2101 
nan_code(Vec4f const & x)2102 static inline Vec4i nan_code(Vec4f const & x) {
2103     Vec4i  a = reinterpret_i(x);
2104     Vec4ib b = (a & 0x7F800000) == 0x7F800000;   // check if NAN/INF
2105     return a & 0x007FFFFF & Vec4i(b);            // isolate NAN code bits
2106 }
2107 
2108 // This function returns the code hidden in a NAN. The sign bit is ignored
nan_code(Vec2d const & x)2109 static inline Vec2q nan_code(Vec2d const & x) {
2110     Vec2q  a = reinterpret_i(x);
2111     Vec2q const m = 0x7FF0000000000000;
2112     Vec2q const n = 0x000FFFFFFFFFFFFF;
2113     Vec2qb b = (a & m) == m;                     // check if NAN/INF
2114     return a & n & Vec2q(b);                     // isolate NAN code bits
2115 }
2116 
2117 #if MAX_VECTOR_SIZE >= 256
2118 
2119 // This function returns the code hidden in a NAN. The sign bit is ignored
nan_code(Vec8f const & x)2120 static inline Vec8i nan_code(Vec8f const & x) {
2121     Vec8i  a = reinterpret_i(x);
2122     Vec8ib b = (a & 0x7F800000) == 0x7F800000;   // check if NAN/INF
2123     return a & 0x007FFFFF & Vec8i(b);            // isolate NAN code bits
2124 }
2125 
2126 // This function returns the code hidden in a NAN. The sign bit is ignored
nan_code(Vec4d const & x)2127 static inline Vec4q nan_code(Vec4d const & x) {
2128     Vec4q  a = reinterpret_i(x);
2129     Vec4q const m = 0x7FF0000000000000;
2130     Vec4q const n = 0x000FFFFFFFFFFFFF;
2131     Vec4qb b = (a & m) == m;                     // check if NAN/INF
2132     return a & n & Vec4q(b);                     // isolate NAN code bits
2133 }
2134 
2135 #endif // MAX_VECTOR_SIZE >= 256
2136 #if MAX_VECTOR_SIZE >= 512
2137 
2138 // This function returns the code hidden in a NAN. The sign bit is ignored
nan_code(Vec16f const & x)2139 static inline Vec16i nan_code(Vec16f const & x) {
2140     Vec16i  a = Vec16i(reinterpret_i(x));
2141     Vec16ib b = (a & 0x7F800000) == 0x7F800000;  // check if NAN/INF
2142     return a & 0x007FFFFF & Vec16i(b);           // isolate NAN code bits
2143 }
2144 
2145 // This function returns the code hidden in a NAN. The sign bit is ignored
nan_code(Vec8d const & x)2146 static inline Vec8q nan_code(Vec8d const & x) {
2147     Vec8q  a = Vec8q(reinterpret_i(x));
2148     Vec8q const m = 0x7FF0000000000000;
2149     Vec8q const n = 0x000FFFFFFFFFFFFF;
2150     Vec8qb b = (a & m) == m;                     // check if NAN/INF
2151     return a & n & Vec8q(b);                     // isolate NAN code bits
2152 }
2153 
2154 #endif // MAX_VECTOR_SIZE >= 512
2155 
2156 #ifdef VCL_NAMESPACE
2157 }
2158 #endif
2159 
2160 #endif  // VECTORMATH_EXP_H
2161