1 /*
2  * Copyright (c) 2017-2018, NVIDIA CORPORATION.  All rights reserved.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *     http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  *
16  */
17 
18 #if defined(TARGET_LINUX_POWER)
19 #error "Source cannot be compiled for POWER architectures"
20 #include "xmm2altivec.h"
21 #else
22 #include <immintrin.h>
23 #endif
24 #include "dpow_defs.h"
25 
26 extern "C" __m256d __fvd_pow_fma3_256(__m256d, __m256d);
27 
28 
29 // struct representing a double-double number
30 // x, y represent the lo and hi parts respectively
31 struct __m256d_2
32 {
33     __m256d x, y;
34 };
35 
36 // negates a number
37 inline
neg(__m256d a)38 __m256d neg(__m256d a)
39 {
40     __m256d const SGN_MASK = (__m256d)_mm256_set1_epi64x(SGN_MASK_D);
41     return _mm256_xor_pd(a, SGN_MASK);
42 }
43 
44 // double-double "fma"
45 inline
__internal_ddfma(__m256d_2 x,__m256d_2 y,__m256d_2 z)46 __m256d_2 __internal_ddfma (__m256d_2 x, __m256d_2 y, __m256d_2 z)
47 {
48     __m256d e;
49     __m256d_2 t, m;
50     t.y = _mm256_mul_pd(x.y, y.y);
51     t.x = _mm256_fmsub_pd(x.y, y.y, t.y);
52     t.x = _mm256_fmadd_pd(x.y, y.x, t.x);
53     t.x = _mm256_fmadd_pd(x.x, y.y, t.x);
54 
55     m.y = _mm256_add_pd (z.y, t.y);
56     e = _mm256_sub_pd (z.y, m.y);
57     m.x = _mm256_add_pd (_mm256_add_pd(_mm256_add_pd (e, t.y), t.x), z.x);
58 
59     return m;
60 }
61 
62 // special case of double-double addition, where |x| > |y| and y is a double.
63 inline
__internal_ddadd_yisdouble(__m256d_2 x,__m256d_2 y)64 __m256d_2 __internal_ddadd_yisdouble(__m256d_2 x, __m256d_2 y)
65 {
66     __m256d_2 z;
67     __m256d e;
68     z.y = _mm256_add_pd (x.y, y.y);
69     e = _mm256_sub_pd (x.y, z.y);
70     z.x = _mm256_add_pd(_mm256_add_pd (e, y.y), x.x);
71 
72     return z;
73 }
74 
75 // casts int to double
76 inline
__internal_fast_int2dbl(__m256i a)77 __m256d __internal_fast_int2dbl(__m256i a)
78 {
79     __m256i const INT2DBL_HI = _mm256_set1_epi64x(INT2DBL_HI_D);
80     __m256i const INT2DBL_LO = _mm256_set1_epi64x(INT2DBL_LO_D);
81     __m256d const INT2DBL    = (__m256d)_mm256_set1_epi64x(INT2DBL_D);
82 
83     __m256i t = _mm256_xor_si256(INT2DBL_LO, a);
84     t = _mm256_blend_epi32(INT2DBL_HI, t, 0x55);
85     return _mm256_sub_pd((__m256d)t, INT2DBL);
86 }
87 
88 // slowpath for exp. used to improve accuracy on larger inputs
__pgm_exp_d_vec256_slowpath(__m256i const i,__m256d const t,__m256d const bloga,__m256d z,__m256d prodx)89 __m256d __attribute__ ((noinline)) __pgm_exp_d_vec256_slowpath(__m256i const i, __m256d const t, __m256d const bloga, __m256d z, __m256d prodx)
90 {
91     __m256d const UPPERBOUND_1 = (__m256d)_mm256_set1_epi64x(UPPERBOUND_1_D);
92     __m256d const UPPERBOUND_2 = (__m256d)_mm256_set1_epi64x(UPPERBOUND_2_D);
93     __m256d const ZERO         = _mm256_set1_pd(ZERO_D);
94     __m256d const INF          = (__m256d)_mm256_set1_epi64x(INF_D);
95     __m256i const HI_ABS_MASK  = _mm256_set1_epi64x(HI_ABS_MASK_D);
96     __m256i const ABS_MASK     = _mm256_set1_epi64x(ABS_MASK_D);
97     __m256i const MULT_CONST   = _mm256_set1_epi64x(MULT_CONST_D);
98 
99     __m256i abs_bloga = _mm256_and_si256((__m256i)bloga, ABS_MASK);
100     __m256d abs_lt = (__m256d)_mm256_and_si256((__m256i)abs_bloga, HI_ABS_MASK);
101     __m256d lt_zero_mask = _mm256_cmp_pd(bloga, ZERO, _CMP_LT_OS); // compute bloga < 0.0
102     __m256d slowpath_mask = _mm256_cmp_pd(abs_lt, UPPERBOUND_1, _CMP_LT_OS);
103 
104     __m256d a_plus_inf = _mm256_add_pd(bloga, INF); // check if bloga is too big
105     __m256d zero_inf_blend = _mm256_blendv_pd(a_plus_inf, ZERO, lt_zero_mask);
106 
107     __m256d accurate_scale_mask = (__m256d)_mm256_cmp_pd(abs_lt, UPPERBOUND_2, 1);
108 
109     // compute accurate scale
110     __m256i k = _mm256_srli_epi64(i, 1); // k = i / 2
111     __m256i i_scale_acc = _mm256_slli_epi64(k, D52_D);  // shift to HI and shift 20
112     k = _mm256_sub_epi32(i, k);          // k = i - k
113     __m256i i_scale_acc_2 = _mm256_slli_epi64(k, D52_D);  // shift to HI and shift 20
114     __m256d multiplier = (__m256d)_mm256_add_epi64(i_scale_acc_2, MULT_CONST);
115     multiplier = _mm256_blendv_pd(ZERO, multiplier, accurate_scale_mask); // quick fix for overflows in case they're being trapped
116 
117     __m256d res = (__m256d)_mm256_add_epi32(i_scale_acc, (__m256i)t);
118     res = _mm256_mul_pd(res, multiplier);
119 
120     __m256d slowpath_blend = _mm256_blendv_pd(zero_inf_blend, res, accurate_scale_mask);
121     z = _mm256_blendv_pd(slowpath_blend, z, slowpath_mask);
122 
123     __m256i isinf_mask = _mm256_cmpeq_epi64((__m256i)INF, (__m256i)z); // special case for inf
124     __m256d z_fixed = _mm256_fmadd_pd(z, prodx, z); // add only if not inf
125     z = _mm256_blendv_pd(z_fixed, z, (__m256d)isinf_mask);
126 
127     return z;
128 }
129 
130 // special cases for pow
131 // implementation derived from cudart/device_functions_impl.c
__pgm_pow_d_vec256_special_cases(__m256d const a,__m256d const b,__m256d t)132 __m256d __attribute__ ((noinline)) __pgm_pow_d_vec256_special_cases(__m256d const a, __m256d const b, __m256d t)
133 {
134    __m256i const HI_MASK       = _mm256_set1_epi64x(HI_MASK_D);
135    __m256d const SGN_EXP_MASK  = (__m256d)_mm256_set1_epi64x(SGN_EXP_MASK_D);
136    __m256i const ABS_MASK      = _mm256_set1_epi64x(ABS_MASK_D);
137    __m256d const NEG_ONE       = _mm256_set1_pd(NEG_ONE_D);
138    __m256d const ZERO          = _mm256_set1_pd(ZERO_D);
139    __m256d const ONE           = _mm256_set1_pd(ONE_D);
140    __m256d const HALF          = _mm256_set1_pd(HALF_D);
141    __m256d const SGN_MASK      = (__m256d)_mm256_set1_epi64x(SGN_MASK_D);
142    __m256d const INF           = (__m256d)_mm256_set1_epi64x(INF_D);
143    __m256i const INF_FAKE      = _mm256_set1_epi64x(INF_FAKE_D);
144    __m256d const NAN_MASK      = (__m256d)_mm256_set1_epi64x(NAN_MASK_D);
145    __m256d const NEG_ONE_CONST = (__m256d)_mm256_set1_epi64x(NEG_ONE_CONST_D);
146 
147    __m256i const TEN_23        = _mm256_set1_epi64x(TEN_23_D);
148    __m256i const ELEVEN        = _mm256_set1_epi64x(ELEVEN_D);
149 
150     __m256i a_sign, b_sign, bIsOddInteger;
151     __m256d abs_a, abs_b;
152     __m256i shiftb;
153 
154     a_sign = (__m256i)_mm256_and_pd(a, SGN_MASK);
155     b_sign = (__m256i)_mm256_and_pd(b, SGN_MASK);
156     abs_a = _mm256_and_pd(a, (__m256d)ABS_MASK);
157     abs_b = _mm256_and_pd(b, (__m256d)ABS_MASK);
158 
159     // determining if b is an odd integer, since there are special cases for it
160     shiftb = (__m256i)_mm256_and_pd (b, SGN_EXP_MASK);
161     shiftb = _mm256_srli_epi64(shiftb, D52_D);
162     shiftb = _mm256_sub_epi64(shiftb, TEN_23);
163     shiftb = _mm256_add_epi64(shiftb, ELEVEN);
164 
165     __m256i b_is_half = _mm256_cmpeq_epi64((__m256i)abs_b, (__m256i)HALF);
166     bIsOddInteger = _mm256_sllv_epi64((__m256i)b, shiftb);
167     bIsOddInteger = _mm256_cmpeq_epi64((__m256i)SGN_MASK, bIsOddInteger);
168     // fix for b = +/-0.5 being incorrectly identified as an odd integer
169     bIsOddInteger = _mm256_andnot_si256(b_is_half, bIsOddInteger);
170 
171     // corner cases where a <= 0
172     // if ((ahi < 0) && bIsOddInteger)
173     __m256d ahi_lt_0 = (__m256d)_mm256_cmpeq_epi64(a_sign, (__m256i)ZERO);
174     __m256d ahilt0_and_boddint_mask = _mm256_andnot_pd(ahi_lt_0, (__m256d)bIsOddInteger);
175 
176     t = _mm256_blendv_pd(t, neg(t), ahilt0_and_boddint_mask);
177 
178     // else if ((ahi < 0) && (b != trunc(b)))
179     __m256d b_ne_trunc = _mm256_cmp_pd(b, _mm256_round_pd(b, _MM_FROUND_TO_ZERO), _CMP_NEQ_UQ);
180     __m256d nan_mask = _mm256_andnot_pd(ahi_lt_0, b_ne_trunc);
181     t = _mm256_blendv_pd(t, NAN_MASK, nan_mask);
182 
183     // if (a == 0.0)
184     __m256d a_is_0_mask = (__m256d)_mm256_cmpeq_epi64((__m256i)abs_a, (__m256i)ZERO);
185     __m256d thi_when_ais0 = ZERO;
186 
187     // if (bIsOddInteger && a == 0.0)
188     thi_when_ais0 = _mm256_blendv_pd(thi_when_ais0, (__m256d)a, (__m256d)bIsOddInteger);
189 
190     // if (bhi < 0 && a == 0.0)
191     __m256d bhi_lt_0 = (__m256d)_mm256_cmpeq_epi64(b_sign, (__m256i)ZERO); //this mask is inverted
192     __m256d thi_or_INF = _mm256_or_pd(thi_when_ais0, INF);
193     thi_when_ais0 = _mm256_blendv_pd(thi_or_INF, thi_when_ais0, bhi_lt_0);
194     __m256d t_when_ais0 = _mm256_and_pd(thi_when_ais0, (__m256d)HI_MASK);
195 
196     t = _mm256_blendv_pd(t, t_when_ais0, a_is_0_mask);
197 
198     // else if (a is INF)
199     __m256d a_inf_mask = (__m256d)_mm256_cmpeq_epi64((__m256i)abs_a, (__m256i)INF);
200     // use bhi_lt_0 backwards to evaluate bhi >= 0
201     __m256d thi_when_aisinf = _mm256_blendv_pd(thi_when_aisinf, INF, bhi_lt_0);
202 
203     // now evaluate ((ahi < 0) && bIsOddInteger)
204     __m256d thi_xor_sgn = _mm256_xor_pd(thi_when_aisinf, SGN_MASK);
205     thi_when_aisinf = _mm256_blendv_pd(thi_when_aisinf, thi_xor_sgn, ahilt0_and_boddint_mask);
206 
207     t = _mm256_blendv_pd(t, thi_when_aisinf, a_inf_mask);
208 
209     // else if (abs(b) is INF)
210     __m256d b_inf_mask = (__m256d)_mm256_cmpeq_epi64((__m256i)abs_b, (__m256i)INF);
211     __m256d thi_when_bisinf = ZERO;
212     __m256d absa_gt_one = _mm256_cmp_pd(abs_a, ONE, _CMP_GT_OS); // evaluating (abs(a) > 1)
213     thi_when_bisinf = _mm256_blendv_pd(thi_when_bisinf, INF, absa_gt_one);
214 
215     __m256d thi_xor_inf = _mm256_xor_pd(thi_when_bisinf, INF);
216     thi_when_bisinf = _mm256_blendv_pd(thi_xor_inf, thi_when_bisinf, bhi_lt_0); // bhi < 0
217 
218     __m256d a_is_negone = (__m256d)_mm256_cmpeq_epi64((__m256i)a, (__m256i)NEG_ONE);
219     thi_when_bisinf = _mm256_blendv_pd(thi_when_bisinf, NEG_ONE_CONST, a_is_negone); //a == -1
220 
221     t = _mm256_blendv_pd(t, thi_when_bisinf, b_inf_mask);
222 
223     // if(a is NAN || B is NAN) <=> !(a is a number && b is a number)
224     __m256i a_nan_mask = _mm256_cmpeq_epi64(_mm256_max_epu32((__m256i)abs_a, INF_FAKE), INF_FAKE);
225     __m256i b_nan_mask = _mm256_cmpeq_epi64(_mm256_max_epu32((__m256i)abs_b, INF_FAKE), INF_FAKE);
226     __m256i aorb_nan_mask = _mm256_and_si256(a_nan_mask, b_nan_mask); //this mask is inverted
227     t = _mm256_blendv_pd(_mm256_add_pd(a,b), t, (__m256d)aorb_nan_mask);
228 
229     // if a == 1 or b == 0, answer = 1
230     __m256d a_is_one_mask = (__m256d)_mm256_cmpeq_epi64((__m256i)a, (__m256i)ONE);
231     __m256d b_is_zero_mask = (__m256d)_mm256_cmpeq_epi64((__m256i)abs_b, (__m256i)ZERO);
232     __m256d ans_equals_one_mask = _mm256_or_pd(a_is_one_mask, b_is_zero_mask);
233     t = _mm256_blendv_pd(t, ONE, ans_equals_one_mask);
234     // ******************************************************************************************
235 
236 
237    __m256i a_is_neg = (__m256i)_mm256_cmp_pd(a, ZERO, _CMP_LT_OS);
238    int a_is_neg_flag = _mm256_movemask_epi8((__m256i)a_is_neg);
239 
240    __m256i b_is_integer = (__m256i)_mm256_cmp_pd(b, _mm256_floor_pd(b), _CMP_EQ_OQ);
241    int b_is_integer_flag = _mm256_movemask_epi8((__m256i)b_is_integer);
242 
243    __m256i b_is_lt_zero = (__m256i)_mm256_cmp_pd(b, ZERO, _CMP_LT_OS);
244    int b_is_lt_zero_flag = _mm256_movemask_epi8((__m256i)b_is_lt_zero);
245 
246    __m256d const MINUS_ZERO = _mm256_set1_pd((double)-0.0);
247    __m256i a_is_pos_zero = _mm256_cmpeq_epi64( (__m256i)a, (__m256i)ZERO);
248    __m256i a_is_neg_zero = _mm256_cmpeq_epi64( (__m256i)a, (__m256i)MINUS_ZERO);
249    __m256i a_is_any_zero = _mm256_or_si256(a_is_pos_zero, a_is_neg_zero);
250    int a_is_any_zero_flag = _mm256_movemask_epi8((__m256i)a_is_any_zero);
251 
252 /*
253  *  Before returning see if we need to set any of the processor
254  *  exception flags.
255  *
256  *  Domain error:  a is negative, and b is a finite noninteger
257  *  we need to raise the Invalid-Operation flag.   This can be done by
258  *  taking the square root of a negative number.
259  *
260  *  Pole error:  a is zero and b is negative we need to raise the
261  *  divide by zero flag.   This can be done by dividing by zero.
262  */
263 
264       if (a_is_neg_flag && (!b_is_integer_flag)) {
265          __m256d volatile invop = _mm256_sqrt_pd(a);
266       }
267 
268       if (a_is_any_zero_flag && b_is_lt_zero_flag) {
269          __m256d volatile divXzero = _mm256_div_pd(ONE,ZERO);
270       }
271 
272 
273 
274 
275 
276 
277 
278 
279 
280 
281 
282 
283     return t;
284 }
285 
__fvd_pow_fma3_256(__m256d const a,__m256d const b)286 __m256d __fvd_pow_fma3_256(__m256d const a, __m256d const b)
287 {
288     //////////////////////////////////////////////////// log constants
289     __m256d const LOG_POLY_6 = _mm256_set1_pd(LOG_POLY_6_D);
290     __m256d const LOG_POLY_5 = _mm256_set1_pd(LOG_POLY_5_D);
291     __m256d const LOG_POLY_4 = _mm256_set1_pd(LOG_POLY_4_D);
292     __m256d const LOG_POLY_3 = _mm256_set1_pd(LOG_POLY_3_D);
293     __m256d const LOG_POLY_2 = _mm256_set1_pd(LOG_POLY_2_D);
294     __m256d const LOG_POLY_1 = _mm256_set1_pd(LOG_POLY_1_D);
295     __m256d const LOG_POLY_0 = _mm256_set1_pd(LOG_POLY_0_D);
296 
297     __m256d const CC_CONST_Y   = _mm256_set1_pd(CC_CONST_Y_D);
298     __m256d const CC_CONST_X   = _mm256_set1_pd(CC_CONST_X_D);
299 
300     __m256i const EXPO_MASK    = _mm256_set1_epi64x(EXPO_MASK_D);
301     __m256d const HI_CONST_1   = (__m256d)_mm256_set1_epi64x(HI_CONST_1_D);
302     __m256d const HI_CONST_2   = (__m256d)_mm256_set1_epi64x(HI_CONST_2_D);
303     __m256i const HALFIFIER    = _mm256_set1_epi64x(HALFIFIER_D);
304     __m256i const HI_THRESH    = _mm256_set1_epi64x(HI_THRESH_D); //~sqrt(2)
305     __m256d const ONE_F        = _mm256_set1_pd(ONE_F_D);
306     __m256d const TWO          = _mm256_set1_pd(TWO_D);
307     __m256d const ZERO         = _mm256_set1_pd(ZERO_D);
308 
309     __m256d const LN2_HI       = _mm256_set1_pd(LN2_HI_D);
310     __m256d const LN2_LO       = _mm256_set1_pd(LN2_LO_D);
311     //////////////////////////////////////////////////////////////////
312 
313     //////////////////////////////////////////////////// exp constants
314     __m256d const L2E        = _mm256_set1_pd(L2E_D);
315     __m256d const NEG_LN2_HI = _mm256_set1_pd(NEG_LN2_HI_D);
316     __m256d const NEG_LN2_LO = _mm256_set1_pd(NEG_LN2_LO_D);
317 
318     __m256d const EXP_POLY_B = _mm256_set1_pd(EXP_POLY_B_D);
319     __m256d const EXP_POLY_A = _mm256_set1_pd(EXP_POLY_A_D);
320     __m256d const EXP_POLY_9 = _mm256_set1_pd(EXP_POLY_9_D);
321     __m256d const EXP_POLY_8 = _mm256_set1_pd(EXP_POLY_8_D);
322     __m256d const EXP_POLY_7 = _mm256_set1_pd(EXP_POLY_7_D);
323     __m256d const EXP_POLY_6 = _mm256_set1_pd(EXP_POLY_6_D);
324     __m256d const EXP_POLY_5 = _mm256_set1_pd(EXP_POLY_5_D);
325     __m256d const EXP_POLY_4 = _mm256_set1_pd(EXP_POLY_4_D);
326     __m256d const EXP_POLY_3 = _mm256_set1_pd(EXP_POLY_3_D);
327     __m256d const EXP_POLY_2 = _mm256_set1_pd(EXP_POLY_2_D);
328     __m256d const EXP_POLY_1 = _mm256_set1_pd(EXP_POLY_1_D);
329     __m256d const EXP_POLY_0 = _mm256_set1_pd(EXP_POLY_0_D);
330 
331     __m256d const DBL2INT_CVT = _mm256_set1_pd(DBL2INT_CVT_D);
332 
333     __m256d const UPPERBOUND_1 = (__m256d)_mm256_set1_epi64x(UPPERBOUND_1_D);
334     __m256i const HI_ABS_MASK = _mm256_set1_epi64x(HI_ABS_MASK_D);
335 
336     //////////////////////////////////////////////////// pow constants
337     __m256i const HI_MASK      = _mm256_set1_epi64x(HI_MASK_D);
338     __m256d const SGN_EXP_MASK = (__m256d)_mm256_set1_epi64x(SGN_EXP_MASK_D);
339     __m256i const ABS_MASK     = _mm256_set1_epi64x(ABS_MASK_D);
340     __m256i const ONE         = _mm256_set1_epi64x(ONE_D);
341 
342     __m256i const TEN_23      = _mm256_set1_epi64x(TEN_23_D);
343     __m256i const ALL_ONES_EXPONENT = _mm256_set1_epi64x(ALL_ONES_EXPONENT_D);
344 
345 
346     __m256d abs_a, a_mut;
347     __m256d_2 loga;
348     __m256d t_hi, t_lo, tmp, e, bloga, prodx;
349 
350     __m256d_2 qq, cc, uu, tt;
351     __m256d f, g, u, v, q, ulo, m;
352     __m256i ihi, expo, expo_plus1;
353     __m256d thresh_mask;
354     __m256d b_is_one_mask;
355 
356     /*
357      * Check whether all exponents in vector b are 1.0, and if so take
358      * a quick exit.
359      * Note: b_is_one_mask is needed later in case some elements in b are 1.0.
360      */
361 
362     b_is_one_mask = _mm256_cmp_pd(b, ONE_F, _CMP_EQ_OQ);
363     if (_mm256_movemask_pd(b_is_one_mask) == 0xf) {
364         return a;
365     }
366 
367     // *****************************************************************************************
368     // computing log(abs(a))
369     abs_a = _mm256_and_pd(a, (__m256d)ABS_MASK);
370     a_mut = _mm256_and_pd(abs_a, HI_CONST_1);
371     a_mut = _mm256_or_pd(a_mut, HI_CONST_2);
372     m = (__m256d)_mm256_sub_epi32((__m256i)a_mut, HALFIFIER); // divide by 2
373 
374     ihi = _mm256_and_si256((__m256i)a_mut, HI_MASK);
375     thresh_mask = _mm256_cmp_pd((__m256d)ihi, (__m256d)HI_THRESH,_CMP_GT_OS);
376     m = _mm256_blendv_pd(a_mut, m, thresh_mask);
377 
378     expo = _mm256_srli_epi64((__m256i)abs_a, D52_D);
379     expo = _mm256_sub_epi64(expo, TEN_23);
380     expo_plus1 = _mm256_add_epi64(expo, ONE); // add one to exponent instead
381     expo = (__m256i)_mm256_blendv_pd((__m256d)expo, (__m256d)expo_plus1, thresh_mask);
382 
383     // begin computing log(m)
384     f = _mm256_sub_pd(m, ONE_F);
385     g = _mm256_add_pd(m, ONE_F);
386     g = _mm256_div_pd(ONE_F, g);
387 
388     u = _mm256_mul_pd(_mm256_mul_pd(TWO, f), g);
389 
390     // u = 2.0 * (m - 1.0) / (m + 1.0)
391     v = _mm256_mul_pd(u, u);
392 
393     // polynomial is used to approximate atanh(v)
394     // an estrin evaluation scheme is used.
395     __m256d c0 = _mm256_fmadd_pd(LOG_POLY_1, v, LOG_POLY_0);
396     __m256d c2 = _mm256_fmadd_pd(LOG_POLY_3, v, LOG_POLY_2);
397     __m256d c4 = _mm256_fmadd_pd(LOG_POLY_5, v, LOG_POLY_4);
398     __m256d v2 = _mm256_mul_pd(v, v);
399     __m256d v4 = _mm256_mul_pd(v2, v2);
400 
401     c0 = _mm256_fmadd_pd(c2, v2, c0);
402     c4 = _mm256_fmadd_pd(LOG_POLY_6, v2, c4);
403     q = _mm256_fmadd_pd(c4, v4, c0);
404     q = _mm256_mul_pd(q, v);
405 
406     tmp = _mm256_mul_pd(TWO, _mm256_sub_pd(f, u));
407     tmp = _mm256_fmadd_pd(neg(u), f, tmp);
408     ulo = _mm256_mul_pd(g, tmp);
409 
410     // double-double computation begins
411     qq.y = q;
412     qq.x = ZERO;
413     uu.y = u;
414     uu.x = ulo;
415     cc.y = CC_CONST_Y;
416     cc.x = CC_CONST_X;
417 
418     qq = __internal_ddadd_yisdouble(cc, qq);
419 
420     // computing log(m) in double-double format
421     cc.y = _mm256_mul_pd(uu.y, uu.y);
422     cc.x = _mm256_fmsub_pd(uu.y, uu.y, cc.y);
423     cc.x = _mm256_fmadd_pd(uu.y,
424                            (__m256d)_mm256_add_epi32((__m256i)uu.x, HALFIFIER),
425                            cc.x); // u ** 2
426 
427     tt.y = _mm256_mul_pd(cc.y, uu.y);
428     tt.x = _mm256_fmsub_pd(cc.y, uu.y, tt.y);
429     tt.x = _mm256_fmadd_pd(cc.y, uu.x, tt.x);
430     tt.x = _mm256_fmadd_pd(cc.x, uu.y, tt.x); // u ** 3 un-normalized
431 
432     uu = __internal_ddfma(qq, tt, uu);
433 
434     // computing log a = log(m) + log(2)*expo
435     f = __internal_fast_int2dbl (expo);
436     q = _mm256_fmadd_pd(f, LN2_HI, uu.y);
437     tmp = _mm256_fmadd_pd(neg(f), LN2_HI, q);
438     tmp = _mm256_sub_pd(tmp, uu.y);
439     loga.y = q;
440     loga.x = _mm256_sub_pd(uu.x, tmp);
441     loga.x = _mm256_fmadd_pd(f, LN2_LO, loga.x);
442 
443     // finish log(a)
444     // ******************************************************************************************
445 
446     // compute b * log(a)
447     t_hi = _mm256_mul_pd(loga.y, b);
448     t_lo = _mm256_fmsub_pd(loga.y, b, t_hi);
449     t_lo = _mm256_fmadd_pd(loga.x, b, t_lo);
450     bloga = e = _mm256_add_pd(t_hi, t_lo);
451     prodx = _mm256_add_pd(_mm256_sub_pd(t_hi, e), t_lo);
452 
453     // ***************************************************************************************
454     // computing exp(b * log(a))
455     // calculating exponent; stored in the LO of each 64-bit block
456     __m256i i = (__m256i) _mm256_fmadd_pd(bloga, L2E, DBL2INT_CVT);
457 
458     // calculate mantissa
459     __m256d t = _mm256_sub_pd ((__m256d)i, DBL2INT_CVT);
460     __m256d z = _mm256_fmadd_pd (t, NEG_LN2_HI, bloga);
461     z = _mm256_fmadd_pd (t, NEG_LN2_LO, z);
462 
463     // use polynomial to calculate exp
464     // mixed estrin/horner scheme: estrin on the higher 8 coefficients, horner on the lowest 4.
465     // provided speedup without loss of precision compared to full horner
466     __m256d t4 = _mm256_fmadd_pd(EXP_POLY_5, z, EXP_POLY_4);
467     __m256d t6 = _mm256_fmadd_pd(EXP_POLY_7, z, EXP_POLY_6);
468     __m256d t8 = _mm256_fmadd_pd(EXP_POLY_9, z, EXP_POLY_8);
469     __m256d t10 = _mm256_fmadd_pd(EXP_POLY_B, z, EXP_POLY_A);
470 
471     __m256d z2 = _mm256_mul_pd(z, z);
472     __m256d z4 = _mm256_mul_pd(z2, z2);
473 
474     t4 = _mm256_fmadd_pd(t6, z2, t4);
475     t8 = _mm256_fmadd_pd(t10, z2, t8);
476     t4 = _mm256_fmadd_pd(t8, z4, t4);
477 
478     t = _mm256_fmadd_pd(t4, z, EXP_POLY_3);
479     t = _mm256_fmadd_pd(t, z, EXP_POLY_2);
480     t = _mm256_fmadd_pd(t, z, EXP_POLY_1);
481     t = _mm256_fmadd_pd(t, z, EXP_POLY_0);
482 
483     // fast scale
484     __m256i i_scale = _mm256_slli_epi64(i,D52_D);
485     __m256d ztemp = z = (__m256d)_mm256_add_epi32(i_scale, (__m256i)t);
486 
487     // slowpath detection for exp
488     __m256d abs_bloga = (__m256d)_mm256_and_si256((__m256i)bloga, HI_ABS_MASK);
489     int exp_slowmask = _mm256_movemask_pd(_mm256_cmp_pd(abs_bloga, UPPERBOUND_1, _CMP_GE_OS));
490 
491     z = _mm256_fmadd_pd(z, prodx, z);
492 
493     if (__builtin_expect(exp_slowmask, 0)) {
494         z = __pgm_exp_d_vec256_slowpath(i, t, bloga, ztemp, prodx);
495     }
496     // finished exp(b * log (a))
497     // ************************************************************************************************
498 
499     // Now special case if some elements of exponent (b) are 1.0.
500     z = _mm256_blendv_pd(z, a, b_is_one_mask);
501 
502     // compute if we have special cases (inf, nan, etc). see man pow for full list of special cases
503     __m256i detect_inf_nan = (__m256i)_mm256_add_pd(a, b);  // check for inf/nan
504     __m256i overridemask = _mm256_cmpeq_epi64( (__m256i)a, (__m256i)ONE_F); //  if a == 1
505     __m256i overridemask2 = _mm256_cmpeq_epi64(_mm256_and_si256(detect_inf_nan, ALL_ONES_EXPONENT), ALL_ONES_EXPONENT);
506     overridemask = _mm256_or_si256(overridemask, (__m256i)_mm256_cmp_pd(a, ZERO, _CMP_LE_OQ)); // if a < 0
507 
508     int specMask = _mm256_movemask_pd((__m256d)_mm256_or_si256(overridemask, overridemask2));
509     if(__builtin_expect(specMask, 0)) {
510         return __pgm_pow_d_vec256_special_cases(a, b, z);
511     }
512     return z;
513 }
514 
515