1 /*
2 * Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 *
16 */
17
18 #if defined(TARGET_LINUX_POWER)
19 #error "Source cannot be compiled for POWER architectures"
20 #include "xmm2altivec.h"
21 #else
22 #include <immintrin.h>
23 #endif
24 #include "dpow_defs.h"
25
26 extern "C" __m256d __fvd_pow_fma3_256(__m256d, __m256d);
27
28
29 // struct representing a double-double number
30 // x, y represent the lo and hi parts respectively
31 struct __m256d_2
32 {
33 __m256d x, y;
34 };
35
36 // negates a number
37 inline
neg(__m256d a)38 __m256d neg(__m256d a)
39 {
40 __m256d const SGN_MASK = (__m256d)_mm256_set1_epi64x(SGN_MASK_D);
41 return _mm256_xor_pd(a, SGN_MASK);
42 }
43
44 // double-double "fma"
45 inline
__internal_ddfma(__m256d_2 x,__m256d_2 y,__m256d_2 z)46 __m256d_2 __internal_ddfma (__m256d_2 x, __m256d_2 y, __m256d_2 z)
47 {
48 __m256d e;
49 __m256d_2 t, m;
50 t.y = _mm256_mul_pd(x.y, y.y);
51 t.x = _mm256_fmsub_pd(x.y, y.y, t.y);
52 t.x = _mm256_fmadd_pd(x.y, y.x, t.x);
53 t.x = _mm256_fmadd_pd(x.x, y.y, t.x);
54
55 m.y = _mm256_add_pd (z.y, t.y);
56 e = _mm256_sub_pd (z.y, m.y);
57 m.x = _mm256_add_pd (_mm256_add_pd(_mm256_add_pd (e, t.y), t.x), z.x);
58
59 return m;
60 }
61
62 // special case of double-double addition, where |x| > |y| and y is a double.
63 inline
__internal_ddadd_yisdouble(__m256d_2 x,__m256d_2 y)64 __m256d_2 __internal_ddadd_yisdouble(__m256d_2 x, __m256d_2 y)
65 {
66 __m256d_2 z;
67 __m256d e;
68 z.y = _mm256_add_pd (x.y, y.y);
69 e = _mm256_sub_pd (x.y, z.y);
70 z.x = _mm256_add_pd(_mm256_add_pd (e, y.y), x.x);
71
72 return z;
73 }
74
75 // casts int to double
76 inline
__internal_fast_int2dbl(__m256i a)77 __m256d __internal_fast_int2dbl(__m256i a)
78 {
79 __m256i const INT2DBL_HI = _mm256_set1_epi64x(INT2DBL_HI_D);
80 __m256i const INT2DBL_LO = _mm256_set1_epi64x(INT2DBL_LO_D);
81 __m256d const INT2DBL = (__m256d)_mm256_set1_epi64x(INT2DBL_D);
82
83 __m256i t = _mm256_xor_si256(INT2DBL_LO, a);
84 t = _mm256_blend_epi32(INT2DBL_HI, t, 0x55);
85 return _mm256_sub_pd((__m256d)t, INT2DBL);
86 }
87
88 // slowpath for exp. used to improve accuracy on larger inputs
__pgm_exp_d_vec256_slowpath(__m256i const i,__m256d const t,__m256d const bloga,__m256d z,__m256d prodx)89 __m256d __attribute__ ((noinline)) __pgm_exp_d_vec256_slowpath(__m256i const i, __m256d const t, __m256d const bloga, __m256d z, __m256d prodx)
90 {
91 __m256d const UPPERBOUND_1 = (__m256d)_mm256_set1_epi64x(UPPERBOUND_1_D);
92 __m256d const UPPERBOUND_2 = (__m256d)_mm256_set1_epi64x(UPPERBOUND_2_D);
93 __m256d const ZERO = _mm256_set1_pd(ZERO_D);
94 __m256d const INF = (__m256d)_mm256_set1_epi64x(INF_D);
95 __m256i const HI_ABS_MASK = _mm256_set1_epi64x(HI_ABS_MASK_D);
96 __m256i const ABS_MASK = _mm256_set1_epi64x(ABS_MASK_D);
97 __m256i const MULT_CONST = _mm256_set1_epi64x(MULT_CONST_D);
98
99 __m256i abs_bloga = _mm256_and_si256((__m256i)bloga, ABS_MASK);
100 __m256d abs_lt = (__m256d)_mm256_and_si256((__m256i)abs_bloga, HI_ABS_MASK);
101 __m256d lt_zero_mask = _mm256_cmp_pd(bloga, ZERO, _CMP_LT_OS); // compute bloga < 0.0
102 __m256d slowpath_mask = _mm256_cmp_pd(abs_lt, UPPERBOUND_1, _CMP_LT_OS);
103
104 __m256d a_plus_inf = _mm256_add_pd(bloga, INF); // check if bloga is too big
105 __m256d zero_inf_blend = _mm256_blendv_pd(a_plus_inf, ZERO, lt_zero_mask);
106
107 __m256d accurate_scale_mask = (__m256d)_mm256_cmp_pd(abs_lt, UPPERBOUND_2, 1);
108
109 // compute accurate scale
110 __m256i k = _mm256_srli_epi64(i, 1); // k = i / 2
111 __m256i i_scale_acc = _mm256_slli_epi64(k, D52_D); // shift to HI and shift 20
112 k = _mm256_sub_epi32(i, k); // k = i - k
113 __m256i i_scale_acc_2 = _mm256_slli_epi64(k, D52_D); // shift to HI and shift 20
114 __m256d multiplier = (__m256d)_mm256_add_epi64(i_scale_acc_2, MULT_CONST);
115 multiplier = _mm256_blendv_pd(ZERO, multiplier, accurate_scale_mask); // quick fix for overflows in case they're being trapped
116
117 __m256d res = (__m256d)_mm256_add_epi32(i_scale_acc, (__m256i)t);
118 res = _mm256_mul_pd(res, multiplier);
119
120 __m256d slowpath_blend = _mm256_blendv_pd(zero_inf_blend, res, accurate_scale_mask);
121 z = _mm256_blendv_pd(slowpath_blend, z, slowpath_mask);
122
123 __m256i isinf_mask = _mm256_cmpeq_epi64((__m256i)INF, (__m256i)z); // special case for inf
124 __m256d z_fixed = _mm256_fmadd_pd(z, prodx, z); // add only if not inf
125 z = _mm256_blendv_pd(z_fixed, z, (__m256d)isinf_mask);
126
127 return z;
128 }
129
130 // special cases for pow
131 // implementation derived from cudart/device_functions_impl.c
__pgm_pow_d_vec256_special_cases(__m256d const a,__m256d const b,__m256d t)132 __m256d __attribute__ ((noinline)) __pgm_pow_d_vec256_special_cases(__m256d const a, __m256d const b, __m256d t)
133 {
134 __m256i const HI_MASK = _mm256_set1_epi64x(HI_MASK_D);
135 __m256d const SGN_EXP_MASK = (__m256d)_mm256_set1_epi64x(SGN_EXP_MASK_D);
136 __m256i const ABS_MASK = _mm256_set1_epi64x(ABS_MASK_D);
137 __m256d const NEG_ONE = _mm256_set1_pd(NEG_ONE_D);
138 __m256d const ZERO = _mm256_set1_pd(ZERO_D);
139 __m256d const ONE = _mm256_set1_pd(ONE_D);
140 __m256d const HALF = _mm256_set1_pd(HALF_D);
141 __m256d const SGN_MASK = (__m256d)_mm256_set1_epi64x(SGN_MASK_D);
142 __m256d const INF = (__m256d)_mm256_set1_epi64x(INF_D);
143 __m256i const INF_FAKE = _mm256_set1_epi64x(INF_FAKE_D);
144 __m256d const NAN_MASK = (__m256d)_mm256_set1_epi64x(NAN_MASK_D);
145 __m256d const NEG_ONE_CONST = (__m256d)_mm256_set1_epi64x(NEG_ONE_CONST_D);
146
147 __m256i const TEN_23 = _mm256_set1_epi64x(TEN_23_D);
148 __m256i const ELEVEN = _mm256_set1_epi64x(ELEVEN_D);
149
150 __m256i a_sign, b_sign, bIsOddInteger;
151 __m256d abs_a, abs_b;
152 __m256i shiftb;
153
154 a_sign = (__m256i)_mm256_and_pd(a, SGN_MASK);
155 b_sign = (__m256i)_mm256_and_pd(b, SGN_MASK);
156 abs_a = _mm256_and_pd(a, (__m256d)ABS_MASK);
157 abs_b = _mm256_and_pd(b, (__m256d)ABS_MASK);
158
159 // determining if b is an odd integer, since there are special cases for it
160 shiftb = (__m256i)_mm256_and_pd (b, SGN_EXP_MASK);
161 shiftb = _mm256_srli_epi64(shiftb, D52_D);
162 shiftb = _mm256_sub_epi64(shiftb, TEN_23);
163 shiftb = _mm256_add_epi64(shiftb, ELEVEN);
164
165 __m256i b_is_half = _mm256_cmpeq_epi64((__m256i)abs_b, (__m256i)HALF);
166 bIsOddInteger = _mm256_sllv_epi64((__m256i)b, shiftb);
167 bIsOddInteger = _mm256_cmpeq_epi64((__m256i)SGN_MASK, bIsOddInteger);
168 // fix for b = +/-0.5 being incorrectly identified as an odd integer
169 bIsOddInteger = _mm256_andnot_si256(b_is_half, bIsOddInteger);
170
171 // corner cases where a <= 0
172 // if ((ahi < 0) && bIsOddInteger)
173 __m256d ahi_lt_0 = (__m256d)_mm256_cmpeq_epi64(a_sign, (__m256i)ZERO);
174 __m256d ahilt0_and_boddint_mask = _mm256_andnot_pd(ahi_lt_0, (__m256d)bIsOddInteger);
175
176 t = _mm256_blendv_pd(t, neg(t), ahilt0_and_boddint_mask);
177
178 // else if ((ahi < 0) && (b != trunc(b)))
179 __m256d b_ne_trunc = _mm256_cmp_pd(b, _mm256_round_pd(b, _MM_FROUND_TO_ZERO), _CMP_NEQ_UQ);
180 __m256d nan_mask = _mm256_andnot_pd(ahi_lt_0, b_ne_trunc);
181 t = _mm256_blendv_pd(t, NAN_MASK, nan_mask);
182
183 // if (a == 0.0)
184 __m256d a_is_0_mask = (__m256d)_mm256_cmpeq_epi64((__m256i)abs_a, (__m256i)ZERO);
185 __m256d thi_when_ais0 = ZERO;
186
187 // if (bIsOddInteger && a == 0.0)
188 thi_when_ais0 = _mm256_blendv_pd(thi_when_ais0, (__m256d)a, (__m256d)bIsOddInteger);
189
190 // if (bhi < 0 && a == 0.0)
191 __m256d bhi_lt_0 = (__m256d)_mm256_cmpeq_epi64(b_sign, (__m256i)ZERO); //this mask is inverted
192 __m256d thi_or_INF = _mm256_or_pd(thi_when_ais0, INF);
193 thi_when_ais0 = _mm256_blendv_pd(thi_or_INF, thi_when_ais0, bhi_lt_0);
194 __m256d t_when_ais0 = _mm256_and_pd(thi_when_ais0, (__m256d)HI_MASK);
195
196 t = _mm256_blendv_pd(t, t_when_ais0, a_is_0_mask);
197
198 // else if (a is INF)
199 __m256d a_inf_mask = (__m256d)_mm256_cmpeq_epi64((__m256i)abs_a, (__m256i)INF);
200 // use bhi_lt_0 backwards to evaluate bhi >= 0
201 __m256d thi_when_aisinf = _mm256_blendv_pd(thi_when_aisinf, INF, bhi_lt_0);
202
203 // now evaluate ((ahi < 0) && bIsOddInteger)
204 __m256d thi_xor_sgn = _mm256_xor_pd(thi_when_aisinf, SGN_MASK);
205 thi_when_aisinf = _mm256_blendv_pd(thi_when_aisinf, thi_xor_sgn, ahilt0_and_boddint_mask);
206
207 t = _mm256_blendv_pd(t, thi_when_aisinf, a_inf_mask);
208
209 // else if (abs(b) is INF)
210 __m256d b_inf_mask = (__m256d)_mm256_cmpeq_epi64((__m256i)abs_b, (__m256i)INF);
211 __m256d thi_when_bisinf = ZERO;
212 __m256d absa_gt_one = _mm256_cmp_pd(abs_a, ONE, _CMP_GT_OS); // evaluating (abs(a) > 1)
213 thi_when_bisinf = _mm256_blendv_pd(thi_when_bisinf, INF, absa_gt_one);
214
215 __m256d thi_xor_inf = _mm256_xor_pd(thi_when_bisinf, INF);
216 thi_when_bisinf = _mm256_blendv_pd(thi_xor_inf, thi_when_bisinf, bhi_lt_0); // bhi < 0
217
218 __m256d a_is_negone = (__m256d)_mm256_cmpeq_epi64((__m256i)a, (__m256i)NEG_ONE);
219 thi_when_bisinf = _mm256_blendv_pd(thi_when_bisinf, NEG_ONE_CONST, a_is_negone); //a == -1
220
221 t = _mm256_blendv_pd(t, thi_when_bisinf, b_inf_mask);
222
223 // if(a is NAN || B is NAN) <=> !(a is a number && b is a number)
224 __m256i a_nan_mask = _mm256_cmpeq_epi64(_mm256_max_epu32((__m256i)abs_a, INF_FAKE), INF_FAKE);
225 __m256i b_nan_mask = _mm256_cmpeq_epi64(_mm256_max_epu32((__m256i)abs_b, INF_FAKE), INF_FAKE);
226 __m256i aorb_nan_mask = _mm256_and_si256(a_nan_mask, b_nan_mask); //this mask is inverted
227 t = _mm256_blendv_pd(_mm256_add_pd(a,b), t, (__m256d)aorb_nan_mask);
228
229 // if a == 1 or b == 0, answer = 1
230 __m256d a_is_one_mask = (__m256d)_mm256_cmpeq_epi64((__m256i)a, (__m256i)ONE);
231 __m256d b_is_zero_mask = (__m256d)_mm256_cmpeq_epi64((__m256i)abs_b, (__m256i)ZERO);
232 __m256d ans_equals_one_mask = _mm256_or_pd(a_is_one_mask, b_is_zero_mask);
233 t = _mm256_blendv_pd(t, ONE, ans_equals_one_mask);
234 // ******************************************************************************************
235
236
237 __m256i a_is_neg = (__m256i)_mm256_cmp_pd(a, ZERO, _CMP_LT_OS);
238 int a_is_neg_flag = _mm256_movemask_epi8((__m256i)a_is_neg);
239
240 __m256i b_is_integer = (__m256i)_mm256_cmp_pd(b, _mm256_floor_pd(b), _CMP_EQ_OQ);
241 int b_is_integer_flag = _mm256_movemask_epi8((__m256i)b_is_integer);
242
243 __m256i b_is_lt_zero = (__m256i)_mm256_cmp_pd(b, ZERO, _CMP_LT_OS);
244 int b_is_lt_zero_flag = _mm256_movemask_epi8((__m256i)b_is_lt_zero);
245
246 __m256d const MINUS_ZERO = _mm256_set1_pd((double)-0.0);
247 __m256i a_is_pos_zero = _mm256_cmpeq_epi64( (__m256i)a, (__m256i)ZERO);
248 __m256i a_is_neg_zero = _mm256_cmpeq_epi64( (__m256i)a, (__m256i)MINUS_ZERO);
249 __m256i a_is_any_zero = _mm256_or_si256(a_is_pos_zero, a_is_neg_zero);
250 int a_is_any_zero_flag = _mm256_movemask_epi8((__m256i)a_is_any_zero);
251
252 /*
253 * Before returning see if we need to set any of the processor
254 * exception flags.
255 *
256 * Domain error: a is negative, and b is a finite noninteger
257 * we need to raise the Invalid-Operation flag. This can be done by
258 * taking the square root of a negative number.
259 *
260 * Pole error: a is zero and b is negative we need to raise the
261 * divide by zero flag. This can be done by dividing by zero.
262 */
263
264 if (a_is_neg_flag && (!b_is_integer_flag)) {
265 __m256d volatile invop = _mm256_sqrt_pd(a);
266 }
267
268 if (a_is_any_zero_flag && b_is_lt_zero_flag) {
269 __m256d volatile divXzero = _mm256_div_pd(ONE,ZERO);
270 }
271
272
273
274
275
276
277
278
279
280
281
282
283 return t;
284 }
285
__fvd_pow_fma3_256(__m256d const a,__m256d const b)286 __m256d __fvd_pow_fma3_256(__m256d const a, __m256d const b)
287 {
288 //////////////////////////////////////////////////// log constants
289 __m256d const LOG_POLY_6 = _mm256_set1_pd(LOG_POLY_6_D);
290 __m256d const LOG_POLY_5 = _mm256_set1_pd(LOG_POLY_5_D);
291 __m256d const LOG_POLY_4 = _mm256_set1_pd(LOG_POLY_4_D);
292 __m256d const LOG_POLY_3 = _mm256_set1_pd(LOG_POLY_3_D);
293 __m256d const LOG_POLY_2 = _mm256_set1_pd(LOG_POLY_2_D);
294 __m256d const LOG_POLY_1 = _mm256_set1_pd(LOG_POLY_1_D);
295 __m256d const LOG_POLY_0 = _mm256_set1_pd(LOG_POLY_0_D);
296
297 __m256d const CC_CONST_Y = _mm256_set1_pd(CC_CONST_Y_D);
298 __m256d const CC_CONST_X = _mm256_set1_pd(CC_CONST_X_D);
299
300 __m256i const EXPO_MASK = _mm256_set1_epi64x(EXPO_MASK_D);
301 __m256d const HI_CONST_1 = (__m256d)_mm256_set1_epi64x(HI_CONST_1_D);
302 __m256d const HI_CONST_2 = (__m256d)_mm256_set1_epi64x(HI_CONST_2_D);
303 __m256i const HALFIFIER = _mm256_set1_epi64x(HALFIFIER_D);
304 __m256i const HI_THRESH = _mm256_set1_epi64x(HI_THRESH_D); //~sqrt(2)
305 __m256d const ONE_F = _mm256_set1_pd(ONE_F_D);
306 __m256d const TWO = _mm256_set1_pd(TWO_D);
307 __m256d const ZERO = _mm256_set1_pd(ZERO_D);
308
309 __m256d const LN2_HI = _mm256_set1_pd(LN2_HI_D);
310 __m256d const LN2_LO = _mm256_set1_pd(LN2_LO_D);
311 //////////////////////////////////////////////////////////////////
312
313 //////////////////////////////////////////////////// exp constants
314 __m256d const L2E = _mm256_set1_pd(L2E_D);
315 __m256d const NEG_LN2_HI = _mm256_set1_pd(NEG_LN2_HI_D);
316 __m256d const NEG_LN2_LO = _mm256_set1_pd(NEG_LN2_LO_D);
317
318 __m256d const EXP_POLY_B = _mm256_set1_pd(EXP_POLY_B_D);
319 __m256d const EXP_POLY_A = _mm256_set1_pd(EXP_POLY_A_D);
320 __m256d const EXP_POLY_9 = _mm256_set1_pd(EXP_POLY_9_D);
321 __m256d const EXP_POLY_8 = _mm256_set1_pd(EXP_POLY_8_D);
322 __m256d const EXP_POLY_7 = _mm256_set1_pd(EXP_POLY_7_D);
323 __m256d const EXP_POLY_6 = _mm256_set1_pd(EXP_POLY_6_D);
324 __m256d const EXP_POLY_5 = _mm256_set1_pd(EXP_POLY_5_D);
325 __m256d const EXP_POLY_4 = _mm256_set1_pd(EXP_POLY_4_D);
326 __m256d const EXP_POLY_3 = _mm256_set1_pd(EXP_POLY_3_D);
327 __m256d const EXP_POLY_2 = _mm256_set1_pd(EXP_POLY_2_D);
328 __m256d const EXP_POLY_1 = _mm256_set1_pd(EXP_POLY_1_D);
329 __m256d const EXP_POLY_0 = _mm256_set1_pd(EXP_POLY_0_D);
330
331 __m256d const DBL2INT_CVT = _mm256_set1_pd(DBL2INT_CVT_D);
332
333 __m256d const UPPERBOUND_1 = (__m256d)_mm256_set1_epi64x(UPPERBOUND_1_D);
334 __m256i const HI_ABS_MASK = _mm256_set1_epi64x(HI_ABS_MASK_D);
335
336 //////////////////////////////////////////////////// pow constants
337 __m256i const HI_MASK = _mm256_set1_epi64x(HI_MASK_D);
338 __m256d const SGN_EXP_MASK = (__m256d)_mm256_set1_epi64x(SGN_EXP_MASK_D);
339 __m256i const ABS_MASK = _mm256_set1_epi64x(ABS_MASK_D);
340 __m256i const ONE = _mm256_set1_epi64x(ONE_D);
341
342 __m256i const TEN_23 = _mm256_set1_epi64x(TEN_23_D);
343 __m256i const ALL_ONES_EXPONENT = _mm256_set1_epi64x(ALL_ONES_EXPONENT_D);
344
345
346 __m256d abs_a, a_mut;
347 __m256d_2 loga;
348 __m256d t_hi, t_lo, tmp, e, bloga, prodx;
349
350 __m256d_2 qq, cc, uu, tt;
351 __m256d f, g, u, v, q, ulo, m;
352 __m256i ihi, expo, expo_plus1;
353 __m256d thresh_mask;
354 __m256d b_is_one_mask;
355
356 /*
357 * Check whether all exponents in vector b are 1.0, and if so take
358 * a quick exit.
359 * Note: b_is_one_mask is needed later in case some elements in b are 1.0.
360 */
361
362 b_is_one_mask = _mm256_cmp_pd(b, ONE_F, _CMP_EQ_OQ);
363 if (_mm256_movemask_pd(b_is_one_mask) == 0xf) {
364 return a;
365 }
366
367 // *****************************************************************************************
368 // computing log(abs(a))
369 abs_a = _mm256_and_pd(a, (__m256d)ABS_MASK);
370 a_mut = _mm256_and_pd(abs_a, HI_CONST_1);
371 a_mut = _mm256_or_pd(a_mut, HI_CONST_2);
372 m = (__m256d)_mm256_sub_epi32((__m256i)a_mut, HALFIFIER); // divide by 2
373
374 ihi = _mm256_and_si256((__m256i)a_mut, HI_MASK);
375 thresh_mask = _mm256_cmp_pd((__m256d)ihi, (__m256d)HI_THRESH,_CMP_GT_OS);
376 m = _mm256_blendv_pd(a_mut, m, thresh_mask);
377
378 expo = _mm256_srli_epi64((__m256i)abs_a, D52_D);
379 expo = _mm256_sub_epi64(expo, TEN_23);
380 expo_plus1 = _mm256_add_epi64(expo, ONE); // add one to exponent instead
381 expo = (__m256i)_mm256_blendv_pd((__m256d)expo, (__m256d)expo_plus1, thresh_mask);
382
383 // begin computing log(m)
384 f = _mm256_sub_pd(m, ONE_F);
385 g = _mm256_add_pd(m, ONE_F);
386 g = _mm256_div_pd(ONE_F, g);
387
388 u = _mm256_mul_pd(_mm256_mul_pd(TWO, f), g);
389
390 // u = 2.0 * (m - 1.0) / (m + 1.0)
391 v = _mm256_mul_pd(u, u);
392
393 // polynomial is used to approximate atanh(v)
394 // an estrin evaluation scheme is used.
395 __m256d c0 = _mm256_fmadd_pd(LOG_POLY_1, v, LOG_POLY_0);
396 __m256d c2 = _mm256_fmadd_pd(LOG_POLY_3, v, LOG_POLY_2);
397 __m256d c4 = _mm256_fmadd_pd(LOG_POLY_5, v, LOG_POLY_4);
398 __m256d v2 = _mm256_mul_pd(v, v);
399 __m256d v4 = _mm256_mul_pd(v2, v2);
400
401 c0 = _mm256_fmadd_pd(c2, v2, c0);
402 c4 = _mm256_fmadd_pd(LOG_POLY_6, v2, c4);
403 q = _mm256_fmadd_pd(c4, v4, c0);
404 q = _mm256_mul_pd(q, v);
405
406 tmp = _mm256_mul_pd(TWO, _mm256_sub_pd(f, u));
407 tmp = _mm256_fmadd_pd(neg(u), f, tmp);
408 ulo = _mm256_mul_pd(g, tmp);
409
410 // double-double computation begins
411 qq.y = q;
412 qq.x = ZERO;
413 uu.y = u;
414 uu.x = ulo;
415 cc.y = CC_CONST_Y;
416 cc.x = CC_CONST_X;
417
418 qq = __internal_ddadd_yisdouble(cc, qq);
419
420 // computing log(m) in double-double format
421 cc.y = _mm256_mul_pd(uu.y, uu.y);
422 cc.x = _mm256_fmsub_pd(uu.y, uu.y, cc.y);
423 cc.x = _mm256_fmadd_pd(uu.y,
424 (__m256d)_mm256_add_epi32((__m256i)uu.x, HALFIFIER),
425 cc.x); // u ** 2
426
427 tt.y = _mm256_mul_pd(cc.y, uu.y);
428 tt.x = _mm256_fmsub_pd(cc.y, uu.y, tt.y);
429 tt.x = _mm256_fmadd_pd(cc.y, uu.x, tt.x);
430 tt.x = _mm256_fmadd_pd(cc.x, uu.y, tt.x); // u ** 3 un-normalized
431
432 uu = __internal_ddfma(qq, tt, uu);
433
434 // computing log a = log(m) + log(2)*expo
435 f = __internal_fast_int2dbl (expo);
436 q = _mm256_fmadd_pd(f, LN2_HI, uu.y);
437 tmp = _mm256_fmadd_pd(neg(f), LN2_HI, q);
438 tmp = _mm256_sub_pd(tmp, uu.y);
439 loga.y = q;
440 loga.x = _mm256_sub_pd(uu.x, tmp);
441 loga.x = _mm256_fmadd_pd(f, LN2_LO, loga.x);
442
443 // finish log(a)
444 // ******************************************************************************************
445
446 // compute b * log(a)
447 t_hi = _mm256_mul_pd(loga.y, b);
448 t_lo = _mm256_fmsub_pd(loga.y, b, t_hi);
449 t_lo = _mm256_fmadd_pd(loga.x, b, t_lo);
450 bloga = e = _mm256_add_pd(t_hi, t_lo);
451 prodx = _mm256_add_pd(_mm256_sub_pd(t_hi, e), t_lo);
452
453 // ***************************************************************************************
454 // computing exp(b * log(a))
455 // calculating exponent; stored in the LO of each 64-bit block
456 __m256i i = (__m256i) _mm256_fmadd_pd(bloga, L2E, DBL2INT_CVT);
457
458 // calculate mantissa
459 __m256d t = _mm256_sub_pd ((__m256d)i, DBL2INT_CVT);
460 __m256d z = _mm256_fmadd_pd (t, NEG_LN2_HI, bloga);
461 z = _mm256_fmadd_pd (t, NEG_LN2_LO, z);
462
463 // use polynomial to calculate exp
464 // mixed estrin/horner scheme: estrin on the higher 8 coefficients, horner on the lowest 4.
465 // provided speedup without loss of precision compared to full horner
466 __m256d t4 = _mm256_fmadd_pd(EXP_POLY_5, z, EXP_POLY_4);
467 __m256d t6 = _mm256_fmadd_pd(EXP_POLY_7, z, EXP_POLY_6);
468 __m256d t8 = _mm256_fmadd_pd(EXP_POLY_9, z, EXP_POLY_8);
469 __m256d t10 = _mm256_fmadd_pd(EXP_POLY_B, z, EXP_POLY_A);
470
471 __m256d z2 = _mm256_mul_pd(z, z);
472 __m256d z4 = _mm256_mul_pd(z2, z2);
473
474 t4 = _mm256_fmadd_pd(t6, z2, t4);
475 t8 = _mm256_fmadd_pd(t10, z2, t8);
476 t4 = _mm256_fmadd_pd(t8, z4, t4);
477
478 t = _mm256_fmadd_pd(t4, z, EXP_POLY_3);
479 t = _mm256_fmadd_pd(t, z, EXP_POLY_2);
480 t = _mm256_fmadd_pd(t, z, EXP_POLY_1);
481 t = _mm256_fmadd_pd(t, z, EXP_POLY_0);
482
483 // fast scale
484 __m256i i_scale = _mm256_slli_epi64(i,D52_D);
485 __m256d ztemp = z = (__m256d)_mm256_add_epi32(i_scale, (__m256i)t);
486
487 // slowpath detection for exp
488 __m256d abs_bloga = (__m256d)_mm256_and_si256((__m256i)bloga, HI_ABS_MASK);
489 int exp_slowmask = _mm256_movemask_pd(_mm256_cmp_pd(abs_bloga, UPPERBOUND_1, _CMP_GE_OS));
490
491 z = _mm256_fmadd_pd(z, prodx, z);
492
493 if (__builtin_expect(exp_slowmask, 0)) {
494 z = __pgm_exp_d_vec256_slowpath(i, t, bloga, ztemp, prodx);
495 }
496 // finished exp(b * log (a))
497 // ************************************************************************************************
498
499 // Now special case if some elements of exponent (b) are 1.0.
500 z = _mm256_blendv_pd(z, a, b_is_one_mask);
501
502 // compute if we have special cases (inf, nan, etc). see man pow for full list of special cases
503 __m256i detect_inf_nan = (__m256i)_mm256_add_pd(a, b); // check for inf/nan
504 __m256i overridemask = _mm256_cmpeq_epi64( (__m256i)a, (__m256i)ONE_F); // if a == 1
505 __m256i overridemask2 = _mm256_cmpeq_epi64(_mm256_and_si256(detect_inf_nan, ALL_ONES_EXPONENT), ALL_ONES_EXPONENT);
506 overridemask = _mm256_or_si256(overridemask, (__m256i)_mm256_cmp_pd(a, ZERO, _CMP_LE_OQ)); // if a < 0
507
508 int specMask = _mm256_movemask_pd((__m256d)_mm256_or_si256(overridemask, overridemask2));
509 if(__builtin_expect(specMask, 0)) {
510 return __pgm_pow_d_vec256_special_cases(a, b, z);
511 }
512 return z;
513 }
514
515