1 /*
2 AVX implementation of sin, cos, sincos, exp and log
3
4 Based on "sse_mathfun.h", by Julien Pommier
5 http://gruntthepeon.free.fr/ssemath/
6
7 Copyright (C) 2012 Giovanni Garberoglio
8 Interdisciplinary Laboratory for Computational Science (LISC)
9 Fondazione Bruno Kessler and University of Trento
10 via Sommarive, 18
11 I-38123 Trento (Italy)
12
13 This software is provided 'as-is', without any express or implied
14 warranty. In no event will the authors be held liable for any damages
15 arising from the use of this software.
16
17 Permission is granted to anyone to use this software for any purpose,
18 including commercial applications, and to alter it and redistribute it
19 freely, subject to the following restrictions:
20
21 1. The origin of this software must not be misrepresented; you must not
22 claim that you wrote the original software. If you use this software
23 in a product, an acknowledgment in the product documentation would be
24 appreciated but is not required.
25 2. Altered source versions must be plainly marked as such, and must not be
26 misrepresented as being the original software.
27 3. This notice may not be removed or altered from any source distribution.
28
29 (this is the zlib license)
30 */
31
32 #ifndef AVX_MATHFUN_H
33 #define AVX_MATHFUN_H
34
35 #include <immintrin.h>
36 #include <emmintrin.h>
37
38 /* yes I know, the top of this file is quite ugly */
39
40 #ifdef _MSC_VER /* visual c++ */
41 #define ALIGN32_BEG __declspec(align(32))
42 #define ALIGN32_END
43 #else /* gcc or icc */
44 #define ALIGN32_BEG
45 #define ALIGN32_END __attribute__((aligned(32)))
46 #endif
47
48 #define _PI32AVX_CONST(Name, Val) \
49 static const ALIGN32_BEG int _pi32avx_##Name[4] ALIGN32_END = {Val, Val, Val, Val}
50
51 _PI32AVX_CONST(1, 1);
52 _PI32AVX_CONST(inv1, ~1);
53 _PI32AVX_CONST(2, 2);
54 _PI32AVX_CONST(4, 4);
55
56 /* declare some AVX constants -- why can't I figure a better way to do that? */
57 #define _PS256_CONST(Name, Val) \
58 static const ALIGN32_BEG float _ps256_##Name[8] ALIGN32_END = {Val, Val, Val, Val, Val, Val, Val, Val}
59 #define _PI32_CONST256(Name, Val) \
60 static const ALIGN32_BEG int _pi32_256_##Name[8] ALIGN32_END = {Val, Val, Val, Val, Val, Val, Val, Val}
61 #define _PS256_CONST_TYPE(Name, Type, Val) \
62 static const ALIGN32_BEG Type _ps256_##Name[8] ALIGN32_END = {Val, Val, Val, Val, Val, Val, Val, Val}
63
64 _PS256_CONST(1, 1.0f);
65 _PS256_CONST(0p5, 0.5f);
66 /* the smallest non denormalized float number */
67 _PS256_CONST_TYPE(min_norm_pos, int, 0x00800000);
68 _PS256_CONST_TYPE(mant_mask, int, 0x7f800000);
69 _PS256_CONST_TYPE(inv_mant_mask, int, ~0x7f800000);
70
71 _PS256_CONST_TYPE(sign_mask, int, (int)0x80000000);
72 _PS256_CONST_TYPE(inv_sign_mask, int, ~0x80000000);
73
74 _PI32_CONST256(0, 0);
75 _PI32_CONST256(1, 1);
76 _PI32_CONST256(inv1, ~1);
77 _PI32_CONST256(2, 2);
78 _PI32_CONST256(4, 4);
79 _PI32_CONST256(0x7f, 0x7f);
80
81 _PS256_CONST(cephes_SQRTHF, 0.707106781186547524f);
82 _PS256_CONST(cephes_log_p0, 7.0376836292E-2f);
83 _PS256_CONST(cephes_log_p1, -1.1514610310E-1f);
84 _PS256_CONST(cephes_log_p2, 1.1676998740E-1f);
85 _PS256_CONST(cephes_log_p3, -1.2420140846E-1f);
86 _PS256_CONST(cephes_log_p4, +1.4249322787E-1f);
87 _PS256_CONST(cephes_log_p5, -1.6668057665E-1f);
88 _PS256_CONST(cephes_log_p6, +2.0000714765E-1f);
89 _PS256_CONST(cephes_log_p7, -2.4999993993E-1f);
90 _PS256_CONST(cephes_log_p8, +3.3333331174E-1f);
91 _PS256_CONST(cephes_log_q1, -2.12194440e-4f);
92 _PS256_CONST(cephes_log_q2, 0.693359375f);
93
94 #ifndef __AVX2__
95
96 typedef union imm_xmm_union
97 {
98 __m256i imm;
99 __m128i xmm[2];
100 } imm_xmm_union;
101
102 #define COPY_IMM_TO_XMM(imm_, xmm0_, xmm1_) \
103 { \
104 ALIGN32_BEG imm_xmm_union u ALIGN32_END; \
105 u.imm = imm_; \
106 xmm0_ = u.xmm[0]; \
107 xmm1_ = u.xmm[1]; \
108 }
109
110 #define COPY_XMM_TO_IMM(xmm0_, xmm1_, imm_) \
111 { \
112 ALIGN32_BEG imm_xmm_union u ALIGN32_END; \
113 u.xmm[0] = xmm0_; \
114 u.xmm[1] = xmm1_; \
115 imm_ = u.imm; \
116 }
117
118 #define AVX2_BITOP_USING_SSE2(fn) \
119 static inline __m256i _mm256_##fn(__m256i x, int a) \
120 { \
121 /* use SSE2 instruction to perform the bitop AVX2 */ \
122 __m128i x1, x2; \
123 __m256i ret; \
124 COPY_IMM_TO_XMM(x, x1, x2); \
125 x1 = _mm_##fn(x1, a); \
126 x2 = _mm_##fn(x2, a); \
127 COPY_XMM_TO_IMM(x1, x2, ret); \
128 return (ret); \
129 }
130
131 #if _MSC_VER
132 #pragma WARNING(Using SSE2 to perform AVX2 bitshift ops)
133 #else
134 #warning "Using SSE2 to perform AVX2 bitshift ops"
135 #endif
136 AVX2_BITOP_USING_SSE2(slli_epi32)
AVX2_BITOP_USING_SSE2(srli_epi32)137 AVX2_BITOP_USING_SSE2(srli_epi32)
138
139 #define AVX2_INTOP_USING_SSE2(fn) \
140 static inline __m256i _mm256_##fn(__m256i x, __m256i y) \
141 { \
142 /* use SSE2 instructions to perform the AVX2 integer operation */ \
143 __m128i x1, x2; \
144 __m128i y1, y2; \
145 __m256i ret; \
146 COPY_IMM_TO_XMM(x, x1, x2); \
147 COPY_IMM_TO_XMM(y, y1, y2); \
148 x1 = _mm_##fn(x1, y1); \
149 x2 = _mm_##fn(x2, y2); \
150 COPY_XMM_TO_IMM(x1, x2, ret); \
151 return (ret); \
152 }
153
154 #if _MSC_VER
155 #pragma WARNING(Using SSE2 to perform AVX2 bitshift ops)
156 #else
157 #warning "Using SSE2 to perform AVX2 integer ops"
158 #endif
159 AVX2_INTOP_USING_SSE2(and_si128)
160 AVX2_INTOP_USING_SSE2(andnot_si128)
161 AVX2_INTOP_USING_SSE2(cmpeq_epi32)
162 AVX2_INTOP_USING_SSE2(sub_epi32)
163 AVX2_INTOP_USING_SSE2(add_epi32)
164
165 #endif /* __AVX2__ */
166
167 /* natural logarithm computed for 8 simultaneous float
168 return NaN for x <= 0
169 */
170 static inline __m256 log256_ps(__m256 x)
171 {
172 __m256i imm0;
173 __m256 one = *(__m256*)_ps256_1;
174
175 //__m256 invalid_mask = _mm256_cmple_ps(x, _mm256_setzero_ps());
176 __m256 invalid_mask = _mm256_cmp_ps(x, _mm256_setzero_ps(), _CMP_LE_OS);
177
178 x = _mm256_max_ps(x, *(__m256*)_ps256_min_norm_pos); /* cut off denormalized stuff */
179
180 // can be done with AVX2
181 imm0 = _mm256_srli_epi32(_mm256_castps_si256(x), 23);
182
183 /* keep only the fractional part */
184 x = _mm256_and_ps(x, *(__m256*)_ps256_inv_mant_mask);
185 x = _mm256_or_ps(x, *(__m256*)_ps256_0p5);
186
187 // this is again another AVX2 instruction
188 imm0 = _mm256_sub_epi32(imm0, *(__m256i*)_pi32_256_0x7f);
189 __m256 e = _mm256_cvtepi32_ps(imm0);
190
191 e = _mm256_add_ps(e, one);
192
193 /* part2:
194 if( x < SQRTHF ) {
195 e -= 1;
196 x = x + x - 1.0;
197 } else { x = x - 1.0; }
198 */
199 //__m256 mask = _mm256_cmplt_ps(x, *(__m256*)_ps256_cephes_SQRTHF);
200 __m256 mask = _mm256_cmp_ps(x, *(__m256*)_ps256_cephes_SQRTHF, _CMP_LT_OS);
201 __m256 tmp = _mm256_and_ps(x, mask);
202 x = _mm256_sub_ps(x, one);
203 e = _mm256_sub_ps(e, _mm256_and_ps(one, mask));
204 x = _mm256_add_ps(x, tmp);
205
206 __m256 z = _mm256_mul_ps(x, x);
207
208 __m256 y = *(__m256*)_ps256_cephes_log_p0;
209 y = _mm256_mul_ps(y, x);
210 y = _mm256_add_ps(y, *(__m256*)_ps256_cephes_log_p1);
211 y = _mm256_mul_ps(y, x);
212 y = _mm256_add_ps(y, *(__m256*)_ps256_cephes_log_p2);
213 y = _mm256_mul_ps(y, x);
214 y = _mm256_add_ps(y, *(__m256*)_ps256_cephes_log_p3);
215 y = _mm256_mul_ps(y, x);
216 y = _mm256_add_ps(y, *(__m256*)_ps256_cephes_log_p4);
217 y = _mm256_mul_ps(y, x);
218 y = _mm256_add_ps(y, *(__m256*)_ps256_cephes_log_p5);
219 y = _mm256_mul_ps(y, x);
220 y = _mm256_add_ps(y, *(__m256*)_ps256_cephes_log_p6);
221 y = _mm256_mul_ps(y, x);
222 y = _mm256_add_ps(y, *(__m256*)_ps256_cephes_log_p7);
223 y = _mm256_mul_ps(y, x);
224 y = _mm256_add_ps(y, *(__m256*)_ps256_cephes_log_p8);
225 y = _mm256_mul_ps(y, x);
226
227 y = _mm256_mul_ps(y, z);
228
229 tmp = _mm256_mul_ps(e, *(__m256*)_ps256_cephes_log_q1);
230 y = _mm256_add_ps(y, tmp);
231
232 tmp = _mm256_mul_ps(z, *(__m256*)_ps256_0p5);
233 y = _mm256_sub_ps(y, tmp);
234
235 tmp = _mm256_mul_ps(e, *(__m256*)_ps256_cephes_log_q2);
236 x = _mm256_add_ps(x, y);
237 x = _mm256_add_ps(x, tmp);
238 y = _mm256_or_ps(x, invalid_mask); // negative arg will be NAN
239 return y;
240 }
241
242 _PS256_CONST(exp_hi, 88.3762626647949f);
243 _PS256_CONST(exp_lo, -88.3762626647949f);
244
245 _PS256_CONST(cephes_LOG2EF, 1.44269504088896341f);
246 _PS256_CONST(cephes_exp_C1, 0.693359375f);
247 _PS256_CONST(cephes_exp_C2, -2.12194440e-4f);
248
249 _PS256_CONST(cephes_exp_p0, 1.9875691500E-4f);
250 _PS256_CONST(cephes_exp_p1, 1.3981999507E-3f);
251 _PS256_CONST(cephes_exp_p2, 8.3334519073E-3f);
252 _PS256_CONST(cephes_exp_p3, 4.1665795894E-2f);
253 _PS256_CONST(cephes_exp_p4, 1.6666665459E-1f);
254 _PS256_CONST(cephes_exp_p5, 5.0000001201E-1f);
255
exp256_ps(__m256 x)256 static inline __m256 exp256_ps(__m256 x)
257 {
258 __m256 tmp = _mm256_setzero_ps(), fx;
259 __m256i imm0;
260 __m256 one = *(__m256*)_ps256_1;
261
262 x = _mm256_min_ps(x, *(__m256*)_ps256_exp_hi);
263 x = _mm256_max_ps(x, *(__m256*)_ps256_exp_lo);
264
265 /* express exp(x) as exp(g + n*log(2)) */
266 fx = _mm256_mul_ps(x, *(__m256*)_ps256_cephes_LOG2EF);
267 fx = _mm256_add_ps(fx, *(__m256*)_ps256_0p5);
268
269 /* how to perform a floorf with SSE: just below */
270 //imm0 = _mm256_cvttps_epi32(fx);
271 //tmp = _mm256_cvtepi32_ps(imm0);
272
273 tmp = _mm256_floor_ps(fx);
274
275 /* if greater, subtract 1 */
276 //__m256 mask = _mm256_cmpgt_ps(tmp, fx);
277 __m256 mask = _mm256_cmp_ps(tmp, fx, _CMP_GT_OS);
278 mask = _mm256_and_ps(mask, one);
279 fx = _mm256_sub_ps(tmp, mask);
280
281 tmp = _mm256_mul_ps(fx, *(__m256*)_ps256_cephes_exp_C1);
282 __m256 z = _mm256_mul_ps(fx, *(__m256*)_ps256_cephes_exp_C2);
283 x = _mm256_sub_ps(x, tmp);
284 x = _mm256_sub_ps(x, z);
285
286 z = _mm256_mul_ps(x, x);
287
288 __m256 y = *(__m256*)_ps256_cephes_exp_p0;
289 y = _mm256_mul_ps(y, x);
290 y = _mm256_add_ps(y, *(__m256*)_ps256_cephes_exp_p1);
291 y = _mm256_mul_ps(y, x);
292 y = _mm256_add_ps(y, *(__m256*)_ps256_cephes_exp_p2);
293 y = _mm256_mul_ps(y, x);
294 y = _mm256_add_ps(y, *(__m256*)_ps256_cephes_exp_p3);
295 y = _mm256_mul_ps(y, x);
296 y = _mm256_add_ps(y, *(__m256*)_ps256_cephes_exp_p4);
297 y = _mm256_mul_ps(y, x);
298 y = _mm256_add_ps(y, *(__m256*)_ps256_cephes_exp_p5);
299 y = _mm256_mul_ps(y, z);
300 y = _mm256_add_ps(y, x);
301 y = _mm256_add_ps(y, one);
302
303 /* build 2^n */
304 imm0 = _mm256_cvttps_epi32(fx);
305 // another two AVX2 instructions
306 imm0 = _mm256_add_epi32(imm0, *(__m256i*)_pi32_256_0x7f);
307 imm0 = _mm256_slli_epi32(imm0, 23);
308 __m256 pow2n = _mm256_castsi256_ps(imm0);
309 y = _mm256_mul_ps(y, pow2n);
310 return y;
311 }
312
313 _PS256_CONST(minus_cephes_DP1, -0.78515625f);
314 _PS256_CONST(minus_cephes_DP2, -2.4187564849853515625e-4f);
315 _PS256_CONST(minus_cephes_DP3, -3.77489497744594108e-8f);
316 _PS256_CONST(sincof_p0, -1.9515295891E-4f);
317 _PS256_CONST(sincof_p1, 8.3321608736E-3f);
318 _PS256_CONST(sincof_p2, -1.6666654611E-1f);
319 _PS256_CONST(coscof_p0, 2.443315711809948E-005f);
320 _PS256_CONST(coscof_p1, -1.388731625493765E-003f);
321 _PS256_CONST(coscof_p2, 4.166664568298827E-002f);
322 _PS256_CONST(cephes_FOPI, 1.27323954473516f); // 4 / M_PI
323
324 /* evaluation of 8 sines at onces using AVX intrisics
325
326 The code is the exact rewriting of the cephes sinf function.
327 Precision is excellent as long as x < 8192 (I did not bother to
328 take into account the special handling they have for greater values
329 -- it does not return garbage for arguments over 8192, though, but
330 the extra precision is missing).
331
332 Note that it is such that sinf((float)M_PI) = 8.74e-8, which is the
333 surprising but correct result.
334
335 */
sin256_ps(__m256 x)336 static inline __m256 sin256_ps(__m256 x)
337 { // any x
338 __m256 xmm1, xmm2 = _mm256_setzero_ps(), xmm3, sign_bit, y;
339 __m256i imm0, imm2;
340
341 #ifndef __AVX2__
342 __m128i imm0_1, imm0_2;
343 __m128i imm2_1, imm2_2;
344 #endif
345
346 sign_bit = x;
347 /* take the absolute value */
348 x = _mm256_and_ps(x, *(__m256*)_ps256_inv_sign_mask);
349 /* extract the sign bit (upper one) */
350 sign_bit = _mm256_and_ps(sign_bit, *(__m256*)_ps256_sign_mask);
351
352 /* scale by 4/Pi */
353 y = _mm256_mul_ps(x, *(__m256*)_ps256_cephes_FOPI);
354
355 /*
356 Here we start a series of integer operations, which are in the
357 realm of AVX2.
358 If we don't have AVX, let's perform them using SSE2 directives
359 */
360
361 #ifdef __AVX2__
362 /* store the integer part of y in mm0 */
363 imm2 = _mm256_cvttps_epi32(y);
364 /* j=(j+1) & (~1) (see the cephes sources) */
365 // another two AVX2 instruction
366 imm2 = _mm256_add_epi32(imm2, *(__m256i*)_pi32_256_1);
367 imm2 = _mm256_and_si256(imm2, *(__m256i*)_pi32_256_inv1);
368 y = _mm256_cvtepi32_ps(imm2);
369
370 /* get the swap sign flag */
371 imm0 = _mm256_and_si256(imm2, *(__m256i*)_pi32_256_4);
372 imm0 = _mm256_slli_epi32(imm0, 29);
373 /* get the polynom selection mask
374 there is one polynom for 0 <= x <= Pi/4
375 and another one for Pi/4<x<=Pi/2
376
377 Both branches will be computed.
378 */
379 imm2 = _mm256_and_si256(imm2, *(__m256i*)_pi32_256_2);
380 imm2 = _mm256_cmpeq_epi32(imm2, *(__m256i*)_pi32_256_0);
381 #else
382 /* we use SSE2 routines to perform the integer ops */
383 COPY_IMM_TO_XMM(_mm256_cvttps_epi32(y), imm2_1, imm2_2);
384
385 imm2_1 = _mm_add_epi32(imm2_1, *(__m128i*)_pi32avx_1);
386 imm2_2 = _mm_add_epi32(imm2_2, *(__m128i*)_pi32avx_1);
387
388 imm2_1 = _mm_and_si128(imm2_1, *(__m128i*)_pi32avx_inv1);
389 imm2_2 = _mm_and_si128(imm2_2, *(__m128i*)_pi32avx_inv1);
390
391 COPY_XMM_TO_IMM(imm2_1, imm2_2, imm2);
392 y = _mm256_cvtepi32_ps(imm2);
393
394 imm0_1 = _mm_and_si128(imm2_1, *(__m128i*)_pi32avx_4);
395 imm0_2 = _mm_and_si128(imm2_2, *(__m128i*)_pi32avx_4);
396
397 imm0_1 = _mm_slli_epi32(imm0_1, 29);
398 imm0_2 = _mm_slli_epi32(imm0_2, 29);
399
400 COPY_XMM_TO_IMM(imm0_1, imm0_2, imm0);
401
402 imm2_1 = _mm_and_si128(imm2_1, *(__m128i*)_pi32avx_2);
403 imm2_2 = _mm_and_si128(imm2_2, *(__m128i*)_pi32avx_2);
404
405 imm2_1 = _mm_cmpeq_epi32(imm2_1, _mm_setzero_si128());
406 imm2_2 = _mm_cmpeq_epi32(imm2_2, _mm_setzero_si128());
407
408 COPY_XMM_TO_IMM(imm2_1, imm2_2, imm2);
409 #endif
410
411 __m256 swap_sign_bit = _mm256_castsi256_ps(imm0);
412 __m256 poly_mask = _mm256_castsi256_ps(imm2);
413 sign_bit = _mm256_xor_ps(sign_bit, swap_sign_bit);
414
415 /* The magic pass: "Extended precision modular arithmetic"
416 x = ((x - y * DP1) - y * DP2) - y * DP3; */
417 xmm1 = *(__m256*)_ps256_minus_cephes_DP1;
418 xmm2 = *(__m256*)_ps256_minus_cephes_DP2;
419 xmm3 = *(__m256*)_ps256_minus_cephes_DP3;
420 xmm1 = _mm256_mul_ps(y, xmm1);
421 xmm2 = _mm256_mul_ps(y, xmm2);
422 xmm3 = _mm256_mul_ps(y, xmm3);
423 x = _mm256_add_ps(x, xmm1);
424 x = _mm256_add_ps(x, xmm2);
425 x = _mm256_add_ps(x, xmm3);
426
427 /* Evaluate the first polynom (0 <= x <= Pi/4) */
428 y = *(__m256*)_ps256_coscof_p0;
429 __m256 z = _mm256_mul_ps(x, x);
430
431 y = _mm256_mul_ps(y, z);
432 y = _mm256_add_ps(y, *(__m256*)_ps256_coscof_p1);
433 y = _mm256_mul_ps(y, z);
434 y = _mm256_add_ps(y, *(__m256*)_ps256_coscof_p2);
435 y = _mm256_mul_ps(y, z);
436 y = _mm256_mul_ps(y, z);
437 __m256 tmp = _mm256_mul_ps(z, *(__m256*)_ps256_0p5);
438 y = _mm256_sub_ps(y, tmp);
439 y = _mm256_add_ps(y, *(__m256*)_ps256_1);
440
441 /* Evaluate the second polynom (Pi/4 <= x <= 0) */
442
443 __m256 y2 = *(__m256*)_ps256_sincof_p0;
444 y2 = _mm256_mul_ps(y2, z);
445 y2 = _mm256_add_ps(y2, *(__m256*)_ps256_sincof_p1);
446 y2 = _mm256_mul_ps(y2, z);
447 y2 = _mm256_add_ps(y2, *(__m256*)_ps256_sincof_p2);
448 y2 = _mm256_mul_ps(y2, z);
449 y2 = _mm256_mul_ps(y2, x);
450 y2 = _mm256_add_ps(y2, x);
451
452 /* select the correct result from the two polynoms */
453 xmm3 = poly_mask;
454 y2 = _mm256_and_ps(xmm3, y2); //, xmm3);
455 y = _mm256_andnot_ps(xmm3, y);
456 y = _mm256_add_ps(y, y2);
457 /* update the sign */
458 y = _mm256_xor_ps(y, sign_bit);
459
460 return y;
461 }
462
463 /* almost the same as sin_ps */
cos256_ps(__m256 x)464 static inline __m256 cos256_ps(__m256 x)
465 { // any x
466 __m256 xmm1, xmm2 = _mm256_setzero_ps(), xmm3, y;
467 __m256i imm0, imm2;
468
469 #ifndef __AVX2__
470 __m128i imm0_1, imm0_2;
471 __m128i imm2_1, imm2_2;
472 #endif
473
474 /* take the absolute value */
475 x = _mm256_and_ps(x, *(__m256*)_ps256_inv_sign_mask);
476
477 /* scale by 4/Pi */
478 y = _mm256_mul_ps(x, *(__m256*)_ps256_cephes_FOPI);
479
480 #ifdef __AVX2__
481 /* store the integer part of y in mm0 */
482 imm2 = _mm256_cvttps_epi32(y);
483 /* j=(j+1) & (~1) (see the cephes sources) */
484 imm2 = _mm256_add_epi32(imm2, *(__m256i*)_pi32_256_1);
485 imm2 = _mm256_and_si256(imm2, *(__m256i*)_pi32_256_inv1);
486 y = _mm256_cvtepi32_ps(imm2);
487 imm2 = _mm256_sub_epi32(imm2, *(__m256i*)_pi32_256_2);
488
489 /* get the swap sign flag */
490 imm0 = _mm256_andnot_si256(imm2, *(__m256i*)_pi32_256_4);
491 imm0 = _mm256_slli_epi32(imm0, 29);
492 /* get the polynom selection mask */
493 imm2 = _mm256_and_si256(imm2, *(__m256i*)_pi32_256_2);
494 imm2 = _mm256_cmpeq_epi32(imm2, *(__m256i*)_pi32_256_0);
495 #else
496
497 /* we use SSE2 routines to perform the integer ops */
498 COPY_IMM_TO_XMM(_mm256_cvttps_epi32(y), imm2_1, imm2_2);
499
500 imm2_1 = _mm_add_epi32(imm2_1, *(__m128i*)_pi32avx_1);
501 imm2_2 = _mm_add_epi32(imm2_2, *(__m128i*)_pi32avx_1);
502
503 imm2_1 = _mm_and_si128(imm2_1, *(__m128i*)_pi32avx_inv1);
504 imm2_2 = _mm_and_si128(imm2_2, *(__m128i*)_pi32avx_inv1);
505
506 COPY_XMM_TO_IMM(imm2_1, imm2_2, imm2);
507 y = _mm256_cvtepi32_ps(imm2);
508
509 imm2_1 = _mm_sub_epi32(imm2_1, *(__m128i*)_pi32avx_2);
510 imm2_2 = _mm_sub_epi32(imm2_2, *(__m128i*)_pi32avx_2);
511
512 imm0_1 = _mm_andnot_si128(imm2_1, *(__m128i*)_pi32avx_4);
513 imm0_2 = _mm_andnot_si128(imm2_2, *(__m128i*)_pi32avx_4);
514
515 imm0_1 = _mm_slli_epi32(imm0_1, 29);
516 imm0_2 = _mm_slli_epi32(imm0_2, 29);
517
518 COPY_XMM_TO_IMM(imm0_1, imm0_2, imm0);
519
520 imm2_1 = _mm_and_si128(imm2_1, *(__m128i*)_pi32avx_2);
521 imm2_2 = _mm_and_si128(imm2_2, *(__m128i*)_pi32avx_2);
522
523 imm2_1 = _mm_cmpeq_epi32(imm2_1, _mm_setzero_si128());
524 imm2_2 = _mm_cmpeq_epi32(imm2_2, _mm_setzero_si128());
525
526 COPY_XMM_TO_IMM(imm2_1, imm2_2, imm2);
527 #endif
528
529 __m256 sign_bit = _mm256_castsi256_ps(imm0);
530 __m256 poly_mask = _mm256_castsi256_ps(imm2);
531
532 /* The magic pass: "Extended precision modular arithmetic"
533 x = ((x - y * DP1) - y * DP2) - y * DP3; */
534 xmm1 = *(__m256*)_ps256_minus_cephes_DP1;
535 xmm2 = *(__m256*)_ps256_minus_cephes_DP2;
536 xmm3 = *(__m256*)_ps256_minus_cephes_DP3;
537 xmm1 = _mm256_mul_ps(y, xmm1);
538 xmm2 = _mm256_mul_ps(y, xmm2);
539 xmm3 = _mm256_mul_ps(y, xmm3);
540 x = _mm256_add_ps(x, xmm1);
541 x = _mm256_add_ps(x, xmm2);
542 x = _mm256_add_ps(x, xmm3);
543
544 /* Evaluate the first polynom (0 <= x <= Pi/4) */
545 y = *(__m256*)_ps256_coscof_p0;
546 __m256 z = _mm256_mul_ps(x, x);
547
548 y = _mm256_mul_ps(y, z);
549 y = _mm256_add_ps(y, *(__m256*)_ps256_coscof_p1);
550 y = _mm256_mul_ps(y, z);
551 y = _mm256_add_ps(y, *(__m256*)_ps256_coscof_p2);
552 y = _mm256_mul_ps(y, z);
553 y = _mm256_mul_ps(y, z);
554 __m256 tmp = _mm256_mul_ps(z, *(__m256*)_ps256_0p5);
555 y = _mm256_sub_ps(y, tmp);
556 y = _mm256_add_ps(y, *(__m256*)_ps256_1);
557
558 /* Evaluate the second polynom (Pi/4 <= x <= 0) */
559
560 __m256 y2 = *(__m256*)_ps256_sincof_p0;
561 y2 = _mm256_mul_ps(y2, z);
562 y2 = _mm256_add_ps(y2, *(__m256*)_ps256_sincof_p1);
563 y2 = _mm256_mul_ps(y2, z);
564 y2 = _mm256_add_ps(y2, *(__m256*)_ps256_sincof_p2);
565 y2 = _mm256_mul_ps(y2, z);
566 y2 = _mm256_mul_ps(y2, x);
567 y2 = _mm256_add_ps(y2, x);
568
569 /* select the correct result from the two polynoms */
570 xmm3 = poly_mask;
571 y2 = _mm256_and_ps(xmm3, y2); //, xmm3);
572 y = _mm256_andnot_ps(xmm3, y);
573 y = _mm256_add_ps(y, y2);
574 /* update the sign */
575 y = _mm256_xor_ps(y, sign_bit);
576
577 return y;
578 }
579
580 /* since sin256_ps and cos256_ps are almost identical, sincos256_ps could replace both of them..
581 it is almost as fast, and gives you a free cosine with your sine */
sincos256_ps(__m256 x,__m256 * s,__m256 * c)582 static inline void sincos256_ps(__m256 x, __m256* s, __m256* c)
583 {
584 __m256 xmm1, xmm2, xmm3 = _mm256_setzero_ps(), sign_bit_sin, y;
585 __m256i imm0, imm2, imm4;
586
587 #ifndef __AVX2__
588 __m128i imm0_1, imm0_2;
589 __m128i imm2_1, imm2_2;
590 __m128i imm4_1, imm4_2;
591 #endif
592
593 sign_bit_sin = x;
594 /* take the absolute value */
595 x = _mm256_and_ps(x, *(__m256*)_ps256_inv_sign_mask);
596 /* extract the sign bit (upper one) */
597 sign_bit_sin = _mm256_and_ps(sign_bit_sin, *(__m256*)_ps256_sign_mask);
598
599 /* scale by 4/Pi */
600 y = _mm256_mul_ps(x, *(__m256*)_ps256_cephes_FOPI);
601
602 #ifdef __AVX2__
603 /* store the integer part of y in imm2 */
604 imm2 = _mm256_cvttps_epi32(y);
605
606 /* j=(j+1) & (~1) (see the cephes sources) */
607 imm2 = _mm256_add_epi32(imm2, *(__m256i*)_pi32_256_1);
608 imm2 = _mm256_and_si256(imm2, *(__m256i*)_pi32_256_inv1);
609
610 y = _mm256_cvtepi32_ps(imm2);
611 imm4 = imm2;
612
613 /* get the swap sign flag for the sine */
614 imm0 = _mm256_and_si256(imm2, *(__m256i*)_pi32_256_4);
615 imm0 = _mm256_slli_epi32(imm0, 29);
616 //__m256 swap_sign_bit_sin = _mm256_castsi256_ps(imm0);
617
618 /* get the polynom selection mask for the sine*/
619 imm2 = _mm256_and_si256(imm2, *(__m256i*)_pi32_256_2);
620 imm2 = _mm256_cmpeq_epi32(imm2, *(__m256i*)_pi32_256_0);
621 //__m256 poly_mask = _mm256_castsi256_ps(imm2);
622 #else
623 /* we use SSE2 routines to perform the integer ops */
624 COPY_IMM_TO_XMM(_mm256_cvttps_epi32(y), imm2_1, imm2_2);
625
626 imm2_1 = _mm_add_epi32(imm2_1, *(__m128i*)_pi32avx_1);
627 imm2_2 = _mm_add_epi32(imm2_2, *(__m128i*)_pi32avx_1);
628
629 imm2_1 = _mm_and_si128(imm2_1, *(__m128i*)_pi32avx_inv1);
630 imm2_2 = _mm_and_si128(imm2_2, *(__m128i*)_pi32avx_inv1);
631
632 COPY_XMM_TO_IMM(imm2_1, imm2_2, imm2);
633 y = _mm256_cvtepi32_ps(imm2);
634
635 imm4_1 = imm2_1;
636 imm4_2 = imm2_2;
637
638 imm0_1 = _mm_and_si128(imm2_1, *(__m128i*)_pi32avx_4);
639 imm0_2 = _mm_and_si128(imm2_2, *(__m128i*)_pi32avx_4);
640
641 imm0_1 = _mm_slli_epi32(imm0_1, 29);
642 imm0_2 = _mm_slli_epi32(imm0_2, 29);
643
644 COPY_XMM_TO_IMM(imm0_1, imm0_2, imm0);
645
646 imm2_1 = _mm_and_si128(imm2_1, *(__m128i*)_pi32avx_2);
647 imm2_2 = _mm_and_si128(imm2_2, *(__m128i*)_pi32avx_2);
648
649 imm2_1 = _mm_cmpeq_epi32(imm2_1, _mm_setzero_si128());
650 imm2_2 = _mm_cmpeq_epi32(imm2_2, _mm_setzero_si128());
651
652 COPY_XMM_TO_IMM(imm2_1, imm2_2, imm2);
653 #endif
654 __m256 swap_sign_bit_sin = _mm256_castsi256_ps(imm0);
655 __m256 poly_mask = _mm256_castsi256_ps(imm2);
656
657 /* The magic pass: "Extended precision modular arithmetic"
658 x = ((x - y * DP1) - y * DP2) - y * DP3; */
659 xmm1 = *(__m256*)_ps256_minus_cephes_DP1;
660 xmm2 = *(__m256*)_ps256_minus_cephes_DP2;
661 xmm3 = *(__m256*)_ps256_minus_cephes_DP3;
662 xmm1 = _mm256_mul_ps(y, xmm1);
663 xmm2 = _mm256_mul_ps(y, xmm2);
664 xmm3 = _mm256_mul_ps(y, xmm3);
665 x = _mm256_add_ps(x, xmm1);
666 x = _mm256_add_ps(x, xmm2);
667 x = _mm256_add_ps(x, xmm3);
668
669 #ifdef __AVX2__
670 imm4 = _mm256_sub_epi32(imm4, *(__m256i*)_pi32_256_2);
671 imm4 = _mm256_andnot_si256(imm4, *(__m256i*)_pi32_256_4);
672 imm4 = _mm256_slli_epi32(imm4, 29);
673 #else
674 imm4_1 = _mm_sub_epi32(imm4_1, *(__m128i*)_pi32avx_2);
675 imm4_2 = _mm_sub_epi32(imm4_2, *(__m128i*)_pi32avx_2);
676
677 imm4_1 = _mm_andnot_si128(imm4_1, *(__m128i*)_pi32avx_4);
678 imm4_2 = _mm_andnot_si128(imm4_2, *(__m128i*)_pi32avx_4);
679
680 imm4_1 = _mm_slli_epi32(imm4_1, 29);
681 imm4_2 = _mm_slli_epi32(imm4_2, 29);
682
683 COPY_XMM_TO_IMM(imm4_1, imm4_2, imm4);
684 #endif
685
686 __m256 sign_bit_cos = _mm256_castsi256_ps(imm4);
687
688 sign_bit_sin = _mm256_xor_ps(sign_bit_sin, swap_sign_bit_sin);
689
690 /* Evaluate the first polynom (0 <= x <= Pi/4) */
691 __m256 z = _mm256_mul_ps(x, x);
692 y = *(__m256*)_ps256_coscof_p0;
693
694 y = _mm256_mul_ps(y, z);
695 y = _mm256_add_ps(y, *(__m256*)_ps256_coscof_p1);
696 y = _mm256_mul_ps(y, z);
697 y = _mm256_add_ps(y, *(__m256*)_ps256_coscof_p2);
698 y = _mm256_mul_ps(y, z);
699 y = _mm256_mul_ps(y, z);
700 __m256 tmp = _mm256_mul_ps(z, *(__m256*)_ps256_0p5);
701 y = _mm256_sub_ps(y, tmp);
702 y = _mm256_add_ps(y, *(__m256*)_ps256_1);
703
704 /* Evaluate the second polynom (Pi/4 <= x <= 0) */
705
706 __m256 y2 = *(__m256*)_ps256_sincof_p0;
707 y2 = _mm256_mul_ps(y2, z);
708 y2 = _mm256_add_ps(y2, *(__m256*)_ps256_sincof_p1);
709 y2 = _mm256_mul_ps(y2, z);
710 y2 = _mm256_add_ps(y2, *(__m256*)_ps256_sincof_p2);
711 y2 = _mm256_mul_ps(y2, z);
712 y2 = _mm256_mul_ps(y2, x);
713 y2 = _mm256_add_ps(y2, x);
714
715 /* select the correct result from the two polynoms */
716 xmm3 = poly_mask;
717 __m256 ysin2 = _mm256_and_ps(xmm3, y2);
718 __m256 ysin1 = _mm256_andnot_ps(xmm3, y);
719 y2 = _mm256_sub_ps(y2, ysin2);
720 y = _mm256_sub_ps(y, ysin1);
721
722 xmm1 = _mm256_add_ps(ysin1, ysin2);
723 xmm2 = _mm256_add_ps(y, y2);
724
725 /* update the sign */
726 *s = _mm256_xor_ps(xmm1, sign_bit_sin);
727 *c = _mm256_xor_ps(xmm2, sign_bit_cos);
728 }
729
pow_ps(__m256 a,__m256 b)730 static inline __m256 pow_ps(__m256 a, __m256 b)
731 {
732 // pow(x, m) = exp(m * log(x))
733 return exp256_ps(_mm256_mul_ps(b, log256_ps(a)));
734 }
735
736 #endif // AVX_MATHFUN_H
737