1 /*
2    AVX implementation of sin, cos, sincos, exp and log
3 
4    Based on "sse_mathfun.h", by Julien Pommier
5    http://gruntthepeon.free.fr/ssemath/
6 
7    Copyright (C) 2012 Giovanni Garberoglio
8    Interdisciplinary Laboratory for Computational Science (LISC)
9    Fondazione Bruno Kessler and University of Trento
10    via Sommarive, 18
11    I-38123 Trento (Italy)
12 
13   This software is provided 'as-is', without any express or implied
14   warranty.  In no event will the authors be held liable for any damages
15   arising from the use of this software.
16 
17   Permission is granted to anyone to use this software for any purpose,
18   including commercial applications, and to alter it and redistribute it
19   freely, subject to the following restrictions:
20 
21   1. The origin of this software must not be misrepresented; you must not
22      claim that you wrote the original software. If you use this software
23      in a product, an acknowledgment in the product documentation would be
24      appreciated but is not required.
25   2. Altered source versions must be plainly marked as such, and must not be
26      misrepresented as being the original software.
27   3. This notice may not be removed or altered from any source distribution.
28 
29   (this is the zlib license)
30 */
31 
32 #ifndef AVX_MATHFUN_H
33 #define AVX_MATHFUN_H
34 
35 #include <immintrin.h>
36 #include <emmintrin.h>
37 
38 /* yes I know, the top of this file is quite ugly */
39 
40 #ifdef _MSC_VER /* visual c++ */
41 #define ALIGN32_BEG __declspec(align(32))
42 #define ALIGN32_END
43 #else /* gcc or icc */
44 #define ALIGN32_BEG
45 #define ALIGN32_END __attribute__((aligned(32)))
46 #endif
47 
48 #define _PI32AVX_CONST(Name, Val) \
49     static const ALIGN32_BEG int _pi32avx_##Name[4] ALIGN32_END = {Val, Val, Val, Val}
50 
51 _PI32AVX_CONST(1, 1);
52 _PI32AVX_CONST(inv1, ~1);
53 _PI32AVX_CONST(2, 2);
54 _PI32AVX_CONST(4, 4);
55 
56 /* declare some AVX constants -- why can't I figure a better way to do that? */
57 #define _PS256_CONST(Name, Val) \
58     static const ALIGN32_BEG float _ps256_##Name[8] ALIGN32_END = {Val, Val, Val, Val, Val, Val, Val, Val}
59 #define _PI32_CONST256(Name, Val) \
60     static const ALIGN32_BEG int _pi32_256_##Name[8] ALIGN32_END = {Val, Val, Val, Val, Val, Val, Val, Val}
61 #define _PS256_CONST_TYPE(Name, Type, Val) \
62     static const ALIGN32_BEG Type _ps256_##Name[8] ALIGN32_END = {Val, Val, Val, Val, Val, Val, Val, Val}
63 
64 _PS256_CONST(1, 1.0f);
65 _PS256_CONST(0p5, 0.5f);
66 /* the smallest non denormalized float number */
67 _PS256_CONST_TYPE(min_norm_pos, int, 0x00800000);
68 _PS256_CONST_TYPE(mant_mask, int, 0x7f800000);
69 _PS256_CONST_TYPE(inv_mant_mask, int, ~0x7f800000);
70 
71 _PS256_CONST_TYPE(sign_mask, int, (int)0x80000000);
72 _PS256_CONST_TYPE(inv_sign_mask, int, ~0x80000000);
73 
74 _PI32_CONST256(0, 0);
75 _PI32_CONST256(1, 1);
76 _PI32_CONST256(inv1, ~1);
77 _PI32_CONST256(2, 2);
78 _PI32_CONST256(4, 4);
79 _PI32_CONST256(0x7f, 0x7f);
80 
81 _PS256_CONST(cephes_SQRTHF, 0.707106781186547524f);
82 _PS256_CONST(cephes_log_p0, 7.0376836292E-2f);
83 _PS256_CONST(cephes_log_p1, -1.1514610310E-1f);
84 _PS256_CONST(cephes_log_p2, 1.1676998740E-1f);
85 _PS256_CONST(cephes_log_p3, -1.2420140846E-1f);
86 _PS256_CONST(cephes_log_p4, +1.4249322787E-1f);
87 _PS256_CONST(cephes_log_p5, -1.6668057665E-1f);
88 _PS256_CONST(cephes_log_p6, +2.0000714765E-1f);
89 _PS256_CONST(cephes_log_p7, -2.4999993993E-1f);
90 _PS256_CONST(cephes_log_p8, +3.3333331174E-1f);
91 _PS256_CONST(cephes_log_q1, -2.12194440e-4f);
92 _PS256_CONST(cephes_log_q2, 0.693359375f);
93 
94 #ifndef __AVX2__
95 
96 typedef union imm_xmm_union
97 {
98     __m256i imm;
99     __m128i xmm[2];
100 } imm_xmm_union;
101 
102 #define COPY_IMM_TO_XMM(imm_, xmm0_, xmm1_)      \
103     {                                            \
104         ALIGN32_BEG imm_xmm_union u ALIGN32_END; \
105         u.imm = imm_;                            \
106         xmm0_ = u.xmm[0];                        \
107         xmm1_ = u.xmm[1];                        \
108     }
109 
110 #define COPY_XMM_TO_IMM(xmm0_, xmm1_, imm_)      \
111     {                                            \
112         ALIGN32_BEG imm_xmm_union u ALIGN32_END; \
113         u.xmm[0] = xmm0_;                        \
114         u.xmm[1] = xmm1_;                        \
115         imm_ = u.imm;                            \
116     }
117 
118 #define AVX2_BITOP_USING_SSE2(fn)                            \
119     static inline __m256i _mm256_##fn(__m256i x, int a)      \
120     {                                                        \
121         /* use SSE2 instruction to perform the bitop AVX2 */ \
122         __m128i x1, x2;                                      \
123         __m256i ret;                                         \
124         COPY_IMM_TO_XMM(x, x1, x2);                          \
125         x1 = _mm_##fn(x1, a);                                \
126         x2 = _mm_##fn(x2, a);                                \
127         COPY_XMM_TO_IMM(x1, x2, ret);                        \
128         return (ret);                                        \
129     }
130 
131 #if _MSC_VER
132 #pragma WARNING(Using SSE2 to perform AVX2 bitshift ops)
133 #else
134 #warning "Using SSE2 to perform AVX2 bitshift ops"
135 #endif
136 AVX2_BITOP_USING_SSE2(slli_epi32)
AVX2_BITOP_USING_SSE2(srli_epi32)137 AVX2_BITOP_USING_SSE2(srli_epi32)
138 
139 #define AVX2_INTOP_USING_SSE2(fn)                                         \
140     static inline __m256i _mm256_##fn(__m256i x, __m256i y)               \
141     {                                                                     \
142         /* use SSE2 instructions to perform the AVX2 integer operation */ \
143         __m128i x1, x2;                                                   \
144         __m128i y1, y2;                                                   \
145         __m256i ret;                                                      \
146         COPY_IMM_TO_XMM(x, x1, x2);                                       \
147         COPY_IMM_TO_XMM(y, y1, y2);                                       \
148         x1 = _mm_##fn(x1, y1);                                            \
149         x2 = _mm_##fn(x2, y2);                                            \
150         COPY_XMM_TO_IMM(x1, x2, ret);                                     \
151         return (ret);                                                     \
152     }
153 
154 #if _MSC_VER
155 #pragma WARNING(Using SSE2 to perform AVX2 bitshift ops)
156 #else
157 #warning "Using SSE2 to perform AVX2 integer ops"
158 #endif
159 AVX2_INTOP_USING_SSE2(and_si128)
160 AVX2_INTOP_USING_SSE2(andnot_si128)
161 AVX2_INTOP_USING_SSE2(cmpeq_epi32)
162 AVX2_INTOP_USING_SSE2(sub_epi32)
163 AVX2_INTOP_USING_SSE2(add_epi32)
164 
165 #endif /* __AVX2__ */
166 
167 /* natural logarithm computed for 8 simultaneous float
168    return NaN for x <= 0
169 */
170 static inline __m256 log256_ps(__m256 x)
171 {
172     __m256i imm0;
173     __m256 one = *(__m256*)_ps256_1;
174 
175     //__m256 invalid_mask = _mm256_cmple_ps(x, _mm256_setzero_ps());
176     __m256 invalid_mask = _mm256_cmp_ps(x, _mm256_setzero_ps(), _CMP_LE_OS);
177 
178     x = _mm256_max_ps(x, *(__m256*)_ps256_min_norm_pos); /* cut off denormalized stuff */
179 
180     // can be done with AVX2
181     imm0 = _mm256_srli_epi32(_mm256_castps_si256(x), 23);
182 
183     /* keep only the fractional part */
184     x = _mm256_and_ps(x, *(__m256*)_ps256_inv_mant_mask);
185     x = _mm256_or_ps(x, *(__m256*)_ps256_0p5);
186 
187     // this is again another AVX2 instruction
188     imm0 = _mm256_sub_epi32(imm0, *(__m256i*)_pi32_256_0x7f);
189     __m256 e = _mm256_cvtepi32_ps(imm0);
190 
191     e = _mm256_add_ps(e, one);
192 
193     /* part2:
194        if( x < SQRTHF ) {
195          e -= 1;
196          x = x + x - 1.0;
197        } else { x = x - 1.0; }
198     */
199     //__m256 mask = _mm256_cmplt_ps(x, *(__m256*)_ps256_cephes_SQRTHF);
200     __m256 mask = _mm256_cmp_ps(x, *(__m256*)_ps256_cephes_SQRTHF, _CMP_LT_OS);
201     __m256 tmp = _mm256_and_ps(x, mask);
202     x = _mm256_sub_ps(x, one);
203     e = _mm256_sub_ps(e, _mm256_and_ps(one, mask));
204     x = _mm256_add_ps(x, tmp);
205 
206     __m256 z = _mm256_mul_ps(x, x);
207 
208     __m256 y = *(__m256*)_ps256_cephes_log_p0;
209     y = _mm256_mul_ps(y, x);
210     y = _mm256_add_ps(y, *(__m256*)_ps256_cephes_log_p1);
211     y = _mm256_mul_ps(y, x);
212     y = _mm256_add_ps(y, *(__m256*)_ps256_cephes_log_p2);
213     y = _mm256_mul_ps(y, x);
214     y = _mm256_add_ps(y, *(__m256*)_ps256_cephes_log_p3);
215     y = _mm256_mul_ps(y, x);
216     y = _mm256_add_ps(y, *(__m256*)_ps256_cephes_log_p4);
217     y = _mm256_mul_ps(y, x);
218     y = _mm256_add_ps(y, *(__m256*)_ps256_cephes_log_p5);
219     y = _mm256_mul_ps(y, x);
220     y = _mm256_add_ps(y, *(__m256*)_ps256_cephes_log_p6);
221     y = _mm256_mul_ps(y, x);
222     y = _mm256_add_ps(y, *(__m256*)_ps256_cephes_log_p7);
223     y = _mm256_mul_ps(y, x);
224     y = _mm256_add_ps(y, *(__m256*)_ps256_cephes_log_p8);
225     y = _mm256_mul_ps(y, x);
226 
227     y = _mm256_mul_ps(y, z);
228 
229     tmp = _mm256_mul_ps(e, *(__m256*)_ps256_cephes_log_q1);
230     y = _mm256_add_ps(y, tmp);
231 
232     tmp = _mm256_mul_ps(z, *(__m256*)_ps256_0p5);
233     y = _mm256_sub_ps(y, tmp);
234 
235     tmp = _mm256_mul_ps(e, *(__m256*)_ps256_cephes_log_q2);
236     x = _mm256_add_ps(x, y);
237     x = _mm256_add_ps(x, tmp);
238     y = _mm256_or_ps(x, invalid_mask); // negative arg will be NAN
239     return y;
240 }
241 
242 _PS256_CONST(exp_hi, 88.3762626647949f);
243 _PS256_CONST(exp_lo, -88.3762626647949f);
244 
245 _PS256_CONST(cephes_LOG2EF, 1.44269504088896341f);
246 _PS256_CONST(cephes_exp_C1, 0.693359375f);
247 _PS256_CONST(cephes_exp_C2, -2.12194440e-4f);
248 
249 _PS256_CONST(cephes_exp_p0, 1.9875691500E-4f);
250 _PS256_CONST(cephes_exp_p1, 1.3981999507E-3f);
251 _PS256_CONST(cephes_exp_p2, 8.3334519073E-3f);
252 _PS256_CONST(cephes_exp_p3, 4.1665795894E-2f);
253 _PS256_CONST(cephes_exp_p4, 1.6666665459E-1f);
254 _PS256_CONST(cephes_exp_p5, 5.0000001201E-1f);
255 
exp256_ps(__m256 x)256 static inline __m256 exp256_ps(__m256 x)
257 {
258     __m256 tmp = _mm256_setzero_ps(), fx;
259     __m256i imm0;
260     __m256 one = *(__m256*)_ps256_1;
261 
262     x = _mm256_min_ps(x, *(__m256*)_ps256_exp_hi);
263     x = _mm256_max_ps(x, *(__m256*)_ps256_exp_lo);
264 
265     /* express exp(x) as exp(g + n*log(2)) */
266     fx = _mm256_mul_ps(x, *(__m256*)_ps256_cephes_LOG2EF);
267     fx = _mm256_add_ps(fx, *(__m256*)_ps256_0p5);
268 
269     /* how to perform a floorf with SSE: just below */
270     //imm0 = _mm256_cvttps_epi32(fx);
271     //tmp  = _mm256_cvtepi32_ps(imm0);
272 
273     tmp = _mm256_floor_ps(fx);
274 
275     /* if greater, subtract 1 */
276     //__m256 mask = _mm256_cmpgt_ps(tmp, fx);
277     __m256 mask = _mm256_cmp_ps(tmp, fx, _CMP_GT_OS);
278     mask = _mm256_and_ps(mask, one);
279     fx = _mm256_sub_ps(tmp, mask);
280 
281     tmp = _mm256_mul_ps(fx, *(__m256*)_ps256_cephes_exp_C1);
282     __m256 z = _mm256_mul_ps(fx, *(__m256*)_ps256_cephes_exp_C2);
283     x = _mm256_sub_ps(x, tmp);
284     x = _mm256_sub_ps(x, z);
285 
286     z = _mm256_mul_ps(x, x);
287 
288     __m256 y = *(__m256*)_ps256_cephes_exp_p0;
289     y = _mm256_mul_ps(y, x);
290     y = _mm256_add_ps(y, *(__m256*)_ps256_cephes_exp_p1);
291     y = _mm256_mul_ps(y, x);
292     y = _mm256_add_ps(y, *(__m256*)_ps256_cephes_exp_p2);
293     y = _mm256_mul_ps(y, x);
294     y = _mm256_add_ps(y, *(__m256*)_ps256_cephes_exp_p3);
295     y = _mm256_mul_ps(y, x);
296     y = _mm256_add_ps(y, *(__m256*)_ps256_cephes_exp_p4);
297     y = _mm256_mul_ps(y, x);
298     y = _mm256_add_ps(y, *(__m256*)_ps256_cephes_exp_p5);
299     y = _mm256_mul_ps(y, z);
300     y = _mm256_add_ps(y, x);
301     y = _mm256_add_ps(y, one);
302 
303     /* build 2^n */
304     imm0 = _mm256_cvttps_epi32(fx);
305     // another two AVX2 instructions
306     imm0 = _mm256_add_epi32(imm0, *(__m256i*)_pi32_256_0x7f);
307     imm0 = _mm256_slli_epi32(imm0, 23);
308     __m256 pow2n = _mm256_castsi256_ps(imm0);
309     y = _mm256_mul_ps(y, pow2n);
310     return y;
311 }
312 
313 _PS256_CONST(minus_cephes_DP1, -0.78515625f);
314 _PS256_CONST(minus_cephes_DP2, -2.4187564849853515625e-4f);
315 _PS256_CONST(minus_cephes_DP3, -3.77489497744594108e-8f);
316 _PS256_CONST(sincof_p0, -1.9515295891E-4f);
317 _PS256_CONST(sincof_p1, 8.3321608736E-3f);
318 _PS256_CONST(sincof_p2, -1.6666654611E-1f);
319 _PS256_CONST(coscof_p0, 2.443315711809948E-005f);
320 _PS256_CONST(coscof_p1, -1.388731625493765E-003f);
321 _PS256_CONST(coscof_p2, 4.166664568298827E-002f);
322 _PS256_CONST(cephes_FOPI, 1.27323954473516f); // 4 / M_PI
323 
324 /* evaluation of 8 sines at onces using AVX intrisics
325 
326    The code is the exact rewriting of the cephes sinf function.
327    Precision is excellent as long as x < 8192 (I did not bother to
328    take into account the special handling they have for greater values
329    -- it does not return garbage for arguments over 8192, though, but
330    the extra precision is missing).
331 
332    Note that it is such that sinf((float)M_PI) = 8.74e-8, which is the
333    surprising but correct result.
334 
335 */
sin256_ps(__m256 x)336 static inline __m256 sin256_ps(__m256 x)
337 {   // any x
338     __m256 xmm1, xmm2 = _mm256_setzero_ps(), xmm3, sign_bit, y;
339     __m256i imm0, imm2;
340 
341 #ifndef __AVX2__
342     __m128i imm0_1, imm0_2;
343     __m128i imm2_1, imm2_2;
344 #endif
345 
346     sign_bit = x;
347     /* take the absolute value */
348     x = _mm256_and_ps(x, *(__m256*)_ps256_inv_sign_mask);
349     /* extract the sign bit (upper one) */
350     sign_bit = _mm256_and_ps(sign_bit, *(__m256*)_ps256_sign_mask);
351 
352     /* scale by 4/Pi */
353     y = _mm256_mul_ps(x, *(__m256*)_ps256_cephes_FOPI);
354 
355     /*
356       Here we start a series of integer operations, which are in the
357       realm of AVX2.
358       If we don't have AVX, let's perform them using SSE2 directives
359     */
360 
361 #ifdef __AVX2__
362     /* store the integer part of y in mm0 */
363     imm2 = _mm256_cvttps_epi32(y);
364     /* j=(j+1) & (~1) (see the cephes sources) */
365     // another two AVX2 instruction
366     imm2 = _mm256_add_epi32(imm2, *(__m256i*)_pi32_256_1);
367     imm2 = _mm256_and_si256(imm2, *(__m256i*)_pi32_256_inv1);
368     y = _mm256_cvtepi32_ps(imm2);
369 
370     /* get the swap sign flag */
371     imm0 = _mm256_and_si256(imm2, *(__m256i*)_pi32_256_4);
372     imm0 = _mm256_slli_epi32(imm0, 29);
373     /* get the polynom selection mask
374        there is one polynom for 0 <= x <= Pi/4
375        and another one for Pi/4<x<=Pi/2
376 
377        Both branches will be computed.
378     */
379     imm2 = _mm256_and_si256(imm2, *(__m256i*)_pi32_256_2);
380     imm2 = _mm256_cmpeq_epi32(imm2, *(__m256i*)_pi32_256_0);
381 #else
382     /* we use SSE2 routines to perform the integer ops */
383     COPY_IMM_TO_XMM(_mm256_cvttps_epi32(y), imm2_1, imm2_2);
384 
385     imm2_1 = _mm_add_epi32(imm2_1, *(__m128i*)_pi32avx_1);
386     imm2_2 = _mm_add_epi32(imm2_2, *(__m128i*)_pi32avx_1);
387 
388     imm2_1 = _mm_and_si128(imm2_1, *(__m128i*)_pi32avx_inv1);
389     imm2_2 = _mm_and_si128(imm2_2, *(__m128i*)_pi32avx_inv1);
390 
391     COPY_XMM_TO_IMM(imm2_1, imm2_2, imm2);
392     y = _mm256_cvtepi32_ps(imm2);
393 
394     imm0_1 = _mm_and_si128(imm2_1, *(__m128i*)_pi32avx_4);
395     imm0_2 = _mm_and_si128(imm2_2, *(__m128i*)_pi32avx_4);
396 
397     imm0_1 = _mm_slli_epi32(imm0_1, 29);
398     imm0_2 = _mm_slli_epi32(imm0_2, 29);
399 
400     COPY_XMM_TO_IMM(imm0_1, imm0_2, imm0);
401 
402     imm2_1 = _mm_and_si128(imm2_1, *(__m128i*)_pi32avx_2);
403     imm2_2 = _mm_and_si128(imm2_2, *(__m128i*)_pi32avx_2);
404 
405     imm2_1 = _mm_cmpeq_epi32(imm2_1, _mm_setzero_si128());
406     imm2_2 = _mm_cmpeq_epi32(imm2_2, _mm_setzero_si128());
407 
408     COPY_XMM_TO_IMM(imm2_1, imm2_2, imm2);
409 #endif
410 
411     __m256 swap_sign_bit = _mm256_castsi256_ps(imm0);
412     __m256 poly_mask = _mm256_castsi256_ps(imm2);
413     sign_bit = _mm256_xor_ps(sign_bit, swap_sign_bit);
414 
415     /* The magic pass: "Extended precision modular arithmetic"
416        x = ((x - y * DP1) - y * DP2) - y * DP3; */
417     xmm1 = *(__m256*)_ps256_minus_cephes_DP1;
418     xmm2 = *(__m256*)_ps256_minus_cephes_DP2;
419     xmm3 = *(__m256*)_ps256_minus_cephes_DP3;
420     xmm1 = _mm256_mul_ps(y, xmm1);
421     xmm2 = _mm256_mul_ps(y, xmm2);
422     xmm3 = _mm256_mul_ps(y, xmm3);
423     x = _mm256_add_ps(x, xmm1);
424     x = _mm256_add_ps(x, xmm2);
425     x = _mm256_add_ps(x, xmm3);
426 
427     /* Evaluate the first polynom  (0 <= x <= Pi/4) */
428     y = *(__m256*)_ps256_coscof_p0;
429     __m256 z = _mm256_mul_ps(x, x);
430 
431     y = _mm256_mul_ps(y, z);
432     y = _mm256_add_ps(y, *(__m256*)_ps256_coscof_p1);
433     y = _mm256_mul_ps(y, z);
434     y = _mm256_add_ps(y, *(__m256*)_ps256_coscof_p2);
435     y = _mm256_mul_ps(y, z);
436     y = _mm256_mul_ps(y, z);
437     __m256 tmp = _mm256_mul_ps(z, *(__m256*)_ps256_0p5);
438     y = _mm256_sub_ps(y, tmp);
439     y = _mm256_add_ps(y, *(__m256*)_ps256_1);
440 
441     /* Evaluate the second polynom  (Pi/4 <= x <= 0) */
442 
443     __m256 y2 = *(__m256*)_ps256_sincof_p0;
444     y2 = _mm256_mul_ps(y2, z);
445     y2 = _mm256_add_ps(y2, *(__m256*)_ps256_sincof_p1);
446     y2 = _mm256_mul_ps(y2, z);
447     y2 = _mm256_add_ps(y2, *(__m256*)_ps256_sincof_p2);
448     y2 = _mm256_mul_ps(y2, z);
449     y2 = _mm256_mul_ps(y2, x);
450     y2 = _mm256_add_ps(y2, x);
451 
452     /* select the correct result from the two polynoms */
453     xmm3 = poly_mask;
454     y2 = _mm256_and_ps(xmm3, y2); //, xmm3);
455     y = _mm256_andnot_ps(xmm3, y);
456     y = _mm256_add_ps(y, y2);
457     /* update the sign */
458     y = _mm256_xor_ps(y, sign_bit);
459 
460     return y;
461 }
462 
463 /* almost the same as sin_ps */
cos256_ps(__m256 x)464 static inline __m256 cos256_ps(__m256 x)
465 {   // any x
466     __m256 xmm1, xmm2 = _mm256_setzero_ps(), xmm3, y;
467     __m256i imm0, imm2;
468 
469 #ifndef __AVX2__
470     __m128i imm0_1, imm0_2;
471     __m128i imm2_1, imm2_2;
472 #endif
473 
474     /* take the absolute value */
475     x = _mm256_and_ps(x, *(__m256*)_ps256_inv_sign_mask);
476 
477     /* scale by 4/Pi */
478     y = _mm256_mul_ps(x, *(__m256*)_ps256_cephes_FOPI);
479 
480 #ifdef __AVX2__
481     /* store the integer part of y in mm0 */
482     imm2 = _mm256_cvttps_epi32(y);
483     /* j=(j+1) & (~1) (see the cephes sources) */
484     imm2 = _mm256_add_epi32(imm2, *(__m256i*)_pi32_256_1);
485     imm2 = _mm256_and_si256(imm2, *(__m256i*)_pi32_256_inv1);
486     y = _mm256_cvtepi32_ps(imm2);
487     imm2 = _mm256_sub_epi32(imm2, *(__m256i*)_pi32_256_2);
488 
489     /* get the swap sign flag */
490     imm0 = _mm256_andnot_si256(imm2, *(__m256i*)_pi32_256_4);
491     imm0 = _mm256_slli_epi32(imm0, 29);
492     /* get the polynom selection mask */
493     imm2 = _mm256_and_si256(imm2, *(__m256i*)_pi32_256_2);
494     imm2 = _mm256_cmpeq_epi32(imm2, *(__m256i*)_pi32_256_0);
495 #else
496 
497     /* we use SSE2 routines to perform the integer ops */
498     COPY_IMM_TO_XMM(_mm256_cvttps_epi32(y), imm2_1, imm2_2);
499 
500     imm2_1 = _mm_add_epi32(imm2_1, *(__m128i*)_pi32avx_1);
501     imm2_2 = _mm_add_epi32(imm2_2, *(__m128i*)_pi32avx_1);
502 
503     imm2_1 = _mm_and_si128(imm2_1, *(__m128i*)_pi32avx_inv1);
504     imm2_2 = _mm_and_si128(imm2_2, *(__m128i*)_pi32avx_inv1);
505 
506     COPY_XMM_TO_IMM(imm2_1, imm2_2, imm2);
507     y = _mm256_cvtepi32_ps(imm2);
508 
509     imm2_1 = _mm_sub_epi32(imm2_1, *(__m128i*)_pi32avx_2);
510     imm2_2 = _mm_sub_epi32(imm2_2, *(__m128i*)_pi32avx_2);
511 
512     imm0_1 = _mm_andnot_si128(imm2_1, *(__m128i*)_pi32avx_4);
513     imm0_2 = _mm_andnot_si128(imm2_2, *(__m128i*)_pi32avx_4);
514 
515     imm0_1 = _mm_slli_epi32(imm0_1, 29);
516     imm0_2 = _mm_slli_epi32(imm0_2, 29);
517 
518     COPY_XMM_TO_IMM(imm0_1, imm0_2, imm0);
519 
520     imm2_1 = _mm_and_si128(imm2_1, *(__m128i*)_pi32avx_2);
521     imm2_2 = _mm_and_si128(imm2_2, *(__m128i*)_pi32avx_2);
522 
523     imm2_1 = _mm_cmpeq_epi32(imm2_1, _mm_setzero_si128());
524     imm2_2 = _mm_cmpeq_epi32(imm2_2, _mm_setzero_si128());
525 
526     COPY_XMM_TO_IMM(imm2_1, imm2_2, imm2);
527 #endif
528 
529     __m256 sign_bit = _mm256_castsi256_ps(imm0);
530     __m256 poly_mask = _mm256_castsi256_ps(imm2);
531 
532     /* The magic pass: "Extended precision modular arithmetic"
533        x = ((x - y * DP1) - y * DP2) - y * DP3; */
534     xmm1 = *(__m256*)_ps256_minus_cephes_DP1;
535     xmm2 = *(__m256*)_ps256_minus_cephes_DP2;
536     xmm3 = *(__m256*)_ps256_minus_cephes_DP3;
537     xmm1 = _mm256_mul_ps(y, xmm1);
538     xmm2 = _mm256_mul_ps(y, xmm2);
539     xmm3 = _mm256_mul_ps(y, xmm3);
540     x = _mm256_add_ps(x, xmm1);
541     x = _mm256_add_ps(x, xmm2);
542     x = _mm256_add_ps(x, xmm3);
543 
544     /* Evaluate the first polynom  (0 <= x <= Pi/4) */
545     y = *(__m256*)_ps256_coscof_p0;
546     __m256 z = _mm256_mul_ps(x, x);
547 
548     y = _mm256_mul_ps(y, z);
549     y = _mm256_add_ps(y, *(__m256*)_ps256_coscof_p1);
550     y = _mm256_mul_ps(y, z);
551     y = _mm256_add_ps(y, *(__m256*)_ps256_coscof_p2);
552     y = _mm256_mul_ps(y, z);
553     y = _mm256_mul_ps(y, z);
554     __m256 tmp = _mm256_mul_ps(z, *(__m256*)_ps256_0p5);
555     y = _mm256_sub_ps(y, tmp);
556     y = _mm256_add_ps(y, *(__m256*)_ps256_1);
557 
558     /* Evaluate the second polynom  (Pi/4 <= x <= 0) */
559 
560     __m256 y2 = *(__m256*)_ps256_sincof_p0;
561     y2 = _mm256_mul_ps(y2, z);
562     y2 = _mm256_add_ps(y2, *(__m256*)_ps256_sincof_p1);
563     y2 = _mm256_mul_ps(y2, z);
564     y2 = _mm256_add_ps(y2, *(__m256*)_ps256_sincof_p2);
565     y2 = _mm256_mul_ps(y2, z);
566     y2 = _mm256_mul_ps(y2, x);
567     y2 = _mm256_add_ps(y2, x);
568 
569     /* select the correct result from the two polynoms */
570     xmm3 = poly_mask;
571     y2 = _mm256_and_ps(xmm3, y2); //, xmm3);
572     y = _mm256_andnot_ps(xmm3, y);
573     y = _mm256_add_ps(y, y2);
574     /* update the sign */
575     y = _mm256_xor_ps(y, sign_bit);
576 
577     return y;
578 }
579 
580 /* since sin256_ps and cos256_ps are almost identical, sincos256_ps could replace both of them..
581    it is almost as fast, and gives you a free cosine with your sine */
sincos256_ps(__m256 x,__m256 * s,__m256 * c)582 static inline void sincos256_ps(__m256 x, __m256* s, __m256* c)
583 {
584     __m256 xmm1, xmm2, xmm3 = _mm256_setzero_ps(), sign_bit_sin, y;
585     __m256i imm0, imm2, imm4;
586 
587 #ifndef __AVX2__
588     __m128i imm0_1, imm0_2;
589     __m128i imm2_1, imm2_2;
590     __m128i imm4_1, imm4_2;
591 #endif
592 
593     sign_bit_sin = x;
594     /* take the absolute value */
595     x = _mm256_and_ps(x, *(__m256*)_ps256_inv_sign_mask);
596     /* extract the sign bit (upper one) */
597     sign_bit_sin = _mm256_and_ps(sign_bit_sin, *(__m256*)_ps256_sign_mask);
598 
599     /* scale by 4/Pi */
600     y = _mm256_mul_ps(x, *(__m256*)_ps256_cephes_FOPI);
601 
602 #ifdef __AVX2__
603     /* store the integer part of y in imm2 */
604     imm2 = _mm256_cvttps_epi32(y);
605 
606     /* j=(j+1) & (~1) (see the cephes sources) */
607     imm2 = _mm256_add_epi32(imm2, *(__m256i*)_pi32_256_1);
608     imm2 = _mm256_and_si256(imm2, *(__m256i*)_pi32_256_inv1);
609 
610     y = _mm256_cvtepi32_ps(imm2);
611     imm4 = imm2;
612 
613     /* get the swap sign flag for the sine */
614     imm0 = _mm256_and_si256(imm2, *(__m256i*)_pi32_256_4);
615     imm0 = _mm256_slli_epi32(imm0, 29);
616     //__m256 swap_sign_bit_sin = _mm256_castsi256_ps(imm0);
617 
618     /* get the polynom selection mask for the sine*/
619     imm2 = _mm256_and_si256(imm2, *(__m256i*)_pi32_256_2);
620     imm2 = _mm256_cmpeq_epi32(imm2, *(__m256i*)_pi32_256_0);
621     //__m256 poly_mask = _mm256_castsi256_ps(imm2);
622 #else
623     /* we use SSE2 routines to perform the integer ops */
624     COPY_IMM_TO_XMM(_mm256_cvttps_epi32(y), imm2_1, imm2_2);
625 
626     imm2_1 = _mm_add_epi32(imm2_1, *(__m128i*)_pi32avx_1);
627     imm2_2 = _mm_add_epi32(imm2_2, *(__m128i*)_pi32avx_1);
628 
629     imm2_1 = _mm_and_si128(imm2_1, *(__m128i*)_pi32avx_inv1);
630     imm2_2 = _mm_and_si128(imm2_2, *(__m128i*)_pi32avx_inv1);
631 
632     COPY_XMM_TO_IMM(imm2_1, imm2_2, imm2);
633     y = _mm256_cvtepi32_ps(imm2);
634 
635     imm4_1 = imm2_1;
636     imm4_2 = imm2_2;
637 
638     imm0_1 = _mm_and_si128(imm2_1, *(__m128i*)_pi32avx_4);
639     imm0_2 = _mm_and_si128(imm2_2, *(__m128i*)_pi32avx_4);
640 
641     imm0_1 = _mm_slli_epi32(imm0_1, 29);
642     imm0_2 = _mm_slli_epi32(imm0_2, 29);
643 
644     COPY_XMM_TO_IMM(imm0_1, imm0_2, imm0);
645 
646     imm2_1 = _mm_and_si128(imm2_1, *(__m128i*)_pi32avx_2);
647     imm2_2 = _mm_and_si128(imm2_2, *(__m128i*)_pi32avx_2);
648 
649     imm2_1 = _mm_cmpeq_epi32(imm2_1, _mm_setzero_si128());
650     imm2_2 = _mm_cmpeq_epi32(imm2_2, _mm_setzero_si128());
651 
652     COPY_XMM_TO_IMM(imm2_1, imm2_2, imm2);
653 #endif
654     __m256 swap_sign_bit_sin = _mm256_castsi256_ps(imm0);
655     __m256 poly_mask = _mm256_castsi256_ps(imm2);
656 
657     /* The magic pass: "Extended precision modular arithmetic"
658        x = ((x - y * DP1) - y * DP2) - y * DP3; */
659     xmm1 = *(__m256*)_ps256_minus_cephes_DP1;
660     xmm2 = *(__m256*)_ps256_minus_cephes_DP2;
661     xmm3 = *(__m256*)_ps256_minus_cephes_DP3;
662     xmm1 = _mm256_mul_ps(y, xmm1);
663     xmm2 = _mm256_mul_ps(y, xmm2);
664     xmm3 = _mm256_mul_ps(y, xmm3);
665     x = _mm256_add_ps(x, xmm1);
666     x = _mm256_add_ps(x, xmm2);
667     x = _mm256_add_ps(x, xmm3);
668 
669 #ifdef __AVX2__
670     imm4 = _mm256_sub_epi32(imm4, *(__m256i*)_pi32_256_2);
671     imm4 = _mm256_andnot_si256(imm4, *(__m256i*)_pi32_256_4);
672     imm4 = _mm256_slli_epi32(imm4, 29);
673 #else
674     imm4_1 = _mm_sub_epi32(imm4_1, *(__m128i*)_pi32avx_2);
675     imm4_2 = _mm_sub_epi32(imm4_2, *(__m128i*)_pi32avx_2);
676 
677     imm4_1 = _mm_andnot_si128(imm4_1, *(__m128i*)_pi32avx_4);
678     imm4_2 = _mm_andnot_si128(imm4_2, *(__m128i*)_pi32avx_4);
679 
680     imm4_1 = _mm_slli_epi32(imm4_1, 29);
681     imm4_2 = _mm_slli_epi32(imm4_2, 29);
682 
683     COPY_XMM_TO_IMM(imm4_1, imm4_2, imm4);
684 #endif
685 
686     __m256 sign_bit_cos = _mm256_castsi256_ps(imm4);
687 
688     sign_bit_sin = _mm256_xor_ps(sign_bit_sin, swap_sign_bit_sin);
689 
690     /* Evaluate the first polynom  (0 <= x <= Pi/4) */
691     __m256 z = _mm256_mul_ps(x, x);
692     y = *(__m256*)_ps256_coscof_p0;
693 
694     y = _mm256_mul_ps(y, z);
695     y = _mm256_add_ps(y, *(__m256*)_ps256_coscof_p1);
696     y = _mm256_mul_ps(y, z);
697     y = _mm256_add_ps(y, *(__m256*)_ps256_coscof_p2);
698     y = _mm256_mul_ps(y, z);
699     y = _mm256_mul_ps(y, z);
700     __m256 tmp = _mm256_mul_ps(z, *(__m256*)_ps256_0p5);
701     y = _mm256_sub_ps(y, tmp);
702     y = _mm256_add_ps(y, *(__m256*)_ps256_1);
703 
704     /* Evaluate the second polynom  (Pi/4 <= x <= 0) */
705 
706     __m256 y2 = *(__m256*)_ps256_sincof_p0;
707     y2 = _mm256_mul_ps(y2, z);
708     y2 = _mm256_add_ps(y2, *(__m256*)_ps256_sincof_p1);
709     y2 = _mm256_mul_ps(y2, z);
710     y2 = _mm256_add_ps(y2, *(__m256*)_ps256_sincof_p2);
711     y2 = _mm256_mul_ps(y2, z);
712     y2 = _mm256_mul_ps(y2, x);
713     y2 = _mm256_add_ps(y2, x);
714 
715     /* select the correct result from the two polynoms */
716     xmm3 = poly_mask;
717     __m256 ysin2 = _mm256_and_ps(xmm3, y2);
718     __m256 ysin1 = _mm256_andnot_ps(xmm3, y);
719     y2 = _mm256_sub_ps(y2, ysin2);
720     y = _mm256_sub_ps(y, ysin1);
721 
722     xmm1 = _mm256_add_ps(ysin1, ysin2);
723     xmm2 = _mm256_add_ps(y, y2);
724 
725     /* update the sign */
726     *s = _mm256_xor_ps(xmm1, sign_bit_sin);
727     *c = _mm256_xor_ps(xmm2, sign_bit_cos);
728 }
729 
pow_ps(__m256 a,__m256 b)730 static inline __m256 pow_ps(__m256 a, __m256 b)
731 {
732     // pow(x, m) = exp(m * log(x))
733     return exp256_ps(_mm256_mul_ps(b, log256_ps(a)));
734 }
735 
736 #endif // AVX_MATHFUN_H
737