1 /* SIMD (SSE1+MMX or SSE2) implementation of sin, cos, exp and log
2 
3    Inspired by Intel Approximate Math library, and based on the
4    corresponding algorithms of the cephes math library
5 
6    The default is to use the SSE1 version. If you define USE_SSE2 the
7    the SSE2 intrinsics will be used in place of the MMX intrinsics. Do
8    not expect any significant performance improvement with SSE2.
9 */
10 
11 /* Copyright (C) 2007  Julien Pommier
12 
13   This software is provided 'as-is', without any express or implied
14   warranty.  In no event will the authors be held liable for any damages
15   arising from the use of this software.
16 
17   Permission is granted to anyone to use this software for any purpose,
18   including commercial applications, and to alter it and redistribute it
19   freely, subject to the following restrictions:
20 
21   1. The origin of this software must not be misrepresented; you must not
22      claim that you wrote the original software. If you use this software
23      in a product, an acknowledgment in the product documentation would be
24      appreciated but is not required.
25   2. Altered source versions must be plainly marked as such, and must not be
26      misrepresented as being the original software.
27   3. This notice may not be removed or altered from any source distribution.
28 
29   (this is the zlib license)
30 */
31 
32 #ifndef SSE_MATHFUN_H
33 #define SSE_MATHFUN_H
34 
35 #define USE_SSE2 1
36 
37 #include <xmmintrin.h>
38 
39 /* yes I know, the top of this file is quite ugly */
40 
41 #ifdef _MSC_VER /* visual c++ */
42 #define ALIGN16_BEG __declspec(align(16))
43 #define ALIGN16_END
44 #else /* gcc or icc */
45 #define ALIGN16_BEG
46 #define ALIGN16_END __attribute__((aligned(16)))
47 #endif
48 
49 /* __m128 is ugly to write */
50 typedef __m128 v4sf; // vector of 4 float (sse1)
51 
52 #ifdef USE_SSE2
53 #include <emmintrin.h>
54 typedef __m128i v4si; // vector of 4 int (sse2)
55 #else
56 typedef __m64 v2si; // vector of 2 int (mmx)
57 #endif
58 
59 /* declare some SSE constants -- why can't I figure a better way to do that? */
60 #define _PS_CONST(Name, Val) \
61     static const ALIGN16_BEG float _ps_##Name[4] ALIGN16_END = {Val, Val, Val, Val}
62 #define _PI32_CONST(Name, Val) \
63     static const ALIGN16_BEG int _pi32_##Name[4] ALIGN16_END = {Val, Val, Val, Val}
64 #define _PS_CONST_TYPE(Name, Type, Val) \
65     static const ALIGN16_BEG Type _ps_##Name[4] ALIGN16_END = {Val, Val, Val, Val}
66 
67 _PS_CONST(1, 1.0f);
68 _PS_CONST(0p5, 0.5f);
69 /* the smallest non denormalized float number */
70 _PS_CONST_TYPE(min_norm_pos, int, 0x00800000);
71 _PS_CONST_TYPE(mant_mask, int, 0x7f800000);
72 _PS_CONST_TYPE(inv_mant_mask, int, ~0x7f800000);
73 
74 _PS_CONST_TYPE(sign_mask, int, (int)0x80000000);
75 _PS_CONST_TYPE(inv_sign_mask, int, ~0x80000000);
76 
77 _PI32_CONST(1, 1);
78 _PI32_CONST(inv1, ~1);
79 _PI32_CONST(2, 2);
80 _PI32_CONST(4, 4);
81 _PI32_CONST(0x7f, 0x7f);
82 
83 _PS_CONST(cephes_SQRTHF, 0.707106781186547524f);
84 _PS_CONST(cephes_log_p0, 7.0376836292E-2f);
85 _PS_CONST(cephes_log_p1, -1.1514610310E-1f);
86 _PS_CONST(cephes_log_p2, 1.1676998740E-1f);
87 _PS_CONST(cephes_log_p3, -1.2420140846E-1f);
88 _PS_CONST(cephes_log_p4, +1.4249322787E-1f);
89 _PS_CONST(cephes_log_p5, -1.6668057665E-1f);
90 _PS_CONST(cephes_log_p6, +2.0000714765E-1f);
91 _PS_CONST(cephes_log_p7, -2.4999993993E-1f);
92 _PS_CONST(cephes_log_p8, +3.3333331174E-1f);
93 _PS_CONST(cephes_log_q1, -2.12194440e-4f);
94 _PS_CONST(cephes_log_q2, 0.693359375f);
95 
96 #ifndef USE_SSE2
97 typedef union xmm_mm_union
98 {
99     __m128 xmm;
100     __m64 mm[2];
101 } xmm_mm_union;
102 
103 #define COPY_XMM_TO_MM(xmm_, mm0_, mm1_) \
104     {                                    \
105         xmm_mm_union u;                  \
106         u.xmm = xmm_;                    \
107         mm0_ = u.mm[0];                  \
108         mm1_ = u.mm[1];                  \
109     }
110 
111 #define COPY_MM_TO_XMM(mm0_, mm1_, xmm_) \
112     {                                    \
113         xmm_mm_union u;                  \
114         u.mm[0] = mm0_;                  \
115         u.mm[1] = mm1_;                  \
116         xmm_ = u.xmm;                    \
117     }
118 
119 #endif // USE_SSE2
120 
121 /* natural logarithm computed for 4 simultaneous float
122    return NaN for x <= 0
123 */
log_ps(v4sf x)124 static inline v4sf log_ps(v4sf x)
125 {
126 #ifdef USE_SSE2
127     v4si emm0;
128 #else
129     v2si mm0, mm1;
130 #endif
131     v4sf one = *(v4sf*)_ps_1;
132 
133     v4sf invalid_mask = _mm_cmple_ps(x, _mm_setzero_ps());
134 
135     x = _mm_max_ps(x, *(v4sf*)_ps_min_norm_pos); /* cut off denormalized stuff */
136 
137 #ifndef USE_SSE2
138     /* part 1: x = frexpf(x, &e); */
139     COPY_XMM_TO_MM(x, mm0, mm1);
140     mm0 = _mm_srli_pi32(mm0, 23);
141     mm1 = _mm_srli_pi32(mm1, 23);
142 #else
143     emm0 = _mm_srli_epi32(_mm_castps_si128(x), 23);
144 #endif
145     /* keep only the fractional part */
146     x = _mm_and_ps(x, *(v4sf*)_ps_inv_mant_mask);
147     x = _mm_or_ps(x, *(v4sf*)_ps_0p5);
148 
149 #ifndef USE_SSE2
150     /* now e=mm0:mm1 contain the really base-2 exponent */
151     mm0 = _mm_sub_pi32(mm0, *(v2si*)_pi32_0x7f);
152     mm1 = _mm_sub_pi32(mm1, *(v2si*)_pi32_0x7f);
153     v4sf e = _mm_cvtpi32x2_ps(mm0, mm1);
154     _mm_empty(); /* bye bye mmx */
155 #else
156     emm0 = _mm_sub_epi32(emm0, *(v4si*)_pi32_0x7f);
157     v4sf e = _mm_cvtepi32_ps(emm0);
158 #endif
159 
160     e = _mm_add_ps(e, one);
161 
162     /* part2:
163        if( x < SQRTHF ) {
164          e -= 1;
165          x = x + x - 1.0;
166        } else { x = x - 1.0; }
167     */
168     v4sf mask = _mm_cmplt_ps(x, *(v4sf*)_ps_cephes_SQRTHF);
169     v4sf tmp = _mm_and_ps(x, mask);
170     x = _mm_sub_ps(x, one);
171     e = _mm_sub_ps(e, _mm_and_ps(one, mask));
172     x = _mm_add_ps(x, tmp);
173 
174     v4sf z = _mm_mul_ps(x, x);
175 
176     v4sf y = *(v4sf*)_ps_cephes_log_p0;
177     y = _mm_mul_ps(y, x);
178     y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p1);
179     y = _mm_mul_ps(y, x);
180     y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p2);
181     y = _mm_mul_ps(y, x);
182     y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p3);
183     y = _mm_mul_ps(y, x);
184     y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p4);
185     y = _mm_mul_ps(y, x);
186     y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p5);
187     y = _mm_mul_ps(y, x);
188     y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p6);
189     y = _mm_mul_ps(y, x);
190     y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p7);
191     y = _mm_mul_ps(y, x);
192     y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p8);
193     y = _mm_mul_ps(y, x);
194 
195     y = _mm_mul_ps(y, z);
196 
197     tmp = _mm_mul_ps(e, *(v4sf*)_ps_cephes_log_q1);
198     y = _mm_add_ps(y, tmp);
199 
200     tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
201     y = _mm_sub_ps(y, tmp);
202 
203     tmp = _mm_mul_ps(e, *(v4sf*)_ps_cephes_log_q2);
204     x = _mm_add_ps(x, y);
205     x = _mm_add_ps(x, tmp);
206     x = _mm_or_ps(x, invalid_mask); // negative arg will be NAN
207     return x;
208 }
209 
210 _PS_CONST(exp_hi, 88.3762626647949f);
211 _PS_CONST(exp_lo, -88.3762626647949f);
212 
213 _PS_CONST(cephes_LOG2EF, 1.44269504088896341f);
214 _PS_CONST(cephes_exp_C1, 0.693359375f);
215 _PS_CONST(cephes_exp_C2, -2.12194440e-4f);
216 
217 _PS_CONST(cephes_exp_p0, 1.9875691500E-4f);
218 _PS_CONST(cephes_exp_p1, 1.3981999507E-3f);
219 _PS_CONST(cephes_exp_p2, 8.3334519073E-3f);
220 _PS_CONST(cephes_exp_p3, 4.1665795894E-2f);
221 _PS_CONST(cephes_exp_p4, 1.6666665459E-1f);
222 _PS_CONST(cephes_exp_p5, 5.0000001201E-1f);
223 
exp_ps(v4sf x)224 static inline v4sf exp_ps(v4sf x)
225 {
226     v4sf tmp = _mm_setzero_ps(), fx;
227 #ifdef USE_SSE2
228     v4si emm0;
229 #else
230     v2si mm0, mm1;
231 #endif
232     v4sf one = *(v4sf*)_ps_1;
233 
234     x = _mm_min_ps(x, *(v4sf*)_ps_exp_hi);
235     x = _mm_max_ps(x, *(v4sf*)_ps_exp_lo);
236 
237     /* express exp(x) as exp(g + n*log(2)) */
238     fx = _mm_mul_ps(x, *(v4sf*)_ps_cephes_LOG2EF);
239     fx = _mm_add_ps(fx, *(v4sf*)_ps_0p5);
240 
241     /* how to perform a floorf with SSE: just below */
242 #ifndef USE_SSE2
243     /* step 1 : cast to int */
244     tmp = _mm_movehl_ps(tmp, fx);
245     mm0 = _mm_cvttps_pi32(fx);
246     mm1 = _mm_cvttps_pi32(tmp);
247     /* step 2 : cast back to float */
248     tmp = _mm_cvtpi32x2_ps(mm0, mm1);
249 #else
250     emm0 = _mm_cvttps_epi32(fx);
251     tmp = _mm_cvtepi32_ps(emm0);
252 #endif
253     /* if greater, substract 1 */
254     v4sf mask = _mm_cmpgt_ps(tmp, fx);
255     mask = _mm_and_ps(mask, one);
256     fx = _mm_sub_ps(tmp, mask);
257 
258     tmp = _mm_mul_ps(fx, *(v4sf*)_ps_cephes_exp_C1);
259     v4sf z = _mm_mul_ps(fx, *(v4sf*)_ps_cephes_exp_C2);
260     x = _mm_sub_ps(x, tmp);
261     x = _mm_sub_ps(x, z);
262 
263     z = _mm_mul_ps(x, x);
264 
265     v4sf y = *(v4sf*)_ps_cephes_exp_p0;
266     y = _mm_mul_ps(y, x);
267     y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p1);
268     y = _mm_mul_ps(y, x);
269     y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p2);
270     y = _mm_mul_ps(y, x);
271     y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p3);
272     y = _mm_mul_ps(y, x);
273     y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p4);
274     y = _mm_mul_ps(y, x);
275     y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p5);
276     y = _mm_mul_ps(y, z);
277     y = _mm_add_ps(y, x);
278     y = _mm_add_ps(y, one);
279 
280     /* build 2^n */
281 #ifndef USE_SSE2
282     z = _mm_movehl_ps(z, fx);
283     mm0 = _mm_cvttps_pi32(fx);
284     mm1 = _mm_cvttps_pi32(z);
285     mm0 = _mm_add_pi32(mm0, *(v2si*)_pi32_0x7f);
286     mm1 = _mm_add_pi32(mm1, *(v2si*)_pi32_0x7f);
287     mm0 = _mm_slli_pi32(mm0, 23);
288     mm1 = _mm_slli_pi32(mm1, 23);
289 
290     v4sf pow2n;
291     COPY_MM_TO_XMM(mm0, mm1, pow2n);
292     _mm_empty();
293 #else
294     emm0 = _mm_cvttps_epi32(fx);
295     emm0 = _mm_add_epi32(emm0, *(v4si*)_pi32_0x7f);
296     emm0 = _mm_slli_epi32(emm0, 23);
297     v4sf pow2n = _mm_castsi128_ps(emm0);
298 #endif
299     y = _mm_mul_ps(y, pow2n);
300     return y;
301 }
302 
303 _PS_CONST(minus_cephes_DP1, -0.78515625f);
304 _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4f);
305 _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8f);
306 _PS_CONST(sincof_p0, -1.9515295891E-4f);
307 _PS_CONST(sincof_p1, 8.3321608736E-3f);
308 _PS_CONST(sincof_p2, -1.6666654611E-1f);
309 _PS_CONST(coscof_p0, 2.443315711809948E-005f);
310 _PS_CONST(coscof_p1, -1.388731625493765E-003f);
311 _PS_CONST(coscof_p2, 4.166664568298827E-002f);
312 _PS_CONST(cephes_FOPI, 1.27323954473516f); // 4 / M_PI
313 
314 /* evaluation of 4 sines at onces, using only SSE1+MMX intrinsics so
315    it runs also on old athlons XPs and the pentium III of your grand
316    mother.
317 
318    The code is the exact rewriting of the cephes sinf function.
319    Precision is excellent as long as x < 8192 (I did not bother to
320    take into account the special handling they have for greater values
321    -- it does not return garbage for arguments over 8192, though, but
322    the extra precision is missing).
323 
324    Note that it is such that sinf((float)M_PI) = 8.74e-8, which is the
325    surprising but correct result.
326 
327    Performance is also surprisingly good, 1.33 times faster than the
328    macos vsinf SSE2 function, and 1.5 times faster than the
329    __vrs4_sinf of amd's ACML (which is only available in 64 bits). Not
330    too bad for an SSE1 function (with no special tuning) !
331    However the latter libraries probably have a much better handling of NaN,
332    Inf, denormalized and other special arguments..
333 
334    On my core 1 duo, the execution of this function takes approximately 95 cycles.
335 
336    From what I have observed on the experiments with Intel AMath lib, switching to an
337    SSE2 version would improve the perf by only 10%.
338 
339    Since it is based on SSE intrinsics, it has to be compiled at -O2 to
340    deliver full speed.
341 */
sin_ps(v4sf x)342 static inline v4sf sin_ps(v4sf x)
343 {   // any x
344     v4sf xmm1, xmm2 = _mm_setzero_ps(), xmm3, sign_bit, y;
345 
346 #ifdef USE_SSE2
347     v4si emm0, emm2;
348 #else
349     v2si mm0, mm1, mm2, mm3;
350 #endif
351     sign_bit = x;
352     /* take the absolute value */
353     x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
354     /* extract the sign bit (upper one) */
355     sign_bit = _mm_and_ps(sign_bit, *(v4sf*)_ps_sign_mask);
356 
357     /* scale by 4/Pi */
358     y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
359 
360 #ifdef USE_SSE2
361     /* store the integer part of y in mm0 */
362     emm2 = _mm_cvttps_epi32(y);
363     /* j=(j+1) & (~1) (see the cephes sources) */
364     emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
365     emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
366     y = _mm_cvtepi32_ps(emm2);
367 
368     /* get the swap sign flag */
369     emm0 = _mm_and_si128(emm2, *(v4si*)_pi32_4);
370     emm0 = _mm_slli_epi32(emm0, 29);
371     /* get the polynom selection mask
372        there is one polynom for 0 <= x <= Pi/4
373        and another one for Pi/4<x<=Pi/2
374 
375        Both branches will be computed.
376     */
377     emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
378     emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
379 
380     v4sf swap_sign_bit = _mm_castsi128_ps(emm0);
381     v4sf poly_mask = _mm_castsi128_ps(emm2);
382     sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);
383 
384 #else
385     /* store the integer part of y in mm0:mm1 */
386     xmm2 = _mm_movehl_ps(xmm2, y);
387     mm2 = _mm_cvttps_pi32(y);
388     mm3 = _mm_cvttps_pi32(xmm2);
389     /* j=(j+1) & (~1) (see the cephes sources) */
390     mm2 = _mm_add_pi32(mm2, *(v2si*)_pi32_1);
391     mm3 = _mm_add_pi32(mm3, *(v2si*)_pi32_1);
392     mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_inv1);
393     mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_inv1);
394     y = _mm_cvtpi32x2_ps(mm2, mm3);
395     /* get the swap sign flag */
396     mm0 = _mm_and_si64(mm2, *(v2si*)_pi32_4);
397     mm1 = _mm_and_si64(mm3, *(v2si*)_pi32_4);
398     mm0 = _mm_slli_pi32(mm0, 29);
399     mm1 = _mm_slli_pi32(mm1, 29);
400     /* get the polynom selection mask */
401     mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_2);
402     mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_2);
403     mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
404     mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
405     v4sf swap_sign_bit, poly_mask;
406     COPY_MM_TO_XMM(mm0, mm1, swap_sign_bit);
407     COPY_MM_TO_XMM(mm2, mm3, poly_mask);
408     sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);
409     _mm_empty(); /* good-bye mmx */
410 #endif
411 
412     /* The magic pass: "Extended precision modular arithmetic"
413        x = ((x - y * DP1) - y * DP2) - y * DP3; */
414     xmm1 = *(v4sf*)_ps_minus_cephes_DP1;
415     xmm2 = *(v4sf*)_ps_minus_cephes_DP2;
416     xmm3 = *(v4sf*)_ps_minus_cephes_DP3;
417     xmm1 = _mm_mul_ps(y, xmm1);
418     xmm2 = _mm_mul_ps(y, xmm2);
419     xmm3 = _mm_mul_ps(y, xmm3);
420     x = _mm_add_ps(x, xmm1);
421     x = _mm_add_ps(x, xmm2);
422     x = _mm_add_ps(x, xmm3);
423 
424     /* Evaluate the first polynom  (0 <= x <= Pi/4) */
425     y = *(v4sf*)_ps_coscof_p0;
426     v4sf z = _mm_mul_ps(x, x);
427 
428     y = _mm_mul_ps(y, z);
429     y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p1);
430     y = _mm_mul_ps(y, z);
431     y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p2);
432     y = _mm_mul_ps(y, z);
433     y = _mm_mul_ps(y, z);
434     v4sf tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
435     y = _mm_sub_ps(y, tmp);
436     y = _mm_add_ps(y, *(v4sf*)_ps_1);
437 
438     /* Evaluate the second polynom  (Pi/4 <= x <= 0) */
439 
440     v4sf y2 = *(v4sf*)_ps_sincof_p0;
441     y2 = _mm_mul_ps(y2, z);
442     y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
443     y2 = _mm_mul_ps(y2, z);
444     y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
445     y2 = _mm_mul_ps(y2, z);
446     y2 = _mm_mul_ps(y2, x);
447     y2 = _mm_add_ps(y2, x);
448 
449     /* select the correct result from the two polynoms */
450     xmm3 = poly_mask;
451     y2 = _mm_and_ps(xmm3, y2); //, xmm3);
452     y = _mm_andnot_ps(xmm3, y);
453     y = _mm_add_ps(y, y2);
454     /* update the sign */
455     y = _mm_xor_ps(y, sign_bit);
456     return y;
457 }
458 
459 /* almost the same as sin_ps */
cos_ps(v4sf x)460 static inline v4sf cos_ps(v4sf x)
461 {   // any x
462     v4sf xmm1, xmm2 = _mm_setzero_ps(), xmm3, y;
463 #ifdef USE_SSE2
464     v4si emm0, emm2;
465 #else
466     v2si mm0, mm1, mm2, mm3;
467 #endif
468     /* take the absolute value */
469     x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
470 
471     /* scale by 4/Pi */
472     y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
473 
474 #ifdef USE_SSE2
475     /* store the integer part of y in mm0 */
476     emm2 = _mm_cvttps_epi32(y);
477     /* j=(j+1) & (~1) (see the cephes sources) */
478     emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
479     emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
480     y = _mm_cvtepi32_ps(emm2);
481 
482     emm2 = _mm_sub_epi32(emm2, *(v4si*)_pi32_2);
483 
484     /* get the swap sign flag */
485     emm0 = _mm_andnot_si128(emm2, *(v4si*)_pi32_4);
486     emm0 = _mm_slli_epi32(emm0, 29);
487     /* get the polynom selection mask */
488     emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
489     emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
490 
491     v4sf sign_bit = _mm_castsi128_ps(emm0);
492     v4sf poly_mask = _mm_castsi128_ps(emm2);
493 #else
494     /* store the integer part of y in mm0:mm1 */
495     xmm2 = _mm_movehl_ps(xmm2, y);
496     mm2 = _mm_cvttps_pi32(y);
497     mm3 = _mm_cvttps_pi32(xmm2);
498 
499     /* j=(j+1) & (~1) (see the cephes sources) */
500     mm2 = _mm_add_pi32(mm2, *(v2si*)_pi32_1);
501     mm3 = _mm_add_pi32(mm3, *(v2si*)_pi32_1);
502     mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_inv1);
503     mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_inv1);
504 
505     y = _mm_cvtpi32x2_ps(mm2, mm3);
506 
507     mm2 = _mm_sub_pi32(mm2, *(v2si*)_pi32_2);
508     mm3 = _mm_sub_pi32(mm3, *(v2si*)_pi32_2);
509 
510     /* get the swap sign flag in mm0:mm1 and the
511        polynom selection mask in mm2:mm3 */
512 
513     mm0 = _mm_andnot_si64(mm2, *(v2si*)_pi32_4);
514     mm1 = _mm_andnot_si64(mm3, *(v2si*)_pi32_4);
515     mm0 = _mm_slli_pi32(mm0, 29);
516     mm1 = _mm_slli_pi32(mm1, 29);
517 
518     mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_2);
519     mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_2);
520 
521     mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
522     mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
523 
524     v4sf sign_bit, poly_mask;
525     COPY_MM_TO_XMM(mm0, mm1, sign_bit);
526     COPY_MM_TO_XMM(mm2, mm3, poly_mask);
527     _mm_empty(); /* good-bye mmx */
528 #endif
529     /* The magic pass: "Extended precision modular arithmetic"
530        x = ((x - y * DP1) - y * DP2) - y * DP3; */
531     xmm1 = *(v4sf*)_ps_minus_cephes_DP1;
532     xmm2 = *(v4sf*)_ps_minus_cephes_DP2;
533     xmm3 = *(v4sf*)_ps_minus_cephes_DP3;
534     xmm1 = _mm_mul_ps(y, xmm1);
535     xmm2 = _mm_mul_ps(y, xmm2);
536     xmm3 = _mm_mul_ps(y, xmm3);
537     x = _mm_add_ps(x, xmm1);
538     x = _mm_add_ps(x, xmm2);
539     x = _mm_add_ps(x, xmm3);
540 
541     /* Evaluate the first polynom  (0 <= x <= Pi/4) */
542     y = *(v4sf*)_ps_coscof_p0;
543     v4sf z = _mm_mul_ps(x, x);
544 
545     y = _mm_mul_ps(y, z);
546     y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p1);
547     y = _mm_mul_ps(y, z);
548     y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p2);
549     y = _mm_mul_ps(y, z);
550     y = _mm_mul_ps(y, z);
551     v4sf tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
552     y = _mm_sub_ps(y, tmp);
553     y = _mm_add_ps(y, *(v4sf*)_ps_1);
554 
555     /* Evaluate the second polynom  (Pi/4 <= x <= 0) */
556 
557     v4sf y2 = *(v4sf*)_ps_sincof_p0;
558     y2 = _mm_mul_ps(y2, z);
559     y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
560     y2 = _mm_mul_ps(y2, z);
561     y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
562     y2 = _mm_mul_ps(y2, z);
563     y2 = _mm_mul_ps(y2, x);
564     y2 = _mm_add_ps(y2, x);
565 
566     /* select the correct result from the two polynoms */
567     xmm3 = poly_mask;
568     y2 = _mm_and_ps(xmm3, y2); //, xmm3);
569     y = _mm_andnot_ps(xmm3, y);
570     y = _mm_add_ps(y, y2);
571     /* update the sign */
572     y = _mm_xor_ps(y, sign_bit);
573 
574     return y;
575 }
576 
577 /* since sin_ps and cos_ps are almost identical, sincos_ps could replace both of them..
578    it is almost as fast, and gives you a free cosine with your sine */
sincos_ps(v4sf x,v4sf * s,v4sf * c)579 static inline void sincos_ps(v4sf x, v4sf* s, v4sf* c)
580 {
581     v4sf xmm1, xmm2, xmm3 = _mm_setzero_ps(), sign_bit_sin, y;
582 #ifdef USE_SSE2
583     v4si emm0, emm2, emm4;
584 #else
585     v2si mm0, mm1, mm2, mm3, mm4, mm5;
586 #endif
587     sign_bit_sin = x;
588     /* take the absolute value */
589     x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
590     /* extract the sign bit (upper one) */
591     sign_bit_sin = _mm_and_ps(sign_bit_sin, *(v4sf*)_ps_sign_mask);
592 
593     /* scale by 4/Pi */
594     y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
595 
596 #ifdef USE_SSE2
597     /* store the integer part of y in emm2 */
598     emm2 = _mm_cvttps_epi32(y);
599 
600     /* j=(j+1) & (~1) (see the cephes sources) */
601     emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
602     emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
603     y = _mm_cvtepi32_ps(emm2);
604 
605     emm4 = emm2;
606 
607     /* get the swap sign flag for the sine */
608     emm0 = _mm_and_si128(emm2, *(v4si*)_pi32_4);
609     emm0 = _mm_slli_epi32(emm0, 29);
610     v4sf swap_sign_bit_sin = _mm_castsi128_ps(emm0);
611 
612     /* get the polynom selection mask for the sine*/
613     emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
614     emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
615     v4sf poly_mask = _mm_castsi128_ps(emm2);
616 #else
617     /* store the integer part of y in mm2:mm3 */
618     xmm3 = _mm_movehl_ps(xmm3, y);
619     mm2 = _mm_cvttps_pi32(y);
620     mm3 = _mm_cvttps_pi32(xmm3);
621 
622     /* j=(j+1) & (~1) (see the cephes sources) */
623     mm2 = _mm_add_pi32(mm2, *(v2si*)_pi32_1);
624     mm3 = _mm_add_pi32(mm3, *(v2si*)_pi32_1);
625     mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_inv1);
626     mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_inv1);
627 
628     y = _mm_cvtpi32x2_ps(mm2, mm3);
629 
630     mm4 = mm2;
631     mm5 = mm3;
632 
633     /* get the swap sign flag for the sine */
634     mm0 = _mm_and_si64(mm2, *(v2si*)_pi32_4);
635     mm1 = _mm_and_si64(mm3, *(v2si*)_pi32_4);
636     mm0 = _mm_slli_pi32(mm0, 29);
637     mm1 = _mm_slli_pi32(mm1, 29);
638     v4sf swap_sign_bit_sin;
639     COPY_MM_TO_XMM(mm0, mm1, swap_sign_bit_sin);
640 
641     /* get the polynom selection mask for the sine */
642 
643     mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_2);
644     mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_2);
645     mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
646     mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
647     v4sf poly_mask;
648     COPY_MM_TO_XMM(mm2, mm3, poly_mask);
649 #endif
650 
651     /* The magic pass: "Extended precision modular arithmetic"
652        x = ((x - y * DP1) - y * DP2) - y * DP3; */
653     xmm1 = *(v4sf*)_ps_minus_cephes_DP1;
654     xmm2 = *(v4sf*)_ps_minus_cephes_DP2;
655     xmm3 = *(v4sf*)_ps_minus_cephes_DP3;
656     xmm1 = _mm_mul_ps(y, xmm1);
657     xmm2 = _mm_mul_ps(y, xmm2);
658     xmm3 = _mm_mul_ps(y, xmm3);
659     x = _mm_add_ps(x, xmm1);
660     x = _mm_add_ps(x, xmm2);
661     x = _mm_add_ps(x, xmm3);
662 
663 #ifdef USE_SSE2
664     emm4 = _mm_sub_epi32(emm4, *(v4si*)_pi32_2);
665     emm4 = _mm_andnot_si128(emm4, *(v4si*)_pi32_4);
666     emm4 = _mm_slli_epi32(emm4, 29);
667     v4sf sign_bit_cos = _mm_castsi128_ps(emm4);
668 #else
669     /* get the sign flag for the cosine */
670     mm4 = _mm_sub_pi32(mm4, *(v2si*)_pi32_2);
671     mm5 = _mm_sub_pi32(mm5, *(v2si*)_pi32_2);
672     mm4 = _mm_andnot_si64(mm4, *(v2si*)_pi32_4);
673     mm5 = _mm_andnot_si64(mm5, *(v2si*)_pi32_4);
674     mm4 = _mm_slli_pi32(mm4, 29);
675     mm5 = _mm_slli_pi32(mm5, 29);
676     v4sf sign_bit_cos;
677     COPY_MM_TO_XMM(mm4, mm5, sign_bit_cos);
678     _mm_empty(); /* good-bye mmx */
679 #endif
680 
681     sign_bit_sin = _mm_xor_ps(sign_bit_sin, swap_sign_bit_sin);
682 
683     /* Evaluate the first polynom  (0 <= x <= Pi/4) */
684     v4sf z = _mm_mul_ps(x, x);
685     y = *(v4sf*)_ps_coscof_p0;
686 
687     y = _mm_mul_ps(y, z);
688     y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p1);
689     y = _mm_mul_ps(y, z);
690     y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p2);
691     y = _mm_mul_ps(y, z);
692     y = _mm_mul_ps(y, z);
693     v4sf tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
694     y = _mm_sub_ps(y, tmp);
695     y = _mm_add_ps(y, *(v4sf*)_ps_1);
696 
697     /* Evaluate the second polynom  (Pi/4 <= x <= 0) */
698 
699     v4sf y2 = *(v4sf*)_ps_sincof_p0;
700     y2 = _mm_mul_ps(y2, z);
701     y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
702     y2 = _mm_mul_ps(y2, z);
703     y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
704     y2 = _mm_mul_ps(y2, z);
705     y2 = _mm_mul_ps(y2, x);
706     y2 = _mm_add_ps(y2, x);
707 
708     /* select the correct result from the two polynoms */
709     xmm3 = poly_mask;
710     v4sf ysin2 = _mm_and_ps(xmm3, y2);
711     v4sf ysin1 = _mm_andnot_ps(xmm3, y);
712     y2 = _mm_sub_ps(y2, ysin2);
713     y = _mm_sub_ps(y, ysin1);
714 
715     xmm1 = _mm_add_ps(ysin1, ysin2);
716     xmm2 = _mm_add_ps(y, y2);
717 
718     /* update the sign */
719     *s = _mm_xor_ps(xmm1, sign_bit_sin);
720     *c = _mm_xor_ps(xmm2, sign_bit_cos);
721 }
722 
723 #endif // SSE_MATHFUN_H
724