1 /**********************************************************************
2 
3    Audacity: A Digital Audio Editor
4 
5    SseMathFuncs.h
6 
7    Stephen Moshier (wrote original C version, The Cephes Library)
8    Julien Pommier (converted to use SSE)
9    Andrew Hallendorff (adapted for Audacity)
10 
11 *******************************************************************//**
12 
13    \file SseMathfuncs.h
14    \brief SSE maths functions (for FFTs)
15 
16 *//****************************************************************/
17 
18 
19 /* JKC: The trig functions use Taylor's series, on the range 0 to Pi/4
20  * computing both Sin and Cos, and using one or the other (in the
21  * solo functions), or both in the more useful for us for FFTs sincos
22  * function.
23  * The constants minus_cephes_DP1 to minus_cephes_DP3 are used in the
24  * angle reduction modulo function.
25  * 4 sincos are done at a time.
26  * If we wanted to do just sin or just cos, we could speed things up
27  * by queuing up the Sines and Cosines into batches of 4 separately.*/
28 
29 
30 
31 
32 /* SIMD (SSE1+MMX or SSE2) implementation of sin, cos, exp and log
33 
34 Inspired by Intel Approximate Math library, and based on the
35 corresponding algorithms of the cephes math library
36 
37 The default is to use the SSE1 version. If you define USE_SSE2 the
38 the SSE2 intrinsics will be used in place of the MMX intrinsics. Do
39 not expect any significant performance improvement with SSE2.
40 */
41 
42 /* Copyright (C) 2007  Julien Pommier
43 
44 This software is provided 'as-is', without any express or implied
45 warranty.  In no event will the authors be held liable for any damages
46 arising from the use of this software.
47 
48 Permission is granted to anyone to use this software for any purpose,
49 including commercial applications, and to alter it and redistribute it
50 freely, subject to the following restrictions:
51 
52 1. The origin of this software must not be misrepresented; you must not
53 claim that you wrote the original software. If you use this software
54 in a product, an acknowledgment in the product documentation would be
55 appreciated but is not required.
56 2. Altered source versions must be plainly marked as such, and must not be
57 misrepresented as being the original software.
58 3. This notice may not be removed or altered from any source distribution.
59 
60 (this is the zlib license)
61 */
62 #ifndef SSE_MATHFUN
63 #define SSE_MATHFUN
64 /* SIMD (SSE1+MMX or SSE2) implementation of sin, cos, exp and log
65 
66    Inspired by Intel Approximate Math library, and based on the
67    corresponding algorithms of the cephes math library
68 
69    The default is to use the SSE1 version. If you define USE_SSE2 the
70    the SSE2 intrinsics will be used in place of the MMX intrinsics. Do
71    not expect any significant performance improvement with SSE2.
72 */
73 
74 /* Copyright (C) 2007  Julien Pommier
75 
76   This software is provided 'as-is', without any express or implied
77   warranty.  In no event will the authors be held liable for any damages
78   arising from the use of this software.
79 
80   Permission is granted to anyone to use this software for any purpose,
81   including commercial applications, and to alter it and redistribute it
82   freely, subject to the following restrictions:
83 
84   1. The origin of this software must not be misrepresented; you must not
85      claim that you wrote the original software. If you use this software
86      in a product, an acknowledgment in the product documentation would be
87      appreciated but is not required.
88   2. Altered source versions must be plainly marked as such, and must not be
89      misrepresented as being the original software.
90   3. This notice may not be removed or altered from any source distribution.
91 
92   (this is the zlib license)
93 */
94 
95 #include <xmmintrin.h>
96 
97 /* yes I know, the top of this file is quite ugly */
98 
99 #ifdef _MSC_VER /* visual c++ */
100 # define ALIGN16_BEG __declspec(align(16))
101 # define ALIGN16_END
102 #else /* gcc or icc */
103 # define ALIGN16_BEG
104 # define ALIGN16_END __attribute__((aligned(16)))
105 #endif
106 
107 /* __m128 is ugly to write */
108 typedef __m128 v4sf;  // vector of 4 float (sse1)
109 
110 #ifdef USE_SSE2
111 # include <emmintrin.h>
112 typedef __m128i v4si; // vector of 4 int (sse2)
113 #else
114 typedef __m64 v2si;   // vector of 2 int (mmx)
115 #endif
116 
117 /* declare some SSE constants -- why can't I figure a better way to do that? */
118 #define _PS_CONST(Name, Val)                                            \
119   static const ALIGN16_BEG float _ps_##Name[4] ALIGN16_END = { Val, Val, Val, Val }
120 #define _PI32_CONST(Name, Val)                                            \
121   static const ALIGN16_BEG int _pi32_##Name[4] ALIGN16_END = { Val, Val, Val, Val }
122 #define _PS_CONST_TYPE(Name, Type, Val)                                 \
123   static const ALIGN16_BEG Type _ps_##Name[4] ALIGN16_END = { Val, Val, Val, Val }
124 
125 _PS_CONST(1  , 1.0f);
126 _PS_CONST(0p5, 0.5f);
127 /* the smallest non denormalized float number */
128 _PS_CONST_TYPE(min_norm_pos, int, 0x00800000);
129 _PS_CONST_TYPE(mant_mask, int, 0x7f800000);
130 _PS_CONST_TYPE(inv_mant_mask, int, ~0x7f800000);
131 
132 _PS_CONST_TYPE(sign_mask, int, (int)0x80000000);
133 _PS_CONST_TYPE(inv_sign_mask, int, ~0x80000000);
134 
135 _PI32_CONST(1, 1);
136 _PI32_CONST(inv1, ~1);
137 _PI32_CONST(2, 2);
138 _PI32_CONST(4, 4);
139 _PI32_CONST(0x7f, 0x7f);
140 
141 _PS_CONST(cephes_SQRTHF, 0.707106781186547524);
142 _PS_CONST(cephes_log_p0, 7.0376836292E-2);
143 _PS_CONST(cephes_log_p1, - 1.1514610310E-1);
144 _PS_CONST(cephes_log_p2, 1.1676998740E-1);
145 _PS_CONST(cephes_log_p3, - 1.2420140846E-1);
146 _PS_CONST(cephes_log_p4, + 1.4249322787E-1);
147 _PS_CONST(cephes_log_p5, - 1.6668057665E-1);
148 _PS_CONST(cephes_log_p6, + 2.0000714765E-1);
149 _PS_CONST(cephes_log_p7, - 2.4999993993E-1);
150 _PS_CONST(cephes_log_p8, + 3.3333331174E-1);
151 _PS_CONST(cephes_log_q1, -2.12194440e-4);
152 _PS_CONST(cephes_log_q2, 0.693359375);
153 
154 #ifndef USE_SSE2
155 typedef union xmm_mm_union {
156   __m128 xmm;
157   __m64 mm[2];
158 } xmm_mm_union;
159 
160 #define COPY_XMM_TO_MM(xmm_, mm0_, mm1_) {          \
161     xmm_mm_union u; u.xmm = xmm_;                   \
162     mm0_ = u.mm[0];                                 \
163     mm1_ = u.mm[1];                                 \
164 }
165 
166 #define COPY_MM_TO_XMM(mm0_, mm1_, xmm_) {                         \
167     xmm_mm_union u; u.mm[0]=mm0_; u.mm[1]=mm1_; xmm_ = u.xmm;      \
168   }
169 
170 #endif // USE_SSE2
171 
172 /* natural logarithm computed for 4 simultaneous float
173    return NaN for x <= 0
174 */
log_ps(v4sf x)175 v4sf log_ps(v4sf x) {
176 #ifdef USE_SSE2
177   v4si emm0;
178 #else
179   v2si mm0, mm1;
180 #endif
181   v4sf one = *(v4sf*)_ps_1;
182 
183   v4sf invalid_mask = _mm_cmple_ps(x, _mm_setzero_ps());
184 
185   x = _mm_max_ps(x, *(v4sf*)_ps_min_norm_pos);  /* cut off denormalized stuff */
186 
187 #ifndef USE_SSE2
188   /* part 1: x = frexpf(x, &e); */
189   COPY_XMM_TO_MM(x, mm0, mm1);
190   mm0 = _mm_srli_pi32(mm0, 23);
191   mm1 = _mm_srli_pi32(mm1, 23);
192 #else
193   emm0 = _mm_srli_epi32(_mm_castps_si128(x), 23);
194 #endif
195   /* keep only the fractional part */
196   x = _mm_and_ps(x, *(v4sf*)_ps_inv_mant_mask);
197   x = _mm_or_ps(x, *(v4sf*)_ps_0p5);
198 
199 #ifndef USE_SSE2
200   /* now e=mm0:mm1 contain the really base-2 exponent */
201   mm0 = _mm_sub_pi32(mm0, *(v2si*)_pi32_0x7f);
202   mm1 = _mm_sub_pi32(mm1, *(v2si*)_pi32_0x7f);
203   v4sf e = _mm_cvtpi32x2_ps(mm0, mm1);
204   _mm_empty(); /* bye bye mmx */
205 #else
206   emm0 = _mm_sub_epi32(emm0, *(v4si*)_pi32_0x7f);
207   v4sf e = _mm_cvtepi32_ps(emm0);
208 #endif
209 
210   e = _mm_add_ps(e, one);
211 
212   /* part2:
213      if( x < SQRTHF ) {
214        e -= 1;
215        x = x + x - 1.0;
216      } else { x = x - 1.0; }
217   */
218   v4sf mask = _mm_cmplt_ps(x, *(v4sf*)_ps_cephes_SQRTHF);
219   v4sf tmp = _mm_and_ps(x, mask);
220   x = _mm_sub_ps(x, one);
221   e = _mm_sub_ps(e, _mm_and_ps(one, mask));
222   x = _mm_add_ps(x, tmp);
223 
224 
225   v4sf z = _mm_mul_ps(x,x);
226 
227   v4sf y = *(v4sf*)_ps_cephes_log_p0;
228   y = _mm_mul_ps(y, x);
229   y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p1);
230   y = _mm_mul_ps(y, x);
231   y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p2);
232   y = _mm_mul_ps(y, x);
233   y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p3);
234   y = _mm_mul_ps(y, x);
235   y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p4);
236   y = _mm_mul_ps(y, x);
237   y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p5);
238   y = _mm_mul_ps(y, x);
239   y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p6);
240   y = _mm_mul_ps(y, x);
241   y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p7);
242   y = _mm_mul_ps(y, x);
243   y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p8);
244   y = _mm_mul_ps(y, x);
245 
246   y = _mm_mul_ps(y, z);
247 
248 
249   tmp = _mm_mul_ps(e, *(v4sf*)_ps_cephes_log_q1);
250   y = _mm_add_ps(y, tmp);
251 
252 
253   tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
254   y = _mm_sub_ps(y, tmp);
255 
256   tmp = _mm_mul_ps(e, *(v4sf*)_ps_cephes_log_q2);
257   x = _mm_add_ps(x, y);
258   x = _mm_add_ps(x, tmp);
259   x = _mm_or_ps(x, invalid_mask); // negative arg will be NAN
260   return x;
261 }
262 
263 _PS_CONST(exp_hi,	88.3762626647949f);
264 _PS_CONST(exp_lo,	-88.3762626647949f);
265 
266 _PS_CONST(cephes_LOG2EF, 1.44269504088896341);
267 _PS_CONST(cephes_exp_C1, 0.693359375);
268 _PS_CONST(cephes_exp_C2, -2.12194440e-4);
269 
270 _PS_CONST(cephes_exp_p0, 1.9875691500E-4);
271 _PS_CONST(cephes_exp_p1, 1.3981999507E-3);
272 _PS_CONST(cephes_exp_p2, 8.3334519073E-3);
273 _PS_CONST(cephes_exp_p3, 4.1665795894E-2);
274 _PS_CONST(cephes_exp_p4, 1.6666665459E-1);
275 _PS_CONST(cephes_exp_p5, 5.0000001201E-1);
276 
exp_ps(v4sf x)277 v4sf exp_ps(v4sf x) {
278   v4sf tmp = _mm_setzero_ps(), fx;
279 #ifdef USE_SSE2
280   v4si emm0;
281 #else
282   v2si mm0, mm1;
283 #endif
284   v4sf one = *(v4sf*)_ps_1;
285 
286   x = _mm_min_ps(x, *(v4sf*)_ps_exp_hi);
287   x = _mm_max_ps(x, *(v4sf*)_ps_exp_lo);
288 
289   /* express exp(x) as exp(g + n*log(2)) */
290   fx = _mm_mul_ps(x, *(v4sf*)_ps_cephes_LOG2EF);
291   fx = _mm_add_ps(fx, *(v4sf*)_ps_0p5);
292 
293   /* how to perform a floorf with SSE: just below */
294 #ifndef USE_SSE2
295   /* step 1 : cast to int */
296   tmp = _mm_movehl_ps(tmp, fx);
297   mm0 = _mm_cvttps_pi32(fx);
298   mm1 = _mm_cvttps_pi32(tmp);
299   /* step 2 : cast back to float */
300   tmp = _mm_cvtpi32x2_ps(mm0, mm1);
301 #else
302   emm0 = _mm_cvttps_epi32(fx);
303   tmp  = _mm_cvtepi32_ps(emm0);
304 #endif
305   /* if greater, subtract 1 */
306   v4sf mask = _mm_cmpgt_ps(tmp, fx);
307   mask = _mm_and_ps(mask, one);
308   fx = _mm_sub_ps(tmp, mask);
309 
310   tmp = _mm_mul_ps(fx, *(v4sf*)_ps_cephes_exp_C1);
311   v4sf z = _mm_mul_ps(fx, *(v4sf*)_ps_cephes_exp_C2);
312   x = _mm_sub_ps(x, tmp);
313   x = _mm_sub_ps(x, z);
314 
315   z = _mm_mul_ps(x,x);
316 
317   v4sf y = *(v4sf*)_ps_cephes_exp_p0;
318   y = _mm_mul_ps(y, x);
319   y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p1);
320   y = _mm_mul_ps(y, x);
321   y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p2);
322   y = _mm_mul_ps(y, x);
323   y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p3);
324   y = _mm_mul_ps(y, x);
325   y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p4);
326   y = _mm_mul_ps(y, x);
327   y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p5);
328   y = _mm_mul_ps(y, z);
329   y = _mm_add_ps(y, x);
330   y = _mm_add_ps(y, one);
331 
332   /* build 2^n */
333 #ifndef USE_SSE2
334   z = _mm_movehl_ps(z, fx);
335   mm0 = _mm_cvttps_pi32(fx);
336   mm1 = _mm_cvttps_pi32(z);
337   mm0 = _mm_add_pi32(mm0, *(v2si*)_pi32_0x7f);
338   mm1 = _mm_add_pi32(mm1, *(v2si*)_pi32_0x7f);
339   mm0 = _mm_slli_pi32(mm0, 23);
340   mm1 = _mm_slli_pi32(mm1, 23);
341 
342   v4sf pow2n;
343   COPY_MM_TO_XMM(mm0, mm1, pow2n);
344   _mm_empty();
345 #else
346   emm0 = _mm_cvttps_epi32(fx);
347   emm0 = _mm_add_epi32(emm0, *(v4si*)_pi32_0x7f);
348   emm0 = _mm_slli_epi32(emm0, 23);
349   v4sf pow2n = _mm_castsi128_ps(emm0);
350 #endif
351   y = _mm_mul_ps(y, pow2n);
352   return y;
353 }
354 
355 _PS_CONST(minus_cephes_DP1, -0.78515625);
356 _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
357 _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
358 _PS_CONST(sincof_p0, -1.9515295891E-4);
359 _PS_CONST(sincof_p1,  8.3321608736E-3);
360 _PS_CONST(sincof_p2, -1.6666654611E-1);
361 _PS_CONST(coscof_p0,  2.443315711809948E-005);
362 _PS_CONST(coscof_p1, -1.388731625493765E-003);
363 _PS_CONST(coscof_p2,  4.166664568298827E-002);
364 _PS_CONST(cephes_FOPI, 1.27323954473516); // 4 / M_PI
365 
366 
367 /* evaluation of 4 sines at once, using only SSE1+MMX intrinsics so
368    it runs also on old athlons XPs and the pentium III of your grand
369    mother.
370 
371    The code is the exact rewriting of the cephes sinf function.
372    Precision is excellent as long as x < 8192 (I did not bother to
373    take into account the special handling they have for greater values
374    -- it does not return garbage for arguments over 8192, though, but
375    the extra precision is missing).
376 
377    Note that it is such that sinf((float)M_PI) = 8.74e-8, which is the
378    surprising but correct result.
379 
380    Performance is also surprisingly good, 1.33 times faster than the
381    macos vsinf SSE2 function, and 1.5 times faster than the
382    __vrs4_sinf of amd's ACML (which is only available in 64 bits). Not
383    too bad for an SSE1 function (with no special tuning) !
384    However the latter libraries probably have a much better handling of NaN,
385    Inf, denormalized and other special arguments..
386 
387    On my core 1 duo, the execution of this function takes approximately 95 cycles.
388 
389    From what I have observed on the experiments with Intel AMath lib, switching to an
390    SSE2 version would improve the perf by only 10%.
391 
392    Since it is based on SSE intrinsics, it has to be compiled at -O2 to
393    deliver full speed.
394 */
sin_ps(v4sf x)395 v4sf sin_ps(v4sf x) { // any x
396   v4sf xmm1, xmm2 = _mm_setzero_ps(), xmm3, sign_bit, y;
397 
398 #ifdef USE_SSE2
399   v4si emm0, emm2;
400 #else
401   v2si mm0, mm1, mm2, mm3;
402 #endif
403   sign_bit = x;
404   /* take the absolute value */
405   x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
406   /* extract the sign bit (upper one) */
407   sign_bit = _mm_and_ps(sign_bit, *(v4sf*)_ps_sign_mask);
408 
409   /* scale by 4/Pi */
410   y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
411 
412 #ifdef USE_SSE2
413   /* store the integer part of y in mm0 */
414   emm2 = _mm_cvttps_epi32(y);
415   /* j=(j+1) & (~1) (see the cephes sources) */
416   emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
417   emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
418   y = _mm_cvtepi32_ps(emm2);
419 
420   /* get the swap sign flag */
421   emm0 = _mm_and_si128(emm2, *(v4si*)_pi32_4);
422   emm0 = _mm_slli_epi32(emm0, 29);
423   /* get the polynom selection mask
424      there is one polynom for 0 <= x <= Pi/4
425      and another one for Pi/4<x<=Pi/2
426 
427      Both branches will be computed.
428   */
429   emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
430   emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
431 
432   v4sf swap_sign_bit = _mm_castsi128_ps(emm0);
433   v4sf poly_mask = _mm_castsi128_ps(emm2);
434   sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);
435 
436 #else
437   /* store the integer part of y in mm0:mm1 */
438   xmm2 = _mm_movehl_ps(xmm2, y);
439   mm2 = _mm_cvttps_pi32(y);
440   mm3 = _mm_cvttps_pi32(xmm2);
441   /* j=(j+1) & (~1) (see the cephes sources) */
442   mm2 = _mm_add_pi32(mm2, *(v2si*)_pi32_1);
443   mm3 = _mm_add_pi32(mm3, *(v2si*)_pi32_1);
444   mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_inv1);
445   mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_inv1);
446   y = _mm_cvtpi32x2_ps(mm2, mm3);
447   /* get the swap sign flag */
448   mm0 = _mm_and_si64(mm2, *(v2si*)_pi32_4);
449   mm1 = _mm_and_si64(mm3, *(v2si*)_pi32_4);
450   mm0 = _mm_slli_pi32(mm0, 29);
451   mm1 = _mm_slli_pi32(mm1, 29);
452   /* get the polynom selection mask */
453   mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_2);
454   mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_2);
455   mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
456   mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
457   v4sf swap_sign_bit, poly_mask;
458   COPY_MM_TO_XMM(mm0, mm1, swap_sign_bit);
459   COPY_MM_TO_XMM(mm2, mm3, poly_mask);
460   sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);
461   _mm_empty(); /* good-bye mmx */
462 #endif
463 
464   /* The magic pass: "Extended precision modular arithmetic"
465      x = ((x - y * DP1) - y * DP2) - y * DP3; */
466   xmm1 = *(v4sf*)_ps_minus_cephes_DP1;
467   xmm2 = *(v4sf*)_ps_minus_cephes_DP2;
468   xmm3 = *(v4sf*)_ps_minus_cephes_DP3;
469   xmm1 = _mm_mul_ps(y, xmm1);
470   xmm2 = _mm_mul_ps(y, xmm2);
471   xmm3 = _mm_mul_ps(y, xmm3);
472   x = _mm_add_ps(x, xmm1);
473   x = _mm_add_ps(x, xmm2);
474   x = _mm_add_ps(x, xmm3);
475 
476   /* Evaluate the first polynom  (0 <= x <= Pi/4) */
477   y = *(v4sf*)_ps_coscof_p0;
478   v4sf z = _mm_mul_ps(x,x);
479 
480   y = _mm_mul_ps(y, z);
481   y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p1);
482   y = _mm_mul_ps(y, z);
483   y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p2);
484   y = _mm_mul_ps(y, z);
485   y = _mm_mul_ps(y, z);
486   v4sf tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
487   y = _mm_sub_ps(y, tmp);
488   y = _mm_add_ps(y, *(v4sf*)_ps_1);
489 
490   /* Evaluate the second polynom  (Pi/4 <= x <= 0) */
491 
492   v4sf y2 = *(v4sf*)_ps_sincof_p0;
493   y2 = _mm_mul_ps(y2, z);
494   y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
495   y2 = _mm_mul_ps(y2, z);
496   y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
497   y2 = _mm_mul_ps(y2, z);
498   y2 = _mm_mul_ps(y2, x);
499   y2 = _mm_add_ps(y2, x);
500 
501   /* select the correct result from the two polynoms */
502   xmm3 = poly_mask;
503   y2 = _mm_and_ps(xmm3, y2); //, xmm3);
504   y = _mm_andnot_ps(xmm3, y);
505   y = _mm_add_ps(y,y2);
506   /* update the sign */
507   y = _mm_xor_ps(y, sign_bit);
508   return y;
509 }
510 
511 /* almost the same as sin_ps */
cos_ps(v4sf x)512 v4sf cos_ps(v4sf x) { // any x
513   v4sf xmm1, xmm2 = _mm_setzero_ps(), xmm3, y;
514 #ifdef USE_SSE2
515   v4si emm0, emm2;
516 #else
517   v2si mm0, mm1, mm2, mm3;
518 #endif
519   /* take the absolute value */
520   x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
521 
522   /* scale by 4/Pi */
523   y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
524 
525 #ifdef USE_SSE2
526   /* store the integer part of y in mm0 */
527   emm2 = _mm_cvttps_epi32(y);
528   /* j=(j+1) & (~1) (see the cephes sources) */
529   emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
530   emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
531   y = _mm_cvtepi32_ps(emm2);
532 
533   emm2 = _mm_sub_epi32(emm2, *(v4si*)_pi32_2);
534 
535   /* get the swap sign flag */
536   emm0 = _mm_andnot_si128(emm2, *(v4si*)_pi32_4);
537   emm0 = _mm_slli_epi32(emm0, 29);
538   /* get the polynom selection mask */
539   emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
540   emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
541 
542   v4sf sign_bit = _mm_castsi128_ps(emm0);
543   v4sf poly_mask = _mm_castsi128_ps(emm2);
544 #else
545   /* store the integer part of y in mm0:mm1 */
546   xmm2 = _mm_movehl_ps(xmm2, y);
547   mm2 = _mm_cvttps_pi32(y);
548   mm3 = _mm_cvttps_pi32(xmm2);
549 
550   /* j=(j+1) & (~1) (see the cephes sources) */
551   mm2 = _mm_add_pi32(mm2, *(v2si*)_pi32_1);
552   mm3 = _mm_add_pi32(mm3, *(v2si*)_pi32_1);
553   mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_inv1);
554   mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_inv1);
555 
556   y = _mm_cvtpi32x2_ps(mm2, mm3);
557 
558 
559   mm2 = _mm_sub_pi32(mm2, *(v2si*)_pi32_2);
560   mm3 = _mm_sub_pi32(mm3, *(v2si*)_pi32_2);
561 
562   /* get the swap sign flag in mm0:mm1 and the
563      polynom selection mask in mm2:mm3 */
564 
565   mm0 = _mm_andnot_si64(mm2, *(v2si*)_pi32_4);
566   mm1 = _mm_andnot_si64(mm3, *(v2si*)_pi32_4);
567   mm0 = _mm_slli_pi32(mm0, 29);
568   mm1 = _mm_slli_pi32(mm1, 29);
569 
570   mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_2);
571   mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_2);
572 
573   mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
574   mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
575 
576   v4sf sign_bit, poly_mask;
577   COPY_MM_TO_XMM(mm0, mm1, sign_bit);
578   COPY_MM_TO_XMM(mm2, mm3, poly_mask);
579   _mm_empty(); /* good-bye mmx */
580 #endif
581   /* The magic pass: "Extended precision modular arithmetic"
582      x = ((x - y * DP1) - y * DP2) - y * DP3; */
583   xmm1 = *(v4sf*)_ps_minus_cephes_DP1;
584   xmm2 = *(v4sf*)_ps_minus_cephes_DP2;
585   xmm3 = *(v4sf*)_ps_minus_cephes_DP3;
586   xmm1 = _mm_mul_ps(y, xmm1);
587   xmm2 = _mm_mul_ps(y, xmm2);
588   xmm3 = _mm_mul_ps(y, xmm3);
589   x = _mm_add_ps(x, xmm1);
590   x = _mm_add_ps(x, xmm2);
591   x = _mm_add_ps(x, xmm3);
592 
593   /* Evaluate the first polynom  (0 <= x <= Pi/4) */
594   y = *(v4sf*)_ps_coscof_p0;
595   v4sf z = _mm_mul_ps(x,x);
596 
597   y = _mm_mul_ps(y, z);
598   y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p1);
599   y = _mm_mul_ps(y, z);
600   y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p2);
601   y = _mm_mul_ps(y, z);
602   y = _mm_mul_ps(y, z);
603   v4sf tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
604   y = _mm_sub_ps(y, tmp);
605   y = _mm_add_ps(y, *(v4sf*)_ps_1);
606 
607   /* Evaluate the second polynom  (Pi/4 <= x <= 0) */
608 
609   v4sf y2 = *(v4sf*)_ps_sincof_p0;
610   y2 = _mm_mul_ps(y2, z);
611   y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
612   y2 = _mm_mul_ps(y2, z);
613   y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
614   y2 = _mm_mul_ps(y2, z);
615   y2 = _mm_mul_ps(y2, x);
616   y2 = _mm_add_ps(y2, x);
617 
618   /* select the correct result from the two polynoms */
619   xmm3 = poly_mask;
620   y2 = _mm_and_ps(xmm3, y2); //, xmm3);
621   y = _mm_andnot_ps(xmm3, y);
622   y = _mm_add_ps(y,y2);
623   /* update the sign */
624   y = _mm_xor_ps(y, sign_bit);
625 
626   return y;
627 }
628 
629 /* since sin_ps and cos_ps are almost identical, sincos_ps could replace both of them..
630    it is almost as fast, and gives you a free cosine with your sine */
sincos_ps(v4sf x,v4sf * s,v4sf * c)631 void sincos_ps(v4sf x, v4sf *s, v4sf *c) {
632   v4sf xmm1, xmm2, xmm3 = _mm_setzero_ps(), sign_bit_sin, y;
633 #ifdef USE_SSE2
634   v4si emm0, emm2, emm4;
635 #else
636   v2si mm0, mm1, mm2, mm3, mm4, mm5;
637 #endif
638   sign_bit_sin = x;
639   /* take the absolute value */
640   x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
641   /* extract the sign bit (upper one) */
642   sign_bit_sin = _mm_and_ps(sign_bit_sin, *(v4sf*)_ps_sign_mask);
643 
644   /* scale by 4/Pi */
645   y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
646 
647 #ifdef USE_SSE2
648   /* store the integer part of y in emm2 */
649   emm2 = _mm_cvttps_epi32(y);
650 
651   /* j=(j+1) & (~1) (see the cephes sources) */
652   emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
653   emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
654   y = _mm_cvtepi32_ps(emm2);
655 
656   emm4 = emm2;
657 
658   /* get the swap sign flag for the sine */
659   emm0 = _mm_and_si128(emm2, *(v4si*)_pi32_4);
660   emm0 = _mm_slli_epi32(emm0, 29);
661   v4sf swap_sign_bit_sin = _mm_castsi128_ps(emm0);
662 
663   /* get the polynom selection mask for the sine*/
664   emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
665   emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
666   v4sf poly_mask = _mm_castsi128_ps(emm2);
667 #else
668   /* store the integer part of y in mm2:mm3 */
669   xmm3 = _mm_movehl_ps(xmm3, y);
670   mm2 = _mm_cvttps_pi32(y);
671   mm3 = _mm_cvttps_pi32(xmm3);
672 
673   /* j=(j+1) & (~1) (see the cephes sources) */
674   mm2 = _mm_add_pi32(mm2, *(v2si*)_pi32_1);
675   mm3 = _mm_add_pi32(mm3, *(v2si*)_pi32_1);
676   mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_inv1);
677   mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_inv1);
678 
679   y = _mm_cvtpi32x2_ps(mm2, mm3);
680 
681   mm4 = mm2;
682   mm5 = mm3;
683 
684   /* get the swap sign flag for the sine */
685   mm0 = _mm_and_si64(mm2, *(v2si*)_pi32_4);
686   mm1 = _mm_and_si64(mm3, *(v2si*)_pi32_4);
687   mm0 = _mm_slli_pi32(mm0, 29);
688   mm1 = _mm_slli_pi32(mm1, 29);
689   v4sf swap_sign_bit_sin;
690   COPY_MM_TO_XMM(mm0, mm1, swap_sign_bit_sin);
691 
692   /* get the polynom selection mask for the sine */
693 
694   mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_2);
695   mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_2);
696   mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
697   mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
698   v4sf poly_mask;
699   COPY_MM_TO_XMM(mm2, mm3, poly_mask);
700 #endif
701 
702   /* The magic pass: "Extended precision modular arithmetic"
703      x = ((x - y * DP1) - y * DP2) - y * DP3; */
704   xmm1 = *(v4sf*)_ps_minus_cephes_DP1;
705   xmm2 = *(v4sf*)_ps_minus_cephes_DP2;
706   xmm3 = *(v4sf*)_ps_minus_cephes_DP3;
707   xmm1 = _mm_mul_ps(y, xmm1);
708   xmm2 = _mm_mul_ps(y, xmm2);
709   xmm3 = _mm_mul_ps(y, xmm3);
710   x = _mm_add_ps(x, xmm1);
711   x = _mm_add_ps(x, xmm2);
712   x = _mm_add_ps(x, xmm3);
713 
714 #ifdef USE_SSE2
715   emm4 = _mm_sub_epi32(emm4, *(v4si*)_pi32_2);
716   emm4 = _mm_andnot_si128(emm4, *(v4si*)_pi32_4);
717   emm4 = _mm_slli_epi32(emm4, 29);
718   v4sf sign_bit_cos = _mm_castsi128_ps(emm4);
719 #else
720   /* get the sign flag for the cosine */
721   mm4 = _mm_sub_pi32(mm4, *(v2si*)_pi32_2);
722   mm5 = _mm_sub_pi32(mm5, *(v2si*)_pi32_2);
723   mm4 = _mm_andnot_si64(mm4, *(v2si*)_pi32_4);
724   mm5 = _mm_andnot_si64(mm5, *(v2si*)_pi32_4);
725   mm4 = _mm_slli_pi32(mm4, 29);
726   mm5 = _mm_slli_pi32(mm5, 29);
727   v4sf sign_bit_cos;
728   COPY_MM_TO_XMM(mm4, mm5, sign_bit_cos);
729   _mm_empty(); /* good-bye mmx */
730 #endif
731 
732   sign_bit_sin = _mm_xor_ps(sign_bit_sin, swap_sign_bit_sin);
733 
734 
735   /* Evaluate the first polynom  (0 <= x <= Pi/4) */
736   v4sf z = _mm_mul_ps(x,x);
737   y = *(v4sf*)_ps_coscof_p0;
738 
739   y = _mm_mul_ps(y, z);
740   y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p1);
741   y = _mm_mul_ps(y, z);
742   y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p2);
743   y = _mm_mul_ps(y, z);
744   y = _mm_mul_ps(y, z);
745   v4sf tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
746   y = _mm_sub_ps(y, tmp);
747   y = _mm_add_ps(y, *(v4sf*)_ps_1);
748 
749   /* Evaluate the second polynom  (Pi/4 <= x <= 0) */
750 
751   v4sf y2 = *(v4sf*)_ps_sincof_p0;
752   y2 = _mm_mul_ps(y2, z);
753   y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
754   y2 = _mm_mul_ps(y2, z);
755   y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
756   y2 = _mm_mul_ps(y2, z);
757   y2 = _mm_mul_ps(y2, x);
758   y2 = _mm_add_ps(y2, x);
759 
760   /* select the correct result from the two polynoms */
761   xmm3 = poly_mask;
762   v4sf ysin2 = _mm_and_ps(xmm3, y2);
763   v4sf ysin1 = _mm_andnot_ps(xmm3, y);
764   y2 = _mm_sub_ps(y2,ysin2);
765   y = _mm_sub_ps(y, ysin1);
766 
767   xmm1 = _mm_add_ps(ysin1,ysin2);
768   xmm2 = _mm_add_ps(y,y2);
769 
770   /* update the sign */
771   *s = _mm_xor_ps(xmm1, sign_bit_sin);
772   *c = _mm_xor_ps(xmm2, sign_bit_cos);
773 }
774 
775 #endif
776