1 /* SIMD (SSE1+MMX or SSE2) implementation of sin, cos, exp and log
2
3 Inspired by Intel Approximate Math library, and based on the
4 corresponding algorithms of the cephes math library
5
6 The default is to use the SSE1 version. If you define USE_SSE2 the
7 the SSE2 intrinsics will be used in place of the MMX intrinsics. Do
8 not expect any significant performance improvement with SSE2.
9 */
10
11 /* Copyright (C) 2007 Julien Pommier
12
13 This software is provided 'as-is', without any express or implied
14 warranty. In no event will the authors be held liable for any damages
15 arising from the use of this software.
16
17 Permission is granted to anyone to use this software for any purpose,
18 including commercial applications, and to alter it and redistribute it
19 freely, subject to the following restrictions:
20
21 1. The origin of this software must not be misrepresented; you must not
22 claim that you wrote the original software. If you use this software
23 in a product, an acknowledgment in the product documentation would be
24 appreciated but is not required.
25 2. Altered source versions must be plainly marked as such, and must not be
26 misrepresented as being the original software.
27 3. This notice may not be removed or altered from any source distribution.
28
29 (this is the zlib license)
30 */
31
32 #ifndef SSE_MATHFUN_H
33 #define SSE_MATHFUN_H
34
35 #define USE_SSE2 1
36
37 #include <xmmintrin.h>
38
39 /* yes I know, the top of this file is quite ugly */
40
41 #ifdef _MSC_VER /* visual c++ */
42 #define ALIGN16_BEG __declspec(align(16))
43 #define ALIGN16_END
44 #else /* gcc or icc */
45 #define ALIGN16_BEG
46 #define ALIGN16_END __attribute__((aligned(16)))
47 #endif
48
49 /* __m128 is ugly to write */
50 typedef __m128 v4sf; // vector of 4 float (sse1)
51
52 #ifdef USE_SSE2
53 #include <emmintrin.h>
54 typedef __m128i v4si; // vector of 4 int (sse2)
55 #else
56 typedef __m64 v2si; // vector of 2 int (mmx)
57 #endif
58
59 /* declare some SSE constants -- why can't I figure a better way to do that? */
60 #define _PS_CONST(Name, Val) \
61 static const ALIGN16_BEG float _ps_##Name[4] ALIGN16_END = {Val, Val, Val, Val}
62 #define _PI32_CONST(Name, Val) \
63 static const ALIGN16_BEG int _pi32_##Name[4] ALIGN16_END = {Val, Val, Val, Val}
64 #define _PS_CONST_TYPE(Name, Type, Val) \
65 static const ALIGN16_BEG Type _ps_##Name[4] ALIGN16_END = {Val, Val, Val, Val}
66
67 _PS_CONST(1, 1.0f);
68 _PS_CONST(0p5, 0.5f);
69 /* the smallest non denormalized float number */
70 _PS_CONST_TYPE(min_norm_pos, int, 0x00800000);
71 _PS_CONST_TYPE(mant_mask, int, 0x7f800000);
72 _PS_CONST_TYPE(inv_mant_mask, int, ~0x7f800000);
73
74 _PS_CONST_TYPE(sign_mask, int, (int)0x80000000);
75 _PS_CONST_TYPE(inv_sign_mask, int, ~0x80000000);
76
77 _PI32_CONST(1, 1);
78 _PI32_CONST(inv1, ~1);
79 _PI32_CONST(2, 2);
80 _PI32_CONST(4, 4);
81 _PI32_CONST(0x7f, 0x7f);
82
83 _PS_CONST(cephes_SQRTHF, 0.707106781186547524f);
84 _PS_CONST(cephes_log_p0, 7.0376836292E-2f);
85 _PS_CONST(cephes_log_p1, -1.1514610310E-1f);
86 _PS_CONST(cephes_log_p2, 1.1676998740E-1f);
87 _PS_CONST(cephes_log_p3, -1.2420140846E-1f);
88 _PS_CONST(cephes_log_p4, +1.4249322787E-1f);
89 _PS_CONST(cephes_log_p5, -1.6668057665E-1f);
90 _PS_CONST(cephes_log_p6, +2.0000714765E-1f);
91 _PS_CONST(cephes_log_p7, -2.4999993993E-1f);
92 _PS_CONST(cephes_log_p8, +3.3333331174E-1f);
93 _PS_CONST(cephes_log_q1, -2.12194440e-4f);
94 _PS_CONST(cephes_log_q2, 0.693359375f);
95
96 #ifndef USE_SSE2
97 typedef union xmm_mm_union
98 {
99 __m128 xmm;
100 __m64 mm[2];
101 } xmm_mm_union;
102
103 #define COPY_XMM_TO_MM(xmm_, mm0_, mm1_) \
104 { \
105 xmm_mm_union u; \
106 u.xmm = xmm_; \
107 mm0_ = u.mm[0]; \
108 mm1_ = u.mm[1]; \
109 }
110
111 #define COPY_MM_TO_XMM(mm0_, mm1_, xmm_) \
112 { \
113 xmm_mm_union u; \
114 u.mm[0] = mm0_; \
115 u.mm[1] = mm1_; \
116 xmm_ = u.xmm; \
117 }
118
119 #endif // USE_SSE2
120
121 /* natural logarithm computed for 4 simultaneous float
122 return NaN for x <= 0
123 */
log_ps(v4sf x)124 static inline v4sf log_ps(v4sf x)
125 {
126 #ifdef USE_SSE2
127 v4si emm0;
128 #else
129 v2si mm0, mm1;
130 #endif
131 v4sf one = *(v4sf*)_ps_1;
132
133 v4sf invalid_mask = _mm_cmple_ps(x, _mm_setzero_ps());
134
135 x = _mm_max_ps(x, *(v4sf*)_ps_min_norm_pos); /* cut off denormalized stuff */
136
137 #ifndef USE_SSE2
138 /* part 1: x = frexpf(x, &e); */
139 COPY_XMM_TO_MM(x, mm0, mm1);
140 mm0 = _mm_srli_pi32(mm0, 23);
141 mm1 = _mm_srli_pi32(mm1, 23);
142 #else
143 emm0 = _mm_srli_epi32(_mm_castps_si128(x), 23);
144 #endif
145 /* keep only the fractional part */
146 x = _mm_and_ps(x, *(v4sf*)_ps_inv_mant_mask);
147 x = _mm_or_ps(x, *(v4sf*)_ps_0p5);
148
149 #ifndef USE_SSE2
150 /* now e=mm0:mm1 contain the really base-2 exponent */
151 mm0 = _mm_sub_pi32(mm0, *(v2si*)_pi32_0x7f);
152 mm1 = _mm_sub_pi32(mm1, *(v2si*)_pi32_0x7f);
153 v4sf e = _mm_cvtpi32x2_ps(mm0, mm1);
154 _mm_empty(); /* bye bye mmx */
155 #else
156 emm0 = _mm_sub_epi32(emm0, *(v4si*)_pi32_0x7f);
157 v4sf e = _mm_cvtepi32_ps(emm0);
158 #endif
159
160 e = _mm_add_ps(e, one);
161
162 /* part2:
163 if( x < SQRTHF ) {
164 e -= 1;
165 x = x + x - 1.0;
166 } else { x = x - 1.0; }
167 */
168 v4sf mask = _mm_cmplt_ps(x, *(v4sf*)_ps_cephes_SQRTHF);
169 v4sf tmp = _mm_and_ps(x, mask);
170 x = _mm_sub_ps(x, one);
171 e = _mm_sub_ps(e, _mm_and_ps(one, mask));
172 x = _mm_add_ps(x, tmp);
173
174 v4sf z = _mm_mul_ps(x, x);
175
176 v4sf y = *(v4sf*)_ps_cephes_log_p0;
177 y = _mm_mul_ps(y, x);
178 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p1);
179 y = _mm_mul_ps(y, x);
180 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p2);
181 y = _mm_mul_ps(y, x);
182 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p3);
183 y = _mm_mul_ps(y, x);
184 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p4);
185 y = _mm_mul_ps(y, x);
186 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p5);
187 y = _mm_mul_ps(y, x);
188 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p6);
189 y = _mm_mul_ps(y, x);
190 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p7);
191 y = _mm_mul_ps(y, x);
192 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p8);
193 y = _mm_mul_ps(y, x);
194
195 y = _mm_mul_ps(y, z);
196
197 tmp = _mm_mul_ps(e, *(v4sf*)_ps_cephes_log_q1);
198 y = _mm_add_ps(y, tmp);
199
200 tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
201 y = _mm_sub_ps(y, tmp);
202
203 tmp = _mm_mul_ps(e, *(v4sf*)_ps_cephes_log_q2);
204 x = _mm_add_ps(x, y);
205 x = _mm_add_ps(x, tmp);
206 x = _mm_or_ps(x, invalid_mask); // negative arg will be NAN
207 return x;
208 }
209
210 _PS_CONST(exp_hi, 88.3762626647949f);
211 _PS_CONST(exp_lo, -88.3762626647949f);
212
213 _PS_CONST(cephes_LOG2EF, 1.44269504088896341f);
214 _PS_CONST(cephes_exp_C1, 0.693359375f);
215 _PS_CONST(cephes_exp_C2, -2.12194440e-4f);
216
217 _PS_CONST(cephes_exp_p0, 1.9875691500E-4f);
218 _PS_CONST(cephes_exp_p1, 1.3981999507E-3f);
219 _PS_CONST(cephes_exp_p2, 8.3334519073E-3f);
220 _PS_CONST(cephes_exp_p3, 4.1665795894E-2f);
221 _PS_CONST(cephes_exp_p4, 1.6666665459E-1f);
222 _PS_CONST(cephes_exp_p5, 5.0000001201E-1f);
223
exp_ps(v4sf x)224 static inline v4sf exp_ps(v4sf x)
225 {
226 v4sf tmp = _mm_setzero_ps(), fx;
227 #ifdef USE_SSE2
228 v4si emm0;
229 #else
230 v2si mm0, mm1;
231 #endif
232 v4sf one = *(v4sf*)_ps_1;
233
234 x = _mm_min_ps(x, *(v4sf*)_ps_exp_hi);
235 x = _mm_max_ps(x, *(v4sf*)_ps_exp_lo);
236
237 /* express exp(x) as exp(g + n*log(2)) */
238 fx = _mm_mul_ps(x, *(v4sf*)_ps_cephes_LOG2EF);
239 fx = _mm_add_ps(fx, *(v4sf*)_ps_0p5);
240
241 /* how to perform a floorf with SSE: just below */
242 #ifndef USE_SSE2
243 /* step 1 : cast to int */
244 tmp = _mm_movehl_ps(tmp, fx);
245 mm0 = _mm_cvttps_pi32(fx);
246 mm1 = _mm_cvttps_pi32(tmp);
247 /* step 2 : cast back to float */
248 tmp = _mm_cvtpi32x2_ps(mm0, mm1);
249 #else
250 emm0 = _mm_cvttps_epi32(fx);
251 tmp = _mm_cvtepi32_ps(emm0);
252 #endif
253 /* if greater, substract 1 */
254 v4sf mask = _mm_cmpgt_ps(tmp, fx);
255 mask = _mm_and_ps(mask, one);
256 fx = _mm_sub_ps(tmp, mask);
257
258 tmp = _mm_mul_ps(fx, *(v4sf*)_ps_cephes_exp_C1);
259 v4sf z = _mm_mul_ps(fx, *(v4sf*)_ps_cephes_exp_C2);
260 x = _mm_sub_ps(x, tmp);
261 x = _mm_sub_ps(x, z);
262
263 z = _mm_mul_ps(x, x);
264
265 v4sf y = *(v4sf*)_ps_cephes_exp_p0;
266 y = _mm_mul_ps(y, x);
267 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p1);
268 y = _mm_mul_ps(y, x);
269 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p2);
270 y = _mm_mul_ps(y, x);
271 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p3);
272 y = _mm_mul_ps(y, x);
273 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p4);
274 y = _mm_mul_ps(y, x);
275 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p5);
276 y = _mm_mul_ps(y, z);
277 y = _mm_add_ps(y, x);
278 y = _mm_add_ps(y, one);
279
280 /* build 2^n */
281 #ifndef USE_SSE2
282 z = _mm_movehl_ps(z, fx);
283 mm0 = _mm_cvttps_pi32(fx);
284 mm1 = _mm_cvttps_pi32(z);
285 mm0 = _mm_add_pi32(mm0, *(v2si*)_pi32_0x7f);
286 mm1 = _mm_add_pi32(mm1, *(v2si*)_pi32_0x7f);
287 mm0 = _mm_slli_pi32(mm0, 23);
288 mm1 = _mm_slli_pi32(mm1, 23);
289
290 v4sf pow2n;
291 COPY_MM_TO_XMM(mm0, mm1, pow2n);
292 _mm_empty();
293 #else
294 emm0 = _mm_cvttps_epi32(fx);
295 emm0 = _mm_add_epi32(emm0, *(v4si*)_pi32_0x7f);
296 emm0 = _mm_slli_epi32(emm0, 23);
297 v4sf pow2n = _mm_castsi128_ps(emm0);
298 #endif
299 y = _mm_mul_ps(y, pow2n);
300 return y;
301 }
302
303 _PS_CONST(minus_cephes_DP1, -0.78515625f);
304 _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4f);
305 _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8f);
306 _PS_CONST(sincof_p0, -1.9515295891E-4f);
307 _PS_CONST(sincof_p1, 8.3321608736E-3f);
308 _PS_CONST(sincof_p2, -1.6666654611E-1f);
309 _PS_CONST(coscof_p0, 2.443315711809948E-005f);
310 _PS_CONST(coscof_p1, -1.388731625493765E-003f);
311 _PS_CONST(coscof_p2, 4.166664568298827E-002f);
312 _PS_CONST(cephes_FOPI, 1.27323954473516f); // 4 / M_PI
313
314 /* evaluation of 4 sines at onces, using only SSE1+MMX intrinsics so
315 it runs also on old athlons XPs and the pentium III of your grand
316 mother.
317
318 The code is the exact rewriting of the cephes sinf function.
319 Precision is excellent as long as x < 8192 (I did not bother to
320 take into account the special handling they have for greater values
321 -- it does not return garbage for arguments over 8192, though, but
322 the extra precision is missing).
323
324 Note that it is such that sinf((float)M_PI) = 8.74e-8, which is the
325 surprising but correct result.
326
327 Performance is also surprisingly good, 1.33 times faster than the
328 macos vsinf SSE2 function, and 1.5 times faster than the
329 __vrs4_sinf of amd's ACML (which is only available in 64 bits). Not
330 too bad for an SSE1 function (with no special tuning) !
331 However the latter libraries probably have a much better handling of NaN,
332 Inf, denormalized and other special arguments..
333
334 On my core 1 duo, the execution of this function takes approximately 95 cycles.
335
336 From what I have observed on the experiments with Intel AMath lib, switching to an
337 SSE2 version would improve the perf by only 10%.
338
339 Since it is based on SSE intrinsics, it has to be compiled at -O2 to
340 deliver full speed.
341 */
sin_ps(v4sf x)342 static inline v4sf sin_ps(v4sf x)
343 { // any x
344 v4sf xmm1, xmm2 = _mm_setzero_ps(), xmm3, sign_bit, y;
345
346 #ifdef USE_SSE2
347 v4si emm0, emm2;
348 #else
349 v2si mm0, mm1, mm2, mm3;
350 #endif
351 sign_bit = x;
352 /* take the absolute value */
353 x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
354 /* extract the sign bit (upper one) */
355 sign_bit = _mm_and_ps(sign_bit, *(v4sf*)_ps_sign_mask);
356
357 /* scale by 4/Pi */
358 y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
359
360 #ifdef USE_SSE2
361 /* store the integer part of y in mm0 */
362 emm2 = _mm_cvttps_epi32(y);
363 /* j=(j+1) & (~1) (see the cephes sources) */
364 emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
365 emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
366 y = _mm_cvtepi32_ps(emm2);
367
368 /* get the swap sign flag */
369 emm0 = _mm_and_si128(emm2, *(v4si*)_pi32_4);
370 emm0 = _mm_slli_epi32(emm0, 29);
371 /* get the polynom selection mask
372 there is one polynom for 0 <= x <= Pi/4
373 and another one for Pi/4<x<=Pi/2
374
375 Both branches will be computed.
376 */
377 emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
378 emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
379
380 v4sf swap_sign_bit = _mm_castsi128_ps(emm0);
381 v4sf poly_mask = _mm_castsi128_ps(emm2);
382 sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);
383
384 #else
385 /* store the integer part of y in mm0:mm1 */
386 xmm2 = _mm_movehl_ps(xmm2, y);
387 mm2 = _mm_cvttps_pi32(y);
388 mm3 = _mm_cvttps_pi32(xmm2);
389 /* j=(j+1) & (~1) (see the cephes sources) */
390 mm2 = _mm_add_pi32(mm2, *(v2si*)_pi32_1);
391 mm3 = _mm_add_pi32(mm3, *(v2si*)_pi32_1);
392 mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_inv1);
393 mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_inv1);
394 y = _mm_cvtpi32x2_ps(mm2, mm3);
395 /* get the swap sign flag */
396 mm0 = _mm_and_si64(mm2, *(v2si*)_pi32_4);
397 mm1 = _mm_and_si64(mm3, *(v2si*)_pi32_4);
398 mm0 = _mm_slli_pi32(mm0, 29);
399 mm1 = _mm_slli_pi32(mm1, 29);
400 /* get the polynom selection mask */
401 mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_2);
402 mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_2);
403 mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
404 mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
405 v4sf swap_sign_bit, poly_mask;
406 COPY_MM_TO_XMM(mm0, mm1, swap_sign_bit);
407 COPY_MM_TO_XMM(mm2, mm3, poly_mask);
408 sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);
409 _mm_empty(); /* good-bye mmx */
410 #endif
411
412 /* The magic pass: "Extended precision modular arithmetic"
413 x = ((x - y * DP1) - y * DP2) - y * DP3; */
414 xmm1 = *(v4sf*)_ps_minus_cephes_DP1;
415 xmm2 = *(v4sf*)_ps_minus_cephes_DP2;
416 xmm3 = *(v4sf*)_ps_minus_cephes_DP3;
417 xmm1 = _mm_mul_ps(y, xmm1);
418 xmm2 = _mm_mul_ps(y, xmm2);
419 xmm3 = _mm_mul_ps(y, xmm3);
420 x = _mm_add_ps(x, xmm1);
421 x = _mm_add_ps(x, xmm2);
422 x = _mm_add_ps(x, xmm3);
423
424 /* Evaluate the first polynom (0 <= x <= Pi/4) */
425 y = *(v4sf*)_ps_coscof_p0;
426 v4sf z = _mm_mul_ps(x, x);
427
428 y = _mm_mul_ps(y, z);
429 y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p1);
430 y = _mm_mul_ps(y, z);
431 y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p2);
432 y = _mm_mul_ps(y, z);
433 y = _mm_mul_ps(y, z);
434 v4sf tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
435 y = _mm_sub_ps(y, tmp);
436 y = _mm_add_ps(y, *(v4sf*)_ps_1);
437
438 /* Evaluate the second polynom (Pi/4 <= x <= 0) */
439
440 v4sf y2 = *(v4sf*)_ps_sincof_p0;
441 y2 = _mm_mul_ps(y2, z);
442 y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
443 y2 = _mm_mul_ps(y2, z);
444 y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
445 y2 = _mm_mul_ps(y2, z);
446 y2 = _mm_mul_ps(y2, x);
447 y2 = _mm_add_ps(y2, x);
448
449 /* select the correct result from the two polynoms */
450 xmm3 = poly_mask;
451 y2 = _mm_and_ps(xmm3, y2); //, xmm3);
452 y = _mm_andnot_ps(xmm3, y);
453 y = _mm_add_ps(y, y2);
454 /* update the sign */
455 y = _mm_xor_ps(y, sign_bit);
456 return y;
457 }
458
459 /* almost the same as sin_ps */
cos_ps(v4sf x)460 static inline v4sf cos_ps(v4sf x)
461 { // any x
462 v4sf xmm1, xmm2 = _mm_setzero_ps(), xmm3, y;
463 #ifdef USE_SSE2
464 v4si emm0, emm2;
465 #else
466 v2si mm0, mm1, mm2, mm3;
467 #endif
468 /* take the absolute value */
469 x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
470
471 /* scale by 4/Pi */
472 y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
473
474 #ifdef USE_SSE2
475 /* store the integer part of y in mm0 */
476 emm2 = _mm_cvttps_epi32(y);
477 /* j=(j+1) & (~1) (see the cephes sources) */
478 emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
479 emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
480 y = _mm_cvtepi32_ps(emm2);
481
482 emm2 = _mm_sub_epi32(emm2, *(v4si*)_pi32_2);
483
484 /* get the swap sign flag */
485 emm0 = _mm_andnot_si128(emm2, *(v4si*)_pi32_4);
486 emm0 = _mm_slli_epi32(emm0, 29);
487 /* get the polynom selection mask */
488 emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
489 emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
490
491 v4sf sign_bit = _mm_castsi128_ps(emm0);
492 v4sf poly_mask = _mm_castsi128_ps(emm2);
493 #else
494 /* store the integer part of y in mm0:mm1 */
495 xmm2 = _mm_movehl_ps(xmm2, y);
496 mm2 = _mm_cvttps_pi32(y);
497 mm3 = _mm_cvttps_pi32(xmm2);
498
499 /* j=(j+1) & (~1) (see the cephes sources) */
500 mm2 = _mm_add_pi32(mm2, *(v2si*)_pi32_1);
501 mm3 = _mm_add_pi32(mm3, *(v2si*)_pi32_1);
502 mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_inv1);
503 mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_inv1);
504
505 y = _mm_cvtpi32x2_ps(mm2, mm3);
506
507 mm2 = _mm_sub_pi32(mm2, *(v2si*)_pi32_2);
508 mm3 = _mm_sub_pi32(mm3, *(v2si*)_pi32_2);
509
510 /* get the swap sign flag in mm0:mm1 and the
511 polynom selection mask in mm2:mm3 */
512
513 mm0 = _mm_andnot_si64(mm2, *(v2si*)_pi32_4);
514 mm1 = _mm_andnot_si64(mm3, *(v2si*)_pi32_4);
515 mm0 = _mm_slli_pi32(mm0, 29);
516 mm1 = _mm_slli_pi32(mm1, 29);
517
518 mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_2);
519 mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_2);
520
521 mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
522 mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
523
524 v4sf sign_bit, poly_mask;
525 COPY_MM_TO_XMM(mm0, mm1, sign_bit);
526 COPY_MM_TO_XMM(mm2, mm3, poly_mask);
527 _mm_empty(); /* good-bye mmx */
528 #endif
529 /* The magic pass: "Extended precision modular arithmetic"
530 x = ((x - y * DP1) - y * DP2) - y * DP3; */
531 xmm1 = *(v4sf*)_ps_minus_cephes_DP1;
532 xmm2 = *(v4sf*)_ps_minus_cephes_DP2;
533 xmm3 = *(v4sf*)_ps_minus_cephes_DP3;
534 xmm1 = _mm_mul_ps(y, xmm1);
535 xmm2 = _mm_mul_ps(y, xmm2);
536 xmm3 = _mm_mul_ps(y, xmm3);
537 x = _mm_add_ps(x, xmm1);
538 x = _mm_add_ps(x, xmm2);
539 x = _mm_add_ps(x, xmm3);
540
541 /* Evaluate the first polynom (0 <= x <= Pi/4) */
542 y = *(v4sf*)_ps_coscof_p0;
543 v4sf z = _mm_mul_ps(x, x);
544
545 y = _mm_mul_ps(y, z);
546 y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p1);
547 y = _mm_mul_ps(y, z);
548 y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p2);
549 y = _mm_mul_ps(y, z);
550 y = _mm_mul_ps(y, z);
551 v4sf tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
552 y = _mm_sub_ps(y, tmp);
553 y = _mm_add_ps(y, *(v4sf*)_ps_1);
554
555 /* Evaluate the second polynom (Pi/4 <= x <= 0) */
556
557 v4sf y2 = *(v4sf*)_ps_sincof_p0;
558 y2 = _mm_mul_ps(y2, z);
559 y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
560 y2 = _mm_mul_ps(y2, z);
561 y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
562 y2 = _mm_mul_ps(y2, z);
563 y2 = _mm_mul_ps(y2, x);
564 y2 = _mm_add_ps(y2, x);
565
566 /* select the correct result from the two polynoms */
567 xmm3 = poly_mask;
568 y2 = _mm_and_ps(xmm3, y2); //, xmm3);
569 y = _mm_andnot_ps(xmm3, y);
570 y = _mm_add_ps(y, y2);
571 /* update the sign */
572 y = _mm_xor_ps(y, sign_bit);
573
574 return y;
575 }
576
577 /* since sin_ps and cos_ps are almost identical, sincos_ps could replace both of them..
578 it is almost as fast, and gives you a free cosine with your sine */
sincos_ps(v4sf x,v4sf * s,v4sf * c)579 static inline void sincos_ps(v4sf x, v4sf* s, v4sf* c)
580 {
581 v4sf xmm1, xmm2, xmm3 = _mm_setzero_ps(), sign_bit_sin, y;
582 #ifdef USE_SSE2
583 v4si emm0, emm2, emm4;
584 #else
585 v2si mm0, mm1, mm2, mm3, mm4, mm5;
586 #endif
587 sign_bit_sin = x;
588 /* take the absolute value */
589 x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
590 /* extract the sign bit (upper one) */
591 sign_bit_sin = _mm_and_ps(sign_bit_sin, *(v4sf*)_ps_sign_mask);
592
593 /* scale by 4/Pi */
594 y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
595
596 #ifdef USE_SSE2
597 /* store the integer part of y in emm2 */
598 emm2 = _mm_cvttps_epi32(y);
599
600 /* j=(j+1) & (~1) (see the cephes sources) */
601 emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
602 emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
603 y = _mm_cvtepi32_ps(emm2);
604
605 emm4 = emm2;
606
607 /* get the swap sign flag for the sine */
608 emm0 = _mm_and_si128(emm2, *(v4si*)_pi32_4);
609 emm0 = _mm_slli_epi32(emm0, 29);
610 v4sf swap_sign_bit_sin = _mm_castsi128_ps(emm0);
611
612 /* get the polynom selection mask for the sine*/
613 emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
614 emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
615 v4sf poly_mask = _mm_castsi128_ps(emm2);
616 #else
617 /* store the integer part of y in mm2:mm3 */
618 xmm3 = _mm_movehl_ps(xmm3, y);
619 mm2 = _mm_cvttps_pi32(y);
620 mm3 = _mm_cvttps_pi32(xmm3);
621
622 /* j=(j+1) & (~1) (see the cephes sources) */
623 mm2 = _mm_add_pi32(mm2, *(v2si*)_pi32_1);
624 mm3 = _mm_add_pi32(mm3, *(v2si*)_pi32_1);
625 mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_inv1);
626 mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_inv1);
627
628 y = _mm_cvtpi32x2_ps(mm2, mm3);
629
630 mm4 = mm2;
631 mm5 = mm3;
632
633 /* get the swap sign flag for the sine */
634 mm0 = _mm_and_si64(mm2, *(v2si*)_pi32_4);
635 mm1 = _mm_and_si64(mm3, *(v2si*)_pi32_4);
636 mm0 = _mm_slli_pi32(mm0, 29);
637 mm1 = _mm_slli_pi32(mm1, 29);
638 v4sf swap_sign_bit_sin;
639 COPY_MM_TO_XMM(mm0, mm1, swap_sign_bit_sin);
640
641 /* get the polynom selection mask for the sine */
642
643 mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_2);
644 mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_2);
645 mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
646 mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
647 v4sf poly_mask;
648 COPY_MM_TO_XMM(mm2, mm3, poly_mask);
649 #endif
650
651 /* The magic pass: "Extended precision modular arithmetic"
652 x = ((x - y * DP1) - y * DP2) - y * DP3; */
653 xmm1 = *(v4sf*)_ps_minus_cephes_DP1;
654 xmm2 = *(v4sf*)_ps_minus_cephes_DP2;
655 xmm3 = *(v4sf*)_ps_minus_cephes_DP3;
656 xmm1 = _mm_mul_ps(y, xmm1);
657 xmm2 = _mm_mul_ps(y, xmm2);
658 xmm3 = _mm_mul_ps(y, xmm3);
659 x = _mm_add_ps(x, xmm1);
660 x = _mm_add_ps(x, xmm2);
661 x = _mm_add_ps(x, xmm3);
662
663 #ifdef USE_SSE2
664 emm4 = _mm_sub_epi32(emm4, *(v4si*)_pi32_2);
665 emm4 = _mm_andnot_si128(emm4, *(v4si*)_pi32_4);
666 emm4 = _mm_slli_epi32(emm4, 29);
667 v4sf sign_bit_cos = _mm_castsi128_ps(emm4);
668 #else
669 /* get the sign flag for the cosine */
670 mm4 = _mm_sub_pi32(mm4, *(v2si*)_pi32_2);
671 mm5 = _mm_sub_pi32(mm5, *(v2si*)_pi32_2);
672 mm4 = _mm_andnot_si64(mm4, *(v2si*)_pi32_4);
673 mm5 = _mm_andnot_si64(mm5, *(v2si*)_pi32_4);
674 mm4 = _mm_slli_pi32(mm4, 29);
675 mm5 = _mm_slli_pi32(mm5, 29);
676 v4sf sign_bit_cos;
677 COPY_MM_TO_XMM(mm4, mm5, sign_bit_cos);
678 _mm_empty(); /* good-bye mmx */
679 #endif
680
681 sign_bit_sin = _mm_xor_ps(sign_bit_sin, swap_sign_bit_sin);
682
683 /* Evaluate the first polynom (0 <= x <= Pi/4) */
684 v4sf z = _mm_mul_ps(x, x);
685 y = *(v4sf*)_ps_coscof_p0;
686
687 y = _mm_mul_ps(y, z);
688 y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p1);
689 y = _mm_mul_ps(y, z);
690 y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p2);
691 y = _mm_mul_ps(y, z);
692 y = _mm_mul_ps(y, z);
693 v4sf tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
694 y = _mm_sub_ps(y, tmp);
695 y = _mm_add_ps(y, *(v4sf*)_ps_1);
696
697 /* Evaluate the second polynom (Pi/4 <= x <= 0) */
698
699 v4sf y2 = *(v4sf*)_ps_sincof_p0;
700 y2 = _mm_mul_ps(y2, z);
701 y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
702 y2 = _mm_mul_ps(y2, z);
703 y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
704 y2 = _mm_mul_ps(y2, z);
705 y2 = _mm_mul_ps(y2, x);
706 y2 = _mm_add_ps(y2, x);
707
708 /* select the correct result from the two polynoms */
709 xmm3 = poly_mask;
710 v4sf ysin2 = _mm_and_ps(xmm3, y2);
711 v4sf ysin1 = _mm_andnot_ps(xmm3, y);
712 y2 = _mm_sub_ps(y2, ysin2);
713 y = _mm_sub_ps(y, ysin1);
714
715 xmm1 = _mm_add_ps(ysin1, ysin2);
716 xmm2 = _mm_add_ps(y, y2);
717
718 /* update the sign */
719 *s = _mm_xor_ps(xmm1, sign_bit_sin);
720 *c = _mm_xor_ps(xmm2, sign_bit_cos);
721 }
722
723 #endif // SSE_MATHFUN_H
724