1 // madronalib: a C++ framework for DSP applications.
2 // Copyright (c) 2020 Madrona Labs LLC. http://www.madronalabs.com
3 // Distributed under the MIT license: http://madrona-labs.mit-license.org/
4
5 // MLDSPMathSSE.h
6 // SSE implementations of madronalib SIMD primitives
7
8 // cephes-derived approximate math functions adapted from code by Julien
9 // Pommier, licensed as follows:
10 /*
11 Copyright (C) 2007 Julien Pommier
12
13 This software is provided 'as-is', without any express or implied
14 warranty. In no event will the authors be held liable for any damages
15 arising from the use of this software.
16
17 Permission is granted to anyone to use this software for any purpose,
18 including commercial applications, and to alter it and redistribute it
19 freely, subject to the following restrictions:
20
21 1. The origin of this software must not be misrepresented; you must not
22 claim that you wrote the original software. If you use this software
23 in a product, an acknowledgment in the product documentation would be
24 appreciated but is not required.
25 2. Altered source versions must be plainly marked as such, and must not be
26 misrepresented as being the original software.
27 3. This notice may not be removed or altered from any source distribution.
28
29 (this is the zlib license)
30 */
31
32 #include <emmintrin.h>
33 #include <float.h>
34
35 #pragma once
36
37 #ifdef _MSC_VER /* visual c++ */
38 #define ALIGN16_BEG __declspec(align(16))
39 #define ALIGN16_END
40 #else /* gcc or icc */
41 #define ALIGN16_BEG
42 #define ALIGN16_END __attribute__((aligned(16)))
43 #endif
44
45 // SSE types
46 typedef __m128 SIMDVectorFloat;
47 typedef __m128i SIMDVectorInt;
48
49 // SSE casts
50 #define VecF2I _mm_castps_si128
51 #define VecI2F _mm_castsi128_ps
52
53 // TODO int or uintptr_t?
54 constexpr int kFloatsPerSIMDVectorBits = 2;
55 constexpr int kFloatsPerSIMDVector = 1 << kFloatsPerSIMDVectorBits;
56 constexpr int kSIMDVectorsPerDSPVector = kFloatsPerDSPVector / kFloatsPerSIMDVector;
57 constexpr int kBytesPerSIMDVector = kFloatsPerSIMDVector * sizeof(float);
58 constexpr int kSIMDVectorMask = ~(kBytesPerSIMDVector - 1);
59
60 constexpr int kIntsPerSIMDVectorBits = 2;
61 constexpr int kIntsPerSIMDVector = 1 << kIntsPerSIMDVectorBits;
62
isSIMDAligned(float * p)63 inline bool isSIMDAligned(float* p)
64 {
65 uintptr_t pM = (uintptr_t)p;
66 return ((pM & kSIMDVectorMask) == 0);
67 }
68
69 // primitive SSE operations
70 #define vecAdd _mm_add_ps
71 #define vecSub _mm_sub_ps
72 #define vecMul _mm_mul_ps
73 #define vecDiv _mm_div_ps
74 #define vecDivApprox(x1, x2) (_mm_mul_ps(x1, _mm_rcp_ps(x2)))
75 #define vecMin _mm_min_ps
76 #define vecMax _mm_max_ps
77
78 #define vecSqrt _mm_sqrt_ps
79 #define vecSqrtApprox(x) (vecMul(x, vecRSqrt(x)))
80 #define vecRSqrt _mm_rsqrt_ps
81 #define vecAbs(x) (_mm_andnot_ps(_mm_set_ps1(-0.0f), x))
82
83 #define vecSign(x) \
84 (_mm_and_ps(_mm_or_ps(_mm_and_ps(_mm_set_ps1(-0.0f), x), _mm_set_ps1(1.0f)), \
85 _mm_cmpneq_ps(_mm_set_ps1(-0.0f), x)))
86
87 #define vecSignBit(x) (_mm_or_ps(_mm_and_ps(_mm_set_ps1(-0.0f), x), _mm_set_ps1(1.0f)))
88 #define vecClamp(x1, x2, x3) _mm_min_ps(_mm_max_ps(x1, x2), x3)
89 #define vecWithin(x1, x2, x3) _mm_and_ps(_mm_cmpge_ps(x1, x2), _mm_cmplt_ps(x1, x3))
90
91 #define vecEqual _mm_cmpeq_ps
92 #define vecNotEqual _mm_cmpneq_ps
93 #define vecGreaterThan _mm_cmpgt_ps
94 #define vecGreaterThanOrEqual _mm_cmpge_ps
95 #define vecLessThan _mm_cmplt_ps
96 #define vecLessThanOrEqual _mm_cmple_ps
97
98 #define vecSet1 _mm_set1_ps
99
100 // low-level store and load a vector to/from a float*.
101 // the pointer must be aligned or the program will crash!
102 // void vecStore(float* pDest, DSPVector v);
103 // DSPVector vecLoad(float* pSrc);
104 #define vecStore _mm_store_ps
105 #define vecLoad _mm_load_ps
106
107 #define vecStoreUnaligned _mm_storeu_ps
108 #define vecLoadUnaligned _mm_loadu_ps
109
110 #define vecAnd _mm_and_ps
111 #define vecOr _mm_or_ps
112
113 #define vecZeros _mm_setzero_ps
114 #define vecOnes vecEqual(vecZeros, vecZeros)
115
116 #define vecShiftLeft _mm_slli_si128
117 #define vecShiftRight _mm_srli_si128
118
119 #define vecFloatToIntRound _mm_cvtps_epi32
120 #define vecFloatToIntTruncate _mm_cvttps_epi32
121 #define vecIntToFloat _mm_cvtepi32_ps
122
123 #define vecAddInt _mm_add_epi32
124 #define vecSubInt _mm_sub_epi32
125 #define vecSet1Int _mm_set1_epi32
126
127 typedef union
128 {
129 SIMDVectorFloat v;
130 float f[4];
131 } SIMDVectorFloatUnion;
132
133 typedef union
134 {
135 SIMDVectorInt v;
136 uint32_t i[4];
137 } SIMDVectorIntUnion;
138
vecSetInt1(uint32_t a)139 inline SIMDVectorInt vecSetInt1(uint32_t a) { return _mm_set1_epi32(a); }
140
vecSetInt4(uint32_t a,uint32_t b,uint32_t c,uint32_t d)141 inline SIMDVectorInt vecSetInt4(uint32_t a, uint32_t b, uint32_t c, uint32_t d)
142 {
143 return _mm_set_epi32(d, c, b, a);
144 }
145
146 static const int XI = 0xFFFFFFFF;
147 static const float X = *(reinterpret_cast<const float*>(&XI));
148
149 const SIMDVectorFloat vecMask0 = {0, 0, 0, 0};
150 const SIMDVectorFloat vecMask1 = {0, 0, 0, X};
151 const SIMDVectorFloat vecMask2 = {0, 0, X, 0};
152 const SIMDVectorFloat vecMask3 = {0, 0, X, X};
153 const SIMDVectorFloat vecMask4 = {0, X, 0, 0};
154 const SIMDVectorFloat vecMask5 = {0, X, 0, X};
155 const SIMDVectorFloat vecMask6 = {0, X, X, 0};
156 const SIMDVectorFloat vecMask7 = {0, X, X, X};
157 const SIMDVectorFloat vecMask8 = {X, 0, 0, 0};
158 const SIMDVectorFloat vecMask9 = {X, 0, 0, X};
159 const SIMDVectorFloat vecMaskA = {X, 0, X, 0};
160 const SIMDVectorFloat vecMaskB = {X, 0, X, X};
161 const SIMDVectorFloat vecMaskC = {X, X, 0, 0};
162 const SIMDVectorFloat vecMaskD = {X, X, 0, X};
163 const SIMDVectorFloat vecMaskE = {X, X, X, 0};
164 const SIMDVectorFloat vecMaskF = {X, X, X, X};
165
166 #define SHUFFLE(a, b, c, d) ((a << 6) | (b << 4) | (c << 2) | (d))
167 #define vecBroadcast3(x1) _mm_shuffle_ps(x1, x1, SHUFFLE(3, 3, 3, 3))
168
169 #define vecShiftElementsLeft(x1, i) _mm_slli_si128(x1, 4 * i);
170 #define vecShiftElementsRight(x1, i) _mm_srli_si128(x1, 4 * i);
171
172 inline std::ostream& operator<<(std::ostream& out, SIMDVectorFloat v)
173 {
174 SIMDVectorFloatUnion u;
175 u.v = v;
176 out << "[";
177 out << u.f[0];
178 out << ", ";
179 out << u.f[1];
180 out << ", ";
181 out << u.f[2];
182 out << ", ";
183 out << u.f[3];
184 out << "]";
185 return out;
186 }
187
188 inline std::ostream& operator<<(std::ostream& out, SIMDVectorInt v)
189 {
190 SIMDVectorIntUnion u;
191 u.v = v;
192 out << "[";
193 out << u.i[0];
194 out << ", ";
195 out << u.i[1];
196 out << ", ";
197 out << u.i[2];
198 out << ", ";
199 out << u.i[3];
200 out << "]";
201 return out;
202 }
203
204 // ----------------------------------------------------------------
205 #pragma mark select
206
vecSelect(SIMDVectorFloat a,SIMDVectorFloat b,SIMDVectorInt conditionMask)207 inline SIMDVectorFloat vecSelect(SIMDVectorFloat a, SIMDVectorFloat b, SIMDVectorInt conditionMask)
208 {
209 __m128i ones = _mm_set1_epi32(-1);
210 return _mm_or_ps(_mm_and_ps(VecI2F(conditionMask), a),
211 _mm_and_ps(_mm_xor_ps(VecI2F(conditionMask), VecI2F(ones)), b));
212 }
213
vecSelect(SIMDVectorInt a,SIMDVectorInt b,SIMDVectorInt conditionMask)214 inline SIMDVectorInt vecSelect(SIMDVectorInt a, SIMDVectorInt b, SIMDVectorInt conditionMask)
215 {
216 __m128i ones = _mm_set1_epi32(-1);
217 return _mm_or_si128(_mm_and_si128(conditionMask, a),
218 _mm_and_si128(_mm_xor_si128(conditionMask, ones), b));
219 }
220
221 // ----------------------------------------------------------------
222 // horizontal operations returning float
223
vecSumH(SIMDVectorFloat v)224 inline float vecSumH(SIMDVectorFloat v)
225 {
226 SIMDVectorFloat tmp0 = _mm_add_ps(v, _mm_movehl_ps(v, v));
227 SIMDVectorFloat tmp1 = _mm_add_ss(tmp0, _mm_shuffle_ps(tmp0, tmp0, 1));
228 return _mm_cvtss_f32(tmp1);
229 }
230
vecMaxH(SIMDVectorFloat v)231 inline float vecMaxH(SIMDVectorFloat v)
232 {
233 SIMDVectorFloat tmp0 = _mm_max_ps(v, _mm_movehl_ps(v, v));
234 SIMDVectorFloat tmp1 = _mm_max_ss(tmp0, _mm_shuffle_ps(tmp0, tmp0, 1));
235 return _mm_cvtss_f32(tmp1);
236 }
237
vecMinH(SIMDVectorFloat v)238 inline float vecMinH(SIMDVectorFloat v)
239 {
240 SIMDVectorFloat tmp0 = _mm_min_ps(v, _mm_movehl_ps(v, v));
241 SIMDVectorFloat tmp1 = _mm_min_ss(tmp0, _mm_shuffle_ps(tmp0, tmp0, 1));
242 return _mm_cvtss_f32(tmp1);
243 }
244
245 /* declare some SSE constants -- why can't I figure a better way to do that? */
246 #define _PS_CONST(Name, Val) \
247 static const ALIGN16_BEG float _ps_##Name[4] ALIGN16_END = {Val, Val, Val, Val}
248 #define _PI32_CONST(Name, Val) \
249 static const ALIGN16_BEG int _pi32_##Name[4] ALIGN16_END = {Val, Val, Val, Val}
250 #define _PS_CONST_TYPE(Name, Type, Val) \
251 static const ALIGN16_BEG Type _ps_##Name[4] ALIGN16_END = {Val, Val, Val, Val}
252
253 _PS_CONST(1, 1.0f);
254 _PS_CONST(0p5, 0.5f);
255
256 /* the smallest non denormalized float number */
257 _PS_CONST_TYPE(min_norm_pos, int, 0x00800000);
258 _PS_CONST_TYPE(mant_mask, int, 0x7f800000);
259 _PS_CONST_TYPE(inv_mant_mask, int, ~0x7f800000);
260
261 _PS_CONST_TYPE(sign_mask, int, (int)0x80000000);
262 _PS_CONST_TYPE(inv_sign_mask, int, ~0x80000000);
263
264 _PI32_CONST(1, 1);
265 _PI32_CONST(inv1, ~1);
266 _PI32_CONST(2, 2);
267 _PI32_CONST(4, 4);
268 _PI32_CONST(0x7f, 0x7f);
269
270 _PS_CONST(cephes_SQRTHF, 0.707106781186547524);
271 _PS_CONST(cephes_log_p0, 7.0376836292E-2);
272 _PS_CONST(cephes_log_p1, -1.1514610310E-1);
273 _PS_CONST(cephes_log_p2, 1.1676998740E-1);
274 _PS_CONST(cephes_log_p3, -1.2420140846E-1);
275 _PS_CONST(cephes_log_p4, +1.4249322787E-1);
276 _PS_CONST(cephes_log_p5, -1.6668057665E-1);
277 _PS_CONST(cephes_log_p6, +2.0000714765E-1);
278 _PS_CONST(cephes_log_p7, -2.4999993993E-1);
279 _PS_CONST(cephes_log_p8, +3.3333331174E-1);
280 _PS_CONST(cephes_log_q1, -2.12194440e-4);
281 _PS_CONST(cephes_log_q2, 0.693359375);
282
283 /* natural logarithm computed for 4 simultaneous float
284 return NaN for x <= 0
285 */
vecLog(SIMDVectorFloat x)286 inline SIMDVectorFloat vecLog(SIMDVectorFloat x)
287 {
288 SIMDVectorInt emm0;
289 SIMDVectorFloat one = *(SIMDVectorFloat*)_ps_1;
290 SIMDVectorFloat invalid_mask = _mm_cmple_ps(x, _mm_setzero_ps());
291
292 x = _mm_max_ps(x, *(SIMDVectorFloat*)_ps_min_norm_pos); /* cut off denormalized stuff */
293
294 emm0 = _mm_srli_epi32(VecF2I(x), 23);
295
296 /* keep only the fractional part */
297 x = _mm_and_ps(x, *(SIMDVectorFloat*)_ps_inv_mant_mask);
298 x = _mm_or_ps(x, *(SIMDVectorFloat*)_ps_0p5);
299
300 emm0 = _mm_sub_epi32(emm0, *(SIMDVectorInt*)_pi32_0x7f);
301 SIMDVectorFloat e = _mm_cvtepi32_ps(emm0);
302
303 e = _mm_add_ps(e, one);
304
305 /* part2:
306 if( x < SQRTHF ) {
307 e -= 1;
308 x = x + x - 1.0;
309 } else { x = x - 1.0; }
310 */
311 SIMDVectorFloat mask = _mm_cmplt_ps(x, *(SIMDVectorFloat*)_ps_cephes_SQRTHF);
312 SIMDVectorFloat tmp = _mm_and_ps(x, mask);
313 x = _mm_sub_ps(x, one);
314 e = _mm_sub_ps(e, _mm_and_ps(one, mask));
315 x = _mm_add_ps(x, tmp);
316
317 SIMDVectorFloat z = _mm_mul_ps(x, x);
318
319 SIMDVectorFloat y = *(SIMDVectorFloat*)_ps_cephes_log_p0;
320 y = _mm_mul_ps(y, x);
321 y = _mm_add_ps(y, *(SIMDVectorFloat*)_ps_cephes_log_p1);
322 y = _mm_mul_ps(y, x);
323 y = _mm_add_ps(y, *(SIMDVectorFloat*)_ps_cephes_log_p2);
324 y = _mm_mul_ps(y, x);
325 y = _mm_add_ps(y, *(SIMDVectorFloat*)_ps_cephes_log_p3);
326 y = _mm_mul_ps(y, x);
327 y = _mm_add_ps(y, *(SIMDVectorFloat*)_ps_cephes_log_p4);
328 y = _mm_mul_ps(y, x);
329 y = _mm_add_ps(y, *(SIMDVectorFloat*)_ps_cephes_log_p5);
330 y = _mm_mul_ps(y, x);
331 y = _mm_add_ps(y, *(SIMDVectorFloat*)_ps_cephes_log_p6);
332 y = _mm_mul_ps(y, x);
333 y = _mm_add_ps(y, *(SIMDVectorFloat*)_ps_cephes_log_p7);
334 y = _mm_mul_ps(y, x);
335 y = _mm_add_ps(y, *(SIMDVectorFloat*)_ps_cephes_log_p8);
336 y = _mm_mul_ps(y, x);
337
338 y = _mm_mul_ps(y, z);
339
340 tmp = _mm_mul_ps(e, *(SIMDVectorFloat*)_ps_cephes_log_q1);
341 y = _mm_add_ps(y, tmp);
342
343 tmp = _mm_mul_ps(z, *(SIMDVectorFloat*)_ps_0p5);
344 y = _mm_sub_ps(y, tmp);
345
346 tmp = _mm_mul_ps(e, *(SIMDVectorFloat*)_ps_cephes_log_q2);
347 x = _mm_add_ps(x, y);
348 x = _mm_add_ps(x, tmp);
349 x = _mm_or_ps(x, invalid_mask); // negative arg will be NAN
350 return x;
351 }
352
353 _PS_CONST(exp_hi, 88.3762626647949f);
354 _PS_CONST(exp_lo, -88.3762626647949f);
355
356 _PS_CONST(cephes_LOG2EF, 1.44269504088896341);
357 _PS_CONST(cephes_exp_C1, 0.693359375);
358 _PS_CONST(cephes_exp_C2, -2.12194440e-4);
359
360 _PS_CONST(cephes_exp_p0, 1.9875691500E-4);
361 _PS_CONST(cephes_exp_p1, 1.3981999507E-3);
362 _PS_CONST(cephes_exp_p2, 8.3334519073E-3);
363 _PS_CONST(cephes_exp_p3, 4.1665795894E-2);
364 _PS_CONST(cephes_exp_p4, 1.6666665459E-1);
365 _PS_CONST(cephes_exp_p5, 5.0000001201E-1);
366
vecExp(SIMDVectorFloat x)367 inline SIMDVectorFloat vecExp(SIMDVectorFloat x)
368 {
369 SIMDVectorFloat tmp = _mm_setzero_ps(), fx;
370 SIMDVectorInt emm0;
371 SIMDVectorFloat one = *(SIMDVectorFloat*)_ps_1;
372
373 x = _mm_min_ps(x, *(SIMDVectorFloat*)_ps_exp_hi);
374 x = _mm_max_ps(x, *(SIMDVectorFloat*)_ps_exp_lo);
375
376 /* express exp(x) as exp(g + n*log(2)) */
377 fx = _mm_mul_ps(x, *(SIMDVectorFloat*)_ps_cephes_LOG2EF);
378 fx = _mm_add_ps(fx, *(SIMDVectorFloat*)_ps_0p5);
379
380 /* how to perform a floorf with SSE: just below */
381 emm0 = _mm_cvttps_epi32(fx);
382 tmp = _mm_cvtepi32_ps(emm0);
383
384 /* if greater, substract 1 */
385 SIMDVectorFloat mask = _mm_cmpgt_ps(tmp, fx);
386 mask = _mm_and_ps(mask, one);
387 fx = _mm_sub_ps(tmp, mask);
388
389 tmp = _mm_mul_ps(fx, *(SIMDVectorFloat*)_ps_cephes_exp_C1);
390 SIMDVectorFloat z = _mm_mul_ps(fx, *(SIMDVectorFloat*)_ps_cephes_exp_C2);
391 x = _mm_sub_ps(x, tmp);
392 x = _mm_sub_ps(x, z);
393 z = _mm_mul_ps(x, x);
394
395 SIMDVectorFloat y = *(SIMDVectorFloat*)_ps_cephes_exp_p0;
396 y = _mm_mul_ps(y, x);
397 y = _mm_add_ps(y, *(SIMDVectorFloat*)_ps_cephes_exp_p1);
398 y = _mm_mul_ps(y, x);
399 y = _mm_add_ps(y, *(SIMDVectorFloat*)_ps_cephes_exp_p2);
400 y = _mm_mul_ps(y, x);
401 y = _mm_add_ps(y, *(SIMDVectorFloat*)_ps_cephes_exp_p3);
402 y = _mm_mul_ps(y, x);
403 y = _mm_add_ps(y, *(SIMDVectorFloat*)_ps_cephes_exp_p4);
404 y = _mm_mul_ps(y, x);
405 y = _mm_add_ps(y, *(SIMDVectorFloat*)_ps_cephes_exp_p5);
406 y = _mm_mul_ps(y, z);
407 y = _mm_add_ps(y, x);
408 y = _mm_add_ps(y, one);
409
410 /* build 2^n */
411 emm0 = _mm_cvttps_epi32(fx);
412 emm0 = _mm_add_epi32(emm0, *(SIMDVectorInt*)_pi32_0x7f);
413 emm0 = _mm_slli_epi32(emm0, 23);
414 SIMDVectorFloat pow2n = VecI2F(emm0);
415
416 y = _mm_mul_ps(y, pow2n);
417 return y;
418 }
419
420 _PS_CONST(minus_cephes_DP1, -0.78515625);
421 _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
422 _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
423 _PS_CONST(sincof_p0, -1.9515295891E-4);
424 _PS_CONST(sincof_p1, 8.3321608736E-3);
425 _PS_CONST(sincof_p2, -1.6666654611E-1);
426 _PS_CONST(coscof_p0, 2.443315711809948E-005);
427 _PS_CONST(coscof_p1, -1.388731625493765E-003);
428 _PS_CONST(coscof_p2, 4.166664568298827E-002);
429 _PS_CONST(cephes_FOPI, 1.27323954473516); // 4 / M_PI
430
431 /*
432 Julien Pommier:
433 The code is the exact rewriting of the cephes sinf function.
434 Precision is excellent as long as x < 8192 (I did not bother to
435 take into account the special handling they have for greater values
436 -- it does not return garbage for arguments over 8192, though, but
437 the extra precision is missing).
438
439 Note that it is such that sinf((float)M_PI) = 8.74e-8, which is the
440 surprising but correct result.
441
442 Performance is also surprisingly good, 1.33 times faster than the
443 macos vsinf SSE2 function, and 1.5 times faster than the
444 __vrs4_sinf of amd's ACML (which is only available in 64 bits). Not
445 too bad for an SSE1 function (with no special tuning) !
446 However the latter libraries probably have a much better handling of NaN,
447 Inf, denormalized and other special arguments..
448
449 On my core 1 duo, the execution of this function takes approximately 95 cycles.
450
451 From what I have observed on the experiments with Intel AMath lib, switching to
452 an SSE2 version would improve the perf by only 10%.
453
454 Since it is based on SSE intrinsics, it has to be compiled at -O2 to
455 deliver full speed.
456 */
vecSin(SIMDVectorFloat x)457 inline SIMDVectorFloat vecSin(SIMDVectorFloat x)
458 {
459 SIMDVectorFloat xmm1, xmm2 = _mm_setzero_ps(), xmm3, sign_bit, y;
460 SIMDVectorInt emm0, emm2;
461
462 sign_bit = x;
463 /* take the absolute value */
464 x = _mm_and_ps(x, *(SIMDVectorFloat*)_ps_inv_sign_mask);
465 /* extract the sign bit (upper one) */
466 sign_bit = _mm_and_ps(sign_bit, *(SIMDVectorFloat*)_ps_sign_mask);
467
468 /* scale by 4/Pi */
469 y = _mm_mul_ps(x, *(SIMDVectorFloat*)_ps_cephes_FOPI);
470
471 /* store the integer part of y in mm0 */
472 emm2 = _mm_cvttps_epi32(y);
473 /* j=(j+1) & (~1) (see the cephes sources) */
474 emm2 = _mm_add_epi32(emm2, *(SIMDVectorInt*)_pi32_1);
475 emm2 = _mm_and_si128(emm2, *(SIMDVectorInt*)_pi32_inv1);
476 y = _mm_cvtepi32_ps(emm2);
477
478 /* get the swap sign flag */
479 emm0 = _mm_and_si128(emm2, *(SIMDVectorInt*)_pi32_4);
480 emm0 = _mm_slli_epi32(emm0, 29);
481 /* get the polynom selection mask
482 there is one polynom for 0 <= x <= Pi/4
483 and another one for Pi/4<x<=Pi/2
484 Both branches will be computed.
485 */
486 emm2 = _mm_and_si128(emm2, *(SIMDVectorInt*)_pi32_2);
487 emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
488
489 SIMDVectorFloat swap_sign_bit = VecI2F(emm0);
490 SIMDVectorFloat poly_mask = VecI2F(emm2);
491 sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);
492
493 /* The magic pass: "Extended precision modular arithmetic"
494 x = ((x - y * DP1) - y * DP2) - y * DP3; */
495 xmm1 = *(SIMDVectorFloat*)_ps_minus_cephes_DP1;
496 xmm2 = *(SIMDVectorFloat*)_ps_minus_cephes_DP2;
497 xmm3 = *(SIMDVectorFloat*)_ps_minus_cephes_DP3;
498 xmm1 = _mm_mul_ps(y, xmm1);
499 xmm2 = _mm_mul_ps(y, xmm2);
500 xmm3 = _mm_mul_ps(y, xmm3);
501 x = _mm_add_ps(x, xmm1);
502 x = _mm_add_ps(x, xmm2);
503 x = _mm_add_ps(x, xmm3);
504
505 /* Evaluate the first polynom (0 <= x <= Pi/4) */
506 y = *(SIMDVectorFloat*)_ps_coscof_p0;
507 SIMDVectorFloat z = _mm_mul_ps(x, x);
508
509 y = _mm_mul_ps(y, z);
510 y = _mm_add_ps(y, *(SIMDVectorFloat*)_ps_coscof_p1);
511 y = _mm_mul_ps(y, z);
512 y = _mm_add_ps(y, *(SIMDVectorFloat*)_ps_coscof_p2);
513 y = _mm_mul_ps(y, z);
514 y = _mm_mul_ps(y, z);
515 SIMDVectorFloat tmp = _mm_mul_ps(z, *(SIMDVectorFloat*)_ps_0p5);
516 y = _mm_sub_ps(y, tmp);
517 y = _mm_add_ps(y, *(SIMDVectorFloat*)_ps_1);
518
519 /* Evaluate the second polynom (Pi/4 <= x <= 0) */
520 SIMDVectorFloat y2 = *(SIMDVectorFloat*)_ps_sincof_p0;
521 y2 = _mm_mul_ps(y2, z);
522 y2 = _mm_add_ps(y2, *(SIMDVectorFloat*)_ps_sincof_p1);
523 y2 = _mm_mul_ps(y2, z);
524 y2 = _mm_add_ps(y2, *(SIMDVectorFloat*)_ps_sincof_p2);
525 y2 = _mm_mul_ps(y2, z);
526 y2 = _mm_mul_ps(y2, x);
527 y2 = _mm_add_ps(y2, x);
528
529 /* select the correct result from the two polynoms */
530 xmm3 = poly_mask;
531 y2 = _mm_and_ps(xmm3, y2); //, xmm3);
532 y = _mm_andnot_ps(xmm3, y);
533 y = _mm_add_ps(y, y2);
534 /* update the sign */
535 y = _mm_xor_ps(y, sign_bit);
536 return y;
537 }
538
539 /* almost the same as sin_ps */
vecCos(SIMDVectorFloat x)540 inline SIMDVectorFloat vecCos(SIMDVectorFloat x)
541 {
542 SIMDVectorFloat xmm1, xmm2 = _mm_setzero_ps(), xmm3, y;
543 SIMDVectorInt emm0, emm2;
544
545 /* take the absolute value */
546 x = _mm_and_ps(x, *(SIMDVectorFloat*)_ps_inv_sign_mask);
547
548 /* scale by 4/Pi */
549 y = _mm_mul_ps(x, *(SIMDVectorFloat*)_ps_cephes_FOPI);
550
551 /* store the integer part of y in mm0 */
552 emm2 = _mm_cvttps_epi32(y);
553 /* j=(j+1) & (~1) (see the cephes sources) */
554 emm2 = _mm_add_epi32(emm2, *(SIMDVectorInt*)_pi32_1);
555 emm2 = _mm_and_si128(emm2, *(SIMDVectorInt*)_pi32_inv1);
556 y = _mm_cvtepi32_ps(emm2);
557 emm2 = _mm_sub_epi32(emm2, *(SIMDVectorInt*)_pi32_2);
558
559 /* get the swap sign flag */
560 emm0 = _mm_andnot_si128(emm2, *(SIMDVectorInt*)_pi32_4);
561 emm0 = _mm_slli_epi32(emm0, 29);
562 /* get the polynom selection mask */
563 emm2 = _mm_and_si128(emm2, *(SIMDVectorInt*)_pi32_2);
564 emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
565
566 SIMDVectorFloat sign_bit = VecI2F(emm0);
567 SIMDVectorFloat poly_mask = VecI2F(emm2);
568
569 /* The magic pass: "Extended precision modular arithmetic"
570 x = ((x - y * DP1) - y * DP2) - y * DP3; */
571 xmm1 = *(SIMDVectorFloat*)_ps_minus_cephes_DP1;
572 xmm2 = *(SIMDVectorFloat*)_ps_minus_cephes_DP2;
573 xmm3 = *(SIMDVectorFloat*)_ps_minus_cephes_DP3;
574 xmm1 = _mm_mul_ps(y, xmm1);
575 xmm2 = _mm_mul_ps(y, xmm2);
576 xmm3 = _mm_mul_ps(y, xmm3);
577 x = _mm_add_ps(x, xmm1);
578 x = _mm_add_ps(x, xmm2);
579 x = _mm_add_ps(x, xmm3);
580
581 /* Evaluate the first polynom (0 <= x <= Pi/4) */
582 y = *(SIMDVectorFloat*)_ps_coscof_p0;
583 SIMDVectorFloat z = _mm_mul_ps(x, x);
584
585 y = _mm_mul_ps(y, z);
586 y = _mm_add_ps(y, *(SIMDVectorFloat*)_ps_coscof_p1);
587 y = _mm_mul_ps(y, z);
588 y = _mm_add_ps(y, *(SIMDVectorFloat*)_ps_coscof_p2);
589 y = _mm_mul_ps(y, z);
590 y = _mm_mul_ps(y, z);
591 SIMDVectorFloat tmp = _mm_mul_ps(z, *(SIMDVectorFloat*)_ps_0p5);
592 y = _mm_sub_ps(y, tmp);
593 y = _mm_add_ps(y, *(SIMDVectorFloat*)_ps_1);
594
595 /* Evaluate the second polynom (Pi/4 <= x <= 0) */
596 SIMDVectorFloat y2 = *(SIMDVectorFloat*)_ps_sincof_p0;
597 y2 = _mm_mul_ps(y2, z);
598 y2 = _mm_add_ps(y2, *(SIMDVectorFloat*)_ps_sincof_p1);
599 y2 = _mm_mul_ps(y2, z);
600 y2 = _mm_add_ps(y2, *(SIMDVectorFloat*)_ps_sincof_p2);
601 y2 = _mm_mul_ps(y2, z);
602 y2 = _mm_mul_ps(y2, x);
603 y2 = _mm_add_ps(y2, x);
604
605 /* select the correct result from the two polynoms */
606 xmm3 = poly_mask;
607 y2 = _mm_and_ps(xmm3, y2); //, xmm3);
608 y = _mm_andnot_ps(xmm3, y);
609 y = _mm_add_ps(y, y2);
610 /* update the sign */
611 y = _mm_xor_ps(y, sign_bit);
612
613 return y;
614 }
615
616 /* since sin_ps and cos_ps are almost identical, sincos_ps could replace both of
617 them.. it is almost as fast, and gives you a free cosine with your sine */
vecSinCos(SIMDVectorFloat x,SIMDVectorFloat * s,SIMDVectorFloat * c)618 inline void vecSinCos(SIMDVectorFloat x, SIMDVectorFloat* s, SIMDVectorFloat* c)
619 {
620 SIMDVectorFloat xmm1, xmm2, xmm3 = _mm_setzero_ps(), sign_bit_sin, y;
621 SIMDVectorInt emm0, emm2, emm4;
622
623 sign_bit_sin = x;
624 /* take the absolute value */
625 x = _mm_and_ps(x, *(SIMDVectorFloat*)_ps_inv_sign_mask);
626 /* extract the sign bit (upper one) */
627 sign_bit_sin = _mm_and_ps(sign_bit_sin, *(SIMDVectorFloat*)_ps_sign_mask);
628
629 /* scale by 4/Pi */
630 y = _mm_mul_ps(x, *(SIMDVectorFloat*)_ps_cephes_FOPI);
631
632 /* store the integer part of y in emm2 */
633 emm2 = _mm_cvttps_epi32(y);
634
635 /* j=(j+1) & (~1) (see the cephes sources) */
636 emm2 = _mm_add_epi32(emm2, *(SIMDVectorInt*)_pi32_1);
637 emm2 = _mm_and_si128(emm2, *(SIMDVectorInt*)_pi32_inv1);
638 y = _mm_cvtepi32_ps(emm2);
639
640 emm4 = emm2;
641
642 /* get the swap sign flag for the sine */
643 emm0 = _mm_and_si128(emm2, *(SIMDVectorInt*)_pi32_4);
644 emm0 = _mm_slli_epi32(emm0, 29);
645 SIMDVectorFloat swap_sign_bit_sin = VecI2F(emm0);
646
647 /* get the polynom selection mask for the sine*/
648 emm2 = _mm_and_si128(emm2, *(SIMDVectorInt*)_pi32_2);
649 emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
650 SIMDVectorFloat poly_mask = VecI2F(emm2);
651
652 /* The magic pass: "Extended precision modular arithmetic"
653 x = ((x - y * DP1) - y * DP2) - y * DP3; */
654 xmm1 = *(SIMDVectorFloat*)_ps_minus_cephes_DP1;
655 xmm2 = *(SIMDVectorFloat*)_ps_minus_cephes_DP2;
656 xmm3 = *(SIMDVectorFloat*)_ps_minus_cephes_DP3;
657 xmm1 = _mm_mul_ps(y, xmm1);
658 xmm2 = _mm_mul_ps(y, xmm2);
659 xmm3 = _mm_mul_ps(y, xmm3);
660 x = _mm_add_ps(x, xmm1);
661 x = _mm_add_ps(x, xmm2);
662 x = _mm_add_ps(x, xmm3);
663
664 emm4 = _mm_sub_epi32(emm4, *(SIMDVectorInt*)_pi32_2);
665 emm4 = _mm_andnot_si128(emm4, *(SIMDVectorInt*)_pi32_4);
666 emm4 = _mm_slli_epi32(emm4, 29);
667 SIMDVectorFloat sign_bit_cos = VecI2F(emm4);
668
669 sign_bit_sin = _mm_xor_ps(sign_bit_sin, swap_sign_bit_sin);
670
671 /* Evaluate the first polynom (0 <= x <= Pi/4) */
672 SIMDVectorFloat z = _mm_mul_ps(x, x);
673 y = *(SIMDVectorFloat*)_ps_coscof_p0;
674
675 y = _mm_mul_ps(y, z);
676 y = _mm_add_ps(y, *(SIMDVectorFloat*)_ps_coscof_p1);
677 y = _mm_mul_ps(y, z);
678 y = _mm_add_ps(y, *(SIMDVectorFloat*)_ps_coscof_p2);
679 y = _mm_mul_ps(y, z);
680 y = _mm_mul_ps(y, z);
681 SIMDVectorFloat tmp = _mm_mul_ps(z, *(SIMDVectorFloat*)_ps_0p5);
682 y = _mm_sub_ps(y, tmp);
683 y = _mm_add_ps(y, *(SIMDVectorFloat*)_ps_1);
684
685 /* Evaluate the second polynom (Pi/4 <= x <= 0) */
686 SIMDVectorFloat y2 = *(SIMDVectorFloat*)_ps_sincof_p0;
687 y2 = _mm_mul_ps(y2, z);
688 y2 = _mm_add_ps(y2, *(SIMDVectorFloat*)_ps_sincof_p1);
689 y2 = _mm_mul_ps(y2, z);
690 y2 = _mm_add_ps(y2, *(SIMDVectorFloat*)_ps_sincof_p2);
691 y2 = _mm_mul_ps(y2, z);
692 y2 = _mm_mul_ps(y2, x);
693 y2 = _mm_add_ps(y2, x);
694
695 /* select the correct result from the two polynoms */
696 xmm3 = poly_mask;
697 SIMDVectorFloat ysin2 = _mm_and_ps(xmm3, y2);
698 SIMDVectorFloat ysin1 = _mm_andnot_ps(xmm3, y);
699 y2 = _mm_sub_ps(y2, ysin2);
700 y = _mm_sub_ps(y, ysin1);
701
702 xmm1 = _mm_add_ps(ysin1, ysin2);
703 xmm2 = _mm_add_ps(y, y2);
704
705 /* update the sign */
706 *s = _mm_xor_ps(xmm1, sign_bit_sin);
707 *c = _mm_xor_ps(xmm2, sign_bit_cos);
708 }
709
710 #define STATIC_M128_CONST(name, val) static constexpr __m128 name = {val, val, val, val};
711
712 // fast polynomial approximations
713 // from scalar code by Jacques-Henri Jourdan <jourgun@gmail.com>
714 // sin and cos valid from -pi to pi
715 // exp and log polynomials generated using Sollya http://sollya.gforge.inria.fr/
716 // exp Generated in Sollya with:
717 // > f=remez(1-x*exp(-(x-1)*log(2)),
718 // [|1,(x-1)*(x-2), (x-1)*(x-2)*x, (x-1)*(x-2)*x*x|],
719 // [1,2], exp(-(x-1)*log(2)));
720 // > plot(exp((x-1)*log(2))/(f+x)-1, [1,2]);
721 // > f+x;
722 //
723 // log Generated in Sollya using :
724 // f = remez(log(x)-(x-1)*log(2),
725 // [|1,(x-1)*(x-2), (x-1)*(x-2)*x, (x-1)*(x-2)*x*x,
726 // (x-1)*(x-2)*x*x*x|], [1,2], 1, 1e-8);
727 // > plot(f+(x-1)*log(2)-log(x), [1,2]);
728 // > f+(x-1)*log(2)
729
730 STATIC_M128_CONST(kSinC1Vec, 0.99997937679290771484375f);
731 STATIC_M128_CONST(kSinC2Vec, -0.166624367237091064453125f);
732 STATIC_M128_CONST(kSinC3Vec, 8.30897875130176544189453125e-3f);
733 STATIC_M128_CONST(kSinC4Vec, -1.92649182281456887722015380859375e-4f);
734 STATIC_M128_CONST(kSinC5Vec, 2.147840177713078446686267852783203125e-6f);
735
vecSinApprox(__m128 x)736 inline __m128 vecSinApprox(__m128 x)
737 {
738 __m128 x2 = _mm_mul_ps(x, x);
739 return _mm_mul_ps(
740 x, _mm_add_ps(
741 kSinC1Vec,
742 _mm_mul_ps(
743 x2,
744 _mm_add_ps(
745 kSinC2Vec,
746 _mm_mul_ps(
747 x2, _mm_add_ps(kSinC3Vec,
748 _mm_mul_ps(x2, _mm_add_ps(kSinC4Vec,
749 _mm_mul_ps(x2, kSinC5Vec)))))))));
750 }
751
752 STATIC_M128_CONST(kCosC1Vec, 0.999959766864776611328125f);
753 STATIC_M128_CONST(kCosC2Vec, -0.4997930824756622314453125f);
754 STATIC_M128_CONST(kCosC3Vec, 4.1496001183986663818359375e-2f);
755 STATIC_M128_CONST(kCosC4Vec, -1.33926304988563060760498046875e-3f);
756 STATIC_M128_CONST(kCosC5Vec, 1.8791708498611114919185638427734375e-5f);
757
vecCosApprox(SIMDVectorFloat x)758 inline SIMDVectorFloat vecCosApprox(SIMDVectorFloat x)
759 {
760 SIMDVectorFloat x2 = _mm_mul_ps(x, x);
761 return _mm_add_ps(
762 kCosC1Vec,
763 _mm_mul_ps(
764 x2,
765 _mm_add_ps(
766 kCosC2Vec,
767 _mm_mul_ps(x2, _mm_add_ps(kCosC3Vec,
768 _mm_mul_ps(x2, _mm_add_ps(kCosC4Vec,
769 _mm_mul_ps(x2, kCosC5Vec))))))));
770 }
771 STATIC_M128_CONST(kExpC1Vec, 2139095040.f);
772 STATIC_M128_CONST(kExpC2Vec, 12102203.1615614f);
773 STATIC_M128_CONST(kExpC3Vec, 1065353216.f);
774 STATIC_M128_CONST(kExpC4Vec, 0.510397365625862338668154f);
775 STATIC_M128_CONST(kExpC5Vec, 0.310670891004095530771135f);
776 STATIC_M128_CONST(kExpC6Vec, 0.168143436463395944830000f);
777 STATIC_M128_CONST(kExpC7Vec, -2.88093587581985443087955e-3f);
778 STATIC_M128_CONST(kExpC8Vec, 1.3671023382430374383648148e-2f);
779
vecExpApprox(SIMDVectorFloat x)780 inline SIMDVectorFloat vecExpApprox(SIMDVectorFloat x)
781 {
782 const SIMDVectorFloat kZeroVec = _mm_setzero_ps();
783
784 SIMDVectorFloat val2, val3, val4;
785 SIMDVectorInt val4i;
786
787 val2 = _mm_add_ps(_mm_mul_ps(x, kExpC2Vec), kExpC3Vec);
788 val3 = _mm_min_ps(val2, kExpC1Vec);
789 val4 = _mm_max_ps(val3, kZeroVec);
790 val4i = _mm_cvttps_epi32(val4);
791
792 SIMDVectorFloat xu = _mm_and_ps(VecI2F(val4i), VecI2F(_mm_set1_epi32(0x7F800000)));
793 SIMDVectorFloat b = _mm_or_ps(_mm_and_ps(VecI2F(val4i), VecI2F(_mm_set1_epi32(0x7FFFFF))),
794 VecI2F(_mm_set1_epi32(0x3F800000)));
795
796 return _mm_mul_ps(
797 xu,
798 (_mm_add_ps(
799 kExpC4Vec,
800 _mm_mul_ps(
801 b, _mm_add_ps(
802 kExpC5Vec,
803 _mm_mul_ps(
804 b, _mm_add_ps(kExpC6Vec,
805 _mm_mul_ps(b, _mm_add_ps(kExpC7Vec,
806 _mm_mul_ps(b, kExpC8Vec))))))))));
807 }
808
809 STATIC_M128_CONST(kLogC1Vec, -89.970756366f);
810 STATIC_M128_CONST(kLogC2Vec, 3.529304993f);
811 STATIC_M128_CONST(kLogC3Vec, -2.461222105f);
812 STATIC_M128_CONST(kLogC4Vec, 1.130626167f);
813 STATIC_M128_CONST(kLogC5Vec, -0.288739945f);
814 STATIC_M128_CONST(kLogC6Vec, 3.110401639e-2f);
815 STATIC_M128_CONST(kLogC7Vec, 0.69314718055995f);
816
vecLogApprox(SIMDVectorFloat val)817 inline SIMDVectorFloat vecLogApprox(SIMDVectorFloat val)
818 {
819 SIMDVectorInt vZero = _mm_setzero_si128();
820 SIMDVectorInt valAsInt = VecF2I(val);
821 SIMDVectorInt expi = _mm_srli_epi32(valAsInt, 23);
822 SIMDVectorFloat addcst =
823 vecSelect(kLogC1Vec, _mm_set1_ps(FLT_MIN), VecF2I(_mm_cmpgt_ps(val, VecI2F(vZero))));
824 SIMDVectorInt valAsIntMasked =
825 VecF2I(_mm_or_ps(_mm_and_ps(VecI2F(valAsInt), VecI2F(_mm_set1_epi32(0x7FFFFF))),
826 VecI2F(_mm_set1_epi32(0x3F800000))));
827 SIMDVectorFloat x = VecI2F(valAsIntMasked);
828
829 SIMDVectorFloat poly = _mm_mul_ps(
830 x, _mm_add_ps(
831 kLogC2Vec,
832 _mm_mul_ps(
833 x, _mm_add_ps(
834 kLogC3Vec,
835 _mm_mul_ps(
836 x, _mm_add_ps(kLogC4Vec,
837 _mm_mul_ps(x, _mm_add_ps(kLogC5Vec,
838 _mm_mul_ps(x, kLogC6Vec)))))))));
839
840 SIMDVectorFloat addCstResult = _mm_add_ps(addcst, _mm_mul_ps(kLogC7Vec, _mm_cvtepi32_ps(expi)));
841 return _mm_add_ps(poly, addCstResult);
842 }
843
vecIntPart(SIMDVectorFloat val)844 inline SIMDVectorFloat vecIntPart(SIMDVectorFloat val)
845 {
846 SIMDVectorInt vi = _mm_cvttps_epi32(val); // convert with truncate
847 return (_mm_cvtepi32_ps(vi));
848 }
849
vecFracPart(SIMDVectorFloat val)850 inline SIMDVectorFloat vecFracPart(SIMDVectorFloat val)
851 {
852 SIMDVectorInt vi = _mm_cvttps_epi32(val); // convert with truncate
853 SIMDVectorFloat intPart = _mm_cvtepi32_ps(vi);
854 return _mm_sub_ps(val, intPart);
855 }
856