1 // madronalib: a C++ framework for DSP applications.
2 // Copyright (c) 2020 Madrona Labs LLC. http://www.madronalabs.com
3 // Distributed under the MIT license: http://madrona-labs.mit-license.org/
4 
5 // MLDSPMathSSE.h
6 // SSE implementations of madronalib SIMD primitives
7 
8 // cephes-derived approximate math functions adapted from code by Julien
9 // Pommier, licensed as follows:
10 /*
11  Copyright (C) 2007  Julien Pommier
12 
13  This software is provided 'as-is', without any express or implied
14  warranty.  In no event will the authors be held liable for any damages
15  arising from the use of this software.
16 
17  Permission is granted to anyone to use this software for any purpose,
18  including commercial applications, and to alter it and redistribute it
19  freely, subject to the following restrictions:
20 
21  1. The origin of this software must not be misrepresented; you must not
22  claim that you wrote the original software. If you use this software
23  in a product, an acknowledgment in the product documentation would be
24  appreciated but is not required.
25  2. Altered source versions must be plainly marked as such, and must not be
26  misrepresented as being the original software.
27  3. This notice may not be removed or altered from any source distribution.
28 
29  (this is the zlib license)
30  */
31 
32 #include <emmintrin.h>
33 #include <float.h>
34 
35 #pragma once
36 
37 #ifdef _MSC_VER /* visual c++ */
38 #define ALIGN16_BEG __declspec(align(16))
39 #define ALIGN16_END
40 #else /* gcc or icc */
41 #define ALIGN16_BEG
42 #define ALIGN16_END __attribute__((aligned(16)))
43 #endif
44 
45 // SSE types
46 typedef __m128 SIMDVectorFloat;
47 typedef __m128i SIMDVectorInt;
48 
49 // SSE casts
50 #define VecF2I _mm_castps_si128
51 #define VecI2F _mm_castsi128_ps
52 
53 // TODO int or uintptr_t?
54 constexpr int kFloatsPerSIMDVectorBits = 2;
55 constexpr int kFloatsPerSIMDVector = 1 << kFloatsPerSIMDVectorBits;
56 constexpr int kSIMDVectorsPerDSPVector = kFloatsPerDSPVector / kFloatsPerSIMDVector;
57 constexpr int kBytesPerSIMDVector = kFloatsPerSIMDVector * sizeof(float);
58 constexpr int kSIMDVectorMask = ~(kBytesPerSIMDVector - 1);
59 
60 constexpr int kIntsPerSIMDVectorBits = 2;
61 constexpr int kIntsPerSIMDVector = 1 << kIntsPerSIMDVectorBits;
62 
isSIMDAligned(float * p)63 inline bool isSIMDAligned(float* p)
64 {
65   uintptr_t pM = (uintptr_t)p;
66   return ((pM & kSIMDVectorMask) == 0);
67 }
68 
69 // primitive SSE operations
70 #define vecAdd _mm_add_ps
71 #define vecSub _mm_sub_ps
72 #define vecMul _mm_mul_ps
73 #define vecDiv _mm_div_ps
74 #define vecDivApprox(x1, x2) (_mm_mul_ps(x1, _mm_rcp_ps(x2)))
75 #define vecMin _mm_min_ps
76 #define vecMax _mm_max_ps
77 
78 #define vecSqrt _mm_sqrt_ps
79 #define vecSqrtApprox(x) (vecMul(x, vecRSqrt(x)))
80 #define vecRSqrt _mm_rsqrt_ps
81 #define vecAbs(x) (_mm_andnot_ps(_mm_set_ps1(-0.0f), x))
82 
83 #define vecSign(x)                                                             \
84   (_mm_and_ps(_mm_or_ps(_mm_and_ps(_mm_set_ps1(-0.0f), x), _mm_set_ps1(1.0f)), \
85               _mm_cmpneq_ps(_mm_set_ps1(-0.0f), x)))
86 
87 #define vecSignBit(x) (_mm_or_ps(_mm_and_ps(_mm_set_ps1(-0.0f), x), _mm_set_ps1(1.0f)))
88 #define vecClamp(x1, x2, x3) _mm_min_ps(_mm_max_ps(x1, x2), x3)
89 #define vecWithin(x1, x2, x3) _mm_and_ps(_mm_cmpge_ps(x1, x2), _mm_cmplt_ps(x1, x3))
90 
91 #define vecEqual _mm_cmpeq_ps
92 #define vecNotEqual _mm_cmpneq_ps
93 #define vecGreaterThan _mm_cmpgt_ps
94 #define vecGreaterThanOrEqual _mm_cmpge_ps
95 #define vecLessThan _mm_cmplt_ps
96 #define vecLessThanOrEqual _mm_cmple_ps
97 
98 #define vecSet1 _mm_set1_ps
99 
100 // low-level store and load a vector to/from a float*.
101 // the pointer must be aligned or the program will crash!
102 // void vecStore(float* pDest, DSPVector v);
103 // DSPVector vecLoad(float* pSrc);
104 #define vecStore _mm_store_ps
105 #define vecLoad _mm_load_ps
106 
107 #define vecStoreUnaligned _mm_storeu_ps
108 #define vecLoadUnaligned _mm_loadu_ps
109 
110 #define vecAnd _mm_and_ps
111 #define vecOr _mm_or_ps
112 
113 #define vecZeros _mm_setzero_ps
114 #define vecOnes vecEqual(vecZeros, vecZeros)
115 
116 #define vecShiftLeft _mm_slli_si128
117 #define vecShiftRight _mm_srli_si128
118 
119 #define vecFloatToIntRound _mm_cvtps_epi32
120 #define vecFloatToIntTruncate _mm_cvttps_epi32
121 #define vecIntToFloat _mm_cvtepi32_ps
122 
123 #define vecAddInt _mm_add_epi32
124 #define vecSubInt _mm_sub_epi32
125 #define vecSet1Int _mm_set1_epi32
126 
127 typedef union
128 {
129   SIMDVectorFloat v;
130   float f[4];
131 } SIMDVectorFloatUnion;
132 
133 typedef union
134 {
135   SIMDVectorInt v;
136   uint32_t i[4];
137 } SIMDVectorIntUnion;
138 
vecSetInt1(uint32_t a)139 inline SIMDVectorInt vecSetInt1(uint32_t a) { return _mm_set1_epi32(a); }
140 
vecSetInt4(uint32_t a,uint32_t b,uint32_t c,uint32_t d)141 inline SIMDVectorInt vecSetInt4(uint32_t a, uint32_t b, uint32_t c, uint32_t d)
142 {
143   return _mm_set_epi32(d, c, b, a);
144 }
145 
146 static const int XI = 0xFFFFFFFF;
147 static const float X = *(reinterpret_cast<const float*>(&XI));
148 
149 const SIMDVectorFloat vecMask0 = {0, 0, 0, 0};
150 const SIMDVectorFloat vecMask1 = {0, 0, 0, X};
151 const SIMDVectorFloat vecMask2 = {0, 0, X, 0};
152 const SIMDVectorFloat vecMask3 = {0, 0, X, X};
153 const SIMDVectorFloat vecMask4 = {0, X, 0, 0};
154 const SIMDVectorFloat vecMask5 = {0, X, 0, X};
155 const SIMDVectorFloat vecMask6 = {0, X, X, 0};
156 const SIMDVectorFloat vecMask7 = {0, X, X, X};
157 const SIMDVectorFloat vecMask8 = {X, 0, 0, 0};
158 const SIMDVectorFloat vecMask9 = {X, 0, 0, X};
159 const SIMDVectorFloat vecMaskA = {X, 0, X, 0};
160 const SIMDVectorFloat vecMaskB = {X, 0, X, X};
161 const SIMDVectorFloat vecMaskC = {X, X, 0, 0};
162 const SIMDVectorFloat vecMaskD = {X, X, 0, X};
163 const SIMDVectorFloat vecMaskE = {X, X, X, 0};
164 const SIMDVectorFloat vecMaskF = {X, X, X, X};
165 
166 #define SHUFFLE(a, b, c, d) ((a << 6) | (b << 4) | (c << 2) | (d))
167 #define vecBroadcast3(x1) _mm_shuffle_ps(x1, x1, SHUFFLE(3, 3, 3, 3))
168 
169 #define vecShiftElementsLeft(x1, i) _mm_slli_si128(x1, 4 * i);
170 #define vecShiftElementsRight(x1, i) _mm_srli_si128(x1, 4 * i);
171 
172 inline std::ostream& operator<<(std::ostream& out, SIMDVectorFloat v)
173 {
174   SIMDVectorFloatUnion u;
175   u.v = v;
176   out << "[";
177   out << u.f[0];
178   out << ", ";
179   out << u.f[1];
180   out << ", ";
181   out << u.f[2];
182   out << ", ";
183   out << u.f[3];
184   out << "]";
185   return out;
186 }
187 
188 inline std::ostream& operator<<(std::ostream& out, SIMDVectorInt v)
189 {
190   SIMDVectorIntUnion u;
191   u.v = v;
192   out << "[";
193   out << u.i[0];
194   out << ", ";
195   out << u.i[1];
196   out << ", ";
197   out << u.i[2];
198   out << ", ";
199   out << u.i[3];
200   out << "]";
201   return out;
202 }
203 
204 // ----------------------------------------------------------------
205 #pragma mark select
206 
vecSelect(SIMDVectorFloat a,SIMDVectorFloat b,SIMDVectorInt conditionMask)207 inline SIMDVectorFloat vecSelect(SIMDVectorFloat a, SIMDVectorFloat b, SIMDVectorInt conditionMask)
208 {
209   __m128i ones = _mm_set1_epi32(-1);
210   return _mm_or_ps(_mm_and_ps(VecI2F(conditionMask), a),
211                    _mm_and_ps(_mm_xor_ps(VecI2F(conditionMask), VecI2F(ones)), b));
212 }
213 
vecSelect(SIMDVectorInt a,SIMDVectorInt b,SIMDVectorInt conditionMask)214 inline SIMDVectorInt vecSelect(SIMDVectorInt a, SIMDVectorInt b, SIMDVectorInt conditionMask)
215 {
216   __m128i ones = _mm_set1_epi32(-1);
217   return _mm_or_si128(_mm_and_si128(conditionMask, a),
218                       _mm_and_si128(_mm_xor_si128(conditionMask, ones), b));
219 }
220 
221 // ----------------------------------------------------------------
222 // horizontal operations returning float
223 
vecSumH(SIMDVectorFloat v)224 inline float vecSumH(SIMDVectorFloat v)
225 {
226   SIMDVectorFloat tmp0 = _mm_add_ps(v, _mm_movehl_ps(v, v));
227   SIMDVectorFloat tmp1 = _mm_add_ss(tmp0, _mm_shuffle_ps(tmp0, tmp0, 1));
228   return _mm_cvtss_f32(tmp1);
229 }
230 
vecMaxH(SIMDVectorFloat v)231 inline float vecMaxH(SIMDVectorFloat v)
232 {
233   SIMDVectorFloat tmp0 = _mm_max_ps(v, _mm_movehl_ps(v, v));
234   SIMDVectorFloat tmp1 = _mm_max_ss(tmp0, _mm_shuffle_ps(tmp0, tmp0, 1));
235   return _mm_cvtss_f32(tmp1);
236 }
237 
vecMinH(SIMDVectorFloat v)238 inline float vecMinH(SIMDVectorFloat v)
239 {
240   SIMDVectorFloat tmp0 = _mm_min_ps(v, _mm_movehl_ps(v, v));
241   SIMDVectorFloat tmp1 = _mm_min_ss(tmp0, _mm_shuffle_ps(tmp0, tmp0, 1));
242   return _mm_cvtss_f32(tmp1);
243 }
244 
245 /* declare some SSE constants -- why can't I figure a better way to do that? */
246 #define _PS_CONST(Name, Val) \
247   static const ALIGN16_BEG float _ps_##Name[4] ALIGN16_END = {Val, Val, Val, Val}
248 #define _PI32_CONST(Name, Val) \
249   static const ALIGN16_BEG int _pi32_##Name[4] ALIGN16_END = {Val, Val, Val, Val}
250 #define _PS_CONST_TYPE(Name, Type, Val) \
251   static const ALIGN16_BEG Type _ps_##Name[4] ALIGN16_END = {Val, Val, Val, Val}
252 
253 _PS_CONST(1, 1.0f);
254 _PS_CONST(0p5, 0.5f);
255 
256 /* the smallest non denormalized float number */
257 _PS_CONST_TYPE(min_norm_pos, int, 0x00800000);
258 _PS_CONST_TYPE(mant_mask, int, 0x7f800000);
259 _PS_CONST_TYPE(inv_mant_mask, int, ~0x7f800000);
260 
261 _PS_CONST_TYPE(sign_mask, int, (int)0x80000000);
262 _PS_CONST_TYPE(inv_sign_mask, int, ~0x80000000);
263 
264 _PI32_CONST(1, 1);
265 _PI32_CONST(inv1, ~1);
266 _PI32_CONST(2, 2);
267 _PI32_CONST(4, 4);
268 _PI32_CONST(0x7f, 0x7f);
269 
270 _PS_CONST(cephes_SQRTHF, 0.707106781186547524);
271 _PS_CONST(cephes_log_p0, 7.0376836292E-2);
272 _PS_CONST(cephes_log_p1, -1.1514610310E-1);
273 _PS_CONST(cephes_log_p2, 1.1676998740E-1);
274 _PS_CONST(cephes_log_p3, -1.2420140846E-1);
275 _PS_CONST(cephes_log_p4, +1.4249322787E-1);
276 _PS_CONST(cephes_log_p5, -1.6668057665E-1);
277 _PS_CONST(cephes_log_p6, +2.0000714765E-1);
278 _PS_CONST(cephes_log_p7, -2.4999993993E-1);
279 _PS_CONST(cephes_log_p8, +3.3333331174E-1);
280 _PS_CONST(cephes_log_q1, -2.12194440e-4);
281 _PS_CONST(cephes_log_q2, 0.693359375);
282 
283 /* natural logarithm computed for 4 simultaneous float
284  return NaN for x <= 0
285  */
vecLog(SIMDVectorFloat x)286 inline SIMDVectorFloat vecLog(SIMDVectorFloat x)
287 {
288   SIMDVectorInt emm0;
289   SIMDVectorFloat one = *(SIMDVectorFloat*)_ps_1;
290   SIMDVectorFloat invalid_mask = _mm_cmple_ps(x, _mm_setzero_ps());
291 
292   x = _mm_max_ps(x, *(SIMDVectorFloat*)_ps_min_norm_pos); /* cut off denormalized stuff */
293 
294   emm0 = _mm_srli_epi32(VecF2I(x), 23);
295 
296   /* keep only the fractional part */
297   x = _mm_and_ps(x, *(SIMDVectorFloat*)_ps_inv_mant_mask);
298   x = _mm_or_ps(x, *(SIMDVectorFloat*)_ps_0p5);
299 
300   emm0 = _mm_sub_epi32(emm0, *(SIMDVectorInt*)_pi32_0x7f);
301   SIMDVectorFloat e = _mm_cvtepi32_ps(emm0);
302 
303   e = _mm_add_ps(e, one);
304 
305   /* part2:
306    if( x < SQRTHF ) {
307    e -= 1;
308    x = x + x - 1.0;
309    } else { x = x - 1.0; }
310 */
311   SIMDVectorFloat mask = _mm_cmplt_ps(x, *(SIMDVectorFloat*)_ps_cephes_SQRTHF);
312   SIMDVectorFloat tmp = _mm_and_ps(x, mask);
313   x = _mm_sub_ps(x, one);
314   e = _mm_sub_ps(e, _mm_and_ps(one, mask));
315   x = _mm_add_ps(x, tmp);
316 
317   SIMDVectorFloat z = _mm_mul_ps(x, x);
318 
319   SIMDVectorFloat y = *(SIMDVectorFloat*)_ps_cephes_log_p0;
320   y = _mm_mul_ps(y, x);
321   y = _mm_add_ps(y, *(SIMDVectorFloat*)_ps_cephes_log_p1);
322   y = _mm_mul_ps(y, x);
323   y = _mm_add_ps(y, *(SIMDVectorFloat*)_ps_cephes_log_p2);
324   y = _mm_mul_ps(y, x);
325   y = _mm_add_ps(y, *(SIMDVectorFloat*)_ps_cephes_log_p3);
326   y = _mm_mul_ps(y, x);
327   y = _mm_add_ps(y, *(SIMDVectorFloat*)_ps_cephes_log_p4);
328   y = _mm_mul_ps(y, x);
329   y = _mm_add_ps(y, *(SIMDVectorFloat*)_ps_cephes_log_p5);
330   y = _mm_mul_ps(y, x);
331   y = _mm_add_ps(y, *(SIMDVectorFloat*)_ps_cephes_log_p6);
332   y = _mm_mul_ps(y, x);
333   y = _mm_add_ps(y, *(SIMDVectorFloat*)_ps_cephes_log_p7);
334   y = _mm_mul_ps(y, x);
335   y = _mm_add_ps(y, *(SIMDVectorFloat*)_ps_cephes_log_p8);
336   y = _mm_mul_ps(y, x);
337 
338   y = _mm_mul_ps(y, z);
339 
340   tmp = _mm_mul_ps(e, *(SIMDVectorFloat*)_ps_cephes_log_q1);
341   y = _mm_add_ps(y, tmp);
342 
343   tmp = _mm_mul_ps(z, *(SIMDVectorFloat*)_ps_0p5);
344   y = _mm_sub_ps(y, tmp);
345 
346   tmp = _mm_mul_ps(e, *(SIMDVectorFloat*)_ps_cephes_log_q2);
347   x = _mm_add_ps(x, y);
348   x = _mm_add_ps(x, tmp);
349   x = _mm_or_ps(x, invalid_mask);  // negative arg will be NAN
350   return x;
351 }
352 
353 _PS_CONST(exp_hi, 88.3762626647949f);
354 _PS_CONST(exp_lo, -88.3762626647949f);
355 
356 _PS_CONST(cephes_LOG2EF, 1.44269504088896341);
357 _PS_CONST(cephes_exp_C1, 0.693359375);
358 _PS_CONST(cephes_exp_C2, -2.12194440e-4);
359 
360 _PS_CONST(cephes_exp_p0, 1.9875691500E-4);
361 _PS_CONST(cephes_exp_p1, 1.3981999507E-3);
362 _PS_CONST(cephes_exp_p2, 8.3334519073E-3);
363 _PS_CONST(cephes_exp_p3, 4.1665795894E-2);
364 _PS_CONST(cephes_exp_p4, 1.6666665459E-1);
365 _PS_CONST(cephes_exp_p5, 5.0000001201E-1);
366 
vecExp(SIMDVectorFloat x)367 inline SIMDVectorFloat vecExp(SIMDVectorFloat x)
368 {
369   SIMDVectorFloat tmp = _mm_setzero_ps(), fx;
370   SIMDVectorInt emm0;
371   SIMDVectorFloat one = *(SIMDVectorFloat*)_ps_1;
372 
373   x = _mm_min_ps(x, *(SIMDVectorFloat*)_ps_exp_hi);
374   x = _mm_max_ps(x, *(SIMDVectorFloat*)_ps_exp_lo);
375 
376   /* express exp(x) as exp(g + n*log(2)) */
377   fx = _mm_mul_ps(x, *(SIMDVectorFloat*)_ps_cephes_LOG2EF);
378   fx = _mm_add_ps(fx, *(SIMDVectorFloat*)_ps_0p5);
379 
380   /* how to perform a floorf with SSE: just below */
381   emm0 = _mm_cvttps_epi32(fx);
382   tmp = _mm_cvtepi32_ps(emm0);
383 
384   /* if greater, substract 1 */
385   SIMDVectorFloat mask = _mm_cmpgt_ps(tmp, fx);
386   mask = _mm_and_ps(mask, one);
387   fx = _mm_sub_ps(tmp, mask);
388 
389   tmp = _mm_mul_ps(fx, *(SIMDVectorFloat*)_ps_cephes_exp_C1);
390   SIMDVectorFloat z = _mm_mul_ps(fx, *(SIMDVectorFloat*)_ps_cephes_exp_C2);
391   x = _mm_sub_ps(x, tmp);
392   x = _mm_sub_ps(x, z);
393   z = _mm_mul_ps(x, x);
394 
395   SIMDVectorFloat y = *(SIMDVectorFloat*)_ps_cephes_exp_p0;
396   y = _mm_mul_ps(y, x);
397   y = _mm_add_ps(y, *(SIMDVectorFloat*)_ps_cephes_exp_p1);
398   y = _mm_mul_ps(y, x);
399   y = _mm_add_ps(y, *(SIMDVectorFloat*)_ps_cephes_exp_p2);
400   y = _mm_mul_ps(y, x);
401   y = _mm_add_ps(y, *(SIMDVectorFloat*)_ps_cephes_exp_p3);
402   y = _mm_mul_ps(y, x);
403   y = _mm_add_ps(y, *(SIMDVectorFloat*)_ps_cephes_exp_p4);
404   y = _mm_mul_ps(y, x);
405   y = _mm_add_ps(y, *(SIMDVectorFloat*)_ps_cephes_exp_p5);
406   y = _mm_mul_ps(y, z);
407   y = _mm_add_ps(y, x);
408   y = _mm_add_ps(y, one);
409 
410   /* build 2^n */
411   emm0 = _mm_cvttps_epi32(fx);
412   emm0 = _mm_add_epi32(emm0, *(SIMDVectorInt*)_pi32_0x7f);
413   emm0 = _mm_slli_epi32(emm0, 23);
414   SIMDVectorFloat pow2n = VecI2F(emm0);
415 
416   y = _mm_mul_ps(y, pow2n);
417   return y;
418 }
419 
420 _PS_CONST(minus_cephes_DP1, -0.78515625);
421 _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
422 _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
423 _PS_CONST(sincof_p0, -1.9515295891E-4);
424 _PS_CONST(sincof_p1, 8.3321608736E-3);
425 _PS_CONST(sincof_p2, -1.6666654611E-1);
426 _PS_CONST(coscof_p0, 2.443315711809948E-005);
427 _PS_CONST(coscof_p1, -1.388731625493765E-003);
428 _PS_CONST(coscof_p2, 4.166664568298827E-002);
429 _PS_CONST(cephes_FOPI, 1.27323954473516);  // 4 / M_PI
430 
431 /*
432  Julien Pommier:
433  The code is the exact rewriting of the cephes sinf function.
434  Precision is excellent as long as x < 8192 (I did not bother to
435  take into account the special handling they have for greater values
436  -- it does not return garbage for arguments over 8192, though, but
437  the extra precision is missing).
438 
439  Note that it is such that sinf((float)M_PI) = 8.74e-8, which is the
440  surprising but correct result.
441 
442  Performance is also surprisingly good, 1.33 times faster than the
443  macos vsinf SSE2 function, and 1.5 times faster than the
444  __vrs4_sinf of amd's ACML (which is only available in 64 bits). Not
445  too bad for an SSE1 function (with no special tuning) !
446  However the latter libraries probably have a much better handling of NaN,
447  Inf, denormalized and other special arguments..
448 
449  On my core 1 duo, the execution of this function takes approximately 95 cycles.
450 
451  From what I have observed on the experiments with Intel AMath lib, switching to
452  an SSE2 version would improve the perf by only 10%.
453 
454  Since it is based on SSE intrinsics, it has to be compiled at -O2 to
455  deliver full speed.
456  */
vecSin(SIMDVectorFloat x)457 inline SIMDVectorFloat vecSin(SIMDVectorFloat x)
458 {
459   SIMDVectorFloat xmm1, xmm2 = _mm_setzero_ps(), xmm3, sign_bit, y;
460   SIMDVectorInt emm0, emm2;
461 
462   sign_bit = x;
463   /* take the absolute value */
464   x = _mm_and_ps(x, *(SIMDVectorFloat*)_ps_inv_sign_mask);
465   /* extract the sign bit (upper one) */
466   sign_bit = _mm_and_ps(sign_bit, *(SIMDVectorFloat*)_ps_sign_mask);
467 
468   /* scale by 4/Pi */
469   y = _mm_mul_ps(x, *(SIMDVectorFloat*)_ps_cephes_FOPI);
470 
471   /* store the integer part of y in mm0 */
472   emm2 = _mm_cvttps_epi32(y);
473   /* j=(j+1) & (~1) (see the cephes sources) */
474   emm2 = _mm_add_epi32(emm2, *(SIMDVectorInt*)_pi32_1);
475   emm2 = _mm_and_si128(emm2, *(SIMDVectorInt*)_pi32_inv1);
476   y = _mm_cvtepi32_ps(emm2);
477 
478   /* get the swap sign flag */
479   emm0 = _mm_and_si128(emm2, *(SIMDVectorInt*)_pi32_4);
480   emm0 = _mm_slli_epi32(emm0, 29);
481   /* get the polynom selection mask
482    there is one polynom for 0 <= x <= Pi/4
483    and another one for Pi/4<x<=Pi/2
484    Both branches will be computed.
485   */
486   emm2 = _mm_and_si128(emm2, *(SIMDVectorInt*)_pi32_2);
487   emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
488 
489   SIMDVectorFloat swap_sign_bit = VecI2F(emm0);
490   SIMDVectorFloat poly_mask = VecI2F(emm2);
491   sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);
492 
493   /* The magic pass: "Extended precision modular arithmetic"
494    x = ((x - y * DP1) - y * DP2) - y * DP3; */
495   xmm1 = *(SIMDVectorFloat*)_ps_minus_cephes_DP1;
496   xmm2 = *(SIMDVectorFloat*)_ps_minus_cephes_DP2;
497   xmm3 = *(SIMDVectorFloat*)_ps_minus_cephes_DP3;
498   xmm1 = _mm_mul_ps(y, xmm1);
499   xmm2 = _mm_mul_ps(y, xmm2);
500   xmm3 = _mm_mul_ps(y, xmm3);
501   x = _mm_add_ps(x, xmm1);
502   x = _mm_add_ps(x, xmm2);
503   x = _mm_add_ps(x, xmm3);
504 
505   /* Evaluate the first polynom  (0 <= x <= Pi/4) */
506   y = *(SIMDVectorFloat*)_ps_coscof_p0;
507   SIMDVectorFloat z = _mm_mul_ps(x, x);
508 
509   y = _mm_mul_ps(y, z);
510   y = _mm_add_ps(y, *(SIMDVectorFloat*)_ps_coscof_p1);
511   y = _mm_mul_ps(y, z);
512   y = _mm_add_ps(y, *(SIMDVectorFloat*)_ps_coscof_p2);
513   y = _mm_mul_ps(y, z);
514   y = _mm_mul_ps(y, z);
515   SIMDVectorFloat tmp = _mm_mul_ps(z, *(SIMDVectorFloat*)_ps_0p5);
516   y = _mm_sub_ps(y, tmp);
517   y = _mm_add_ps(y, *(SIMDVectorFloat*)_ps_1);
518 
519   /* Evaluate the second polynom  (Pi/4 <= x <= 0) */
520   SIMDVectorFloat y2 = *(SIMDVectorFloat*)_ps_sincof_p0;
521   y2 = _mm_mul_ps(y2, z);
522   y2 = _mm_add_ps(y2, *(SIMDVectorFloat*)_ps_sincof_p1);
523   y2 = _mm_mul_ps(y2, z);
524   y2 = _mm_add_ps(y2, *(SIMDVectorFloat*)_ps_sincof_p2);
525   y2 = _mm_mul_ps(y2, z);
526   y2 = _mm_mul_ps(y2, x);
527   y2 = _mm_add_ps(y2, x);
528 
529   /* select the correct result from the two polynoms */
530   xmm3 = poly_mask;
531   y2 = _mm_and_ps(xmm3, y2);  //, xmm3);
532   y = _mm_andnot_ps(xmm3, y);
533   y = _mm_add_ps(y, y2);
534   /* update the sign */
535   y = _mm_xor_ps(y, sign_bit);
536   return y;
537 }
538 
539 /* almost the same as sin_ps */
vecCos(SIMDVectorFloat x)540 inline SIMDVectorFloat vecCos(SIMDVectorFloat x)
541 {
542   SIMDVectorFloat xmm1, xmm2 = _mm_setzero_ps(), xmm3, y;
543   SIMDVectorInt emm0, emm2;
544 
545   /* take the absolute value */
546   x = _mm_and_ps(x, *(SIMDVectorFloat*)_ps_inv_sign_mask);
547 
548   /* scale by 4/Pi */
549   y = _mm_mul_ps(x, *(SIMDVectorFloat*)_ps_cephes_FOPI);
550 
551   /* store the integer part of y in mm0 */
552   emm2 = _mm_cvttps_epi32(y);
553   /* j=(j+1) & (~1) (see the cephes sources) */
554   emm2 = _mm_add_epi32(emm2, *(SIMDVectorInt*)_pi32_1);
555   emm2 = _mm_and_si128(emm2, *(SIMDVectorInt*)_pi32_inv1);
556   y = _mm_cvtepi32_ps(emm2);
557   emm2 = _mm_sub_epi32(emm2, *(SIMDVectorInt*)_pi32_2);
558 
559   /* get the swap sign flag */
560   emm0 = _mm_andnot_si128(emm2, *(SIMDVectorInt*)_pi32_4);
561   emm0 = _mm_slli_epi32(emm0, 29);
562   /* get the polynom selection mask */
563   emm2 = _mm_and_si128(emm2, *(SIMDVectorInt*)_pi32_2);
564   emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
565 
566   SIMDVectorFloat sign_bit = VecI2F(emm0);
567   SIMDVectorFloat poly_mask = VecI2F(emm2);
568 
569   /* The magic pass: "Extended precision modular arithmetic"
570    x = ((x - y * DP1) - y * DP2) - y * DP3; */
571   xmm1 = *(SIMDVectorFloat*)_ps_minus_cephes_DP1;
572   xmm2 = *(SIMDVectorFloat*)_ps_minus_cephes_DP2;
573   xmm3 = *(SIMDVectorFloat*)_ps_minus_cephes_DP3;
574   xmm1 = _mm_mul_ps(y, xmm1);
575   xmm2 = _mm_mul_ps(y, xmm2);
576   xmm3 = _mm_mul_ps(y, xmm3);
577   x = _mm_add_ps(x, xmm1);
578   x = _mm_add_ps(x, xmm2);
579   x = _mm_add_ps(x, xmm3);
580 
581   /* Evaluate the first polynom  (0 <= x <= Pi/4) */
582   y = *(SIMDVectorFloat*)_ps_coscof_p0;
583   SIMDVectorFloat z = _mm_mul_ps(x, x);
584 
585   y = _mm_mul_ps(y, z);
586   y = _mm_add_ps(y, *(SIMDVectorFloat*)_ps_coscof_p1);
587   y = _mm_mul_ps(y, z);
588   y = _mm_add_ps(y, *(SIMDVectorFloat*)_ps_coscof_p2);
589   y = _mm_mul_ps(y, z);
590   y = _mm_mul_ps(y, z);
591   SIMDVectorFloat tmp = _mm_mul_ps(z, *(SIMDVectorFloat*)_ps_0p5);
592   y = _mm_sub_ps(y, tmp);
593   y = _mm_add_ps(y, *(SIMDVectorFloat*)_ps_1);
594 
595   /* Evaluate the second polynom  (Pi/4 <= x <= 0) */
596   SIMDVectorFloat y2 = *(SIMDVectorFloat*)_ps_sincof_p0;
597   y2 = _mm_mul_ps(y2, z);
598   y2 = _mm_add_ps(y2, *(SIMDVectorFloat*)_ps_sincof_p1);
599   y2 = _mm_mul_ps(y2, z);
600   y2 = _mm_add_ps(y2, *(SIMDVectorFloat*)_ps_sincof_p2);
601   y2 = _mm_mul_ps(y2, z);
602   y2 = _mm_mul_ps(y2, x);
603   y2 = _mm_add_ps(y2, x);
604 
605   /* select the correct result from the two polynoms */
606   xmm3 = poly_mask;
607   y2 = _mm_and_ps(xmm3, y2);  //, xmm3);
608   y = _mm_andnot_ps(xmm3, y);
609   y = _mm_add_ps(y, y2);
610   /* update the sign */
611   y = _mm_xor_ps(y, sign_bit);
612 
613   return y;
614 }
615 
616 /* since sin_ps and cos_ps are almost identical, sincos_ps could replace both of
617  them.. it is almost as fast, and gives you a free cosine with your sine */
vecSinCos(SIMDVectorFloat x,SIMDVectorFloat * s,SIMDVectorFloat * c)618 inline void vecSinCos(SIMDVectorFloat x, SIMDVectorFloat* s, SIMDVectorFloat* c)
619 {
620   SIMDVectorFloat xmm1, xmm2, xmm3 = _mm_setzero_ps(), sign_bit_sin, y;
621   SIMDVectorInt emm0, emm2, emm4;
622 
623   sign_bit_sin = x;
624   /* take the absolute value */
625   x = _mm_and_ps(x, *(SIMDVectorFloat*)_ps_inv_sign_mask);
626   /* extract the sign bit (upper one) */
627   sign_bit_sin = _mm_and_ps(sign_bit_sin, *(SIMDVectorFloat*)_ps_sign_mask);
628 
629   /* scale by 4/Pi */
630   y = _mm_mul_ps(x, *(SIMDVectorFloat*)_ps_cephes_FOPI);
631 
632   /* store the integer part of y in emm2 */
633   emm2 = _mm_cvttps_epi32(y);
634 
635   /* j=(j+1) & (~1) (see the cephes sources) */
636   emm2 = _mm_add_epi32(emm2, *(SIMDVectorInt*)_pi32_1);
637   emm2 = _mm_and_si128(emm2, *(SIMDVectorInt*)_pi32_inv1);
638   y = _mm_cvtepi32_ps(emm2);
639 
640   emm4 = emm2;
641 
642   /* get the swap sign flag for the sine */
643   emm0 = _mm_and_si128(emm2, *(SIMDVectorInt*)_pi32_4);
644   emm0 = _mm_slli_epi32(emm0, 29);
645   SIMDVectorFloat swap_sign_bit_sin = VecI2F(emm0);
646 
647   /* get the polynom selection mask for the sine*/
648   emm2 = _mm_and_si128(emm2, *(SIMDVectorInt*)_pi32_2);
649   emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
650   SIMDVectorFloat poly_mask = VecI2F(emm2);
651 
652   /* The magic pass: "Extended precision modular arithmetic"
653    x = ((x - y * DP1) - y * DP2) - y * DP3; */
654   xmm1 = *(SIMDVectorFloat*)_ps_minus_cephes_DP1;
655   xmm2 = *(SIMDVectorFloat*)_ps_minus_cephes_DP2;
656   xmm3 = *(SIMDVectorFloat*)_ps_minus_cephes_DP3;
657   xmm1 = _mm_mul_ps(y, xmm1);
658   xmm2 = _mm_mul_ps(y, xmm2);
659   xmm3 = _mm_mul_ps(y, xmm3);
660   x = _mm_add_ps(x, xmm1);
661   x = _mm_add_ps(x, xmm2);
662   x = _mm_add_ps(x, xmm3);
663 
664   emm4 = _mm_sub_epi32(emm4, *(SIMDVectorInt*)_pi32_2);
665   emm4 = _mm_andnot_si128(emm4, *(SIMDVectorInt*)_pi32_4);
666   emm4 = _mm_slli_epi32(emm4, 29);
667   SIMDVectorFloat sign_bit_cos = VecI2F(emm4);
668 
669   sign_bit_sin = _mm_xor_ps(sign_bit_sin, swap_sign_bit_sin);
670 
671   /* Evaluate the first polynom  (0 <= x <= Pi/4) */
672   SIMDVectorFloat z = _mm_mul_ps(x, x);
673   y = *(SIMDVectorFloat*)_ps_coscof_p0;
674 
675   y = _mm_mul_ps(y, z);
676   y = _mm_add_ps(y, *(SIMDVectorFloat*)_ps_coscof_p1);
677   y = _mm_mul_ps(y, z);
678   y = _mm_add_ps(y, *(SIMDVectorFloat*)_ps_coscof_p2);
679   y = _mm_mul_ps(y, z);
680   y = _mm_mul_ps(y, z);
681   SIMDVectorFloat tmp = _mm_mul_ps(z, *(SIMDVectorFloat*)_ps_0p5);
682   y = _mm_sub_ps(y, tmp);
683   y = _mm_add_ps(y, *(SIMDVectorFloat*)_ps_1);
684 
685   /* Evaluate the second polynom  (Pi/4 <= x <= 0) */
686   SIMDVectorFloat y2 = *(SIMDVectorFloat*)_ps_sincof_p0;
687   y2 = _mm_mul_ps(y2, z);
688   y2 = _mm_add_ps(y2, *(SIMDVectorFloat*)_ps_sincof_p1);
689   y2 = _mm_mul_ps(y2, z);
690   y2 = _mm_add_ps(y2, *(SIMDVectorFloat*)_ps_sincof_p2);
691   y2 = _mm_mul_ps(y2, z);
692   y2 = _mm_mul_ps(y2, x);
693   y2 = _mm_add_ps(y2, x);
694 
695   /* select the correct result from the two polynoms */
696   xmm3 = poly_mask;
697   SIMDVectorFloat ysin2 = _mm_and_ps(xmm3, y2);
698   SIMDVectorFloat ysin1 = _mm_andnot_ps(xmm3, y);
699   y2 = _mm_sub_ps(y2, ysin2);
700   y = _mm_sub_ps(y, ysin1);
701 
702   xmm1 = _mm_add_ps(ysin1, ysin2);
703   xmm2 = _mm_add_ps(y, y2);
704 
705   /* update the sign */
706   *s = _mm_xor_ps(xmm1, sign_bit_sin);
707   *c = _mm_xor_ps(xmm2, sign_bit_cos);
708 }
709 
710 #define STATIC_M128_CONST(name, val) static constexpr __m128 name = {val, val, val, val};
711 
712 // fast polynomial approximations
713 // from scalar code by Jacques-Henri Jourdan <jourgun@gmail.com>
714 // sin and cos valid from -pi to pi
715 // exp and log polynomials generated using Sollya http://sollya.gforge.inria.fr/
716 // exp Generated in Sollya with:
717 // > f=remez(1-x*exp(-(x-1)*log(2)),
718 // [|1,(x-1)*(x-2), (x-1)*(x-2)*x, (x-1)*(x-2)*x*x|],
719 // [1,2], exp(-(x-1)*log(2)));
720 // > plot(exp((x-1)*log(2))/(f+x)-1, [1,2]);
721 // > f+x;
722 //
723 // log Generated in Sollya using :
724 // f = remez(log(x)-(x-1)*log(2),
725 // [|1,(x-1)*(x-2), (x-1)*(x-2)*x, (x-1)*(x-2)*x*x,
726 // (x-1)*(x-2)*x*x*x|], [1,2], 1, 1e-8);
727 // > plot(f+(x-1)*log(2)-log(x), [1,2]);
728 // > f+(x-1)*log(2)
729 
730 STATIC_M128_CONST(kSinC1Vec, 0.99997937679290771484375f);
731 STATIC_M128_CONST(kSinC2Vec, -0.166624367237091064453125f);
732 STATIC_M128_CONST(kSinC3Vec, 8.30897875130176544189453125e-3f);
733 STATIC_M128_CONST(kSinC4Vec, -1.92649182281456887722015380859375e-4f);
734 STATIC_M128_CONST(kSinC5Vec, 2.147840177713078446686267852783203125e-6f);
735 
vecSinApprox(__m128 x)736 inline __m128 vecSinApprox(__m128 x)
737 {
738   __m128 x2 = _mm_mul_ps(x, x);
739   return _mm_mul_ps(
740       x, _mm_add_ps(
741              kSinC1Vec,
742              _mm_mul_ps(
743                  x2,
744                  _mm_add_ps(
745                      kSinC2Vec,
746                      _mm_mul_ps(
747                          x2, _mm_add_ps(kSinC3Vec,
748                                         _mm_mul_ps(x2, _mm_add_ps(kSinC4Vec,
749                                                                   _mm_mul_ps(x2, kSinC5Vec)))))))));
750 }
751 
752 STATIC_M128_CONST(kCosC1Vec, 0.999959766864776611328125f);
753 STATIC_M128_CONST(kCosC2Vec, -0.4997930824756622314453125f);
754 STATIC_M128_CONST(kCosC3Vec, 4.1496001183986663818359375e-2f);
755 STATIC_M128_CONST(kCosC4Vec, -1.33926304988563060760498046875e-3f);
756 STATIC_M128_CONST(kCosC5Vec, 1.8791708498611114919185638427734375e-5f);
757 
vecCosApprox(SIMDVectorFloat x)758 inline SIMDVectorFloat vecCosApprox(SIMDVectorFloat x)
759 {
760   SIMDVectorFloat x2 = _mm_mul_ps(x, x);
761   return _mm_add_ps(
762       kCosC1Vec,
763       _mm_mul_ps(
764           x2,
765           _mm_add_ps(
766               kCosC2Vec,
767               _mm_mul_ps(x2, _mm_add_ps(kCosC3Vec,
768                                         _mm_mul_ps(x2, _mm_add_ps(kCosC4Vec,
769                                                                   _mm_mul_ps(x2, kCosC5Vec))))))));
770 }
771 STATIC_M128_CONST(kExpC1Vec, 2139095040.f);
772 STATIC_M128_CONST(kExpC2Vec, 12102203.1615614f);
773 STATIC_M128_CONST(kExpC3Vec, 1065353216.f);
774 STATIC_M128_CONST(kExpC4Vec, 0.510397365625862338668154f);
775 STATIC_M128_CONST(kExpC5Vec, 0.310670891004095530771135f);
776 STATIC_M128_CONST(kExpC6Vec, 0.168143436463395944830000f);
777 STATIC_M128_CONST(kExpC7Vec, -2.88093587581985443087955e-3f);
778 STATIC_M128_CONST(kExpC8Vec, 1.3671023382430374383648148e-2f);
779 
vecExpApprox(SIMDVectorFloat x)780 inline SIMDVectorFloat vecExpApprox(SIMDVectorFloat x)
781 {
782   const SIMDVectorFloat kZeroVec = _mm_setzero_ps();
783 
784   SIMDVectorFloat val2, val3, val4;
785   SIMDVectorInt val4i;
786 
787   val2 = _mm_add_ps(_mm_mul_ps(x, kExpC2Vec), kExpC3Vec);
788   val3 = _mm_min_ps(val2, kExpC1Vec);
789   val4 = _mm_max_ps(val3, kZeroVec);
790   val4i = _mm_cvttps_epi32(val4);
791 
792   SIMDVectorFloat xu = _mm_and_ps(VecI2F(val4i), VecI2F(_mm_set1_epi32(0x7F800000)));
793   SIMDVectorFloat b = _mm_or_ps(_mm_and_ps(VecI2F(val4i), VecI2F(_mm_set1_epi32(0x7FFFFF))),
794                                 VecI2F(_mm_set1_epi32(0x3F800000)));
795 
796   return _mm_mul_ps(
797       xu,
798       (_mm_add_ps(
799           kExpC4Vec,
800           _mm_mul_ps(
801               b, _mm_add_ps(
802                      kExpC5Vec,
803                      _mm_mul_ps(
804                          b, _mm_add_ps(kExpC6Vec,
805                                        _mm_mul_ps(b, _mm_add_ps(kExpC7Vec,
806                                                                 _mm_mul_ps(b, kExpC8Vec))))))))));
807 }
808 
809 STATIC_M128_CONST(kLogC1Vec, -89.970756366f);
810 STATIC_M128_CONST(kLogC2Vec, 3.529304993f);
811 STATIC_M128_CONST(kLogC3Vec, -2.461222105f);
812 STATIC_M128_CONST(kLogC4Vec, 1.130626167f);
813 STATIC_M128_CONST(kLogC5Vec, -0.288739945f);
814 STATIC_M128_CONST(kLogC6Vec, 3.110401639e-2f);
815 STATIC_M128_CONST(kLogC7Vec, 0.69314718055995f);
816 
vecLogApprox(SIMDVectorFloat val)817 inline SIMDVectorFloat vecLogApprox(SIMDVectorFloat val)
818 {
819   SIMDVectorInt vZero = _mm_setzero_si128();
820   SIMDVectorInt valAsInt = VecF2I(val);
821   SIMDVectorInt expi = _mm_srli_epi32(valAsInt, 23);
822   SIMDVectorFloat addcst =
823       vecSelect(kLogC1Vec, _mm_set1_ps(FLT_MIN), VecF2I(_mm_cmpgt_ps(val, VecI2F(vZero))));
824   SIMDVectorInt valAsIntMasked =
825       VecF2I(_mm_or_ps(_mm_and_ps(VecI2F(valAsInt), VecI2F(_mm_set1_epi32(0x7FFFFF))),
826                        VecI2F(_mm_set1_epi32(0x3F800000))));
827   SIMDVectorFloat x = VecI2F(valAsIntMasked);
828 
829   SIMDVectorFloat poly = _mm_mul_ps(
830       x, _mm_add_ps(
831              kLogC2Vec,
832              _mm_mul_ps(
833                  x, _mm_add_ps(
834                         kLogC3Vec,
835                         _mm_mul_ps(
836                             x, _mm_add_ps(kLogC4Vec,
837                                           _mm_mul_ps(x, _mm_add_ps(kLogC5Vec,
838                                                                    _mm_mul_ps(x, kLogC6Vec)))))))));
839 
840   SIMDVectorFloat addCstResult = _mm_add_ps(addcst, _mm_mul_ps(kLogC7Vec, _mm_cvtepi32_ps(expi)));
841   return _mm_add_ps(poly, addCstResult);
842 }
843 
vecIntPart(SIMDVectorFloat val)844 inline SIMDVectorFloat vecIntPart(SIMDVectorFloat val)
845 {
846   SIMDVectorInt vi = _mm_cvttps_epi32(val);  // convert with truncate
847   return (_mm_cvtepi32_ps(vi));
848 }
849 
vecFracPart(SIMDVectorFloat val)850 inline SIMDVectorFloat vecFracPart(SIMDVectorFloat val)
851 {
852   SIMDVectorInt vi = _mm_cvttps_epi32(val);  // convert with truncate
853   SIMDVectorFloat intPart = _mm_cvtepi32_ps(vi);
854   return _mm_sub_ps(val, intPart);
855 }
856