1 // =============================================================================
2 // PROJECT CHRONO - http://projectchrono.org
3 //
4 // Copyright (c) 2021 projectchrono.org
5 // All rights reserved.
6 //
7 // Use of this source code is governed by a BSD-style license that can be found
8 // in the LICENSE file at the top level of the distribution and at
9 // http://projectchrono.org/license-chrono.txt.
10 //
11 // =============================================================================
12 // Authors: Hammad Mazhar, Radu Serban
13 // =============================================================================
14 
15 #include "chrono/multicore_math/simd.h"
16 #include "chrono/multicore_math/real.h"
17 #include "chrono/multicore_math/real3.h"
18 #include "chrono/multicore_math/real4.h"
19 
20 using namespace chrono;
21 
22 namespace simd {
23 
24 // http://fastcpp.blogspot.com/2011/03/changing-sign-of-float-values-using-sse.html
25 static const __m128 NEGATEMASK = _mm_castsi128_ps(_mm_set1_epi32(0x80000000));
26 static const __m128 ABSMASK = _mm_castsi128_ps(_mm_set1_epi32(0x7FFFFFFF));
27 static const __m128 REAL3MASK = _mm_castsi128_ps(_mm_setr_epi32(0xffffffff, 0xffffffff, 0xffffffff, 0x00000000));
28 
29 template <int i0, int i1, int i2, int i3>
change_sign(__m128 a)30 static __m128 change_sign(__m128 a) {
31     if ((i0 | i1 | i2 | i3) == 0) {
32         return a;
33     }
34     __m128 mask = _mm_castsi128_ps(_mm_setr_epi32(i0 ? (int)0x80000000 : 0, i1 ? (int)0x80000000 : 0,
35                                                   i2 ? (int)0x80000000 : 0, i3 ? (int)0x80000000 : 0));
36 
37     __m128 res = _mm_xor_ps(a, mask);
38 
39     return res;
40 }
41 
Add(__m128 a,__m128 b)42 inline __m128 Add(__m128 a, __m128 b) {
43     return _mm_add_ps(a, b);
44 }
45 
Sub(__m128 a,__m128 b)46 inline __m128 Sub(__m128 a, __m128 b) {
47     return _mm_sub_ps(a, b);
48 }
49 
Mul(__m128 a,__m128 b)50 inline __m128 Mul(__m128 a, __m128 b) {
51     return _mm_mul_ps(a, b);
52 }
53 
Div(__m128 a,__m128 b)54 inline __m128 Div(__m128 a, __m128 b) {
55     return _mm_div_ps(a, b);
56 }
57 
Div3(__m128 a,__m128 b)58 inline __m128 Div3(__m128 a, __m128 b) {
59     return _mm_and_ps(_mm_div_ps(a, b), REAL3MASK);
60 }
61 
Negate(__m128 a)62 inline __m128 Negate(__m128 a) {
63     return _mm_xor_ps(a, NEGATEMASK);
64 }
65 
Dot3(__m128 a)66 inline real Dot3(__m128 a) {
67     return _mm_cvtss_f32(_mm_dp_ps(a, a, 0x71));
68 }
69 
Dot3(__m128 a,__m128 b)70 inline real Dot3(__m128 a, __m128 b) {
71     return _mm_cvtss_f32(_mm_dp_ps(a, b, 0x71));
72 }
73 
Dot4(__m128 a)74 inline real Dot4(__m128 a) {
75     return _mm_cvtss_f32(_mm_dp_ps(a, a, 0xF1));
76 }
77 
Dot4(__m128 a,__m128 b)78 inline real Dot4(__m128 a, __m128 b) {
79     return _mm_cvtss_f32(_mm_dp_ps(a, b, 0xF1));
80 }
81 
SquareRoot(__m128 v)82 inline __m128 SquareRoot(__m128 v) {
83     return _mm_sqrt_ps(v);
84 }
85 
Cross(__m128 a,__m128 b)86 inline __m128 Cross(__m128 a, __m128 b) {
87     return _mm_sub_ps(
88         _mm_mul_ps(_mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 0, 2, 1)), _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 1, 0, 2))),
89         _mm_mul_ps(_mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 1, 0, 2)), _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 0, 2, 1))));
90 }
91 
Cross3(__m128 a,__m128 b)92 inline __m128 Cross3(__m128 a, __m128 b) {
93     __m128 tmp = _mm_sub_ps(
94         _mm_mul_ps(_mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 0, 2, 1)), _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 1, 0, 2))),
95         _mm_mul_ps(_mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 1, 0, 2)), _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 0, 2, 1))));
96     return _mm_and_ps(tmp, REAL3MASK);
97 }
98 
Normalize3(__m128 v)99 inline __m128 Normalize3(__m128 v) {
100     real t = Dot3(v);
101     real dp = InvSqrt(t);
102     __m128 tmp = _mm_mul_ps(v, _mm_set1_ps(dp));
103     return _mm_and_ps(tmp, REAL3MASK);
104 }
105 
Normalize(__m128 v)106 inline __m128 Normalize(__m128 v) {
107     real t = Dot4(v);
108     real dp = InvSqrt(t);
109     return _mm_mul_ps(v, _mm_set1_ps(dp));
110 }
111 
Max(__m128 v1,__m128 v2)112 inline __m128 Max(__m128 v1, __m128 v2) {
113     return _mm_max_ps(v1, v2);
114 }
115 
Min(__m128 v1,__m128 v2)116 inline __m128 Min(__m128 v1, __m128 v2) {
117     return _mm_min_ps(v1, v2);
118 }
119 
Min(__m128 x)120 inline real Min(__m128 x) {
121     __m128 low = _mm_movehl_ps(x, x);                                             /* [2, 3, 2, 3] */
122     __m128 low_accum = _mm_min_ps(low, x);                                        /* [0|2, 1|3, 2|2, 3|3] */
123     __m128 elem1 = _mm_shuffle_ps(low_accum, low_accum, _MM_SHUFFLE(1, 1, 1, 1)); /* [1|3, 1|3, 1|3, 1|3] */
124     __m128 accum = _mm_min_ss(low_accum, elem1);
125     return _mm_cvtss_f32(accum);
126 }
127 
Max(__m128 x)128 inline real Max(__m128 x) {
129     __m128 low = _mm_movehl_ps(x, x);                                             /* [2, 3, 2, 3] */
130     __m128 low_accum = _mm_max_ps(low, x);                                        /* [0|2, 1|3, 2|2, 3|3] */
131     __m128 elem1 = _mm_shuffle_ps(low_accum, low_accum, _MM_SHUFFLE(1, 1, 1, 1)); /* [1|3, 1|3, 1|3, 1|3] */
132     __m128 accum = _mm_max_ss(low_accum, elem1);
133     return _mm_cvtss_f32(accum);
134 }
135 
Min3(__m128 a)136 inline real Min3(__m128 a) {
137     __m128 x = _mm_permute_ps(a, 6);                                              // copy over 4th value
138     __m128 low = _mm_movehl_ps(x, x);                                             /* [2, 3, 2, 3] */
139     __m128 low_accum = _mm_min_ps(low, x);                                        /* [0|2, 1|3, 2|2, 3|3] */
140     __m128 elem1 = _mm_shuffle_ps(low_accum, low_accum, _MM_SHUFFLE(1, 1, 1, 1)); /* [1|3, 1|3, 1|3, 1|3] */
141     __m128 accum = _mm_min_ss(low_accum, elem1);
142     return _mm_cvtss_f32(accum);
143 }
144 
Max3(__m128 a)145 inline real Max3(__m128 a) {
146     __m128 x = _mm_permute_ps(a, 6);                                              // copy over 4th value
147     __m128 low = _mm_movehl_ps(x, x);                                             /* [2, 3, 2, 3] */
148     __m128 low_accum = _mm_max_ps(low, x);                                        /* [0|2, 1|3, 2|2, 3|3] */
149     __m128 elem1 = _mm_shuffle_ps(low_accum, low_accum, _MM_SHUFFLE(1, 1, 1, 1)); /* [1|3, 1|3, 1|3, 1|3] */
150     __m128 accum = _mm_max_ss(low_accum, elem1);
151     return _mm_cvtss_f32(accum);
152 }
153 
Abs(__m128 v)154 inline __m128 Abs(__m128 v) {
155     return _mm_and_ps(v, ABSMASK);
156 }
157 
Round(__m128 a)158 inline __m128 Round(__m128 a) {
159     return _mm_round_ps(a, _MM_FROUND_TO_NEAREST_INT);
160 }
161 
QuatMult(__m128 a,__m128 b)162 inline __m128 QuatMult(__m128 a, __m128 b) {
163     __m128 a1123 = _mm_shuffle_ps(a, a, 0xE5);
164     __m128 a2231 = _mm_shuffle_ps(a, a, 0x7A);
165     __m128 b1000 = _mm_shuffle_ps(b, b, 0x01);
166     __m128 b2312 = _mm_shuffle_ps(b, b, 0x9E);
167     __m128 t1 = _mm_mul_ps(a1123, b1000);
168     __m128 t2 = _mm_mul_ps(a2231, b2312);
169     __m128 t12 = _mm_add_ps(t1, t2);
170     __m128 t12m = change_sign<1, 0, 0, 0>(t12);
171     __m128 a3312 = _mm_shuffle_ps(a, a, 0x9F);
172     __m128 b3231 = _mm_shuffle_ps(b, b, 0x7B);
173     __m128 a0000 = _mm_shuffle_ps(a, a, 0x00);
174     __m128 t3 = _mm_mul_ps(a3312, b3231);
175     __m128 t0 = _mm_mul_ps(a0000, b);
176     __m128 t03 = _mm_sub_ps(t0, t3);
177     return _mm_add_ps(t03, t12m);
178 }
179 
180 // inline __m128 Dot4(__m128 v, __m128 a, __m128 b, __m128 c) {
181 //    __m128 u1 = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0));
182 //    __m128 u2 = _mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1));
183 //    __m128 u3 = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2));
184 //
185 //    __m128 prod1 = _mm_mul_ps(u1, a);
186 //    __m128 prod2 = _mm_mul_ps(u2, b);
187 //    __m128 prod3 = _mm_mul_ps(u3, c);
188 //
189 //    return _mm_add_ps(_mm_add_ps(prod1, prod2), _mm_add_ps(prod3, _mm_set1_ps(0)));
190 //}
191 
Dot4(__m128 v,__m128 a,__m128 b,__m128 c)192 inline __m128 Dot4(__m128 v, __m128 a, __m128 b, __m128 c) {
193     __m128 prod1 = _mm_mul_ps(a, v);
194     __m128 prod2 = _mm_mul_ps(b, v);
195     __m128 prod3 = _mm_mul_ps(c, v);
196     return _mm_hadd_ps(_mm_hadd_ps(prod1, prod2), _mm_hadd_ps(prod3, _mm_set1_ps(0)));
197 }
198 
Dot4(__m128 v,__m128 a,__m128 b,__m128 c,__m128 d)199 inline __m128 Dot4(__m128 v, __m128 a, __m128 b, __m128 c, __m128 d) {
200     __m128 prod1 = _mm_mul_ps(a, v);
201     __m128 prod2 = _mm_mul_ps(b, v);
202     __m128 prod3 = _mm_mul_ps(c, v);
203     __m128 prod4 = _mm_mul_ps(d, v);
204     return _mm_hadd_ps(_mm_hadd_ps(prod1, prod2), _mm_hadd_ps(prod3, prod4));
205 }
206 
HorizontalAdd(real4 a)207 inline real HorizontalAdd(real4 a) {
208     return a[0] + a[1] + a[2] + a[3];
209 }
210 
HorizontalAdd(real3 a)211 inline real HorizontalAdd(real3 a) {
212     return a[0] + a[1] + a[2];
213 }
214 
IsZero(const real3 & v,const real & a)215 inline bool IsZero(const real3& v, const real& a) {
216     return chrono::Abs(v.x) < a && chrono::Abs(v.y) < a && chrono::Abs(v.z) < a;
217 }
218 
Set(int x)219 inline __m128i Set(int x) {
220     return _mm_set1_epi32(x);
221 }
222 
Sub(__m128i a,__m128i b)223 inline __m128i Sub(__m128i a, __m128i b) {
224     return _mm_sub_epi32(a, b);
225 }
226 
Add(__m128i a,__m128i b)227 inline __m128i Add(__m128i a, __m128i b) {
228     return _mm_add_epi32(a, b);
229 }
230 
Max(__m128i a,__m128i b)231 inline __m128i Max(__m128i a, __m128i b) {
232     return _mm_max_epi32(a, b);
233 }
234 
Min(__m128i a,__m128i b)235 inline __m128i Min(__m128i a, __m128i b) {
236     return _mm_min_epi32(a, b);
237 }
238 
239 }  // end namespace simd
240