1 // =============================================================================
2 // PROJECT CHRONO - http://projectchrono.org
3 //
4 // Copyright (c) 2021 projectchrono.org
5 // All rights reserved.
6 //
7 // Use of this source code is governed by a BSD-style license that can be found
8 // in the LICENSE file at the top level of the distribution and at
9 // http://projectchrono.org/license-chrono.txt.
10 //
11 // =============================================================================
12 // Authors: Hammad Mazhar, Radu Serban
13 // =============================================================================
14
15 #include "chrono/multicore_math/simd.h"
16 #include "chrono/multicore_math/real.h"
17 #include "chrono/multicore_math/real3.h"
18 #include "chrono/multicore_math/real4.h"
19
20 using namespace chrono;
21
22 namespace simd {
23
24 // http://fastcpp.blogspot.com/2011/03/changing-sign-of-float-values-using-sse.html
25 static const __m128 NEGATEMASK = _mm_castsi128_ps(_mm_set1_epi32(0x80000000));
26 static const __m128 ABSMASK = _mm_castsi128_ps(_mm_set1_epi32(0x7FFFFFFF));
27 static const __m128 REAL3MASK = _mm_castsi128_ps(_mm_setr_epi32(0xffffffff, 0xffffffff, 0xffffffff, 0x00000000));
28
29 template <int i0, int i1, int i2, int i3>
change_sign(__m128 a)30 static __m128 change_sign(__m128 a) {
31 if ((i0 | i1 | i2 | i3) == 0) {
32 return a;
33 }
34 __m128 mask = _mm_castsi128_ps(_mm_setr_epi32(i0 ? (int)0x80000000 : 0, i1 ? (int)0x80000000 : 0,
35 i2 ? (int)0x80000000 : 0, i3 ? (int)0x80000000 : 0));
36
37 __m128 res = _mm_xor_ps(a, mask);
38
39 return res;
40 }
41
Add(__m128 a,__m128 b)42 inline __m128 Add(__m128 a, __m128 b) {
43 return _mm_add_ps(a, b);
44 }
45
Sub(__m128 a,__m128 b)46 inline __m128 Sub(__m128 a, __m128 b) {
47 return _mm_sub_ps(a, b);
48 }
49
Mul(__m128 a,__m128 b)50 inline __m128 Mul(__m128 a, __m128 b) {
51 return _mm_mul_ps(a, b);
52 }
53
Div(__m128 a,__m128 b)54 inline __m128 Div(__m128 a, __m128 b) {
55 return _mm_div_ps(a, b);
56 }
57
Div3(__m128 a,__m128 b)58 inline __m128 Div3(__m128 a, __m128 b) {
59 return _mm_and_ps(_mm_div_ps(a, b), REAL3MASK);
60 }
61
Negate(__m128 a)62 inline __m128 Negate(__m128 a) {
63 return _mm_xor_ps(a, NEGATEMASK);
64 }
65
Dot3(__m128 a)66 inline real Dot3(__m128 a) {
67 return _mm_cvtss_f32(_mm_dp_ps(a, a, 0x71));
68 }
69
Dot3(__m128 a,__m128 b)70 inline real Dot3(__m128 a, __m128 b) {
71 return _mm_cvtss_f32(_mm_dp_ps(a, b, 0x71));
72 }
73
Dot4(__m128 a)74 inline real Dot4(__m128 a) {
75 return _mm_cvtss_f32(_mm_dp_ps(a, a, 0xF1));
76 }
77
Dot4(__m128 a,__m128 b)78 inline real Dot4(__m128 a, __m128 b) {
79 return _mm_cvtss_f32(_mm_dp_ps(a, b, 0xF1));
80 }
81
SquareRoot(__m128 v)82 inline __m128 SquareRoot(__m128 v) {
83 return _mm_sqrt_ps(v);
84 }
85
Cross(__m128 a,__m128 b)86 inline __m128 Cross(__m128 a, __m128 b) {
87 return _mm_sub_ps(
88 _mm_mul_ps(_mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 0, 2, 1)), _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 1, 0, 2))),
89 _mm_mul_ps(_mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 1, 0, 2)), _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 0, 2, 1))));
90 }
91
Cross3(__m128 a,__m128 b)92 inline __m128 Cross3(__m128 a, __m128 b) {
93 __m128 tmp = _mm_sub_ps(
94 _mm_mul_ps(_mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 0, 2, 1)), _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 1, 0, 2))),
95 _mm_mul_ps(_mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 1, 0, 2)), _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 0, 2, 1))));
96 return _mm_and_ps(tmp, REAL3MASK);
97 }
98
Normalize3(__m128 v)99 inline __m128 Normalize3(__m128 v) {
100 real t = Dot3(v);
101 real dp = InvSqrt(t);
102 __m128 tmp = _mm_mul_ps(v, _mm_set1_ps(dp));
103 return _mm_and_ps(tmp, REAL3MASK);
104 }
105
Normalize(__m128 v)106 inline __m128 Normalize(__m128 v) {
107 real t = Dot4(v);
108 real dp = InvSqrt(t);
109 return _mm_mul_ps(v, _mm_set1_ps(dp));
110 }
111
Max(__m128 v1,__m128 v2)112 inline __m128 Max(__m128 v1, __m128 v2) {
113 return _mm_max_ps(v1, v2);
114 }
115
Min(__m128 v1,__m128 v2)116 inline __m128 Min(__m128 v1, __m128 v2) {
117 return _mm_min_ps(v1, v2);
118 }
119
Min(__m128 x)120 inline real Min(__m128 x) {
121 __m128 low = _mm_movehl_ps(x, x); /* [2, 3, 2, 3] */
122 __m128 low_accum = _mm_min_ps(low, x); /* [0|2, 1|3, 2|2, 3|3] */
123 __m128 elem1 = _mm_shuffle_ps(low_accum, low_accum, _MM_SHUFFLE(1, 1, 1, 1)); /* [1|3, 1|3, 1|3, 1|3] */
124 __m128 accum = _mm_min_ss(low_accum, elem1);
125 return _mm_cvtss_f32(accum);
126 }
127
Max(__m128 x)128 inline real Max(__m128 x) {
129 __m128 low = _mm_movehl_ps(x, x); /* [2, 3, 2, 3] */
130 __m128 low_accum = _mm_max_ps(low, x); /* [0|2, 1|3, 2|2, 3|3] */
131 __m128 elem1 = _mm_shuffle_ps(low_accum, low_accum, _MM_SHUFFLE(1, 1, 1, 1)); /* [1|3, 1|3, 1|3, 1|3] */
132 __m128 accum = _mm_max_ss(low_accum, elem1);
133 return _mm_cvtss_f32(accum);
134 }
135
Min3(__m128 a)136 inline real Min3(__m128 a) {
137 __m128 x = _mm_permute_ps(a, 6); // copy over 4th value
138 __m128 low = _mm_movehl_ps(x, x); /* [2, 3, 2, 3] */
139 __m128 low_accum = _mm_min_ps(low, x); /* [0|2, 1|3, 2|2, 3|3] */
140 __m128 elem1 = _mm_shuffle_ps(low_accum, low_accum, _MM_SHUFFLE(1, 1, 1, 1)); /* [1|3, 1|3, 1|3, 1|3] */
141 __m128 accum = _mm_min_ss(low_accum, elem1);
142 return _mm_cvtss_f32(accum);
143 }
144
Max3(__m128 a)145 inline real Max3(__m128 a) {
146 __m128 x = _mm_permute_ps(a, 6); // copy over 4th value
147 __m128 low = _mm_movehl_ps(x, x); /* [2, 3, 2, 3] */
148 __m128 low_accum = _mm_max_ps(low, x); /* [0|2, 1|3, 2|2, 3|3] */
149 __m128 elem1 = _mm_shuffle_ps(low_accum, low_accum, _MM_SHUFFLE(1, 1, 1, 1)); /* [1|3, 1|3, 1|3, 1|3] */
150 __m128 accum = _mm_max_ss(low_accum, elem1);
151 return _mm_cvtss_f32(accum);
152 }
153
Abs(__m128 v)154 inline __m128 Abs(__m128 v) {
155 return _mm_and_ps(v, ABSMASK);
156 }
157
Round(__m128 a)158 inline __m128 Round(__m128 a) {
159 return _mm_round_ps(a, _MM_FROUND_TO_NEAREST_INT);
160 }
161
QuatMult(__m128 a,__m128 b)162 inline __m128 QuatMult(__m128 a, __m128 b) {
163 __m128 a1123 = _mm_shuffle_ps(a, a, 0xE5);
164 __m128 a2231 = _mm_shuffle_ps(a, a, 0x7A);
165 __m128 b1000 = _mm_shuffle_ps(b, b, 0x01);
166 __m128 b2312 = _mm_shuffle_ps(b, b, 0x9E);
167 __m128 t1 = _mm_mul_ps(a1123, b1000);
168 __m128 t2 = _mm_mul_ps(a2231, b2312);
169 __m128 t12 = _mm_add_ps(t1, t2);
170 __m128 t12m = change_sign<1, 0, 0, 0>(t12);
171 __m128 a3312 = _mm_shuffle_ps(a, a, 0x9F);
172 __m128 b3231 = _mm_shuffle_ps(b, b, 0x7B);
173 __m128 a0000 = _mm_shuffle_ps(a, a, 0x00);
174 __m128 t3 = _mm_mul_ps(a3312, b3231);
175 __m128 t0 = _mm_mul_ps(a0000, b);
176 __m128 t03 = _mm_sub_ps(t0, t3);
177 return _mm_add_ps(t03, t12m);
178 }
179
180 // inline __m128 Dot4(__m128 v, __m128 a, __m128 b, __m128 c) {
181 // __m128 u1 = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0));
182 // __m128 u2 = _mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1));
183 // __m128 u3 = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2));
184 //
185 // __m128 prod1 = _mm_mul_ps(u1, a);
186 // __m128 prod2 = _mm_mul_ps(u2, b);
187 // __m128 prod3 = _mm_mul_ps(u3, c);
188 //
189 // return _mm_add_ps(_mm_add_ps(prod1, prod2), _mm_add_ps(prod3, _mm_set1_ps(0)));
190 //}
191
Dot4(__m128 v,__m128 a,__m128 b,__m128 c)192 inline __m128 Dot4(__m128 v, __m128 a, __m128 b, __m128 c) {
193 __m128 prod1 = _mm_mul_ps(a, v);
194 __m128 prod2 = _mm_mul_ps(b, v);
195 __m128 prod3 = _mm_mul_ps(c, v);
196 return _mm_hadd_ps(_mm_hadd_ps(prod1, prod2), _mm_hadd_ps(prod3, _mm_set1_ps(0)));
197 }
198
Dot4(__m128 v,__m128 a,__m128 b,__m128 c,__m128 d)199 inline __m128 Dot4(__m128 v, __m128 a, __m128 b, __m128 c, __m128 d) {
200 __m128 prod1 = _mm_mul_ps(a, v);
201 __m128 prod2 = _mm_mul_ps(b, v);
202 __m128 prod3 = _mm_mul_ps(c, v);
203 __m128 prod4 = _mm_mul_ps(d, v);
204 return _mm_hadd_ps(_mm_hadd_ps(prod1, prod2), _mm_hadd_ps(prod3, prod4));
205 }
206
HorizontalAdd(real4 a)207 inline real HorizontalAdd(real4 a) {
208 return a[0] + a[1] + a[2] + a[3];
209 }
210
HorizontalAdd(real3 a)211 inline real HorizontalAdd(real3 a) {
212 return a[0] + a[1] + a[2];
213 }
214
IsZero(const real3 & v,const real & a)215 inline bool IsZero(const real3& v, const real& a) {
216 return chrono::Abs(v.x) < a && chrono::Abs(v.y) < a && chrono::Abs(v.z) < a;
217 }
218
Set(int x)219 inline __m128i Set(int x) {
220 return _mm_set1_epi32(x);
221 }
222
Sub(__m128i a,__m128i b)223 inline __m128i Sub(__m128i a, __m128i b) {
224 return _mm_sub_epi32(a, b);
225 }
226
Add(__m128i a,__m128i b)227 inline __m128i Add(__m128i a, __m128i b) {
228 return _mm_add_epi32(a, b);
229 }
230
Max(__m128i a,__m128i b)231 inline __m128i Max(__m128i a, __m128i b) {
232 return _mm_max_epi32(a, b);
233 }
234
Min(__m128i a,__m128i b)235 inline __m128i Min(__m128i a, __m128i b) {
236 return _mm_min_epi32(a, b);
237 }
238
239 } // end namespace simd
240