1 /*
2 * Copyright (c) 2007-2012 Hypertriton, Inc. <http://hypertriton.com/>
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
6 * are met:
7 * 1. Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * 2. Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
12 *
13 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
14 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
16 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR
17 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
18 * DAMAGES (INCLUDING BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
19 * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
20 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
21 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
22 * USE OF THIS SOFTWARE EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
23 */
24
25 /*
26 * Operations on 4x4 matrices using Streaming SIMD Extensions.
27 */
28
29 #include <agar/config/have_sse.h>
30 #ifdef HAVE_SSE
31
32 #include <agar/core/core.h>
33 #include <agar/math/m.h>
34
35 const M_MatrixOps44 mMatOps44_SSE = {
36 "sse",
37 M_MatrixZero44_SSE, /* -34 clks */
38 M_MatrixZero44v_SSE, /* +25 clks */
39 M_MatrixIdentity44_SSE, /* -13 clks */
40 M_MatrixIdentity44v_SSE, /* -8 clks */
41 M_MatrixTranspose44_SSE, /* +4 clks */
42 M_MatrixTranspose44p_SSE, /* -22 clks */
43 M_MatrixTranspose44v_SSE, /* -28 clks */
44 M_MatrixInvert44_SSE, /* -295 clks */
45 M_MatrixInvertElim44_FPU,
46 M_MatrixMult44_SSE, /* -61 clks */
47 M_MatrixMult44v_SSE, /* -59 clks */
48 M_MatrixMultVector44_SSE, /* -5 clks */
49 M_MatrixMultVector44p_SSE, /* -10 clks */
50 M_MatrixMultVector44v_SSE, /* -40 clks */
51 M_MatrixCopy44_SSE, /* -3 clks */
52 M_MatrixToFloats44_FPU,
53 M_MatrixToDoubles44_FPU,
54 M_MatrixFromFloats44_FPU,
55 M_MatrixFromDoubles44_FPU,
56 M_MatrixRotateAxis44_SSE, /* -1446 clks */
57 M_MatrixOrbitAxis44_FPU,
58 M_MatrixRotateEul44_FPU,
59 M_MatrixRotate44I_SSE, /* -1204 clks */
60 M_MatrixRotate44J_SSE, /* -1204 clks */
61 M_MatrixRotate44K_SSE, /* -1204 clks */
62 M_MatrixTranslatev44_SSE, /* -549 clks */
63 M_MatrixTranslate44_SSE,
64 M_MatrixTranslateX44_SSE, /* -608 clks */
65 M_MatrixTranslateY44_SSE, /* -608 clks */
66 M_MatrixTranslateZ44_SSE, /* -608 clks */
67 M_MatrixScale44_SSE, /* -522 clks */
68 M_MatrixUniScale44_SSE, /* -595 clks */
69 };
70
71 M_Matrix44
M_MatrixInvert44_SSE(M_Matrix44 A)72 M_MatrixInvert44_SSE(M_Matrix44 A)
73 {
74 M_Matrix44 Ainv;
75 float *src = &A.m[0][0];
76 float *dst = &Ainv.m[0][0];
77 __m128 minor0, minor1, minor2, minor3;
78 __m128 row0, row1, row2, row3;
79 __m128 det, tmp1;
80
81 tmp1 = _mm_loadh_pi(_mm_loadl_pi(tmp1, (__m64 *)(src)),
82 (__m64 *)(src+4));
83 row1 = _mm_loadh_pi(_mm_loadl_pi(row1, (__m64 *)(src+8)),
84 (__m64 *)(src+12));
85 row0 = _mm_shuffle_ps(tmp1, row1, 0x88);
86 row1 = _mm_shuffle_ps(row1, tmp1, 0xDD);
87 tmp1 = _mm_loadh_pi(_mm_loadl_pi(tmp1, (__m64 *)(src+2)),
88 (__m64 *)(src+6));
89 row3 = _mm_loadh_pi(_mm_loadl_pi(row3, (__m64 *)(src+10)),
90 (__m64 *)(src+14));
91 row2 = _mm_shuffle_ps(tmp1, row3, 0x88);
92 row3 = _mm_shuffle_ps(row3, tmp1, 0xDD);
93
94 tmp1 = _mm_mul_ps(row2, row3);
95 tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1);
96 minor0 = _mm_mul_ps(row1, tmp1);
97 minor1 = _mm_mul_ps(row0, tmp1);
98 tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E);
99 minor0 = _mm_sub_ps(_mm_mul_ps(row1, tmp1), minor0);
100 minor1 = _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor1);
101 minor1 = _mm_shuffle_ps(minor1, minor1, 0x4E);
102
103 tmp1 = _mm_mul_ps(row1, row2);
104 tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1);
105 minor0 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor0);
106 minor3 = _mm_mul_ps(row0, tmp1);
107 tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E);
108 minor0 = _mm_sub_ps(minor0, _mm_mul_ps(row3, tmp1));
109 minor3 = _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor3);
110 minor3 = _mm_shuffle_ps(minor3, minor3, 0x4E);
111
112 tmp1 = _mm_mul_ps(_mm_shuffle_ps(row1, row1, 0x4E), row3);
113 tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1);
114 row2 = _mm_shuffle_ps(row2, row2, 0x4E);
115 minor0 = _mm_add_ps(_mm_mul_ps(row2, tmp1), minor0);
116 minor2 = _mm_mul_ps(row0, tmp1);
117 tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E);
118 minor0 = _mm_sub_ps(minor0, _mm_mul_ps(row2, tmp1));
119 minor2 = _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor2);
120 minor2 = _mm_shuffle_ps(minor2, minor2, 0x4E);
121
122 tmp1 = _mm_mul_ps(row0, row1);
123 tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1);
124 minor2 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor2);
125 minor3 = _mm_sub_ps(_mm_mul_ps(row2, tmp1), minor3);
126 tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E);
127 minor2 = _mm_sub_ps(_mm_mul_ps(row3, tmp1), minor2);
128 minor3 = _mm_sub_ps(minor3, _mm_mul_ps(row2, tmp1));
129
130 tmp1 = _mm_mul_ps(row0, row3);
131 tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1);
132 minor1 = _mm_sub_ps(minor1, _mm_mul_ps(row2, tmp1));
133 minor2 = _mm_add_ps(_mm_mul_ps(row1, tmp1), minor2);
134 tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E);
135 minor1 = _mm_add_ps(_mm_mul_ps(row2, tmp1), minor1);
136 minor2 = _mm_sub_ps(minor2, _mm_mul_ps(row1, tmp1));
137
138 tmp1 = _mm_mul_ps(row0, row2);
139 tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1);
140 minor1 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor1);
141 minor3 = _mm_sub_ps(minor3, _mm_mul_ps(row1, tmp1));
142 tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E);
143 minor1 = _mm_sub_ps(minor1, _mm_mul_ps(row3, tmp1));
144 minor3 = _mm_add_ps(_mm_mul_ps(row1, tmp1), minor3);
145
146 det = _mm_mul_ps(row0, minor0);
147 det = _mm_add_ps(_mm_shuffle_ps(det, det, 0x4E), det);
148 det = _mm_add_ss(_mm_shuffle_ps(det, det, 0xB1), det);
149 tmp1 = _mm_rcp_ss(det);
150 det = _mm_sub_ss(_mm_add_ss(tmp1, tmp1),
151 _mm_mul_ss(det, _mm_mul_ss(tmp1,tmp1)));
152 det = _mm_shuffle_ps(det, det, 0x00);
153
154 minor0 = _mm_mul_ps(det, minor0);
155 _mm_storel_pi((__m64 *)(dst), minor0);
156 _mm_storeh_pi((__m64 *)(dst+2), minor0);
157
158 minor1 = _mm_mul_ps(det, minor1);
159 _mm_storel_pi((__m64 *)(dst+4), minor1);
160 _mm_storeh_pi((__m64 *)(dst+6), minor1);
161
162 minor2 = _mm_mul_ps(det, minor2);
163 _mm_storel_pi((__m64 *)(dst+8), minor2);
164 _mm_storeh_pi((__m64 *)(dst+10), minor2);
165
166 minor3 = _mm_mul_ps(det, minor3);
167 _mm_storel_pi((__m64 *)(dst+12), minor3);
168 _mm_storeh_pi((__m64 *)(dst+14), minor3);
169
170 return (Ainv);
171 }
172
173 void
M_MatrixRotateAxis44_SSE(M_Matrix44 * M,M_Real theta,M_Vector3 A)174 M_MatrixRotateAxis44_SSE(M_Matrix44 *M, M_Real theta, M_Vector3 A)
175 {
176 float s = sinf((float)theta);
177 float c = cosf((float)theta);
178 float t = 1.0f - c;
179 M_Matrix44 R;
180 __m128 a = A.m128, r1;
181 #ifdef HAVE_SSE3
182 __m128 rC1 = _mm_set_ps(-c, s*A.z, 0.0f, +c); /* 1,3 1,2 3 2 */
183 __m128 rC2 = _mm_set_ps(0.0f, -s*A.y, s*A.y, -s*A.x); /* 1,2 3 1 2,3 */
184 #endif
185
186 /* m1: [t*AxAx + c, t*AxAy + sAz, t*AxAz - sAy, 0] */
187 r1 = _mm_mul_ps(_mm_set1_ps(t), a);
188 r1 = _mm_mul_ps(r1, _mm_shuffle_ps(a,a,_MM_SHUFFLE(0,0,0,0)));
189 #ifdef HAVE_SSE3
190 R.m1 = _mm_addsub_ps(r1, _mm_shuffle_ps(rC1,rC2,_MM_SHUFFLE(3,1,2,3)));
191 #else
192 R.m1 = _mm_add_ps(r1, _mm_set_ps(0.0f, -s*A.y, s*A.z, c));
193 #endif
194
195 /* m2: [t*AxAy - sAz, t*AyAy + c, t*AyAz + sAx, 0] */
196 r1 = _mm_mul_ps(_mm_set1_ps(t), _mm_shuffle_ps(a,a,_MM_SHUFFLE(3,1,1,0)));
197 r1 = _mm_mul_ps(r1, _mm_shuffle_ps(a,a,_MM_SHUFFLE(3,2,1,1)));
198 #ifdef HAVE_SSE3
199 R.m2 = _mm_addsub_ps(r1, _mm_shuffle_ps(rC1,rC2,_MM_SHUFFLE(3,0,0,2)));
200 #else
201 R.m2 = _mm_add_ps(r1, _mm_set_ps(0.0f, +s*A.x, c, -s*A.z));
202 #endif
203
204 /* m3: [t*AxAz + sAy, t*AyAz - sAx, t*AzAz + c, 0] */
205 r1 = _mm_mul_ps(_mm_set1_ps(t), a);
206 r1 = _mm_mul_ps(r1, _mm_shuffle_ps(a,a,_MM_SHUFFLE(0,2,2,2)));
207 #ifdef HAVE_SSE3
208 R.m3 = _mm_addsub_ps(r1, _mm_shuffle_ps(rC2,rC1,_MM_SHUFFLE(1,3,0,2)));
209 #else
210 R.m3 = _mm_add_ps(r1, _mm_set_ps(0.0f, c, -s*A.x, +s*A.y));
211 #endif
212 /* m4: [0, 0, 0, 1] */
213 R.m4 = _mm_set_ps(1.0f, 0.0f, 0.0f, 0.0f);
214
215 M_MatrixMult44v_SSE(M, &R);
216 }
217
218 void
M_MatrixRotate44I_SSE(M_Matrix44 * M,M_Real theta)219 M_MatrixRotate44I_SSE(M_Matrix44 *M, M_Real theta)
220 {
221 float s = sinf((float)theta);
222 float c = cosf((float)theta);
223 M_Matrix44 R;
224
225 R.m1 = _mm_set_ps(0.0f, 0.0f, 0.0f, 1.0f);
226 R.m2 = _mm_set_ps(0.0f, -s, c, 0.0f);
227 R.m3 = _mm_set_ps(0.0f, c, s, 0.0f);
228 R.m4 = _mm_set_ps(1.0f, 0.0f, 0.0f, 0.0f);
229 M_MatrixMult44v_SSE(M, &R);
230 }
231 void
M_MatrixRotate44J_SSE(M_Matrix44 * M,M_Real theta)232 M_MatrixRotate44J_SSE(M_Matrix44 *M, M_Real theta)
233 {
234 float s = sinf((float)theta);
235 float c = cosf((float)theta);
236 M_Matrix44 R;
237
238 R.m1 = _mm_set_ps(0.0f, s, 0.0f, c);
239 R.m2 = _mm_set_ps(0.0f, 0.0f, 1.0f, 0.0f);
240 R.m3 = _mm_set_ps(0.0f, c, 0.0f, -s);
241 R.m4 = _mm_set_ps(1.0f, 0.0f, 0.0f, 0.0f);
242 M_MatrixMult44v_SSE(M, &R);
243 }
244 void
M_MatrixRotate44K_SSE(M_Matrix44 * M,M_Real theta)245 M_MatrixRotate44K_SSE(M_Matrix44 *M, M_Real theta)
246 {
247 float s = sinf((float)theta);
248 float c = cosf((float)theta);
249 M_Matrix44 R;
250
251 R.m1 = _mm_set_ps(0.0f, 0.0f, -s, c);
252 R.m2 = _mm_set_ps(0.0f, 0.0f, c, s);
253 R.m3 = _mm_set_ps(0.0f, 1.0f, 0.0f, 0.0f);
254 R.m4 = _mm_set_ps(1.0f, 0.0f, 0.0f, 0.0f);
255 M_MatrixMult44v_SSE(M, &R);
256 }
257
258 void
M_MatrixTranslate44_SSE(M_Matrix44 * M,M_Real x,M_Real y,M_Real z)259 M_MatrixTranslate44_SSE(M_Matrix44 *M, M_Real x, M_Real y, M_Real z)
260 {
261 M_Matrix44 T;
262 float xf = (float)x;
263 float yf = (float)y;
264 float zf = (float)z;
265
266 T.m1 = _mm_set_ps(xf, 0.0f, 0.0f, 1.0f);
267 T.m2 = _mm_set_ps(yf, 0.0f, 1.0f, 0.0f);
268 T.m3 = _mm_set_ps(zf, 1.0f, 0.0f, 0.0f);
269 T.m4 = _mm_set_ps(1.0f, 0.0f, 0.0f, 0.0f);
270 M_MatrixMult44v_SSE(M, &T);
271 }
272
273 void
M_MatrixTranslatev44_SSE(M_Matrix44 * M,M_Vector3 v)274 M_MatrixTranslatev44_SSE(M_Matrix44 *M, M_Vector3 v)
275 {
276 M_Matrix44 T;
277
278 T.m1 = _mm_set_ps(v.x, 0.0f, 0.0f, 1.0f);
279 T.m2 = _mm_set_ps(v.y, 0.0f, 1.0f, 0.0f);
280 T.m3 = _mm_set_ps(v.z, 1.0f, 0.0f, 0.0f);
281 T.m4 = _mm_set_ps(1.0f, 0.0f, 0.0f, 0.0f);
282 M_MatrixMult44v_SSE(M, &T);
283 }
284 void
M_MatrixTranslateX44_SSE(M_Matrix44 * M,M_Real x)285 M_MatrixTranslateX44_SSE(M_Matrix44 *M, M_Real x)
286 {
287 float xf = (float)x;
288 M_Matrix44 T;
289
290 T.m1 = _mm_set_ps(xf, 0.0f, 0.0f, 1.0f);
291 T.m2 = _mm_set_ps(0.0f, 0.0f, 1.0f, 0.0f);
292 T.m3 = _mm_set_ps(0.0f, 1.0f, 0.0f, 0.0f);
293 T.m4 = _mm_set_ps(1.0f, 0.0f, 0.0f, 0.0f);
294 M_MatrixMult44v_SSE(M, &T);
295 }
296 void
M_MatrixTranslateY44_SSE(M_Matrix44 * M,M_Real y)297 M_MatrixTranslateY44_SSE(M_Matrix44 *M, M_Real y)
298 {
299 float yf = (float)y;
300 M_Matrix44 T;
301
302 T.m1 = _mm_set_ps(0.0f, 0.0f, 0.0f, 1.0f);
303 T.m2 = _mm_set_ps(yf, 0.0f, 1.0f, 0.0f);
304 T.m3 = _mm_set_ps(0.0f, 1.0f, 0.0f, 0.0f);
305 T.m4 = _mm_set_ps(1.0f, 0.0f, 0.0f, 0.0f);
306 M_MatrixMult44v_SSE(M, &T);
307 }
308 void
M_MatrixTranslateZ44_SSE(M_Matrix44 * M,M_Real z)309 M_MatrixTranslateZ44_SSE(M_Matrix44 *M, M_Real z)
310 {
311 float zf = (float)z;
312 M_Matrix44 T;
313
314 T.m1 = _mm_set_ps(0.0f, 0.0f, 0.0f, 1.0f);
315 T.m2 = _mm_set_ps(0.0f, 0.0f, 1.0f, 0.0f);
316 T.m3 = _mm_set_ps(zf, 1.0f, 0.0f, 0.0f);
317 T.m4 = _mm_set_ps(1.0f, 0.0f, 0.0f, 0.0f);
318 M_MatrixMult44v_SSE(M, &T);
319 }
320
321 void
M_MatrixScale44_SSE(M_Matrix44 * M,M_Real x,M_Real y,M_Real z,M_Real w)322 M_MatrixScale44_SSE(M_Matrix44 *M, M_Real x, M_Real y, M_Real z, M_Real w)
323 {
324 float xf = (float)x;
325 float yf = (float)y;
326 float zf = (float)z;
327 float wf = (float)w;
328 M_Matrix44 S;
329
330 S.m1 = _mm_set_ps(0.0f, 0.0f, 0.0f, xf);
331 S.m2 = _mm_set_ps(0.0f, 0.0f, yf, 0.0f);
332 S.m3 = _mm_set_ps(0.0f, zf, 0.0f, 0.0f);
333 S.m4 = _mm_set_ps(wf, 0.0f, 0.0f, 0.0f);
334 M_MatrixMult44v_SSE(M, &S);
335 }
336 void
M_MatrixUniScale44_SSE(M_Matrix44 * M,M_Real r)337 M_MatrixUniScale44_SSE(M_Matrix44 *M, M_Real r)
338 {
339 M_Matrix44 S;
340 float f = (float)r;
341
342 S.m1 = _mm_set_ps(0.0f, 0.0f, 0.0f, f);
343 S.m2 = _mm_shuffle_ps(S.m1,S.m1,_MM_SHUFFLE(3,3,0,3));
344 S.m3 = _mm_shuffle_ps(S.m1,S.m1,_MM_SHUFFLE(3,0,3,3));
345 S.m4 = _mm_set_ps(1.0f, 0.0f, 0.0f, 0.0f);
346 M_MatrixMult44v_SSE(M, &S);
347 }
348
349 #endif /* HAVE_SSE */
350