1 /*
2  * Copyright (c) 2007-2012 Hypertriton, Inc. <http://hypertriton.com/>
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions
6  * are met:
7  * 1. Redistributions of source code must retain the above copyright
8  *    notice, this list of conditions and the following disclaimer.
9  * 2. Redistributions in binary form must reproduce the above copyright
10  *    notice, this list of conditions and the following disclaimer in the
11  *    documentation and/or other materials provided with the distribution.
12  *
13  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
14  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
16  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR
17  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
18  * DAMAGES (INCLUDING BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
19  * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
20  * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
21  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
22  * USE OF THIS SOFTWARE EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
23  */
24 
25 /*
26  * Operations on 4x4 matrices using Streaming SIMD Extensions.
27  */
28 
29 #include <agar/config/have_sse.h>
30 #ifdef HAVE_SSE
31 
32 #include <agar/core/core.h>
33 #include <agar/math/m.h>
34 
35 const M_MatrixOps44 mMatOps44_SSE = {
36 	"sse",
37 	M_MatrixZero44_SSE,			/* -34 clks */
38 	M_MatrixZero44v_SSE,			/* +25 clks */
39 	M_MatrixIdentity44_SSE,			/* -13 clks */
40 	M_MatrixIdentity44v_SSE,		/* -8 clks */
41 	M_MatrixTranspose44_SSE,		/* +4 clks */
42 	M_MatrixTranspose44p_SSE,		/* -22 clks */
43 	M_MatrixTranspose44v_SSE,		/* -28 clks */
44 	M_MatrixInvert44_SSE,			/* -295 clks */
45 	M_MatrixInvertElim44_FPU,
46 	M_MatrixMult44_SSE,			/* -61 clks */
47 	M_MatrixMult44v_SSE,			/* -59 clks */
48 	M_MatrixMultVector44_SSE,		/* -5 clks */
49 	M_MatrixMultVector44p_SSE,		/* -10 clks */
50 	M_MatrixMultVector44v_SSE,		/* -40 clks */
51 	M_MatrixCopy44_SSE,			/* -3 clks */
52 	M_MatrixToFloats44_FPU,
53 	M_MatrixToDoubles44_FPU,
54 	M_MatrixFromFloats44_FPU,
55 	M_MatrixFromDoubles44_FPU,
56 	M_MatrixRotateAxis44_SSE,		/* -1446 clks */
57 	M_MatrixOrbitAxis44_FPU,
58 	M_MatrixRotateEul44_FPU,
59 	M_MatrixRotate44I_SSE,			/* -1204 clks */
60 	M_MatrixRotate44J_SSE,			/* -1204 clks */
61 	M_MatrixRotate44K_SSE,			/* -1204 clks */
62 	M_MatrixTranslatev44_SSE,		/* -549 clks */
63 	M_MatrixTranslate44_SSE,
64 	M_MatrixTranslateX44_SSE,		/* -608 clks */
65 	M_MatrixTranslateY44_SSE,		/* -608 clks */
66 	M_MatrixTranslateZ44_SSE,		/* -608 clks */
67 	M_MatrixScale44_SSE,			/* -522 clks */
68 	M_MatrixUniScale44_SSE,			/* -595 clks */
69 };
70 
71 M_Matrix44
M_MatrixInvert44_SSE(M_Matrix44 A)72 M_MatrixInvert44_SSE(M_Matrix44 A)
73 {
74 	M_Matrix44 Ainv;
75 	float *src = &A.m[0][0];
76 	float *dst = &Ainv.m[0][0];
77 	__m128 minor0, minor1, minor2, minor3;
78 	__m128 row0, row1, row2, row3;
79 	__m128 det, tmp1;
80 
81 	tmp1	= _mm_loadh_pi(_mm_loadl_pi(tmp1, (__m64 *)(src)),
82 	                                          (__m64 *)(src+4));
83 	row1	= _mm_loadh_pi(_mm_loadl_pi(row1, (__m64 *)(src+8)),
84 	                                          (__m64 *)(src+12));
85 	row0	= _mm_shuffle_ps(tmp1, row1, 0x88);
86 	row1	= _mm_shuffle_ps(row1, tmp1, 0xDD);
87 	tmp1	= _mm_loadh_pi(_mm_loadl_pi(tmp1, (__m64 *)(src+2)),
88 	                                          (__m64 *)(src+6));
89 	row3	= _mm_loadh_pi(_mm_loadl_pi(row3, (__m64 *)(src+10)),
90 	                                          (__m64 *)(src+14));
91 	row2	= _mm_shuffle_ps(tmp1, row3, 0x88);
92 	row3	= _mm_shuffle_ps(row3, tmp1, 0xDD);
93 
94 	tmp1	= _mm_mul_ps(row2, row3);
95 	tmp1	= _mm_shuffle_ps(tmp1, tmp1, 0xB1);
96 	minor0	= _mm_mul_ps(row1, tmp1);
97 	minor1	= _mm_mul_ps(row0, tmp1);
98 	tmp1	= _mm_shuffle_ps(tmp1, tmp1, 0x4E);
99 	minor0	= _mm_sub_ps(_mm_mul_ps(row1, tmp1), minor0);
100 	minor1	= _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor1);
101 	minor1	= _mm_shuffle_ps(minor1, minor1, 0x4E);
102 
103 	tmp1	= _mm_mul_ps(row1, row2);
104 	tmp1	= _mm_shuffle_ps(tmp1, tmp1, 0xB1);
105 	minor0	= _mm_add_ps(_mm_mul_ps(row3, tmp1), minor0);
106 	minor3	= _mm_mul_ps(row0, tmp1);
107 	tmp1	= _mm_shuffle_ps(tmp1, tmp1, 0x4E);
108 	minor0	= _mm_sub_ps(minor0, _mm_mul_ps(row3, tmp1));
109 	minor3	= _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor3);
110 	minor3	= _mm_shuffle_ps(minor3, minor3, 0x4E);
111 
112 	tmp1	= _mm_mul_ps(_mm_shuffle_ps(row1, row1, 0x4E), row3);
113 	tmp1	= _mm_shuffle_ps(tmp1, tmp1, 0xB1);
114 	row2	= _mm_shuffle_ps(row2, row2, 0x4E);
115 	minor0	= _mm_add_ps(_mm_mul_ps(row2, tmp1), minor0);
116 	minor2	= _mm_mul_ps(row0, tmp1);
117 	tmp1	= _mm_shuffle_ps(tmp1, tmp1, 0x4E);
118 	minor0	= _mm_sub_ps(minor0, _mm_mul_ps(row2, tmp1));
119 	minor2	= _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor2);
120 	minor2	= _mm_shuffle_ps(minor2, minor2, 0x4E);
121 
122 	tmp1	= _mm_mul_ps(row0, row1);
123 	tmp1	= _mm_shuffle_ps(tmp1, tmp1, 0xB1);
124 	minor2	= _mm_add_ps(_mm_mul_ps(row3, tmp1), minor2);
125 	minor3	= _mm_sub_ps(_mm_mul_ps(row2, tmp1), minor3);
126 	tmp1	= _mm_shuffle_ps(tmp1, tmp1, 0x4E);
127 	minor2	= _mm_sub_ps(_mm_mul_ps(row3, tmp1), minor2);
128 	minor3	= _mm_sub_ps(minor3, _mm_mul_ps(row2, tmp1));
129 
130 	tmp1	= _mm_mul_ps(row0, row3);
131 	tmp1	= _mm_shuffle_ps(tmp1, tmp1, 0xB1);
132 	minor1	= _mm_sub_ps(minor1, _mm_mul_ps(row2, tmp1));
133 	minor2	= _mm_add_ps(_mm_mul_ps(row1, tmp1), minor2);
134 	tmp1	= _mm_shuffle_ps(tmp1, tmp1, 0x4E);
135 	minor1	= _mm_add_ps(_mm_mul_ps(row2, tmp1), minor1);
136 	minor2	= _mm_sub_ps(minor2, _mm_mul_ps(row1, tmp1));
137 
138 	tmp1	= _mm_mul_ps(row0, row2);
139 	tmp1	= _mm_shuffle_ps(tmp1, tmp1, 0xB1);
140 	minor1	= _mm_add_ps(_mm_mul_ps(row3, tmp1), minor1);
141 	minor3	= _mm_sub_ps(minor3, _mm_mul_ps(row1, tmp1));
142 	tmp1	= _mm_shuffle_ps(tmp1, tmp1, 0x4E);
143 	minor1	= _mm_sub_ps(minor1, _mm_mul_ps(row3, tmp1));
144 	minor3	= _mm_add_ps(_mm_mul_ps(row1, tmp1), minor3);
145 
146 	det	= _mm_mul_ps(row0, minor0);
147 	det	= _mm_add_ps(_mm_shuffle_ps(det, det, 0x4E), det);
148 	det	= _mm_add_ss(_mm_shuffle_ps(det, det, 0xB1), det);
149 	tmp1	= _mm_rcp_ss(det);
150 	det	= _mm_sub_ss(_mm_add_ss(tmp1, tmp1),
151 	                     _mm_mul_ss(det, _mm_mul_ss(tmp1,tmp1)));
152 	det	= _mm_shuffle_ps(det, det, 0x00);
153 
154 	minor0	= _mm_mul_ps(det, minor0);
155 	_mm_storel_pi((__m64 *)(dst), minor0);
156 	_mm_storeh_pi((__m64 *)(dst+2), minor0);
157 
158 	minor1	= _mm_mul_ps(det, minor1);
159 	_mm_storel_pi((__m64 *)(dst+4), minor1);
160 	_mm_storeh_pi((__m64 *)(dst+6), minor1);
161 
162 	minor2	= _mm_mul_ps(det, minor2);
163 	_mm_storel_pi((__m64 *)(dst+8), minor2);
164 	_mm_storeh_pi((__m64 *)(dst+10), minor2);
165 
166 	minor3	= _mm_mul_ps(det, minor3);
167 	_mm_storel_pi((__m64 *)(dst+12), minor3);
168 	_mm_storeh_pi((__m64 *)(dst+14), minor3);
169 
170 	return (Ainv);
171 }
172 
173 void
M_MatrixRotateAxis44_SSE(M_Matrix44 * M,M_Real theta,M_Vector3 A)174 M_MatrixRotateAxis44_SSE(M_Matrix44 *M, M_Real theta, M_Vector3 A)
175 {
176 	float s = sinf((float)theta);
177 	float c = cosf((float)theta);
178 	float t = 1.0f - c;
179 	M_Matrix44 R;
180 	__m128 a = A.m128, r1;
181 #ifdef HAVE_SSE3
182 	__m128 rC1 = _mm_set_ps(-c,    s*A.z, 0.0f,  +c);	/* 1,3 1,2 3 2 */
183 	__m128 rC2 = _mm_set_ps(0.0f, -s*A.y, s*A.y, -s*A.x);	/* 1,2 3 1 2,3 */
184 #endif
185 
186 	/* m1: [t*AxAx + c,    t*AxAy + sAz,    t*AxAz - sAy,    0] */
187 	r1 = _mm_mul_ps(_mm_set1_ps(t), a);
188 	r1 = _mm_mul_ps(r1, _mm_shuffle_ps(a,a,_MM_SHUFFLE(0,0,0,0)));
189 #ifdef HAVE_SSE3
190 	R.m1 = _mm_addsub_ps(r1, _mm_shuffle_ps(rC1,rC2,_MM_SHUFFLE(3,1,2,3)));
191 #else
192 	R.m1 = _mm_add_ps(r1, _mm_set_ps(0.0f, -s*A.y, s*A.z, c));
193 #endif
194 
195 	/* m2: [t*AxAy - sAz,    t*AyAy + c,    t*AyAz + sAx,    0] */
196 	r1 = _mm_mul_ps(_mm_set1_ps(t), _mm_shuffle_ps(a,a,_MM_SHUFFLE(3,1,1,0)));
197 	r1 = _mm_mul_ps(r1, _mm_shuffle_ps(a,a,_MM_SHUFFLE(3,2,1,1)));
198 #ifdef HAVE_SSE3
199 	R.m2 = _mm_addsub_ps(r1, _mm_shuffle_ps(rC1,rC2,_MM_SHUFFLE(3,0,0,2)));
200 #else
201 	R.m2 = _mm_add_ps(r1, _mm_set_ps(0.0f, +s*A.x, c, -s*A.z));
202 #endif
203 
204 	/* m3: [t*AxAz + sAy,    t*AyAz - sAx,    t*AzAz + c,    0] */
205 	r1 = _mm_mul_ps(_mm_set1_ps(t), a);
206 	r1 = _mm_mul_ps(r1, _mm_shuffle_ps(a,a,_MM_SHUFFLE(0,2,2,2)));
207 #ifdef HAVE_SSE3
208 	R.m3 = _mm_addsub_ps(r1, _mm_shuffle_ps(rC2,rC1,_MM_SHUFFLE(1,3,0,2)));
209 #else
210 	R.m3 = _mm_add_ps(r1, _mm_set_ps(0.0f, c, -s*A.x, +s*A.y));
211 #endif
212 	/* m4: [0,    0,    0,    1] */
213 	R.m4 = _mm_set_ps(1.0f, 0.0f, 0.0f, 0.0f);
214 
215 	M_MatrixMult44v_SSE(M, &R);
216 }
217 
218 void
M_MatrixRotate44I_SSE(M_Matrix44 * M,M_Real theta)219 M_MatrixRotate44I_SSE(M_Matrix44 *M, M_Real theta)
220 {
221 	float s = sinf((float)theta);
222 	float c = cosf((float)theta);
223 	M_Matrix44 R;
224 
225 	R.m1 = _mm_set_ps(0.0f, 0.0f, 0.0f, 1.0f);
226 	R.m2 = _mm_set_ps(0.0f, -s,   c,    0.0f);
227 	R.m3 = _mm_set_ps(0.0f, c,    s,    0.0f);
228 	R.m4 = _mm_set_ps(1.0f, 0.0f, 0.0f, 0.0f);
229 	M_MatrixMult44v_SSE(M, &R);
230 }
231 void
M_MatrixRotate44J_SSE(M_Matrix44 * M,M_Real theta)232 M_MatrixRotate44J_SSE(M_Matrix44 *M, M_Real theta)
233 {
234 	float s = sinf((float)theta);
235 	float c = cosf((float)theta);
236 	M_Matrix44 R;
237 
238 	R.m1 = _mm_set_ps(0.0f, s,    0.0f, c);
239 	R.m2 = _mm_set_ps(0.0f, 0.0f, 1.0f, 0.0f);
240 	R.m3 = _mm_set_ps(0.0f, c,    0.0f, -s);
241 	R.m4 = _mm_set_ps(1.0f, 0.0f, 0.0f, 0.0f);
242 	M_MatrixMult44v_SSE(M, &R);
243 }
244 void
M_MatrixRotate44K_SSE(M_Matrix44 * M,M_Real theta)245 M_MatrixRotate44K_SSE(M_Matrix44 *M, M_Real theta)
246 {
247 	float s = sinf((float)theta);
248 	float c = cosf((float)theta);
249 	M_Matrix44 R;
250 
251 	R.m1 = _mm_set_ps(0.0f, 0.0f, -s,   c);
252 	R.m2 = _mm_set_ps(0.0f, 0.0f, c,    s);
253 	R.m3 = _mm_set_ps(0.0f, 1.0f, 0.0f, 0.0f);
254 	R.m4 = _mm_set_ps(1.0f, 0.0f, 0.0f, 0.0f);
255 	M_MatrixMult44v_SSE(M, &R);
256 }
257 
258 void
M_MatrixTranslate44_SSE(M_Matrix44 * M,M_Real x,M_Real y,M_Real z)259 M_MatrixTranslate44_SSE(M_Matrix44 *M, M_Real x, M_Real y, M_Real z)
260 {
261 	M_Matrix44 T;
262 	float xf = (float)x;
263 	float yf = (float)y;
264 	float zf = (float)z;
265 
266 	T.m1 = _mm_set_ps(xf,   0.0f, 0.0f, 1.0f);
267 	T.m2 = _mm_set_ps(yf,   0.0f, 1.0f, 0.0f);
268 	T.m3 = _mm_set_ps(zf,   1.0f, 0.0f, 0.0f);
269 	T.m4 = _mm_set_ps(1.0f, 0.0f, 0.0f, 0.0f);
270 	M_MatrixMult44v_SSE(M, &T);
271 }
272 
273 void
M_MatrixTranslatev44_SSE(M_Matrix44 * M,M_Vector3 v)274 M_MatrixTranslatev44_SSE(M_Matrix44 *M, M_Vector3 v)
275 {
276 	M_Matrix44 T;
277 
278 	T.m1 = _mm_set_ps(v.x,  0.0f, 0.0f, 1.0f);
279 	T.m2 = _mm_set_ps(v.y,  0.0f, 1.0f, 0.0f);
280 	T.m3 = _mm_set_ps(v.z,  1.0f, 0.0f, 0.0f);
281 	T.m4 = _mm_set_ps(1.0f, 0.0f, 0.0f, 0.0f);
282 	M_MatrixMult44v_SSE(M, &T);
283 }
284 void
M_MatrixTranslateX44_SSE(M_Matrix44 * M,M_Real x)285 M_MatrixTranslateX44_SSE(M_Matrix44 *M, M_Real x)
286 {
287 	float xf = (float)x;
288 	M_Matrix44 T;
289 
290 	T.m1 = _mm_set_ps(xf,   0.0f, 0.0f, 1.0f);
291 	T.m2 = _mm_set_ps(0.0f, 0.0f, 1.0f, 0.0f);
292 	T.m3 = _mm_set_ps(0.0f, 1.0f, 0.0f, 0.0f);
293 	T.m4 = _mm_set_ps(1.0f, 0.0f, 0.0f, 0.0f);
294 	M_MatrixMult44v_SSE(M, &T);
295 }
296 void
M_MatrixTranslateY44_SSE(M_Matrix44 * M,M_Real y)297 M_MatrixTranslateY44_SSE(M_Matrix44 *M, M_Real y)
298 {
299 	float yf = (float)y;
300 	M_Matrix44 T;
301 
302 	T.m1 = _mm_set_ps(0.0f, 0.0f, 0.0f, 1.0f);
303 	T.m2 = _mm_set_ps(yf,   0.0f, 1.0f, 0.0f);
304 	T.m3 = _mm_set_ps(0.0f, 1.0f, 0.0f, 0.0f);
305 	T.m4 = _mm_set_ps(1.0f, 0.0f, 0.0f, 0.0f);
306 	M_MatrixMult44v_SSE(M, &T);
307 }
308 void
M_MatrixTranslateZ44_SSE(M_Matrix44 * M,M_Real z)309 M_MatrixTranslateZ44_SSE(M_Matrix44 *M, M_Real z)
310 {
311 	float zf = (float)z;
312 	M_Matrix44 T;
313 
314 	T.m1 = _mm_set_ps(0.0f, 0.0f, 0.0f, 1.0f);
315 	T.m2 = _mm_set_ps(0.0f, 0.0f, 1.0f, 0.0f);
316 	T.m3 = _mm_set_ps(zf,   1.0f, 0.0f, 0.0f);
317 	T.m4 = _mm_set_ps(1.0f, 0.0f, 0.0f, 0.0f);
318 	M_MatrixMult44v_SSE(M, &T);
319 }
320 
321 void
M_MatrixScale44_SSE(M_Matrix44 * M,M_Real x,M_Real y,M_Real z,M_Real w)322 M_MatrixScale44_SSE(M_Matrix44 *M, M_Real x, M_Real y, M_Real z, M_Real w)
323 {
324 	float xf = (float)x;
325 	float yf = (float)y;
326 	float zf = (float)z;
327 	float wf = (float)w;
328 	M_Matrix44 S;
329 
330 	S.m1 = _mm_set_ps(0.0f, 0.0f, 0.0f, xf);
331 	S.m2 = _mm_set_ps(0.0f, 0.0f, yf,   0.0f);
332 	S.m3 = _mm_set_ps(0.0f, zf,   0.0f, 0.0f);
333 	S.m4 = _mm_set_ps(wf,   0.0f, 0.0f, 0.0f);
334 	M_MatrixMult44v_SSE(M, &S);
335 }
336 void
M_MatrixUniScale44_SSE(M_Matrix44 * M,M_Real r)337 M_MatrixUniScale44_SSE(M_Matrix44 *M, M_Real r)
338 {
339 	M_Matrix44 S;
340 	float f = (float)r;
341 
342 	S.m1 = _mm_set_ps(0.0f, 0.0f, 0.0f, f);
343 	S.m2 = _mm_shuffle_ps(S.m1,S.m1,_MM_SHUFFLE(3,3,0,3));
344 	S.m3 = _mm_shuffle_ps(S.m1,S.m1,_MM_SHUFFLE(3,0,3,3));
345 	S.m4 = _mm_set_ps(1.0f, 0.0f, 0.0f, 0.0f);
346 	M_MatrixMult44v_SSE(M, &S);
347 }
348 
349 #endif /* HAVE_SSE */
350