1 /*
2  * Copyright (c) 2007-2012 Hypertriton, Inc. <http://hypertriton.com/>
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions
6  * are met:
7  * 1. Redistributions of source code must retain the above copyright
8  *    notice, this list of conditions and the following disclaimer.
9  * 2. Redistributions in binary form must reproduce the above copyright
10  *    notice, this list of conditions and the following disclaimer in the
11  *    documentation and/or other materials provided with the distribution.
12  *
13  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
14  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
16  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR
17  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
18  * DAMAGES (INCLUDING BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
19  * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
20  * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
21  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
22  * USE OF THIS SOFTWARE EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
23  */
24 
25 /*
26  * Operations on 4x4 matrices using Streaming SIMD Extensions.
27  */
28 
29 #ifdef HAVE_SSE
30 
31 __BEGIN_DECLS
32 
33 static __inline__ M_Matrix44
M_MatrixZero44_SSE(void)34 M_MatrixZero44_SSE(void)
35 {
36 	M_Matrix44 out;
37 
38 	out.m1 = _mm_setzero_ps();
39 	out.m2 = _mm_setzero_ps();
40 	out.m3 = _mm_setzero_ps();
41 	out.m4 = _mm_setzero_ps();
42 	return (out);
43 }
44 static __inline__ void
M_MatrixZero44v_SSE(M_Matrix44 * M)45 M_MatrixZero44v_SSE(M_Matrix44 *M)
46 {
47 	M->m1 = _mm_setzero_ps();
48 	M->m2 = _mm_setzero_ps();
49 	M->m3 = _mm_setzero_ps();
50 	M->m4 = _mm_setzero_ps();
51 }
52 
53 static __inline__ M_Matrix44
M_MatrixIdentity44_SSE(void)54 M_MatrixIdentity44_SSE(void)
55 {
56 	M_Matrix44 I;
57 
58 	I.m1 = _mm_set_ps(0.0f, 0.0f, 0.0f, 1.0f);
59 	I.m2 = _mm_set_ps(0.0f, 0.0f, 1.0f, 0.0f);
60 	I.m3 = _mm_set_ps(0.0f, 1.0f, 0.0f, 0.0f);
61 	I.m4 = _mm_set_ps(1.0f, 0.0f, 0.0f, 0.0f);
62 	return (I);
63 }
64 static __inline__ void
M_MatrixIdentity44v_SSE(M_Matrix44 * M)65 M_MatrixIdentity44v_SSE(M_Matrix44 *M)
66 {
67 	M->m1 = _mm_set_ps(0.0f, 0.0f, 0.0f, 1.0f);
68 	M->m2 = _mm_set_ps(0.0f, 0.0f, 1.0f, 0.0f);
69 	M->m3 = _mm_set_ps(0.0f, 1.0f, 0.0f, 0.0f);
70 	M->m4 = _mm_set_ps(1.0f, 0.0f, 0.0f, 0.0f);
71 }
72 
73 static __inline__ M_Matrix44
M_MatrixTranspose44_SSE(M_Matrix44 M)74 M_MatrixTranspose44_SSE(M_Matrix44 M)
75 {
76 	M_Matrix44 out;
77 	out.m1 = M.m1;
78 	out.m2 = M.m2;
79 	out.m3 = M.m3;
80 	out.m4 = M.m4;
81 	_MM_TRANSPOSE4_PS(out.m1, out.m2, out.m3, out.m4);
82 	return (out);
83 }
84 static __inline__ M_Matrix44
M_MatrixTranspose44p_SSE(const M_Matrix44 * M)85 M_MatrixTranspose44p_SSE(const M_Matrix44 *M)
86 {
87 	M_Matrix44 out;
88 	out.m1 = M->m1;
89 	out.m2 = M->m2;
90 	out.m3 = M->m3;
91 	out.m4 = M->m4;
92 	_MM_TRANSPOSE4_PS(out.m1, out.m2, out.m3, out.m4);
93 	return (out);
94 }
95 static __inline__ void
M_MatrixTranspose44v_SSE(M_Matrix44 * M)96 M_MatrixTranspose44v_SSE(M_Matrix44 *M)
97 {
98 	_MM_TRANSPOSE4_PS(M->m1, M->m2, M->m3, M->m4);
99 }
100 
101 static __inline__ M_Matrix44
M_MatrixMult44_SSE(M_Matrix44 A,M_Matrix44 B)102 M_MatrixMult44_SSE(M_Matrix44 A, M_Matrix44 B)
103 {
104 	__m128 r1;
105 	M_Matrix44 out;
106 
107 	r1 = A.m1;
108 	out.m1 = _mm_add_ps(
109 	    _mm_add_ps(_mm_add_ps(
110 	        _mm_mul_ps(_mm_shuffle_ps(r1,r1,_MM_SHUFFLE(0,0,0,0)),B.m1),
111 		_mm_mul_ps(_mm_shuffle_ps(r1,r1,_MM_SHUFFLE(1,1,1,1)),B.m2)
112 	    ),
113 	    _mm_mul_ps(_mm_shuffle_ps(r1,r1,_MM_SHUFFLE(2,2,2,2)),B.m3)),
114 	    _mm_mul_ps(_mm_shuffle_ps(r1,r1,_MM_SHUFFLE(3,3,3,3)),B.m4)
115 	);
116 	r1 = A.m2;
117 	out.m2 = _mm_add_ps(
118 	    _mm_add_ps(_mm_add_ps(
119 	        _mm_mul_ps(_mm_shuffle_ps(r1,r1,_MM_SHUFFLE(0,0,0,0)),B.m1),
120 	        _mm_mul_ps(_mm_shuffle_ps(r1,r1,_MM_SHUFFLE(1,1,1,1)),B.m2)
121 	    ),
122 	    _mm_mul_ps(_mm_shuffle_ps(r1,r1,_MM_SHUFFLE(2,2,2,2)),B.m3)),
123 	    _mm_mul_ps(_mm_shuffle_ps(r1,r1,_MM_SHUFFLE(3,3,3,3)),B.m4)
124 	);
125 	r1 = A.m3;
126 	out.m3 = _mm_add_ps(
127 	    _mm_add_ps(_mm_add_ps(
128 	        _mm_mul_ps(_mm_shuffle_ps(r1,r1,_MM_SHUFFLE(0,0,0,0)),B.m1),
129 	        _mm_mul_ps(_mm_shuffle_ps(r1,r1,_MM_SHUFFLE(1,1,1,1)),B.m2)
130 	    ),
131 	    _mm_mul_ps(_mm_shuffle_ps(r1,r1,_MM_SHUFFLE(2,2,2,2)),B.m3)),
132 	    _mm_mul_ps(_mm_shuffle_ps(r1,r1,_MM_SHUFFLE(3,3,3,3)),B.m4)
133 	);
134 	r1 = A.m4;
135 	out.m4 = _mm_add_ps(
136 	    _mm_add_ps(_mm_add_ps(
137 	        _mm_mul_ps(_mm_shuffle_ps(r1,r1,_MM_SHUFFLE(0,0,0,0)),B.m1),
138 		_mm_mul_ps(_mm_shuffle_ps(r1,r1,_MM_SHUFFLE(1,1,1,1)),B.m2)
139 	    ),
140 	    _mm_mul_ps(_mm_shuffle_ps(r1,r1,_MM_SHUFFLE(2,2,2,2)),B.m3)),
141 	    _mm_mul_ps(_mm_shuffle_ps(r1,r1,_MM_SHUFFLE(3,3,3,3)),B.m4)
142 	);
143 	return (out);
144 }
145 static __inline__ void
M_MatrixMult44v_SSE(M_Matrix44 * A,const M_Matrix44 * B)146 M_MatrixMult44v_SSE(M_Matrix44 *A, const M_Matrix44 *B)
147 {
148 	__m128 r1;
149 	M_Matrix44 out;
150 
151 	r1 = A->m1;
152 	out.m1 = _mm_add_ps(
153 	    _mm_add_ps(_mm_add_ps(
154 	     _mm_mul_ps(_mm_shuffle_ps(r1,r1,_MM_SHUFFLE(0,0,0,0)),B->m1),
155 	     _mm_mul_ps(_mm_shuffle_ps(r1,r1,_MM_SHUFFLE(1,1,1,1)),B->m2)
156 	    ),
157 	    _mm_mul_ps(_mm_shuffle_ps(r1,r1,_MM_SHUFFLE(2,2,2,2)),B->m3)),
158 	    _mm_mul_ps(_mm_shuffle_ps(r1,r1,_MM_SHUFFLE(3,3,3,3)),B->m4)
159 	);
160 	r1 = A->m2;
161 	out.m2 = _mm_add_ps(
162 	    _mm_add_ps(_mm_add_ps(
163 	     _mm_mul_ps(_mm_shuffle_ps(r1,r1,_MM_SHUFFLE(0,0,0,0)),B->m1),
164 	     _mm_mul_ps(_mm_shuffle_ps(r1,r1,_MM_SHUFFLE(1,1,1,1)),B->m2)
165 	    ),
166 	    _mm_mul_ps(_mm_shuffle_ps(r1,r1,_MM_SHUFFLE(2,2,2,2)),B->m3)),
167 	    _mm_mul_ps(_mm_shuffle_ps(r1,r1,_MM_SHUFFLE(3,3,3,3)),B->m4)
168 	);
169 	r1 = A->m3;
170 	out.m3 = _mm_add_ps(
171 	    _mm_add_ps(_mm_add_ps(
172 	     _mm_mul_ps(_mm_shuffle_ps(r1,r1,_MM_SHUFFLE(0,0,0,0)),B->m1),
173 	     _mm_mul_ps(_mm_shuffle_ps(r1,r1,_MM_SHUFFLE(1,1,1,1)),B->m2)
174 	    ),
175 	    _mm_mul_ps(_mm_shuffle_ps(r1,r1,_MM_SHUFFLE(2,2,2,2)),B->m3)),
176 	    _mm_mul_ps(_mm_shuffle_ps(r1,r1,_MM_SHUFFLE(3,3,3,3)),B->m4)
177 	);
178 	r1 = A->m4;
179 	out.m4 = _mm_add_ps(
180 	    _mm_add_ps(_mm_add_ps(
181 	     _mm_mul_ps(_mm_shuffle_ps(r1,r1,_MM_SHUFFLE(0,0,0,0)),B->m1),
182 	     _mm_mul_ps(_mm_shuffle_ps(r1,r1,_MM_SHUFFLE(1,1,1,1)),B->m2)
183 	    ),
184 	    _mm_mul_ps(_mm_shuffle_ps(r1,r1,_MM_SHUFFLE(2,2,2,2)),B->m3)),
185 	    _mm_mul_ps(_mm_shuffle_ps(r1,r1,_MM_SHUFFLE(3,3,3,3)),B->m4)
186 	);
187 	A->m1 = out.m1;
188 	A->m2 = out.m2;
189 	A->m3 = out.m3;
190 	A->m4 = out.m4;
191 }
192 
193 static __inline__ M_Vector4
M_MatrixMultVector44_SSE(M_Matrix44 A,M_Vector4 b)194 M_MatrixMultVector44_SSE(M_Matrix44 A, M_Vector4 b)
195 {
196 #ifdef HAVE_SSE3
197 	__m128 x, r1, r2;
198 	M_Vector4 out;
199 
200 	x = b.m128;
201 	r1 = _mm_hadd_ps(_mm_mul_ps(A.m1,x), _mm_mul_ps(A.m2,x));
202 	r2 = _mm_hadd_ps(_mm_mul_ps(A.m3,x), _mm_mul_ps(A.m4,x));
203 	out.m128 = _mm_hadd_ps(r1, r2);
204 	return (out);
205 #else
206 	return M_MatrixMultVector44_FPU(A, b);
207 #endif
208 }
209 static __inline__ M_Vector4
M_MatrixMultVector44p_SSE(const M_Matrix44 * A,const M_Vector4 * b)210 M_MatrixMultVector44p_SSE(const M_Matrix44 *A, const M_Vector4 *b)
211 {
212 #ifdef HAVE_SSE3
213 	__m128 x, r1, r2;
214 	M_Vector4 out;
215 
216 	x = b->m128;
217 	r1 = _mm_hadd_ps(_mm_mul_ps(A->m1,x), _mm_mul_ps(A->m2,x));
218 	r2 = _mm_hadd_ps(_mm_mul_ps(A->m3,x), _mm_mul_ps(A->m4,x));
219 	out.m128 = _mm_hadd_ps(r1, r2);
220 	return (out);
221 #else
222 	return M_MatrixMultVector44p_FPU(A, b);
223 #endif
224 }
225 static __inline__ void
M_MatrixMultVector44v_SSE(M_Vector4 * b,const M_Matrix44 * A)226 M_MatrixMultVector44v_SSE(M_Vector4 *b, const M_Matrix44 *A)
227 {
228 #ifdef HAVE_SSE3
229 	__m128 x, r1, r2;
230 
231 	x = b->m128;
232 	r1 = _mm_hadd_ps(_mm_mul_ps(A->m1,x), _mm_mul_ps(A->m2,x));
233 	r2 = _mm_hadd_ps(_mm_mul_ps(A->m3,x), _mm_mul_ps(A->m4,x));
234 	b->m128 = _mm_hadd_ps(r1, r2);
235 #else
236 	M_MatrixMultVector44v_FPU(b, A);
237 #endif
238 }
239 
240 static __inline__ void
M_MatrixCopy44_SSE(M_Matrix44 * mDst,const M_Matrix44 * mSrc)241 M_MatrixCopy44_SSE(M_Matrix44 *mDst, const M_Matrix44 *mSrc)
242 {
243 	mDst->m1 = mSrc->m1;
244 	mDst->m2 = mSrc->m2;
245 	mDst->m3 = mSrc->m3;
246 	mDst->m4 = mSrc->m4;
247 }
248 __END_DECLS
249 
250 __BEGIN_DECLS
251 extern const M_MatrixOps44 mMatOps44_SSE;
252 
253 M_Matrix44 M_MatrixInvert44_SSE(const M_Matrix44);
254 void       M_MatrixRotateAxis44_SSE(M_Matrix44 *, M_Real, M_Vector3);
255 void       M_MatrixRotate44I_SSE(M_Matrix44 *, M_Real);
256 void       M_MatrixRotate44J_SSE(M_Matrix44 *, M_Real);
257 void       M_MatrixRotate44K_SSE(M_Matrix44 *, M_Real);
258 void       M_MatrixTranslatev44_SSE(M_Matrix44 *, M_Vector3);
259 void       M_MatrixTranslate44_SSE(M_Matrix44 *, M_Real, M_Real, M_Real);
260 void       M_MatrixTranslateX44_SSE(M_Matrix44 *, M_Real);
261 void       M_MatrixTranslateY44_SSE(M_Matrix44 *, M_Real);
262 void       M_MatrixTranslateZ44_SSE(M_Matrix44 *, M_Real);
263 void       M_MatrixScale44_SSE(M_Matrix44 *, M_Real, M_Real, M_Real, M_Real);
264 void       M_MatrixUniScale44_SSE(M_Matrix44 *, M_Real);
265 __END_DECLS
266 
267 #endif /* HAVE_SSE */
268