1 /*
2 * Copyright (c) 2007-2012 Hypertriton, Inc. <http://hypertriton.com/>
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
6 * are met:
7 * 1. Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * 2. Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
12 *
13 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
14 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
16 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR
17 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
18 * DAMAGES (INCLUDING BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
19 * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
20 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
21 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
22 * USE OF THIS SOFTWARE EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
23 */
24
25 /*
26 * Operations on 4x4 matrices using Streaming SIMD Extensions.
27 */
28
29 #ifdef HAVE_SSE
30
31 __BEGIN_DECLS
32
33 static __inline__ M_Matrix44
M_MatrixZero44_SSE(void)34 M_MatrixZero44_SSE(void)
35 {
36 M_Matrix44 out;
37
38 out.m1 = _mm_setzero_ps();
39 out.m2 = _mm_setzero_ps();
40 out.m3 = _mm_setzero_ps();
41 out.m4 = _mm_setzero_ps();
42 return (out);
43 }
44 static __inline__ void
M_MatrixZero44v_SSE(M_Matrix44 * M)45 M_MatrixZero44v_SSE(M_Matrix44 *M)
46 {
47 M->m1 = _mm_setzero_ps();
48 M->m2 = _mm_setzero_ps();
49 M->m3 = _mm_setzero_ps();
50 M->m4 = _mm_setzero_ps();
51 }
52
53 static __inline__ M_Matrix44
M_MatrixIdentity44_SSE(void)54 M_MatrixIdentity44_SSE(void)
55 {
56 M_Matrix44 I;
57
58 I.m1 = _mm_set_ps(0.0f, 0.0f, 0.0f, 1.0f);
59 I.m2 = _mm_set_ps(0.0f, 0.0f, 1.0f, 0.0f);
60 I.m3 = _mm_set_ps(0.0f, 1.0f, 0.0f, 0.0f);
61 I.m4 = _mm_set_ps(1.0f, 0.0f, 0.0f, 0.0f);
62 return (I);
63 }
64 static __inline__ void
M_MatrixIdentity44v_SSE(M_Matrix44 * M)65 M_MatrixIdentity44v_SSE(M_Matrix44 *M)
66 {
67 M->m1 = _mm_set_ps(0.0f, 0.0f, 0.0f, 1.0f);
68 M->m2 = _mm_set_ps(0.0f, 0.0f, 1.0f, 0.0f);
69 M->m3 = _mm_set_ps(0.0f, 1.0f, 0.0f, 0.0f);
70 M->m4 = _mm_set_ps(1.0f, 0.0f, 0.0f, 0.0f);
71 }
72
73 static __inline__ M_Matrix44
M_MatrixTranspose44_SSE(M_Matrix44 M)74 M_MatrixTranspose44_SSE(M_Matrix44 M)
75 {
76 M_Matrix44 out;
77 out.m1 = M.m1;
78 out.m2 = M.m2;
79 out.m3 = M.m3;
80 out.m4 = M.m4;
81 _MM_TRANSPOSE4_PS(out.m1, out.m2, out.m3, out.m4);
82 return (out);
83 }
84 static __inline__ M_Matrix44
M_MatrixTranspose44p_SSE(const M_Matrix44 * M)85 M_MatrixTranspose44p_SSE(const M_Matrix44 *M)
86 {
87 M_Matrix44 out;
88 out.m1 = M->m1;
89 out.m2 = M->m2;
90 out.m3 = M->m3;
91 out.m4 = M->m4;
92 _MM_TRANSPOSE4_PS(out.m1, out.m2, out.m3, out.m4);
93 return (out);
94 }
95 static __inline__ void
M_MatrixTranspose44v_SSE(M_Matrix44 * M)96 M_MatrixTranspose44v_SSE(M_Matrix44 *M)
97 {
98 _MM_TRANSPOSE4_PS(M->m1, M->m2, M->m3, M->m4);
99 }
100
101 static __inline__ M_Matrix44
M_MatrixMult44_SSE(M_Matrix44 A,M_Matrix44 B)102 M_MatrixMult44_SSE(M_Matrix44 A, M_Matrix44 B)
103 {
104 __m128 r1;
105 M_Matrix44 out;
106
107 r1 = A.m1;
108 out.m1 = _mm_add_ps(
109 _mm_add_ps(_mm_add_ps(
110 _mm_mul_ps(_mm_shuffle_ps(r1,r1,_MM_SHUFFLE(0,0,0,0)),B.m1),
111 _mm_mul_ps(_mm_shuffle_ps(r1,r1,_MM_SHUFFLE(1,1,1,1)),B.m2)
112 ),
113 _mm_mul_ps(_mm_shuffle_ps(r1,r1,_MM_SHUFFLE(2,2,2,2)),B.m3)),
114 _mm_mul_ps(_mm_shuffle_ps(r1,r1,_MM_SHUFFLE(3,3,3,3)),B.m4)
115 );
116 r1 = A.m2;
117 out.m2 = _mm_add_ps(
118 _mm_add_ps(_mm_add_ps(
119 _mm_mul_ps(_mm_shuffle_ps(r1,r1,_MM_SHUFFLE(0,0,0,0)),B.m1),
120 _mm_mul_ps(_mm_shuffle_ps(r1,r1,_MM_SHUFFLE(1,1,1,1)),B.m2)
121 ),
122 _mm_mul_ps(_mm_shuffle_ps(r1,r1,_MM_SHUFFLE(2,2,2,2)),B.m3)),
123 _mm_mul_ps(_mm_shuffle_ps(r1,r1,_MM_SHUFFLE(3,3,3,3)),B.m4)
124 );
125 r1 = A.m3;
126 out.m3 = _mm_add_ps(
127 _mm_add_ps(_mm_add_ps(
128 _mm_mul_ps(_mm_shuffle_ps(r1,r1,_MM_SHUFFLE(0,0,0,0)),B.m1),
129 _mm_mul_ps(_mm_shuffle_ps(r1,r1,_MM_SHUFFLE(1,1,1,1)),B.m2)
130 ),
131 _mm_mul_ps(_mm_shuffle_ps(r1,r1,_MM_SHUFFLE(2,2,2,2)),B.m3)),
132 _mm_mul_ps(_mm_shuffle_ps(r1,r1,_MM_SHUFFLE(3,3,3,3)),B.m4)
133 );
134 r1 = A.m4;
135 out.m4 = _mm_add_ps(
136 _mm_add_ps(_mm_add_ps(
137 _mm_mul_ps(_mm_shuffle_ps(r1,r1,_MM_SHUFFLE(0,0,0,0)),B.m1),
138 _mm_mul_ps(_mm_shuffle_ps(r1,r1,_MM_SHUFFLE(1,1,1,1)),B.m2)
139 ),
140 _mm_mul_ps(_mm_shuffle_ps(r1,r1,_MM_SHUFFLE(2,2,2,2)),B.m3)),
141 _mm_mul_ps(_mm_shuffle_ps(r1,r1,_MM_SHUFFLE(3,3,3,3)),B.m4)
142 );
143 return (out);
144 }
145 static __inline__ void
M_MatrixMult44v_SSE(M_Matrix44 * A,const M_Matrix44 * B)146 M_MatrixMult44v_SSE(M_Matrix44 *A, const M_Matrix44 *B)
147 {
148 __m128 r1;
149 M_Matrix44 out;
150
151 r1 = A->m1;
152 out.m1 = _mm_add_ps(
153 _mm_add_ps(_mm_add_ps(
154 _mm_mul_ps(_mm_shuffle_ps(r1,r1,_MM_SHUFFLE(0,0,0,0)),B->m1),
155 _mm_mul_ps(_mm_shuffle_ps(r1,r1,_MM_SHUFFLE(1,1,1,1)),B->m2)
156 ),
157 _mm_mul_ps(_mm_shuffle_ps(r1,r1,_MM_SHUFFLE(2,2,2,2)),B->m3)),
158 _mm_mul_ps(_mm_shuffle_ps(r1,r1,_MM_SHUFFLE(3,3,3,3)),B->m4)
159 );
160 r1 = A->m2;
161 out.m2 = _mm_add_ps(
162 _mm_add_ps(_mm_add_ps(
163 _mm_mul_ps(_mm_shuffle_ps(r1,r1,_MM_SHUFFLE(0,0,0,0)),B->m1),
164 _mm_mul_ps(_mm_shuffle_ps(r1,r1,_MM_SHUFFLE(1,1,1,1)),B->m2)
165 ),
166 _mm_mul_ps(_mm_shuffle_ps(r1,r1,_MM_SHUFFLE(2,2,2,2)),B->m3)),
167 _mm_mul_ps(_mm_shuffle_ps(r1,r1,_MM_SHUFFLE(3,3,3,3)),B->m4)
168 );
169 r1 = A->m3;
170 out.m3 = _mm_add_ps(
171 _mm_add_ps(_mm_add_ps(
172 _mm_mul_ps(_mm_shuffle_ps(r1,r1,_MM_SHUFFLE(0,0,0,0)),B->m1),
173 _mm_mul_ps(_mm_shuffle_ps(r1,r1,_MM_SHUFFLE(1,1,1,1)),B->m2)
174 ),
175 _mm_mul_ps(_mm_shuffle_ps(r1,r1,_MM_SHUFFLE(2,2,2,2)),B->m3)),
176 _mm_mul_ps(_mm_shuffle_ps(r1,r1,_MM_SHUFFLE(3,3,3,3)),B->m4)
177 );
178 r1 = A->m4;
179 out.m4 = _mm_add_ps(
180 _mm_add_ps(_mm_add_ps(
181 _mm_mul_ps(_mm_shuffle_ps(r1,r1,_MM_SHUFFLE(0,0,0,0)),B->m1),
182 _mm_mul_ps(_mm_shuffle_ps(r1,r1,_MM_SHUFFLE(1,1,1,1)),B->m2)
183 ),
184 _mm_mul_ps(_mm_shuffle_ps(r1,r1,_MM_SHUFFLE(2,2,2,2)),B->m3)),
185 _mm_mul_ps(_mm_shuffle_ps(r1,r1,_MM_SHUFFLE(3,3,3,3)),B->m4)
186 );
187 A->m1 = out.m1;
188 A->m2 = out.m2;
189 A->m3 = out.m3;
190 A->m4 = out.m4;
191 }
192
193 static __inline__ M_Vector4
M_MatrixMultVector44_SSE(M_Matrix44 A,M_Vector4 b)194 M_MatrixMultVector44_SSE(M_Matrix44 A, M_Vector4 b)
195 {
196 #ifdef HAVE_SSE3
197 __m128 x, r1, r2;
198 M_Vector4 out;
199
200 x = b.m128;
201 r1 = _mm_hadd_ps(_mm_mul_ps(A.m1,x), _mm_mul_ps(A.m2,x));
202 r2 = _mm_hadd_ps(_mm_mul_ps(A.m3,x), _mm_mul_ps(A.m4,x));
203 out.m128 = _mm_hadd_ps(r1, r2);
204 return (out);
205 #else
206 return M_MatrixMultVector44_FPU(A, b);
207 #endif
208 }
209 static __inline__ M_Vector4
M_MatrixMultVector44p_SSE(const M_Matrix44 * A,const M_Vector4 * b)210 M_MatrixMultVector44p_SSE(const M_Matrix44 *A, const M_Vector4 *b)
211 {
212 #ifdef HAVE_SSE3
213 __m128 x, r1, r2;
214 M_Vector4 out;
215
216 x = b->m128;
217 r1 = _mm_hadd_ps(_mm_mul_ps(A->m1,x), _mm_mul_ps(A->m2,x));
218 r2 = _mm_hadd_ps(_mm_mul_ps(A->m3,x), _mm_mul_ps(A->m4,x));
219 out.m128 = _mm_hadd_ps(r1, r2);
220 return (out);
221 #else
222 return M_MatrixMultVector44p_FPU(A, b);
223 #endif
224 }
225 static __inline__ void
M_MatrixMultVector44v_SSE(M_Vector4 * b,const M_Matrix44 * A)226 M_MatrixMultVector44v_SSE(M_Vector4 *b, const M_Matrix44 *A)
227 {
228 #ifdef HAVE_SSE3
229 __m128 x, r1, r2;
230
231 x = b->m128;
232 r1 = _mm_hadd_ps(_mm_mul_ps(A->m1,x), _mm_mul_ps(A->m2,x));
233 r2 = _mm_hadd_ps(_mm_mul_ps(A->m3,x), _mm_mul_ps(A->m4,x));
234 b->m128 = _mm_hadd_ps(r1, r2);
235 #else
236 M_MatrixMultVector44v_FPU(b, A);
237 #endif
238 }
239
240 static __inline__ void
M_MatrixCopy44_SSE(M_Matrix44 * mDst,const M_Matrix44 * mSrc)241 M_MatrixCopy44_SSE(M_Matrix44 *mDst, const M_Matrix44 *mSrc)
242 {
243 mDst->m1 = mSrc->m1;
244 mDst->m2 = mSrc->m2;
245 mDst->m3 = mSrc->m3;
246 mDst->m4 = mSrc->m4;
247 }
248 __END_DECLS
249
250 __BEGIN_DECLS
251 extern const M_MatrixOps44 mMatOps44_SSE;
252
253 M_Matrix44 M_MatrixInvert44_SSE(const M_Matrix44);
254 void M_MatrixRotateAxis44_SSE(M_Matrix44 *, M_Real, M_Vector3);
255 void M_MatrixRotate44I_SSE(M_Matrix44 *, M_Real);
256 void M_MatrixRotate44J_SSE(M_Matrix44 *, M_Real);
257 void M_MatrixRotate44K_SSE(M_Matrix44 *, M_Real);
258 void M_MatrixTranslatev44_SSE(M_Matrix44 *, M_Vector3);
259 void M_MatrixTranslate44_SSE(M_Matrix44 *, M_Real, M_Real, M_Real);
260 void M_MatrixTranslateX44_SSE(M_Matrix44 *, M_Real);
261 void M_MatrixTranslateY44_SSE(M_Matrix44 *, M_Real);
262 void M_MatrixTranslateZ44_SSE(M_Matrix44 *, M_Real);
263 void M_MatrixScale44_SSE(M_Matrix44 *, M_Real, M_Real, M_Real, M_Real);
264 void M_MatrixUniScale44_SSE(M_Matrix44 *, M_Real);
265 __END_DECLS
266
267 #endif /* HAVE_SSE */
268