1 /*
2 ===========================================================================
3
4 Doom 3 GPL Source Code
5 Copyright (C) 1999-2011 id Software LLC, a ZeniMax Media company.
6
7 This file is part of the Doom 3 GPL Source Code ("Doom 3 Source Code").
8
9 Doom 3 Source Code is free software: you can redistribute it and/or modify
10 it under the terms of the GNU General Public License as published by
11 the Free Software Foundation, either version 3 of the License, or
12 (at your option) any later version.
13
14 Doom 3 Source Code is distributed in the hope that it will be useful,
15 but WITHOUT ANY WARRANTY; without even the implied warranty of
16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 GNU General Public License for more details.
18
19 You should have received a copy of the GNU General Public License
20 along with Doom 3 Source Code. If not, see <http://www.gnu.org/licenses/>.
21
22 In addition, the Doom 3 Source Code is also subject to certain additional terms. You should have received a copy of these additional terms immediately following the terms and conditions of the GNU General Public License which accompanied the Doom 3 Source Code. If not, please request a copy in writing from id Software at the address below.
23
24 If you have questions concerning this license or the applicable additional terms, you may contact in writing id Software LLC, c/o ZeniMax Media Inc., Suite 120, Rockville, Maryland 20850 USA.
25
26 ===========================================================================
27 */
28
29 #include "sys/platform.h"
30
31 #include "idlib/math/Simd_SSE3.h"
32
33 //===============================================================
34 //
35 // SSE3 implementation of idSIMDProcessor
36 //
37 //===============================================================
38
39 #if defined(__GNUC__) && defined(__SSE3__)
40
41 /*
42 ============
43 idSIMD_SSE3::GetName
44 ============
45 */
GetName(void) const46 const char * idSIMD_SSE3::GetName( void ) const {
47 return "MMX & SSE & SSE2 & SSE3";
48 }
49
50 #elif defined(_MSC_VER) && defined(_M_IX86)
51
52 #include <xmmintrin.h>
53
54 #include "idlib/geometry/JointTransform.h"
55 #include "idlib/geometry/DrawVert.h"
56 #include "idlib/math/Vector.h"
57
58 #define SHUFFLEPS( x, y, z, w ) (( (x) & 3 ) << 6 | ( (y) & 3 ) << 4 | ( (z) & 3 ) << 2 | ( (w) & 3 ))
59 #define R_SHUFFLEPS( x, y, z, w ) (( (w) & 3 ) << 6 | ( (z) & 3 ) << 4 | ( (y) & 3 ) << 2 | ( (x) & 3 ))
60 #define SHUFFLEPD( x, y ) (( (x) & 1 ) << 1 | ( (y) & 1 ))
61 #define R_SHUFFLEPD( x, y ) (( (y) & 1 ) << 1 | ( (x) & 1 ))
62
63 /*
64
65 The first argument of an instruction macro is the destination
66 and the second argument is the source operand. The destination
67 operand can be _xmm0 to _xmm7 only. The source operand can be
68 any one of the registers _xmm0 to _xmm7 or _eax, _ecx, _edx, _esp,
69 _ebp, _ebx, _esi, or _edi that contains the effective address.
70
71 For instance: haddps xmm0, xmm1
72 becomes: haddps( _xmm0, _xmm1 )
73 and: haddps xmm0, [esi]
74 becomes: haddps( _xmm0, _esi )
75
76 The ADDRESS_ADDC macro can be used when the effective source address
77 is formed by adding a constant to a general purpose register.
78 For instance: haddps xmm0, [esi+48]
79 becomes: haddps( _xmm0, ADDRESS_ADDC( _esi, 48 ) )
80
81 The ADDRESS_ADDR macro can be used when the effective source address
82 is formed by adding two general purpose registers.
83 For instance: haddps xmm0, [esi+eax]
84 becomes: haddps( _xmm0, ADDRESS_ADDR( _esi, _eax ) )
85
86 The ADDRESS_ADDRC macro can be used when the effective source address
87 is formed by adding two general purpose registers and a constant.
88 The constant must be in the range [-128, 127].
89 For instance: haddps xmm0, [esi+eax+48]
90 becomes: haddps( _xmm0, ADDRESS_ADDRC( _esi, _eax, 48 ) )
91
92 The ADDRESS_SCALEADDR macro can be used when the effective source address is formed
93 by adding a scaled general purpose register to another general purpose register.
94 The scale must be either 1, 2, 4 or 8.
95 For instance: haddps xmm0, [esi+eax*4]
96 becomes: haddps( _xmm0, ADDRESS_SCALEADDR( _esi, _eax, 4 ) )
97
98 The ADDRESS_SCALEADDRC macro can be used when the effective source address is formed
99 by adding a scaled general purpose register to another general purpose register and
100 also adding a constant. The scale must be either 1, 2, 4 or 8. The constant must
101 be in the range [-128, 127].
102 For instance: haddps xmm0, [esi+eax*4+64]
103 becomes: haddps( _xmm0, ADDRESS_SCALEADDRC( _esi, _eax, 4, 64 ) )
104
105 */
106
107 #define _eax 0x00
108 #define _ecx 0x01
109 #define _edx 0x02
110 #define _ebx 0x03
111 #define _esp 0x04
112 #define _ebp 0x05
113 #define _esi 0x06
114 #define _edi 0x07
115
116 #define _xmm0 0xC0
117 #define _xmm1 0xC1
118 #define _xmm2 0xC2
119 #define _xmm3 0xC3
120 #define _xmm4 0xC4
121 #define _xmm5 0xC5
122 #define _xmm6 0xC6
123 #define _xmm7 0xC7
124
125 #define RSCALE( s ) ( (s&2)<<5 ) | ( (s&4)<<5 ) | ( (s&8)<<3 ) | ( (s&8)<<4 )
126
127 #define ADDRESS_ADDC( reg0, constant ) 0x40 | ( reg0 & 7 ) \
128 _asm _emit constant
129
130 #define ADDRESS_ADDR( reg0, reg1 ) 0x04 \
131 _asm _emit ( ( reg1 & 7 ) << 3 ) | ( reg0 & 7 )
132
133 #define ADDRESS_ADDRC( reg0, reg1, constant ) 0x44 \
134 _asm _emit ( ( reg1 & 7 ) << 3 ) | ( reg0 & 7 ) \
135 _asm _emit constant
136
137 #define ADDRESS_SCALEADDR( reg0, reg1, scale ) 0x04 \
138 _asm _emit ( ( reg1 & 7 ) << 3 ) | ( reg0 & 7 ) | RSCALE( scale )
139
140 #define ADDRESS_SCALEADDRC( reg0, reg1, scale, constant ) 0x44 \
141 _asm _emit ( ( reg1 & 7 ) << 3 ) | ( reg0 & 7 ) | RSCALE( scale ) \
142 _asm _emit constant
143
144
145 // Packed Single-FP Add/Subtract ( dst[0]=dst[0]+src[0], dst[1]=dst[1]-src[1], dst[2]=dst[2]+src[2], dst[3]=dst[3]-src[3] )
146 #define addsubps( dst, src ) \
147 _asm _emit 0xF2 \
148 _asm _emit 0x0F \
149 _asm _emit 0xD0 \
150 _asm _emit ( ( dst & 7 ) << 3 ) | src
151
152 // Packed Double-FP Add/Subtract ( dst[0]=dst[0]+src[0], dst[1]=dst[1]-src[1] )
153 #define addsubpd( dst, src ) \
154 _asm _emit 0x66 \
155 _asm _emit 0x0F \
156 _asm _emit 0xD0 \
157 _asm _emit ( ( dst & 7 ) << 3 ) | src
158
159 // Packed Single-FP Horizontal Add ( dst[0]=dst[0]+dst[1], dst[1]=dst[2]+dst[3], dst[2]=src[0]+src[1], dst[3]=src[2]+src[3] )
160 #define haddps( dst, src ) \
161 _asm _emit 0xF2 \
162 _asm _emit 0x0F \
163 _asm _emit 0x7C \
164 _asm _emit ( ( dst & 7 ) << 3 ) | src
165
166 // Packed Double-FP Horizontal Add ( dst[0]=dst[0]+dst[1], dst[1]=src[0]+src[1] )
167 #define haddpd( dst, src ) \
168 _asm _emit 0x66 \
169 _asm _emit 0x0F \
170 _asm _emit 0x7C \
171 _asm _emit ( ( dst & 7 ) << 3 ) | src
172
173 // Packed Single-FP Horizontal Subtract ( dst[0]=dst[0]-dst[1], dst[1]=dst[2]-dst[3], dst[2]=src[0]-src[1], dst[3]=src[2]-src[3] )
174 #define hsubps( dst, src ) \
175 _asm _emit 0xF2 \
176 _asm _emit 0x0F \
177 _asm _emit 0x7D \
178 _asm _emit ( ( dst & 7 ) << 3 ) | src
179
180 // Packed Double-FP Horizontal Subtract ( dst[0]=dst[0]-dst[1], dst[1]=src[0]-src[1] )
181 #define hsubpd( dst, src ) \
182 _asm _emit 0x66 \
183 _asm _emit 0x0F \
184 _asm _emit 0x7D \
185 _asm _emit ( ( dst & 7 ) << 3 ) | src
186
187 // Move Packed Single-FP Low and Duplicate ( dst[0]=src[0], dst[1]=src[0], dst[2]=src[2], dst[3]=src[2] )
188 #define movsldup( dst, src ) \
189 _asm _emit 0xF3 \
190 _asm _emit 0x0F \
191 _asm _emit 0x12 \
192 _asm _emit ( ( dst & 7 ) << 3 ) | src
193
194 // Move One Double-FP Low and Duplicate ( dst[0]=src[0], dst[1]=src[0] )
195 #define movdldup( dst, src ) \
196 _asm _emit 0xF2 \
197 _asm _emit 0x0F \
198 _asm _emit 0x12 \
199 _asm _emit ( ( dst & 7 ) << 3 ) | src
200
201 // Move Packed Single-FP High and Duplicate ( dst[0]=src[1], dst[1]=src[1], dst[2]=src[3], dst[3]=src[3] )
202 #define movshdup( dst, src ) \
203 _asm _emit 0xF3 \
204 _asm _emit 0x0F \
205 _asm _emit 0x16 \
206 _asm _emit ( ( dst & 7 ) << 3 ) | src
207
208 // Move One Double-FP High and Duplicate ( dst[0]=src[1], dst[1]=src[1] )
209 #define movdhdup( dst, src ) \
210 _asm _emit 0xF2 \
211 _asm _emit 0x0F \
212 _asm _emit 0x16 \
213 _asm _emit ( ( dst & 7 ) << 3 ) | src
214
215 // Load Unaligned Integer 128 bits
216 #define lddqu( dst, src ) \
217 _asm _emit 0xF2 \
218 _asm _emit 0x0F \
219 _asm _emit 0xF0 \
220 _asm _emit ( ( dst & 7 ) << 3 ) | src
221
222
223 #define DRAWVERT_SIZE 60
224 #define DRAWVERT_XYZ_OFFSET (0*4)
225 #define DRAWVERT_ST_OFFSET (3*4)
226 #define DRAWVERT_NORMAL_OFFSET (5*4)
227 #define DRAWVERT_TANGENT0_OFFSET (8*4)
228 #define DRAWVERT_TANGENT1_OFFSET (11*4)
229 #define DRAWVERT_COLOR_OFFSET (14*4)
230
231 #define JOINTQUAT_SIZE (7*4)
232 #define JOINTMAT_SIZE (4*3*4)
233 #define JOINTWEIGHT_SIZE (4*4)
234
235
236 /*
237 ============
238 SSE3_Dot
239 ============
240 */
SSE3_Dot(const idVec4 & v1,const idVec4 & v2)241 float SSE3_Dot( const idVec4 &v1, const idVec4 &v2 ) {
242 float d;
243 __asm {
244 mov esi, v1
245 mov edi, v2
246 movaps xmm0, [esi]
247 mulps xmm0, [edi]
248 haddps( _xmm0, _xmm0 )
249 haddps( _xmm0, _xmm0 )
250 movss d, xmm0
251 }
252 return d;
253 }
254
255 /*
256 ============
257 idSIMD_SSE3::GetName
258 ============
259 */
GetName(void) const260 const char * idSIMD_SSE3::GetName( void ) const {
261 return "MMX & SSE & SSE2 & SSE3";
262 }
263
264 /*
265 ============
266 idSIMD_SSE3::TransformVerts
267 ============
268 */
TransformVerts(idDrawVert * verts,const int numVerts,const idJointMat * joints,const idVec4 * weights,const int * index,const int numWeights)269 void VPCALL idSIMD_SSE3::TransformVerts( idDrawVert *verts, const int numVerts, const idJointMat *joints, const idVec4 *weights, const int *index, const int numWeights ) {
270 #if 1
271
272 assert( sizeof( idDrawVert ) == DRAWVERT_SIZE );
273 assert( (int)&((idDrawVert *)0)->xyz == DRAWVERT_XYZ_OFFSET );
274 assert( sizeof( idVec4 ) == JOINTWEIGHT_SIZE );
275 assert( sizeof( idJointMat ) == JOINTMAT_SIZE );
276
277 __asm
278 {
279 mov eax, numVerts
280 test eax, eax
281 jz done
282 imul eax, DRAWVERT_SIZE
283
284 mov ecx, verts
285 mov edx, index
286 mov esi, weights
287 mov edi, joints
288
289 add ecx, eax
290 neg eax
291
292 loopVert:
293 mov ebx, [edx]
294 movaps xmm2, [esi]
295 add edx, 8
296 movaps xmm0, xmm2
297 add esi, JOINTWEIGHT_SIZE
298 movaps xmm1, xmm2
299
300 mulps xmm0, [edi+ebx+ 0] // xmm0 = m0, m1, m2, t0
301 mulps xmm1, [edi+ebx+16] // xmm1 = m3, m4, m5, t1
302 mulps xmm2, [edi+ebx+32] // xmm2 = m6, m7, m8, t2
303
304 cmp dword ptr [edx-4], 0
305
306 jne doneWeight
307
308 loopWeight:
309 mov ebx, [edx]
310 movaps xmm5, [esi]
311 add edx, 8
312 movaps xmm3, xmm5
313 add esi, JOINTWEIGHT_SIZE
314 movaps xmm4, xmm5
315
316 mulps xmm3, [edi+ebx+ 0] // xmm3 = m0, m1, m2, t0
317 mulps xmm4, [edi+ebx+16] // xmm4 = m3, m4, m5, t1
318 mulps xmm5, [edi+ebx+32] // xmm5 = m6, m7, m8, t2
319
320 cmp dword ptr [edx-4], 0
321
322 addps xmm0, xmm3
323 addps xmm1, xmm4
324 addps xmm2, xmm5
325
326 je loopWeight
327
328 doneWeight:
329 add eax, DRAWVERT_SIZE
330
331 haddps( _xmm0, _xmm1 )
332 haddps( _xmm2, _xmm0 )
333
334 movhps [ecx+eax-DRAWVERT_SIZE+0], xmm2
335
336 haddps( _xmm2, _xmm2 )
337
338 movss [ecx+eax-DRAWVERT_SIZE+8], xmm2
339
340 jl loopVert
341 done:
342 }
343
344 #else
345
346 int i, j;
347 const byte *jointsPtr = (byte *)joints;
348
349 for( j = i = 0; i < numVerts; i++ ) {
350 idVec3 v;
351
352 v = ( *(idJointMat *) ( jointsPtr + index[j*2+0] ) ) * weights[j];
353 while( index[j*2+1] == 0 ) {
354 j++;
355 v += ( *(idJointMat *) ( jointsPtr + index[j*2+0] ) ) * weights[j];
356 }
357 j++;
358
359 verts[i].xyz = v;
360 }
361
362 #endif
363 }
364
365 #endif /* _MSC_VER */
366