1 /*
2 ===========================================================================
3 
4 Doom 3 GPL Source Code
5 Copyright (C) 1999-2011 id Software LLC, a ZeniMax Media company.
6 
7 This file is part of the Doom 3 GPL Source Code ("Doom 3 Source Code").
8 
9 Doom 3 Source Code is free software: you can redistribute it and/or modify
10 it under the terms of the GNU General Public License as published by
11 the Free Software Foundation, either version 3 of the License, or
12 (at your option) any later version.
13 
14 Doom 3 Source Code is distributed in the hope that it will be useful,
15 but WITHOUT ANY WARRANTY; without even the implied warranty of
16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17 GNU General Public License for more details.
18 
19 You should have received a copy of the GNU General Public License
20 along with Doom 3 Source Code.  If not, see <http://www.gnu.org/licenses/>.
21 
22 In addition, the Doom 3 Source Code is also subject to certain additional terms. You should have received a copy of these additional terms immediately following the terms and conditions of the GNU General Public License which accompanied the Doom 3 Source Code.  If not, please request a copy in writing from id Software at the address below.
23 
24 If you have questions concerning this license or the applicable additional terms, you may contact in writing id Software LLC, c/o ZeniMax Media Inc., Suite 120, Rockville, Maryland 20850 USA.
25 
26 ===========================================================================
27 */
28 
29 #include "sys/platform.h"
30 
31 #include "idlib/math/Simd_SSE3.h"
32 
33 //===============================================================
34 //
35 //	SSE3 implementation of idSIMDProcessor
36 //
37 //===============================================================
38 
39 #if defined(__GNUC__) && defined(__SSE3__)
40 
41 /*
42 ============
43 idSIMD_SSE3::GetName
44 ============
45 */
GetName(void) const46 const char * idSIMD_SSE3::GetName( void ) const {
47 	return "MMX & SSE & SSE2 & SSE3";
48 }
49 
50 #elif defined(_MSC_VER) && defined(_M_IX86)
51 
52 #include <xmmintrin.h>
53 
54 #include "idlib/geometry/JointTransform.h"
55 #include "idlib/geometry/DrawVert.h"
56 #include "idlib/math/Vector.h"
57 
58 #define SHUFFLEPS( x, y, z, w )		(( (x) & 3 ) << 6 | ( (y) & 3 ) << 4 | ( (z) & 3 ) << 2 | ( (w) & 3 ))
59 #define R_SHUFFLEPS( x, y, z, w )	(( (w) & 3 ) << 6 | ( (z) & 3 ) << 4 | ( (y) & 3 ) << 2 | ( (x) & 3 ))
60 #define SHUFFLEPD( x, y )			(( (x) & 1 ) << 1 | ( (y) & 1 ))
61 #define R_SHUFFLEPD( x, y )			(( (y) & 1 ) << 1 | ( (x) & 1 ))
62 
63 /*
64 
65 	The first argument of an instruction macro is the destination
66 	and the second argument is the source operand. The destination
67 	operand can be _xmm0 to _xmm7 only. The source operand can be
68 	any one of the registers _xmm0 to _xmm7 or _eax, _ecx, _edx, _esp,
69 	_ebp, _ebx, _esi, or _edi that contains the effective address.
70 
71 	For instance:  haddps   xmm0, xmm1
72 	becomes:       haddps( _xmm0, _xmm1 )
73 	and:           haddps   xmm0, [esi]
74 	becomes:       haddps( _xmm0, _esi )
75 
76 	The ADDRESS_ADDC macro can be used when the effective source address
77 	is formed by adding a constant to a general purpose register.
78 	For instance:  haddps   xmm0, [esi+48]
79 	becomes:       haddps( _xmm0, ADDRESS_ADDC( _esi, 48 ) )
80 
81 	The ADDRESS_ADDR macro can be used when the effective source address
82 	is formed by adding two general purpose registers.
83 	For instance:  haddps   xmm0, [esi+eax]
84 	becomes:       haddps( _xmm0, ADDRESS_ADDR( _esi, _eax ) )
85 
86 	The ADDRESS_ADDRC macro can be used when the effective source address
87 	is formed by adding two general purpose registers and a constant.
88 	The constant must be in the range [-128, 127].
89 	For instance:  haddps   xmm0, [esi+eax+48]
90 	becomes:       haddps( _xmm0, ADDRESS_ADDRC( _esi, _eax, 48 ) )
91 
92 	The ADDRESS_SCALEADDR macro can be used when the effective source address is formed
93 	by adding a scaled general purpose register to another general purpose register.
94 	The scale must be either 1, 2, 4 or 8.
95 	For instance:  haddps   xmm0, [esi+eax*4]
96 	becomes:       haddps( _xmm0, ADDRESS_SCALEADDR( _esi, _eax, 4 ) )
97 
98 	The ADDRESS_SCALEADDRC macro can be used when the effective source address is formed
99 	by adding a scaled general purpose register to another general purpose register and
100 	also adding a constant. The scale must be either 1, 2, 4 or 8. The constant must
101 	be in the range [-128, 127].
102 	For instance:  haddps   xmm0, [esi+eax*4+64]
103 	becomes:       haddps( _xmm0, ADDRESS_SCALEADDRC( _esi, _eax, 4, 64 ) )
104 
105 */
106 
107 #define _eax	0x00
108 #define _ecx	0x01
109 #define _edx	0x02
110 #define _ebx	0x03
111 #define _esp	0x04
112 #define _ebp	0x05
113 #define _esi	0x06
114 #define _edi	0x07
115 
116 #define _xmm0	0xC0
117 #define _xmm1	0xC1
118 #define _xmm2	0xC2
119 #define _xmm3	0xC3
120 #define _xmm4	0xC4
121 #define _xmm5	0xC5
122 #define _xmm6	0xC6
123 #define _xmm7	0xC7
124 
125 #define RSCALE( s )		( (s&2)<<5 ) | ( (s&4)<<5 ) | ( (s&8)<<3 ) | ( (s&8)<<4 )
126 
127 #define ADDRESS_ADDC( reg0, constant )						0x40 | ( reg0 & 7 )	\
128 	_asm _emit constant
129 
130 #define ADDRESS_ADDR( reg0, reg1 )							0x04				\
131 	_asm _emit ( ( reg1 & 7 ) << 3 ) | ( reg0 & 7 )
132 
133 #define ADDRESS_ADDRC( reg0, reg1, constant )				0x44				\
134 	_asm _emit ( ( reg1 & 7 ) << 3 ) | ( reg0 & 7 )								\
135 	_asm _emit constant
136 
137 #define ADDRESS_SCALEADDR( reg0, reg1, scale )				0x04				\
138 	_asm _emit ( ( reg1 & 7 ) << 3 ) | ( reg0 & 7 ) | RSCALE( scale )
139 
140 #define ADDRESS_SCALEADDRC( reg0, reg1, scale, constant )	0x44				\
141 	_asm _emit ( ( reg1 & 7 ) << 3 ) | ( reg0 & 7 ) | RSCALE( scale )			\
142 	_asm _emit constant
143 
144 
145 // Packed Single-FP Add/Subtract ( dst[0]=dst[0]+src[0], dst[1]=dst[1]-src[1], dst[2]=dst[2]+src[2], dst[3]=dst[3]-src[3] )
146 #define addsubps( dst, src )						\
147 	_asm _emit 0xF2									\
148 	_asm _emit 0x0F									\
149 	_asm _emit 0xD0									\
150 	_asm _emit ( ( dst & 7 ) << 3 ) | src
151 
152 // Packed Double-FP Add/Subtract ( dst[0]=dst[0]+src[0], dst[1]=dst[1]-src[1] )
153 #define addsubpd( dst, src )						\
154 	_asm _emit 0x66									\
155 	_asm _emit 0x0F									\
156 	_asm _emit 0xD0									\
157 	_asm _emit ( ( dst & 7 ) << 3 ) | src
158 
159 // Packed Single-FP Horizontal Add ( dst[0]=dst[0]+dst[1], dst[1]=dst[2]+dst[3], dst[2]=src[0]+src[1], dst[3]=src[2]+src[3] )
160 #define haddps( dst, src )							\
161 	_asm _emit 0xF2									\
162 	_asm _emit 0x0F									\
163 	_asm _emit 0x7C									\
164 	_asm _emit ( ( dst & 7 ) << 3 ) | src
165 
166 // Packed Double-FP Horizontal Add ( dst[0]=dst[0]+dst[1], dst[1]=src[0]+src[1] )
167 #define haddpd( dst, src )							\
168 	_asm _emit 0x66									\
169 	_asm _emit 0x0F									\
170 	_asm _emit 0x7C									\
171 	_asm _emit ( ( dst & 7 ) << 3 ) | src
172 
173 // Packed Single-FP Horizontal Subtract ( dst[0]=dst[0]-dst[1], dst[1]=dst[2]-dst[3], dst[2]=src[0]-src[1], dst[3]=src[2]-src[3] )
174 #define hsubps( dst, src )							\
175 	_asm _emit 0xF2									\
176 	_asm _emit 0x0F									\
177 	_asm _emit 0x7D									\
178 	_asm _emit ( ( dst & 7 ) << 3 ) | src
179 
180 // Packed Double-FP Horizontal Subtract ( dst[0]=dst[0]-dst[1], dst[1]=src[0]-src[1] )
181 #define hsubpd( dst, src )							\
182 	_asm _emit 0x66									\
183 	_asm _emit 0x0F									\
184 	_asm _emit 0x7D									\
185 	_asm _emit ( ( dst & 7 ) << 3 ) | src
186 
187 // Move Packed Single-FP Low and Duplicate ( dst[0]=src[0], dst[1]=src[0], dst[2]=src[2], dst[3]=src[2] )
188 #define movsldup( dst, src )						\
189 	_asm _emit 0xF3									\
190 	_asm _emit 0x0F									\
191 	_asm _emit 0x12									\
192 	_asm _emit ( ( dst & 7 ) << 3 ) | src
193 
194 // Move One Double-FP Low and Duplicate ( dst[0]=src[0], dst[1]=src[0] )
195 #define movdldup( dst, src )						\
196 	_asm _emit 0xF2									\
197 	_asm _emit 0x0F									\
198 	_asm _emit 0x12									\
199 	_asm _emit ( ( dst & 7 ) << 3 ) | src
200 
201 // Move Packed Single-FP High and Duplicate ( dst[0]=src[1], dst[1]=src[1], dst[2]=src[3], dst[3]=src[3] )
202 #define movshdup( dst, src )						\
203 	_asm _emit 0xF3									\
204 	_asm _emit 0x0F									\
205 	_asm _emit 0x16									\
206 	_asm _emit ( ( dst & 7 ) << 3 ) | src
207 
208 // Move One Double-FP High and Duplicate ( dst[0]=src[1], dst[1]=src[1] )
209 #define movdhdup( dst, src )						\
210 	_asm _emit 0xF2									\
211 	_asm _emit 0x0F									\
212 	_asm _emit 0x16									\
213 	_asm _emit ( ( dst & 7 ) << 3 ) | src
214 
215 // Load Unaligned Integer 128 bits
216 #define lddqu( dst, src )							\
217 	_asm _emit 0xF2									\
218 	_asm _emit 0x0F									\
219 	_asm _emit 0xF0									\
220 	_asm _emit ( ( dst & 7 ) << 3 ) | src
221 
222 
223 #define DRAWVERT_SIZE				60
224 #define DRAWVERT_XYZ_OFFSET			(0*4)
225 #define DRAWVERT_ST_OFFSET			(3*4)
226 #define DRAWVERT_NORMAL_OFFSET		(5*4)
227 #define DRAWVERT_TANGENT0_OFFSET	(8*4)
228 #define DRAWVERT_TANGENT1_OFFSET	(11*4)
229 #define DRAWVERT_COLOR_OFFSET		(14*4)
230 
231 #define JOINTQUAT_SIZE				(7*4)
232 #define JOINTMAT_SIZE				(4*3*4)
233 #define JOINTWEIGHT_SIZE			(4*4)
234 
235 
236 /*
237 ============
238 SSE3_Dot
239 ============
240 */
SSE3_Dot(const idVec4 & v1,const idVec4 & v2)241 float SSE3_Dot( const idVec4 &v1, const idVec4 &v2 ) {
242 	float d;
243 	__asm {
244 		mov		esi, v1
245 		mov		edi, v2
246 		movaps	xmm0, [esi]
247 		mulps	xmm0, [edi]
248 		haddps(	_xmm0, _xmm0 )
249 		haddps(	_xmm0, _xmm0 )
250 		movss	d, xmm0
251 	}
252 	return d;
253 }
254 
255 /*
256 ============
257 idSIMD_SSE3::GetName
258 ============
259 */
GetName(void) const260 const char * idSIMD_SSE3::GetName( void ) const {
261 	return "MMX & SSE & SSE2 & SSE3";
262 }
263 
264 /*
265 ============
266 idSIMD_SSE3::TransformVerts
267 ============
268 */
TransformVerts(idDrawVert * verts,const int numVerts,const idJointMat * joints,const idVec4 * weights,const int * index,const int numWeights)269 void VPCALL idSIMD_SSE3::TransformVerts( idDrawVert *verts, const int numVerts, const idJointMat *joints, const idVec4 *weights, const int *index, const int numWeights ) {
270 #if 1
271 
272 	assert( sizeof( idDrawVert ) == DRAWVERT_SIZE );
273 	assert( (int)&((idDrawVert *)0)->xyz == DRAWVERT_XYZ_OFFSET );
274 	assert( sizeof( idVec4 ) == JOINTWEIGHT_SIZE );
275 	assert( sizeof( idJointMat ) == JOINTMAT_SIZE );
276 
277 	__asm
278 	{
279 		mov			eax, numVerts
280 		test		eax, eax
281 		jz			done
282 		imul		eax, DRAWVERT_SIZE
283 
284 		mov			ecx, verts
285 		mov			edx, index
286 		mov			esi, weights
287 		mov			edi, joints
288 
289 		add			ecx, eax
290 		neg			eax
291 
292 	loopVert:
293 		mov			ebx, [edx]
294 		movaps		xmm2, [esi]
295 		add			edx, 8
296 		movaps		xmm0, xmm2
297 		add			esi, JOINTWEIGHT_SIZE
298 		movaps		xmm1, xmm2
299 
300 		mulps		xmm0, [edi+ebx+ 0]						// xmm0 = m0, m1, m2, t0
301 		mulps		xmm1, [edi+ebx+16]						// xmm1 = m3, m4, m5, t1
302 		mulps		xmm2, [edi+ebx+32]						// xmm2 = m6, m7, m8, t2
303 
304 		cmp			dword ptr [edx-4], 0
305 
306 		jne			doneWeight
307 
308 	loopWeight:
309 		mov			ebx, [edx]
310 		movaps		xmm5, [esi]
311 		add			edx, 8
312 		movaps		xmm3, xmm5
313 		add			esi, JOINTWEIGHT_SIZE
314 		movaps		xmm4, xmm5
315 
316 		mulps		xmm3, [edi+ebx+ 0]						// xmm3 = m0, m1, m2, t0
317 		mulps		xmm4, [edi+ebx+16]						// xmm4 = m3, m4, m5, t1
318 		mulps		xmm5, [edi+ebx+32]						// xmm5 = m6, m7, m8, t2
319 
320 		cmp			dword ptr [edx-4], 0
321 
322 		addps		xmm0, xmm3
323 		addps		xmm1, xmm4
324 		addps		xmm2, xmm5
325 
326 		je			loopWeight
327 
328 	doneWeight:
329 		add			eax, DRAWVERT_SIZE
330 
331 		haddps(		_xmm0, _xmm1 )
332 		haddps(		_xmm2, _xmm0 )
333 
334 		movhps		[ecx+eax-DRAWVERT_SIZE+0], xmm2
335 
336 		haddps(		_xmm2, _xmm2 )
337 
338 		movss		[ecx+eax-DRAWVERT_SIZE+8], xmm2
339 
340 		jl			loopVert
341 	done:
342 	}
343 
344 #else
345 
346 	int i, j;
347 	const byte *jointsPtr = (byte *)joints;
348 
349 	for( j = i = 0; i < numVerts; i++ ) {
350 		idVec3 v;
351 
352 		v = ( *(idJointMat *) ( jointsPtr + index[j*2+0] ) ) * weights[j];
353 		while( index[j*2+1] == 0 ) {
354 			j++;
355 			v += ( *(idJointMat *) ( jointsPtr + index[j*2+0] ) ) * weights[j];
356 		}
357 		j++;
358 
359 		verts[i].xyz = v;
360 	}
361 
362 #endif
363 }
364 
365 #endif /* _MSC_VER */
366