1 /*
2 ===========================================================================
3 
4 Doom 3 GPL Source Code
5 Copyright (C) 1999-2011 id Software LLC, a ZeniMax Media company.
6 
7 This file is part of the Doom 3 GPL Source Code ("Doom 3 Source Code").
8 
9 Doom 3 Source Code is free software: you can redistribute it and/or modify
10 it under the terms of the GNU General Public License as published by
11 the Free Software Foundation, either version 3 of the License, or
12 (at your option) any later version.
13 
14 Doom 3 Source Code is distributed in the hope that it will be useful,
15 but WITHOUT ANY WARRANTY; without even the implied warranty of
16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17 GNU General Public License for more details.
18 
19 You should have received a copy of the GNU General Public License
20 along with Doom 3 Source Code.  If not, see <http://www.gnu.org/licenses/>.
21 
22 In addition, the Doom 3 Source Code is also subject to certain additional terms. You should have received a copy of these additional terms immediately following the terms and conditions of the GNU General Public License which accompanied the Doom 3 Source Code.  If not, please request a copy in writing from id Software at the address below.
23 
24 If you have questions concerning this license or the applicable additional terms, you may contact in writing id Software LLC, c/o ZeniMax Media Inc., Suite 120, Rockville, Maryland 20850 USA.
25 
26 ===========================================================================
27 */
28 
29 #include "sys/platform.h"
30 #include "idlib/geometry/DrawVert.h"
31 
32 #include "idlib/math/Simd_SSE.h"
33 
34 //===============================================================
35 //                                                        M
36 //  SSE implementation of idSIMDProcessor                MrE
37 //                                                        E
38 //===============================================================
39 
40 #define DRAWVERT_SIZE				60
41 #define DRAWVERT_XYZ_OFFSET			(0*4)
42 #define DRAWVERT_ST_OFFSET			(3*4)
43 #define DRAWVERT_NORMAL_OFFSET		(5*4)
44 #define DRAWVERT_TANGENT0_OFFSET	(8*4)
45 #define DRAWVERT_TANGENT1_OFFSET	(11*4)
46 #define DRAWVERT_COLOR_OFFSET		(14*4)
47 
48 #if defined(__GNUC__) && defined(__SSE__)
49 
50 #include <xmmintrin.h>
51 
52 #define SHUFFLEPS( x, y, z, w )		(( (x) & 3 ) << 6 | ( (y) & 3 ) << 4 | ( (z) & 3 ) << 2 | ( (w) & 3 ))
53 #define R_SHUFFLEPS( x, y, z, w )	(( (w) & 3 ) << 6 | ( (z) & 3 ) << 4 | ( (y) & 3 ) << 2 | ( (x) & 3 ))
54 
55 /*
56 ============
57 idSIMD_SSE::GetName
58 ============
59 */
GetName(void) const60 const char * idSIMD_SSE::GetName( void ) const {
61 	return "MMX & SSE";
62 }
63 
64 /*
65 ============
66 idSIMD_SSE::Dot
67 
68   dst[i] = constant.Normal() * src[i].xyz + constant[3];
69 ============
70 */
Dot(float * dst,const idPlane & constant,const idDrawVert * src,const int count)71 void VPCALL idSIMD_SSE::Dot( float *dst, const idPlane &constant, const idDrawVert *src, const int count ) {
72 	// 0,  1,  2
73 	// 3,  4,  5
74 	// 6,  7,  8
75 	// 9, 10, 11
76 
77 	/*
78 		mov			eax, count
79 		mov			edi, constant
80 		mov			edx, eax
81 		mov			esi, src
82 		mov			ecx, dst
83 	*/
84 	__m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;	// Declare 8 xmm registers.
85 	int count_l4 = count;                                   // count_l4 = eax
86 	int count_l1 = count;                                   // count_l1 = edx
87 	char *constant_p = (char *)&constant;                   // constant_p = edi
88 	char *src_p = (char *) src;                             // src_p = esi
89 	char *dst_p = (char *) dst;                             // dst_p = ecx
90 
91 	assert( sizeof( idDrawVert ) == DRAWVERT_SIZE );
92 	assert( ptrdiff_t(&src->xyz) - ptrdiff_t(src) == DRAWVERT_XYZ_OFFSET );
93 
94 	/*
95 		and			eax, ~3
96 		movss		xmm4, [edi+0]
97 		shufps		xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
98 		movss		xmm5, [edi+4]
99 		shufps		xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
100 		movss		xmm6, [edi+8]
101 		shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
102 		movss		xmm7, [edi+12]
103 		shufps		xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
104 	*/
105 	count_l4 = count_l4 & ~3;
106 	xmm4 = _mm_load_ss((float *) (constant_p));
107 	xmm4 = _mm_shuffle_ps(xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 ));
108 	xmm5 = _mm_load_ss((float *) (constant_p + 4));
109 	xmm5 = _mm_shuffle_ps(xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 ));
110 	xmm6 = _mm_load_ss((float *) (constant_p + 8));
111 	xmm6 = _mm_shuffle_ps(xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ));
112 	xmm7 = _mm_load_ss((float *) (constant_p + 12));
113 	xmm7 = _mm_shuffle_ps(xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 ));
114 
115 	/*
116 		jz			startVert1
117 	*/
118 	if(count_l4 != 0) {
119 	/*
120 		imul		eax, DRAWVERT_SIZE
121 		add			esi, eax
122 		neg			eax
123 	*/
124 		count_l4 = count_l4 * DRAWVERT_SIZE;
125 		src_p = src_p + count_l4;
126 		count_l4 = -count_l4;
127 	/*
128 	loopVert4:
129 	*/
130 		do {
131 	/*
132 		movss		xmm0, [esi+eax+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]	//  3,  X,  X,  X
133 		movss		xmm2, [esi+eax+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8]	//  2,  X,  X,  X
134 		movhps		xmm0, [esi+eax+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]	//  3,  X,  0,  1
135 		movaps		xmm1, xmm0												//  3,  X,  0,  1
136 	*/
137 			xmm0 = _mm_load_ss((float *) (src_p+count_l4+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0));        // 3,  X,  X,  X
138 			xmm2 = _mm_load_ss((float *) (src_p+count_l4+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8));        // 2,  X,  X,  X
139 			xmm0 = _mm_loadh_pi(xmm0, (__m64 *) (src_p+count_l4+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0)); // 3,  X,  0,  1
140 			xmm1 = xmm0;							                                                    // 3,  X,  0,  1
141 
142 	/*
143 		movlps		xmm1, [esi+eax+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4]	//  4,  5,  0,  1
144 		shufps		xmm2, xmm1, R_SHUFFLEPS( 0, 1, 0, 1 )					//  2,  X,  4,  5
145 	*/
146 			xmm1 = _mm_loadl_pi(xmm1, (__m64 *) (src_p+count_l4+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4)); // 4,  5,  0,  1
147 			xmm2 = _mm_shuffle_ps(xmm2, xmm1, R_SHUFFLEPS( 0, 1, 0, 1 ));                               // 2,  X,  4,  5
148 
149 	/*
150 		movss		xmm3, [esi+eax+3*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]	//  9,  X,  X,  X
151 		movhps		xmm3, [esi+eax+2*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]	//  9,  X,  6,  7
152 		shufps		xmm0, xmm3, R_SHUFFLEPS( 2, 0, 2, 0 )					//  0,  3,  6,  9
153 	*/
154 			xmm3 = _mm_load_ss((float *) (src_p+count_l4+3*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0));        // 9,  X,  X,  X
155 			xmm3 = _mm_loadh_pi(xmm3, (__m64 *) (src_p+count_l4+2*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0)); // 9,  X,  6,  7
156 			xmm0 = _mm_shuffle_ps(xmm0, xmm3, R_SHUFFLEPS( 2, 0, 2, 0 ));                               // 0,  3,  6,  9
157 	/*
158 		movlps		xmm3, [esi+eax+3*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4]	// 10, 11,  6,  7
159 		shufps		xmm1, xmm3, R_SHUFFLEPS( 3, 0, 3, 0 )					//  1,  4,  7, 10
160 	*/
161 			xmm3 = _mm_loadl_pi(xmm3, (__m64 *)(src_p+count_l4+3*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4));  // 10, 11, 6,  7
162 			xmm1 = _mm_shuffle_ps(xmm1, xmm3, R_SHUFFLEPS( 3, 0, 3, 0 ));                               // 1,  4,  7,  10
163 	/*
164 		movhps		xmm3, [esi+eax+2*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8]	// 10, 11,  8,  X
165 		shufps		xmm2, xmm3, R_SHUFFLEPS( 0, 3, 2, 1 )					//  2,  5,  8, 11
166 	*/
167 			xmm3 = _mm_loadh_pi(xmm3, (__m64 *)(src_p+count_l4+2*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8));  // 10, 11, 8,  X
168 			xmm2 = _mm_shuffle_ps(xmm2, xmm3, R_SHUFFLEPS( 0, 3, 2, 1 ));                               // 2,  5,  8,  11
169 
170 	/*
171 		add			ecx, 16
172 		add			eax, 4*DRAWVERT_SIZE
173 	*/
174 			dst_p = dst_p + 16;
175 			count_l4 = count_l4 + 4*DRAWVERT_SIZE;
176 
177 	/*
178 		mulps		xmm0, xmm4
179 		mulps		xmm1, xmm5
180 		mulps		xmm2, xmm6
181 		addps		xmm0, xmm7
182 		addps		xmm0, xmm1
183 		addps		xmm0, xmm2
184 	*/
185 			xmm0 = _mm_mul_ps(xmm0, xmm4);
186 			xmm1 = _mm_mul_ps(xmm1, xmm5);
187 			xmm2 = _mm_mul_ps(xmm2, xmm6);
188 			xmm0 = _mm_add_ps(xmm0, xmm7);
189 			xmm0 = _mm_add_ps(xmm0, xmm1);
190 			xmm0 = _mm_add_ps(xmm0, xmm2);
191 
192 	/*
193 		movlps		[ecx-16+0], xmm0
194 		movhps		[ecx-16+8], xmm0
195 		jl			loopVert4
196 	*/
197 			_mm_storel_pi((__m64 *) (dst_p-16+0), xmm0);
198 			_mm_storeh_pi((__m64 *) (dst_p-16+8), xmm0);
199 		} while(count_l4 < 0);
200 	}
201 
202 	/*
203 	startVert1:
204 		and			edx, 3
205 		jz			done
206 	*/
207 	count_l1 = count_l1 & 3;
208 	if(count_l1 != 0) {
209 	/*
210 		loopVert1:
211 		movss		xmm0, [esi+eax+DRAWVERT_XYZ_OFFSET+0]
212 		movss		xmm1, [esi+eax+DRAWVERT_XYZ_OFFSET+4]
213 		movss		xmm2, [esi+eax+DRAWVERT_XYZ_OFFSET+8]
214 		mulss		xmm0, xmm4
215 		mulss		xmm1, xmm5
216 		mulss		xmm2, xmm6
217 		addss		xmm0, xmm7
218 		add			ecx, 4
219 		addss		xmm0, xmm1
220 		add			eax, DRAWVERT_SIZE
221 		addss		xmm0, xmm2
222 		dec			edx
223 		movss		[ecx-4], xmm0
224 		jnz			loopVert1
225 	*/
226 		do {
227 			xmm0 = _mm_load_ss((float *) (src_p+count_l4+DRAWVERT_XYZ_OFFSET+0));
228 			xmm1 = _mm_load_ss((float *) (src_p+count_l4+DRAWVERT_XYZ_OFFSET+4));
229 			xmm2 = _mm_load_ss((float *) (src_p+count_l4+DRAWVERT_XYZ_OFFSET+8));
230 			xmm0 = _mm_mul_ss(xmm0, xmm4);
231 			xmm1 = _mm_mul_ss(xmm1, xmm5);
232 			xmm2 = _mm_mul_ss(xmm2, xmm6);
233 			xmm0 = _mm_add_ss(xmm0, xmm7);
234 			dst_p = dst_p + 4;
235 			xmm0 = _mm_add_ss(xmm0, xmm1);
236 			count_l4 = count_l4 + DRAWVERT_SIZE;
237 			xmm0 = _mm_add_ss(xmm0, xmm2);
238 			count_l1 = count_l1 - 1;
239 			_mm_store_ss((float *) (dst_p-4), xmm0);
240 		} while( count_l1 != 0);
241 	}
242 	/*
243 		done:
244 	*/
245 }
246 
247 /*
248 ============
249 idSIMD_SSE::MinMax
250 ============
251 */
MinMax(idVec3 & min,idVec3 & max,const idDrawVert * src,const int * indexes,const int count)252 void VPCALL idSIMD_SSE::MinMax( idVec3 &min, idVec3 &max, const idDrawVert *src, const int *indexes, const int count ) {
253 
254 	assert( sizeof( idDrawVert ) == DRAWVERT_SIZE );
255 	assert( ptrdiff_t(&src->xyz) - ptrdiff_t(src) == DRAWVERT_XYZ_OFFSET );
256 
257 	__m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
258 	char *indexes_p;
259 	char *src_p;
260 	int count_l;
261 	int edx;
262 	char *min_p;
263 	char *max_p;
264 
265 	/*
266 		movss		xmm0, idMath::INFINITY
267 		xorps		xmm1, xmm1
268 		shufps		xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
269 		subps		xmm1, xmm0
270 		movaps		xmm2, xmm0
271 		movaps		xmm3, xmm1
272 	*/
273 		xmm0 = _mm_load_ss(&idMath::INFINITY);
274 		// To satisfy the compiler use xmm0 instead.
275 		xmm1 = _mm_xor_ps(xmm0, xmm0);
276 		xmm0 = _mm_shuffle_ps(xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 ));
277 		xmm1 = _mm_sub_ps(xmm1, xmm0);
278 		xmm2 = xmm0;
279 		xmm3 = xmm1;
280 
281 	/*
282 		mov			edi, indexes
283 		mov			esi, src
284 		mov			eax, count
285 		and			eax, ~3
286 		jz			done4
287 	*/
288 		indexes_p = (char *) indexes;
289 		src_p = (char *) src;
290 		count_l = count;
291 		count_l = count_l & ~3;
292 		if(count_l != 0) {
293 	/*
294 		shl			eax, 2
295 		add			edi, eax
296 		neg			eax
297 	*/
298 			count_l = count_l << 2;
299 			indexes_p = indexes_p + count_l;
300 			count_l = -count_l;
301 	/*
302 	loop4:
303 //		prefetchnta	[edi+128]
304 //		prefetchnta	[esi+4*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET]
305 	*/
306 		do {
307 	/*
308 		mov			edx, [edi+eax+0]
309 		imul		edx, DRAWVERT_SIZE
310 		movss		xmm4, [esi+edx+DRAWVERT_XYZ_OFFSET+8]
311 		movhps		xmm4, [esi+edx+DRAWVERT_XYZ_OFFSET+0]
312 		minps		xmm0, xmm4
313 		maxps		xmm1, xmm4
314 	*/
315 			edx = *((int*)(indexes_p+count_l+0));
316 			edx = edx * DRAWVERT_SIZE;
317 			xmm4 = _mm_load_ss((float *) (src_p+edx+DRAWVERT_XYZ_OFFSET+8));
318 			xmm4 = _mm_loadh_pi(xmm4, (__m64 *) (src_p+edx+DRAWVERT_XYZ_OFFSET+0) );
319 			xmm0 = _mm_min_ps(xmm0, xmm4);
320 			xmm1 = _mm_max_ps(xmm1, xmm4);
321 
322 	/*
323 		mov			edx, [edi+eax+4]
324 		imul		edx, DRAWVERT_SIZE
325 		movss		xmm5, [esi+edx+DRAWVERT_XYZ_OFFSET+0]
326 		movhps		xmm5, [esi+edx+DRAWVERT_XYZ_OFFSET+4]
327 		minps		xmm2, xmm5
328 		maxps		xmm3, xmm5
329 	*/
330 			edx = *((int*)(indexes_p+count_l+4));
331 			edx = edx * DRAWVERT_SIZE;
332 			xmm5 = _mm_load_ss((float *) (src_p+edx+DRAWVERT_XYZ_OFFSET+0));
333 			xmm5 = _mm_loadh_pi(xmm5, (__m64 *) (src_p+edx+DRAWVERT_XYZ_OFFSET+4) );
334 			xmm2 = _mm_min_ps(xmm2, xmm5);
335 			xmm3 = _mm_max_ps(xmm3, xmm5);
336 
337 	/*
338 		mov			edx, [edi+eax+8]
339 		imul		edx, DRAWVERT_SIZE
340 		movss		xmm6, [esi+edx+DRAWVERT_XYZ_OFFSET+8]
341 		movhps		xmm6, [esi+edx+DRAWVERT_XYZ_OFFSET+0]
342 		minps		xmm0, xmm6
343 		maxps		xmm1, xmm6
344 	*/
345 			edx = *((int*)(indexes_p+count_l+8));
346 			edx = edx * DRAWVERT_SIZE;
347 			xmm6 = _mm_load_ss((float *) (src_p+edx+DRAWVERT_XYZ_OFFSET+8));
348 			xmm6 = _mm_loadh_pi(xmm6, (__m64 *) (src_p+edx+DRAWVERT_XYZ_OFFSET+0) );
349 			xmm0 = _mm_min_ps(xmm0, xmm6);
350 			xmm1 = _mm_max_ps(xmm1, xmm6);
351 
352 	/*
353 		mov			edx, [edi+eax+12]
354 		imul		edx, DRAWVERT_SIZE
355 		movss		xmm7, [esi+edx+DRAWVERT_XYZ_OFFSET+0]
356 		movhps		xmm7, [esi+edx+DRAWVERT_XYZ_OFFSET+4]
357 		minps		xmm2, xmm7
358 		maxps		xmm3, xmm7
359 	*/
360 			edx = *((int*)(indexes_p+count_l+12));
361 			edx = edx * DRAWVERT_SIZE;
362 			xmm7 = _mm_load_ss((float *) (src_p+edx+DRAWVERT_XYZ_OFFSET+0));
363 			xmm7 = _mm_loadh_pi(xmm7, (__m64 *) (src_p+edx+DRAWVERT_XYZ_OFFSET+4) );
364 			xmm2 = _mm_min_ps(xmm2, xmm7);
365 			xmm3 = _mm_max_ps(xmm3, xmm7);
366 
367 	/*
368 		add			eax, 4*4
369 		jl			loop4
370 	*/
371 			count_l = count_l + 4*4;
372 		} while (count_l < 0);
373 	}
374 	/*
375 	done4:
376 		mov			eax, count
377 		and			eax, 3
378 		jz			done1
379 	*/
380 	count_l = count;
381 	count_l = count_l & 3;
382 	if(count_l != 0) {
383 	/*
384 		shl			eax, 2
385 		add			edi, eax
386 		neg			eax
387 	*/
388 		count_l = count_l << 2;
389 		indexes_p = indexes_p + count_l;
390 		count_l = -count_l;
391 	/*
392 	loop1:
393 	*/
394 		do{
395 	/*
396 		mov			edx, [edi+eax+0]
397 		imul		edx, DRAWVERT_SIZE;
398 		movss		xmm4, [esi+edx+DRAWVERT_XYZ_OFFSET+8]
399 		movhps		xmm4, [esi+edx+DRAWVERT_XYZ_OFFSET+0]
400 		minps		xmm0, xmm4
401 		maxps		xmm1, xmm4
402 	*/
403 			edx = *((int*)(indexes_p+count_l+0));
404 			edx = edx * DRAWVERT_SIZE;
405 			xmm4 = _mm_load_ss((float *) (src_p+edx+DRAWVERT_XYZ_OFFSET+8));
406 			xmm4 = _mm_loadh_pi(xmm4, (__m64 *) (src_p+edx+DRAWVERT_XYZ_OFFSET+0) );
407 			xmm0 = _mm_min_ps(xmm0, xmm4);
408 			xmm1 = _mm_max_ps(xmm1, xmm4);
409 
410 	/*
411 		add			eax, 4
412 		jl			loop1
413 	*/
414 			count_l = count_l + 4;
415 		} while (count_l < 0);
416 
417 	}
418 
419 	/*
420 	done1:
421 		shufps		xmm2, xmm2, R_SHUFFLEPS( 3, 1, 0, 2 )
422 		shufps		xmm3, xmm3, R_SHUFFLEPS( 3, 1, 0, 2 )
423 		minps		xmm0, xmm2
424 		maxps		xmm1, xmm3
425 		mov			esi, min
426 		movhps		[esi], xmm0
427 		movss		[esi+8], xmm0
428 		mov			edi, max
429 		movhps		[edi], xmm1
430 		movss		[edi+8], xmm1
431 	*/
432 	xmm2 = _mm_shuffle_ps(xmm2, xmm2, R_SHUFFLEPS( 3, 1, 0, 2 ));
433 	xmm3 = _mm_shuffle_ps(xmm3, xmm3, R_SHUFFLEPS( 3, 1, 0, 2 ));
434 	xmm0 = _mm_min_ps(xmm0, xmm2);
435 	xmm1 = _mm_max_ps(xmm1, xmm3);
436 	min_p = (char *) &min;
437 	_mm_storeh_pi((__m64 *)(min_p), xmm0);
438 	_mm_store_ss((float *)(min_p+8), xmm0);
439 	max_p = (char *) &max;
440 	_mm_storeh_pi((__m64 *)(max_p), xmm1);
441 	_mm_store_ss((float *)(max_p+8), xmm1);
442 }
443 
444 /*
445 ============
446 idSIMD_SSE::Dot
447 
448   dst[i] = constant * src[i].Normal() + src[i][3];
449 ============
450 */
Dot(float * dst,const idVec3 & constant,const idPlane * src,const int count)451 void VPCALL idSIMD_SSE::Dot( float *dst, const idVec3 &constant, const idPlane *src, const int count ) {
452 	int count_l4;
453 	int count_l1;
454 	char *constant_p;
455 	char *src_p;
456 	char *dst_p;
457 	__m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
458 
459 	/*
460 		mov			eax, count
461 		mov			edi, constant
462 		mov			edx, eax
463 		mov			esi, src
464 		mov			ecx, dst
465 		and			eax, ~3
466 	*/
467 	count_l4 = count;
468 	constant_p = (char *) &constant;
469 	count_l1 = count_l4;
470 	src_p = (char *) src;
471 	dst_p = (char *) dst;
472 	count_l4 = count_l4 & ~3;
473 
474 	/*
475 		movss		xmm5, [edi+0]
476 		shufps		xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
477 		movss		xmm6, [edi+4]
478 		shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
479 		movss		xmm7, [edi+8]
480 		shufps		xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
481 	*/
482 	xmm5 = _mm_load_ss((float *) (constant_p+0));
483 	xmm5 = _mm_shuffle_ps(xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 ));
484 	xmm6 = _mm_load_ss((float *) (constant_p+4));
485 	xmm6 = _mm_shuffle_ps(xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ));
486 	xmm7 = _mm_load_ss((float *) (constant_p+8));
487 	xmm7 = _mm_shuffle_ps(xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 ));
488 
489 	/*
490 		jz			startVert1
491 	*/
492 	if (count_l4 != 0) {
493 	/*
494 		imul		eax, 16
495 		add			esi, eax
496 		neg			eax
497 	*/
498 		count_l4 = count_l4 * 16;
499 		src_p = src_p + count_l4;
500 		count_l4 = -count_l4;
501 	/*
502 	loopVert4:
503 	*/
504 		do {
505 	/*
506 		movlps		xmm1, [esi+eax+ 0]
507 		movlps		xmm3, [esi+eax+ 8]
508 		movhps		xmm1, [esi+eax+16]
509 		movhps		xmm3, [esi+eax+24]
510 		movlps		xmm2, [esi+eax+32]
511 		movlps		xmm4, [esi+eax+40]
512 		movhps		xmm2, [esi+eax+48]
513 		movhps		xmm4, [esi+eax+56]
514 		movaps		xmm0, xmm1
515 		shufps		xmm0, xmm2, R_SHUFFLEPS( 0, 2, 0, 2 )
516 		shufps		xmm1, xmm2, R_SHUFFLEPS( 1, 3, 1, 3 )
517 		movaps		xmm2, xmm3
518 		shufps		xmm2, xmm4, R_SHUFFLEPS( 0, 2, 0, 2 )
519 		shufps		xmm3, xmm4, R_SHUFFLEPS( 1, 3, 1, 3 )
520 	*/
521 			xmm1 = _mm_loadl_pi(xmm1, (__m64 *)(src_p+count_l4+ 0));
522 			xmm3 = _mm_loadl_pi(xmm3, (__m64 *)(src_p+count_l4+ 8));
523 			xmm1 = _mm_loadh_pi(xmm1, (__m64 *)(src_p+count_l4+16));
524 			xmm3 = _mm_loadh_pi(xmm3, (__m64 *)(src_p+count_l4+24));
525 			xmm2 = _mm_loadl_pi(xmm2, (__m64 *)(src_p+count_l4+32));
526 			xmm4 = _mm_loadl_pi(xmm4, (__m64 *)(src_p+count_l4+40));
527 			xmm2 = _mm_loadh_pi(xmm2, (__m64 *)(src_p+count_l4+48));
528 			xmm4 = _mm_loadh_pi(xmm4, (__m64 *)(src_p+count_l4+56));
529 
530 			xmm0 = xmm1;
531 			xmm0 = _mm_shuffle_ps(xmm0, xmm2, R_SHUFFLEPS( 0, 2, 0, 2 ));
532 			xmm1 = _mm_shuffle_ps(xmm1, xmm2, R_SHUFFLEPS( 1, 3, 1, 3 ));
533 			xmm2 = xmm3;
534 			xmm2 = _mm_shuffle_ps(xmm2, xmm4, R_SHUFFLEPS( 0, 2, 0, 2 ));
535 			xmm3 = _mm_shuffle_ps(xmm3, xmm4, R_SHUFFLEPS( 1, 3, 1, 3 ));
536 
537 	/*
538 		add			ecx, 16
539 		add			eax, 4*16
540 	*/
541 			dst_p = dst_p + 16;
542 			count_l4 = count_l4 + 4*16;
543 
544 	/*
545 		mulps		xmm0, xmm5
546 		mulps		xmm1, xmm6
547 		mulps		xmm2, xmm7
548 		addps		xmm0, xmm3
549 		addps		xmm0, xmm1
550 		addps		xmm0, xmm2
551 	*/
552 			xmm0 = _mm_mul_ps(xmm0, xmm5);
553 			xmm1 = _mm_mul_ps(xmm1, xmm6);
554 			xmm2 = _mm_mul_ps(xmm2, xmm7);
555 			xmm0 = _mm_add_ps(xmm0, xmm3);
556 			xmm0 = _mm_add_ps(xmm0, xmm1);
557 			xmm0 = _mm_add_ps(xmm0, xmm2);
558 
559 	/*
560 		movlps		[ecx-16+0], xmm0
561 		movhps		[ecx-16+8], xmm0
562 		jl			loopVert4
563 	*/
564 			_mm_storel_pi((__m64 *) (dst_p-16+0), xmm0);
565 			_mm_storeh_pi((__m64 *) (dst_p-16+8), xmm0);
566 		} while (count_l4 < 0);
567 	}
568 
569 	/*
570 	startVert1:
571 		and			edx, 3
572 		jz			done
573 	*/
574 	count_l1 = count_l1 & 3;
575 
576 	if(count_l1 != 0) {
577 	/*
578 	loopVert1:
579 	*/
580 		do {
581 	/*
582 		movss		xmm0, [esi+eax+0]
583 		movss		xmm1, [esi+eax+4]
584 		movss		xmm2, [esi+eax+8]
585 		mulss		xmm0, xmm5
586 		mulss		xmm1, xmm6
587 		mulss		xmm2, xmm7
588 		addss		xmm0, [esi+eax+12]
589 		add			ecx, 4
590 		addss		xmm0, xmm1
591 		add			eax, 16
592 		addss		xmm0, xmm2
593 		dec			edx
594 		movss		[ecx-4], xmm0
595 		jnz			loopVert1
596 	*/
597 			xmm0 = _mm_load_ss((float *) (src_p+count_l4+ 0));
598 			xmm1 = _mm_load_ss((float *) (src_p+count_l4+ 4));
599 			xmm2 = _mm_load_ss((float *) (src_p+count_l4+ 8));
600 			xmm3 = _mm_load_ss((float *) (src_p+count_l4+12));
601 
602 			xmm0 = _mm_mul_ss(xmm0, xmm5);
603 			xmm1 = _mm_mul_ss(xmm1, xmm6);
604 			xmm2 = _mm_mul_ss(xmm2, xmm7);
605 
606 			xmm0 = _mm_add_ss(xmm0, xmm3);
607 			dst_p = dst_p + 4;
608 			xmm0 = _mm_add_ss(xmm0, xmm1);
609 			count_l4 = count_l4 + 16;
610 			xmm0 = _mm_add_ss(xmm0, xmm2);
611 			count_l1 = count_l1 - 1;
612 			_mm_store_ss((float *) (dst_p-4), xmm0);
613 		} while (count_l1 != 0);
614 	}
615 	/*
616 	done:
617 	*/
618 }
619 
620 #elif defined(_MSC_VER) && defined(_M_IX86)
621 
622 #include <xmmintrin.h>
623 
624 #include "idlib/geometry/JointTransform.h"
625 #include "idlib/math/Vector.h"
626 #include "idlib/math/Matrix.h"
627 #include "idlib/math/Quat.h"
628 #include "idlib/math/Plane.h"
629 #include "renderer/Model.h"
630 
631 #define SHUFFLEPS( x, y, z, w )		(( (x) & 3 ) << 6 | ( (y) & 3 ) << 4 | ( (z) & 3 ) << 2 | ( (w) & 3 ))
632 #define R_SHUFFLEPS( x, y, z, w )	(( (w) & 3 ) << 6 | ( (z) & 3 ) << 4 | ( (y) & 3 ) << 2 | ( (x) & 3 ))
633 
634 // transpose a 4x4 matrix loaded into 4 xmm registers (reg4 is temporary)
635 #define TRANSPOSE_4x4( reg0, reg1, reg2, reg3, reg4 )											\
636 	__asm	movaps		reg4, reg2								/* reg4 =  8,  9, 10, 11 */		\
637 	__asm	unpcklps	reg2, reg3								/* reg2 =  8, 12,  9, 13 */		\
638 	__asm	unpckhps	reg4, reg3								/* reg4 = 10, 14, 11, 15 */		\
639 	__asm	movaps		reg3, reg0								/* reg3 =  0,  1,  2,  3 */		\
640 	__asm	unpcklps	reg0, reg1								/* reg0 =  0,  4,  1,  5 */		\
641 	__asm	unpckhps	reg3, reg1								/* reg3 =  2,  6,  3,  7 */		\
642 	__asm	movaps		reg1, reg0								/* reg1 =  0,  4,  1,  5 */		\
643 	__asm	shufps		reg0, reg2, R_SHUFFLEPS( 0, 1, 0, 1 )	/* reg0 =  0,  4,  8, 12 */		\
644 	__asm	shufps		reg1, reg2, R_SHUFFLEPS( 2, 3, 2, 3 )	/* reg1 =  1,  5,  9, 13 */		\
645 	__asm	movaps		reg2, reg3								/* reg2 =  2,  6,  3,  7 */		\
646 	__asm	shufps		reg2, reg4, R_SHUFFLEPS( 0, 1, 0, 1 )	/* reg2 =  2,  6, 10, 14 */		\
647 	__asm	shufps		reg3, reg4, R_SHUFFLEPS( 2, 3, 2, 3 )	/* reg3 =  3,  7, 11, 15 */
648 
649 // transpose a 4x4 matrix from memory into 4 xmm registers (reg4 is temporary)
650 #define TRANPOSE_4x4_FROM_MEMORY( address, reg0, reg1, reg2, reg3, reg4 )						\
651 	__asm	movlps		reg1, [address+ 0]						/* reg1 =  0,  1,  X,  X */		\
652 	__asm	movlps		reg3, [address+ 8]						/* reg3 =  2,  3,  X,  X */		\
653 	__asm	movhps		reg1, [address+16]						/* reg1 =  0,  1,  4,  5 */		\
654 	__asm	movhps		reg3, [address+24]						/* reg3 =  2,  3,  6,  7 */		\
655 	__asm	movlps		reg2, [address+32]						/* reg2 =  8,  9,  X,  X */		\
656 	__asm	movlps		reg4, [address+40]						/* reg4 = 10, 11,  X,  X */		\
657 	__asm	movhps		reg2, [address+48]						/* reg2 =  8,  9, 12, 13 */		\
658 	__asm	movhps		reg4, [address+56]						/* reg4 = 10, 11, 14, 15 */		\
659 	__asm	movaps		reg0, reg1								/* reg0 =  0,  1,  4,  5 */		\
660 	__asm	shufps		reg0, reg2, R_SHUFFLEPS( 0, 2, 0, 2 )	/* reg0 =  0,  4,  8, 12 */		\
661 	__asm	shufps		reg1, reg2, R_SHUFFLEPS( 1, 3, 1, 3 )	/* reg1 =  1,  5,  9, 13 */		\
662 	__asm	movaps		reg2, reg3								/* reg2 =  2,  3,  6,  7 */		\
663 	__asm	shufps		reg2, reg4, R_SHUFFLEPS( 0, 2, 0, 2 )	/* reg2 =  2,  6, 10, 14 */		\
664 	__asm	shufps		reg3, reg4, R_SHUFFLEPS( 1, 3, 1, 3 )	/* reg3 =  3,  7, 11, 15 */
665 
666 // transpose a 4x4 matrix to memory from 4 xmm registers (reg4 is temporary)
667 #define TRANPOSE_4x4_TO_MEMORY( address, reg0, reg1, reg2, reg3, reg4 )							\
668 	__asm	movaps		reg4, reg0								/* reg4 =  0,  4,  8, 12 */		\
669 	__asm	unpcklps	reg0, reg1								/* reg0 =  0,  1,  4,  5 */		\
670 	__asm	unpckhps	reg4, reg1								/* reg4 =  8,  9, 12, 13 */		\
671 	__asm	movaps		reg1, reg2								/* reg1 =  2,  6, 10, 14 */		\
672 	__asm	unpcklps	reg2, reg3								/* reg2 =  2,  3,  6,  7 */		\
673 	__asm	unpckhps	reg1, reg3								/* reg1 = 10, 11, 14, 15 */		\
674 	__asm	movlps		[address+ 0], reg0						/* mem0 =  0,  1,  X,  X */		\
675 	__asm	movlps		[address+ 8], reg2						/* mem0 =  0,  1,  2,  3 */		\
676 	__asm	movhps		[address+16], reg0						/* mem1 =  4,  5,  X,  X */		\
677 	__asm	movhps		[address+24], reg2						/* mem1 =  4,  5,  6,  7 */		\
678 	__asm	movlps		[address+32], reg4						/* mem2 =  8,  9,  X,  X */		\
679 	__asm	movlps		[address+40], reg1						/* mem2 =  8,  9, 10, 11 */		\
680 	__asm	movhps		[address+48], reg4						/* mem3 = 12, 13,  X,  X */		\
681 	__asm	movhps		[address+56], reg1						/* mem3 = 12, 13, 14, 15 */
682 
683 // transpose a 4x3 matrix loaded into 3 xmm registers (reg3 is temporary)
684 #define TRANSPOSE_4x3( reg0, reg1, reg2, reg3 )													\
685 	__asm	movaps		reg3, reg2								/* reg3 =  8,  9, 10, 11 */		\
686 	__asm	shufps		reg3, reg1, R_SHUFFLEPS( 2, 3, 0, 1 )	/* reg3 = 10, 11,  4,  5 */		\
687 	__asm	shufps		reg2, reg0, R_SHUFFLEPS( 0, 1, 2, 3 )	/* reg2 =  8,  9,  2,  3 */		\
688 	__asm	shufps		reg1, reg0, R_SHUFFLEPS( 2, 3, 0, 1 )	/* reg1 =  6,  7,  0,  1 */		\
689 	__asm	movaps		reg0, reg1								/* reg0 =  6,  7,  0,  1 */		\
690 	__asm	shufps		reg0, reg2, R_SHUFFLEPS( 2, 0, 3, 1 )	/* reg0 =  0,  6,  3,  9 */		\
691 	__asm	shufps		reg1, reg3, R_SHUFFLEPS( 3, 1, 2, 0 )	/* reg1 =  1,  7,  4, 10 */		\
692 	__asm	shufps		reg2, reg3, R_SHUFFLEPS( 2, 0, 3, 1 )	/* reg2 =  2,  8,  5, 11 */
693 
694 // transpose a 4x3 matrix from memory into 3 xmm registers (reg3 is temporary)
695 #define TRANSPOSE_4x3_FROM_MEMORY( address, reg0, reg1, reg2, reg3 )							\
696 	__asm	movlps		reg1, [address+ 0]						/* reg1 =  0,  1,  X,  X */		\
697 	__asm	movlps		reg2, [address+ 8]						/* reg2 =  2,  3,  X,  X */		\
698 	__asm	movlps		reg3, [address+16]						/* reg3 =  4,  5,  X,  X */		\
699 	__asm	movhps		reg1, [address+24]						/* reg1 =  0,  1,  6,  7 */		\
700 	__asm	movhps		reg2, [address+32]						/* reg2 =  2,  3,  8,  9 */		\
701 	__asm	movhps		reg3, [address+40]						/* reg3 =  4,  5, 10, 11 */		\
702 	__asm	movaps		reg0, reg1								/* reg0 =  0,  1,  6,  7 */		\
703 	__asm	shufps		reg0, reg2, R_SHUFFLEPS( 0, 2, 1, 3 )	/* reg0 =  0,  6,  3,  9 */		\
704 	__asm	shufps		reg1, reg3, R_SHUFFLEPS( 1, 3, 0, 2 )	/* reg1 =  1,  7,  4, 10 */		\
705 	__asm	shufps		reg2, reg3, R_SHUFFLEPS( 0, 2, 1, 3 )	/* reg2 =  2,  8,  5, 11 */
706 
707 // transpose a 4x3 matrix to memory from 3 xmm registers (reg3 is temporary)
708 #define TRANSPOSE_4x3_TO_MEMORY( address, reg0, reg1, reg2, reg3 )								\
709 	__asm	movhlps		reg3, reg0								/* reg3 =  3,  9,  X,  X */		\
710 	__asm	unpcklps	reg0, reg1								/* reg0 =  0,  1,  6,  7 */		\
711 	__asm	unpckhps	reg1, reg2								/* reg1 =  4,  5, 10, 11 */		\
712 	__asm	unpcklps	reg2, reg3								/* reg2 =  2,  3,  8,  9 */		\
713 	__asm	movlps		[address+ 0], reg0						/* mem0 =  0,  1,  X,  X */		\
714 	__asm	movlps		[address+ 8], reg2						/* mem0 =  0,  1,  2,  3 */		\
715 	__asm	movlps		[address+16], reg1						/* mem1 =  4,  5,  X,  X */		\
716 	__asm	movhps		[address+24], reg0						/* mem1 =  4,  5,  6,  7 */		\
717 	__asm	movhps		[address+32], reg2						/* mem2 =  8,  9,  X,  X */		\
718 	__asm	movhps		[address+40], reg1						/* mem2 =  8,  9, 10, 11 */
719 
720 
721 // with alignment
722 #define KFLOATINITS(   SRC0, COUNT, PRE, POST )				KFLOATINITDSS( SRC0,SRC0,SRC0,COUNT,PRE,POST )
723 #define KFLOATINITD(   DST, COUNT, PRE, POST )				KFLOATINITDSS( DST,DST,DST,COUNT,PRE,POST )
724 #define KFLOATINITDS(  DST, SRC0, COUNT, PRE, POST )		KFLOATINITDSS( DST,SRC0,SRC0,COUNT,PRE,POST )
725 
726 #define KFLOATINITDSS( DST, SRC0, SRC1, COUNT, PRE, POST )\
727 	__asm	mov		ecx,DST								\
728 	__asm	shr		ecx,2								\
729 	__asm	mov		ebx,COUNT							\
730 	__asm	neg		ecx									\
731 	__asm	mov		edx,SRC0							\
732 	__asm	and		ecx,3								\
733 	__asm	mov		esi,SRC1							\
734 	__asm	sub		ebx,ecx								\
735 	__asm	jge		noUnderFlow							\
736 	__asm	xor		ebx,ebx								\
737 	__asm	mov		ecx,COUNT							\
738 	__asm	noUnderFlow:								\
739 	__asm	mov		PRE,ecx								\
740 	__asm	mov		eax,ebx								\
741 	__asm	mov		edi,DST								\
742 	__asm	and		eax,8-1								\
743 	__asm	mov		POST,eax							\
744 	__asm	and		ebx,0xfffffff8						\
745 	__asm	jle		done								\
746 	__asm	shl		ebx,2								\
747 	__asm	lea		ecx,[ecx*4+ebx]						\
748 	__asm	neg		ebx									\
749 	__asm	add		edx,ecx								\
750 	__asm	add		esi,ecx								\
751 	__asm	add		edi,ecx								\
752 	__asm	mov		eax,edx								\
753 	__asm	or		eax,esi
754 
755 // without alignment (pre==0)
756 #define KFLOATINITS_NA(   SRC0, COUNT, PRE, POST )				KFLOATINITDSS_NA( SRC0,SRC0,SRC0,COUNT,PRE,POST )
757 #define KFLOATINITD_NA(   DST, COUNT, PRE, POST )				KFLOATINITDSS_NA( DST,DST,DST,COUNT,PRE,POST )
758 #define KFLOATINITDS_NA(  DST, SRC0, COUNT, PRE, POST )			KFLOATINITDSS_NA( DST,SRC0,SRC0,COUNT,PRE,POST )
759 #define KFLOATINITDSS_NA( DST, SRC0, SRC1, COUNT, PRE, POST )\
760 	__asm	mov		eax,COUNT							\
761 	__asm	mov		PRE,0								\
762 	__asm	and		eax,8-1								\
763 	__asm	mov		ebx,COUNT							\
764 	__asm	mov		POST,eax							\
765 	__asm	and		ebx,0xfffffff8						\
766 	__asm	je		done								\
767 	__asm	shl		ebx,2								\
768 	__asm	mov		edx,SRC0							\
769 	__asm	mov		esi,SRC1							\
770 	__asm	mov		edi,DST								\
771 	__asm	add		edx,ebx								\
772 	__asm	add		esi,ebx								\
773 	__asm	add		edi,ebx								\
774 	__asm	mov		eax,edx								\
775 	__asm	or		eax,esi								\
776 	__asm	or		eax,edi								\
777 	__asm	neg		ebx									\
778 
779 /*
780 	when OPER is called:
781 	edx = s0
782 	esi	= s1
783 	edi	= d
784 	ebx	= index*4
785 
786 	xmm0 & xmm1	must not be trashed
787 */
788 #define KMOVDS1( DST, SRC0 )							\
789 	__asm	movss	xmm2,SRC0							\
790 	__asm	movss	DST,xmm2
791 #define KMOVDS4( DST, SRC0 )							\
792 	__asm	movups	xmm2,SRC0							\
793 	__asm	movups	DST,xmm2
794 #define KMINDS1( DST, SRC0 )							\
795 	__asm	movss	xmm2,SRC0							\
796 	__asm	minss	DST,xmm2
797 #define KMAXDS1( DST, SRC0 )							\
798 	__asm	movss	xmm2,SRC0							\
799 	__asm	maxss	DST,xmm2
800 
801 // general ALU operation
802 #define KALUDSS1( OP, DST, SRC0, SRC1 )					\
803 	__asm	movss	xmm2,SRC0							\
804 	__asm	OP##ss	xmm2,SRC1							\
805 	__asm	movss	DST,xmm2
806 #define KALUDSS4( OP, DST, SRC0, SRC1 )					\
807 	__asm	movups	xmm2,SRC0							\
808 	__asm	movups	xmm3,SRC1							\
809 	__asm	OP##ps	xmm2,xmm3							\
810 	__asm	movups	DST,xmm2
811 
812 #define KADDDSS1( DST, SRC0, SRC1 )		KALUDSS1( add, DST,SRC0,SRC1 )
813 #define KADDDSS4( DST, SRC0, SRC1 )		KALUDSS4( add, DST,SRC0,SRC1 )
814 #define KSUBDSS1( DST, SRC0, SRC1 )		KALUDSS1( sub, DST,SRC0,SRC1 )
815 #define KSUBDSS4( DST, SRC0, SRC1 )		KALUDSS4( sub, DST,SRC0,SRC1 )
816 #define KMULDSS1( DST, SRC0, SRC1 )		KALUDSS1( mul, DST,SRC0,SRC1 )
817 #define KMULDSS4( DST, SRC0, SRC1 )		KALUDSS4( mul, DST,SRC0,SRC1 )
818 
819 #define KDIVDSS1( DST, SRC0, SRC1 )						\
820 	__asm	movss	xmm2,SRC1							\
821 	__asm	rcpss	xmm3,xmm2							\
822 	__asm	mulss	xmm2,xmm3							\
823 	__asm	mulss	xmm2,xmm3							\
824 	__asm	addss	xmm3,xmm3							\
825 	__asm	subss	xmm3,xmm2							\
826 	__asm	mulss	xmm3,SRC0							\
827 	__asm	movss	DST,xmm3
828 #define KDIVDSS4( DST, SRC0, SRC1 )						\
829 	__asm	movups	xmm2,SRC1							\
830 	__asm	rcpps	xmm3,xmm2							\
831 	__asm	mulps	xmm2,xmm3							\
832 	__asm	mulps	xmm2,xmm3							\
833 	__asm	addps	xmm3,xmm3							\
834 	__asm	subps	xmm3,xmm2							\
835 	__asm	movups	xmm2,SRC0							\
836 	__asm	mulps	xmm3,xmm2							\
837 	__asm	movups	DST,xmm3
838 #define	KF2IDS1( SRC0 )									\
839 	__asm	movss		xmm2,SRC0						\
840 	__asm	cvttps2pi	mm2,xmm2						\
841 	__asm	movd		[edi+ebx],mm2
842 #define	KF2IDS4( SRC0 )									\
843 	__asm	movups		xmm2,SRC0						\
844 	__asm	cvttps2pi	mm2,xmm2						\
845 	__asm	movq		[edi+ebx+0],mm2					\
846 	__asm	shufps		xmm2,xmm2,SHUFFLEPS(1,0,3,2)	\
847 	__asm	cvttps2pi	mm2,xmm2						\
848 	__asm	movq		[edi+ebx+8],mm2
849 #define	KISQRTDS1( DST,SRC0 )							\
850 	__asm	movss	xmm2,SRC0							\
851 	__asm	rsqrtss	xmm3,xmm2							\
852 	__asm	mulss	xmm2,xmm3							\
853 	__asm	mulss	xmm2,xmm3							\
854 	__asm	subss	xmm2,xmm1							\
855 	__asm	mulss	xmm3,xmm0							\
856 	__asm	mulss	xmm3,xmm2							\
857 	__asm	movss	DST,xmm3
858 #define	KISQRTDS4( DST,SRC0 )							\
859 	__asm	movups	xmm2,SRC0							\
860 	__asm	rsqrtps	xmm3,xmm2							\
861 	__asm	mulps	xmm2,xmm3							\
862 	__asm	mulps	xmm2,xmm3							\
863 	__asm	subps	xmm2,xmm1							\
864 	__asm	mulps	xmm3,xmm0							\
865 	__asm	mulps	xmm3,xmm2							\
866 	__asm	movups	DST,xmm3
867 
868 // this is used in vector4 implementation to shift constant V4
869 #define KANDREGDSV( DST, SRC0, VALUE )					\
870 	__asm	mov		DST,SRC0							\
871 	__asm	and		DST,VALUE
872 
873 // this is used in vector4 code to operate with float arrays as sources
874 #define KEXPANDFLOAT( DST, SRC )						\
875 	__asm	movss	DST,SRC								\
876 	__asm	shufps  DST,DST,0
877 
878 #define	KADDDS1( DST,SRC )		KADDDSS1( DST,DST,SRC )
879 #define	KADDDS4( DST,SRC )		KADDDSS4( DST,DST,SRC )
880 #define	KSUBDS1( DST,SRC )		KSUBDSS1( DST,DST,SRC )
881 #define	KSUBDS4( DST,SRC )		KSUBDSS4( DST,DST,SRC )
882 #define	KMULDS1( DST,SRC )		KMULDSS1( DST,DST,SRC )
883 #define	KMULDS4( DST,SRC )		KMULDSS4( DST,DST,SRC )
884 #define	KDIVDS1( DST,SRC )		KDIVDSS1( DST,DST,SRC )
885 #define	KDIVDS4( DST,SRC )		KDIVDSS4( DST,DST,SRC )
886 
887 // handles pre & post leftovers
888 #define	KFLOATOPER( OPER, OPER4, COUNT )				\
889 	__asm		mov		ecx,pre							\
890 	__asm		mov		ebx,COUNT						\
891 	__asm		cmp		ebx,ecx							\
892 	__asm		cmovl	ecx,COUNT						\
893 	__asm		test	ecx,ecx							\
894 	__asm		je		preDone							\
895 	__asm		xor		ebx,ebx							\
896 	__asm	lpPre:										\
897 				OPER									\
898 	__asm		add		ebx,4							\
899 	__asm		dec		ecx								\
900 	__asm		jg		lpPre							\
901 	__asm	preDone:									\
902 	__asm		mov		ecx,post						\
903 	__asm		mov		ebx,COUNT						\
904 	__asm		sub		ebx,ecx							\
905 	__asm		shl		ebx,2							\
906 	__asm		cmp		ecx,4							\
907 	__asm		jl		post4Done						\
908 				OPER4									\
909 	__asm		sub		ecx,4							\
910 	__asm		add		ebx,4*4							\
911 	__asm	post4Done:									\
912 	__asm		test	ecx,ecx							\
913 	__asm		je		postDone						\
914 	__asm	lpPost:										\
915 				OPER									\
916 	__asm		add		ebx,4							\
917 	__asm		dec		ecx								\
918 	__asm		jg		lpPost							\
919 	__asm	postDone:
920 
921 // operate on a constant and a float array
922 #define KFLOAT_CA( ALUOP, DST, SRC, CONSTANT, COUNT )	\
923 	int	pre,post;										\
924 	__asm		movss	xmm0,CONSTANT					\
925 	__asm		shufps	xmm0,xmm0,0						\
926 			KFLOATINITDS( DST, SRC, COUNT, pre, post )	\
927 	__asm		and		eax,15							\
928 	__asm		jne		lpNA							\
929 	__asm		jmp		lpA								\
930 	__asm		align	16								\
931 	__asm	lpA:										\
932 	__asm		prefetchnta	[edx+ebx+64]				\
933 	__asm		movaps	xmm1,xmm0						\
934 	__asm		movaps	xmm2,xmm0						\
935 	__asm		ALUOP##ps	xmm1,[edx+ebx]				\
936 	__asm		ALUOP##ps	xmm2,[edx+ebx+16]			\
937 	__asm		movaps	[edi+ebx],xmm1					\
938 	__asm		movaps	[edi+ebx+16],xmm2				\
939 	__asm		add		ebx,16*2						\
940 	__asm		jl		lpA								\
941 	__asm		jmp		done							\
942 	__asm		align	16								\
943 	__asm	lpNA:										\
944 	__asm		prefetchnta	[edx+ebx+64]				\
945 	__asm		movaps	xmm1,xmm0						\
946 	__asm		movaps	xmm2,xmm0						\
947 	__asm		movups	xmm3,[edx+ebx]					\
948 	__asm		movups	xmm4,[edx+ebx+16]				\
949 	__asm		ALUOP##ps	xmm1,xmm3					\
950 	__asm		ALUOP##ps	xmm2,xmm4					\
951 	__asm		movaps	[edi+ebx],xmm1					\
952 	__asm		movaps	[edi+ebx+16],xmm2				\
953 	__asm		add		ebx,16*2						\
954 	__asm		jl		lpNA							\
955 	__asm	done:										\
956 	__asm		mov		edx,SRC							\
957 	__asm		mov		edi,DST							\
958 	__asm		KFLOATOPER( KALUDSS1( ALUOP, [edi+ebx],xmm0,[edx+ebx] ),	\
959 	__asm					KALUDSS4( ALUOP, [edi+ebx],xmm0,[edx+ebx] ), COUNT )
960 
961 // operate on two float arrays
962 #define KFLOAT_AA( ALUOP, DST, SRC0, SRC1, COUNT )		\
963 	int	pre,post;										\
964 	KFLOATINITDSS( DST, SRC0, SRC1, COUNT, pre, post )	\
965 	__asm		and		eax,15							\
966 	__asm		jne		lpNA							\
967 	__asm		jmp		lpA								\
968 	__asm		align	16								\
969 	__asm	lpA:										\
970 	__asm		movaps	xmm1,[edx+ebx]					\
971 	__asm		movaps	xmm2,[edx+ebx+16]				\
972 	__asm		ALUOP##ps	xmm1,[esi+ebx]				\
973 	__asm		ALUOP##ps	xmm2,[esi+ebx+16]			\
974 	__asm		prefetchnta	[edx+ebx+64]				\
975 	__asm		prefetchnta	[esi+ebx+64]				\
976 	__asm		movaps	[edi+ebx],xmm1					\
977 	__asm		movaps	[edi+ebx+16],xmm2				\
978 	__asm		add		ebx,16*2						\
979 	__asm		jl		lpA								\
980 	__asm		jmp		done							\
981 	__asm		align	16								\
982 	__asm	lpNA:										\
983 	__asm		movups	xmm1,[edx+ebx]					\
984 	__asm		movups	xmm2,[edx+ebx+16]				\
985 	__asm		movups	xmm3,[esi+ebx]					\
986 	__asm		movups	xmm4,[esi+ebx+16]				\
987 	__asm		prefetchnta	[edx+ebx+64]				\
988 	__asm		prefetchnta	[esi+ebx+64]				\
989 	__asm		ALUOP##ps	xmm1,xmm3					\
990 	__asm		ALUOP##ps	xmm2,xmm4					\
991 	__asm		movaps	[edi+ebx],xmm1					\
992 	__asm		movaps	[edi+ebx+16],xmm2				\
993 	__asm		add		ebx,16*2						\
994 	__asm		jl		lpNA							\
995 	__asm	done:										\
996 	__asm		mov		edx,SRC0						\
997 	__asm		mov		esi,SRC1						\
998 	__asm		mov		edi,DST							\
999 	KFLOATOPER( KALUDSS1( ALUOP, [edi+ebx],[edx+ebx],[esi+ebx] ),		\
1000 				KALUDSS4( ALUOP, [edi+ebx],[edx+ebx],[esi+ebx] ), COUNT )
1001 
1002 
1003 #define DRAWVERT_SIZE				60
1004 
1005 #define JOINTQUAT_SIZE				(7*4)
1006 #define JOINTMAT_SIZE				(4*3*4)
1007 #define JOINTWEIGHT_SIZE			(4*4)
1008 
1009 
1010 #define ALIGN4_INIT1( X, INIT )				ALIGN16( static X[4] ) = { INIT, INIT, INIT, INIT }
1011 #define ALIGN4_INIT4( X, I0, I1, I2, I3 )	ALIGN16( static X[4] ) = { I0, I1, I2, I3 }
1012 #define ALIGN8_INIT1( X, INIT )				ALIGN16( static X[8] ) = { INIT, INIT, INIT, INIT, INIT, INIT, INIT, INIT }
1013 
1014 ALIGN8_INIT1( unsigned short SIMD_W_zero, 0 );
1015 ALIGN8_INIT1( unsigned short SIMD_W_maxShort, 1<<15 );
1016 
1017 ALIGN4_INIT1( unsigned int SIMD_DW_mat2quatShuffle0, (3<<0)|(2<<8)|(1<<16)|(0<<24) );
1018 ALIGN4_INIT1( unsigned int SIMD_DW_mat2quatShuffle1, (0<<0)|(1<<8)|(2<<16)|(3<<24) );
1019 ALIGN4_INIT1( unsigned int SIMD_DW_mat2quatShuffle2, (1<<0)|(0<<8)|(3<<16)|(2<<24) );
1020 ALIGN4_INIT1( unsigned int SIMD_DW_mat2quatShuffle3, (2<<0)|(3<<8)|(0<<16)|(1<<24) );
1021 
1022 ALIGN4_INIT4( unsigned int SIMD_SP_singleSignBitMask, (unsigned int) ( 1 << 31 ), 0, 0, 0 );
1023 ALIGN4_INIT1( unsigned int SIMD_SP_signBitMask, (unsigned int) ( 1 << 31 ) );
1024 ALIGN4_INIT1( unsigned int SIMD_SP_absMask, (unsigned int) ~( 1 << 31 ) );
1025 ALIGN4_INIT1( unsigned int SIMD_SP_infinityMask, (unsigned int) ~( 1 << 23 ) );
1026 ALIGN4_INIT1( unsigned int SIMD_SP_not, 0xFFFFFFFF );
1027 
1028 ALIGN4_INIT1( float SIMD_SP_zero, 0.0f );
1029 ALIGN4_INIT1( float SIMD_SP_half, 0.5f );
1030 ALIGN4_INIT1( float SIMD_SP_one, 1.0f );
1031 ALIGN4_INIT1( float SIMD_SP_two, 2.0f );
1032 ALIGN4_INIT1( float SIMD_SP_three, 3.0f );
1033 ALIGN4_INIT1( float SIMD_SP_four, 4.0f );
1034 ALIGN4_INIT1( float SIMD_SP_maxShort, (1<<15) );
1035 ALIGN4_INIT1( float SIMD_SP_tiny, 1e-10f );
1036 ALIGN4_INIT1( float SIMD_SP_PI, idMath::PI );
1037 ALIGN4_INIT1( float SIMD_SP_halfPI, idMath::HALF_PI );
1038 ALIGN4_INIT1( float SIMD_SP_twoPI, idMath::TWO_PI );
1039 ALIGN4_INIT1( float SIMD_SP_oneOverTwoPI, 1.0f / idMath::TWO_PI );
1040 ALIGN4_INIT1( float SIMD_SP_infinity, idMath::INFINITY );
1041 ALIGN4_INIT4( float SIMD_SP_lastOne, 0.0f, 0.0f, 0.0f, 1.0f );
1042 
1043 ALIGN4_INIT1( float SIMD_SP_rsqrt_c0,  3.0f );
1044 ALIGN4_INIT1( float SIMD_SP_rsqrt_c1, -0.5f );
1045 ALIGN4_INIT1( float SIMD_SP_mat2quat_rsqrt_c1, -0.5f*0.5f );
1046 
1047 ALIGN4_INIT1( float SIMD_SP_sin_c0, -2.39e-08f );
1048 ALIGN4_INIT1( float SIMD_SP_sin_c1,  2.7526e-06f );
1049 ALIGN4_INIT1( float SIMD_SP_sin_c2, -1.98409e-04f );
1050 ALIGN4_INIT1( float SIMD_SP_sin_c3,  8.3333315e-03f );
1051 ALIGN4_INIT1( float SIMD_SP_sin_c4, -1.666666664e-01f );
1052 
1053 ALIGN4_INIT1( float SIMD_SP_cos_c0, -2.605e-07f );
1054 ALIGN4_INIT1( float SIMD_SP_cos_c1,  2.47609e-05f );
1055 ALIGN4_INIT1( float SIMD_SP_cos_c2, -1.3888397e-03f );
1056 ALIGN4_INIT1( float SIMD_SP_cos_c3,  4.16666418e-02f );
1057 ALIGN4_INIT1( float SIMD_SP_cos_c4, -4.999999963e-01f );
1058 
1059 ALIGN4_INIT1( float SIMD_SP_atan_c0,  0.0028662257f );
1060 ALIGN4_INIT1( float SIMD_SP_atan_c1, -0.0161657367f );
1061 ALIGN4_INIT1( float SIMD_SP_atan_c2,  0.0429096138f );
1062 ALIGN4_INIT1( float SIMD_SP_atan_c3, -0.0752896400f );
1063 ALIGN4_INIT1( float SIMD_SP_atan_c4,  0.1065626393f );
1064 ALIGN4_INIT1( float SIMD_SP_atan_c5, -0.1420889944f );
1065 ALIGN4_INIT1( float SIMD_SP_atan_c6,  0.1999355085f );
1066 ALIGN4_INIT1( float SIMD_SP_atan_c7, -0.3333314528f );
1067 
1068 /*
1069 ============
1070 SSE_InvSqrt
1071 ============
1072 */
SSE_InvSqrt(float x)1073 float SSE_InvSqrt( float x ) {
1074 	float y;
1075 
1076 	__asm {
1077 		movss		xmm0, x
1078 		rsqrtss		xmm1, xmm0
1079 		mulss		xmm0, xmm1
1080 		mulss		xmm0, xmm1
1081 		subss		xmm0, SIMD_SP_rsqrt_c0
1082 		mulss		xmm1, SIMD_SP_rsqrt_c1
1083 		mulss		xmm0, xmm1
1084 		movss		y, xmm0
1085 	}
1086 	return y;
1087 }
1088 
1089 /*
1090 ============
1091 SSE_InvSqrt4
1092 ============
1093 */
SSE_InvSqrt4(float x[4])1094 void SSE_InvSqrt4( float x[4] ) {
1095 	__asm {
1096 		mov			edi, x
1097 		movaps		xmm0, [edi]
1098 		rsqrtps		xmm1, xmm0
1099 		mulps		xmm0, xmm1
1100 		mulps		xmm0, xmm1
1101 		subps		xmm0, SIMD_SP_rsqrt_c0
1102 		mulps		xmm1, SIMD_SP_rsqrt_c1
1103 		mulps		xmm0, xmm1
1104 		movaps		[edi], xmm0
1105 	}
1106 }
1107 
1108 /*
1109 ============
1110 SSE_SinZeroHalfPI
1111 
1112   The angle must be between zero and half PI.
1113 ============
1114 */
SSE_SinZeroHalfPI(float a)1115 float SSE_SinZeroHalfPI( float a ) {
1116 #if 1
1117 
1118 	float t;
1119 
1120 	assert( a >= 0.0f && a <= idMath::HALF_PI );
1121 
1122 	__asm {
1123 		movss		xmm0, a
1124 		movss		xmm1, xmm0
1125 		mulss		xmm1, xmm1
1126 		movss		xmm2, SIMD_SP_sin_c0
1127 		mulss		xmm2, xmm1
1128 		addss		xmm2, SIMD_SP_sin_c1
1129 		mulss		xmm2, xmm1
1130 		addss		xmm2, SIMD_SP_sin_c2
1131 		mulss		xmm2, xmm1
1132 		addss		xmm2, SIMD_SP_sin_c3
1133 		mulss		xmm2, xmm1
1134 		addss		xmm2, SIMD_SP_sin_c4
1135 		mulss		xmm2, xmm1
1136 		addss		xmm2, SIMD_SP_one
1137 		mulss		xmm2, xmm0
1138 		movss		t, xmm2
1139 	}
1140 
1141 	return t;
1142 
1143 #else
1144 
1145 	float s, t;
1146 
1147 	assert( a >= 0.0f && a <= idMath::HALF_PI );
1148 
1149 	s = a * a;
1150 	t = -2.39e-08f;
1151 	t *= s;
1152 	t += 2.7526e-06f;
1153 	t *= s;
1154 	t += -1.98409e-04f;
1155 	t *= s;
1156 	t += 8.3333315e-03f;
1157 	t *= s;
1158 	t += -1.666666664e-01f;
1159 	t *= s;
1160 	t += 1.0f;
1161 	t *= a;
1162 
1163 	return t;
1164 
1165 #endif
1166 }
1167 
1168 /*
1169 ============
1170 SSE_Sin4ZeroHalfPI
1171 
1172   The angle must be between zero and half PI.
1173 ============
1174 */
SSE_Sin4ZeroHalfPI(float a[4],float s[4])1175 void SSE_Sin4ZeroHalfPI( float a[4], float s[4] ) {
1176 	__asm {
1177 		mov			edi, a
1178 		mov			esi, s
1179 		movaps		xmm0, [edi]
1180 		movaps		xmm1, xmm0
1181 		mulps		xmm1, xmm1
1182 		movaps		xmm2, SIMD_SP_sin_c0
1183 		mulps		xmm2, xmm1
1184 		addps		xmm2, SIMD_SP_sin_c1
1185 		mulps		xmm2, xmm1
1186 		addps		xmm2, SIMD_SP_sin_c2
1187 		mulps		xmm2, xmm1
1188 		addps		xmm2, SIMD_SP_sin_c3
1189 		mulps		xmm2, xmm1
1190 		addps		xmm2, SIMD_SP_sin_c4
1191 		mulps		xmm2, xmm1
1192 		addps		xmm2, SIMD_SP_one
1193 		mulps		xmm2, xmm0
1194 		movaps		[esi], xmm2
1195 	}
1196 }
1197 
1198 /*
1199 ============
1200 SSE_Sin
1201 ============
1202 */
SSE_Sin(float a)1203 float SSE_Sin( float a ) {
1204 #if 1
1205 
1206 	float t;
1207 
1208 	__asm {
1209 		movss		xmm1, a
1210 		movss		xmm2, xmm1
1211 		movss		xmm3, xmm1
1212 		mulss		xmm2, SIMD_SP_oneOverTwoPI
1213 		cvttss2si	ecx, xmm2
1214 		cmpltss		xmm3, SIMD_SP_zero
1215 		andps		xmm3, SIMD_SP_one
1216 		cvtsi2ss	xmm2, ecx
1217 		subss		xmm2, xmm3
1218 		mulss		xmm2, SIMD_SP_twoPI
1219 		subss		xmm1, xmm2
1220 
1221 		movss		xmm0, SIMD_SP_PI			// xmm0 = PI
1222 		subss		xmm0, xmm1					// xmm0 = PI - a
1223 		movss		xmm1, xmm0					// xmm1 = PI - a
1224 		andps		xmm1, SIMD_SP_signBitMask	// xmm1 = signbit( PI - a )
1225 		movss		xmm2, xmm0					// xmm2 = PI - a
1226 		xorps		xmm2, xmm1					// xmm2 = fabs( PI - a )
1227 		cmpnltss	xmm2, SIMD_SP_halfPI		// xmm2 = ( fabs( PI - a ) >= idMath::HALF_PI ) ? 0xFFFFFFFF : 0x00000000
1228 		movss		xmm3, SIMD_SP_PI			// xmm3 = PI
1229 		xorps		xmm3, xmm1					// xmm3 = PI ^ signbit( PI - a )
1230 		andps		xmm3, xmm2					// xmm3 = ( fabs( PI - a ) >= idMath::HALF_PI ) ? ( PI ^ signbit( PI - a ) ) : 0.0f
1231 		andps		xmm2, SIMD_SP_signBitMask	// xmm2 = ( fabs( PI - a ) >= idMath::HALF_PI ) ? SIMD_SP_signBitMask : 0.0f
1232 		xorps		xmm0, xmm2
1233 		addps		xmm0, xmm3
1234 
1235 		movss		xmm1, xmm0
1236 		mulss		xmm1, xmm1
1237 		movss		xmm2, SIMD_SP_sin_c0
1238 		mulss		xmm2, xmm1
1239 		addss		xmm2, SIMD_SP_sin_c1
1240 		mulss		xmm2, xmm1
1241 		addss		xmm2, SIMD_SP_sin_c2
1242 		mulss		xmm2, xmm1
1243 		addss		xmm2, SIMD_SP_sin_c3
1244 		mulss		xmm2, xmm1
1245 		addss		xmm2, SIMD_SP_sin_c4
1246 		mulss		xmm2, xmm1
1247 		addss		xmm2, SIMD_SP_one
1248 		mulss		xmm2, xmm0
1249 		movss		t, xmm2
1250 	}
1251 
1252 	return t;
1253 
1254 #else
1255 
1256 	float s, t;
1257 
1258 	if ( ( a < 0.0f ) || ( a >= idMath::TWO_PI ) ) {
1259 		a -= floorf( a / idMath::TWO_PI ) * idMath::TWO_PI;
1260 	}
1261 
1262 	a = idMath::PI - a;
1263 	if ( fabs( a ) >= idMath::HALF_PI ) {
1264 		a = ( ( a < 0.0f ) ? -idMath::PI : idMath::PI ) - a;
1265 	}
1266 
1267 	s = a * a;
1268 	t = -2.39e-08f;
1269 	t *= s;
1270 	t += 2.7526e-06f;
1271 	t *= s;
1272 	t += -1.98409e-04f;
1273 	t *= s;
1274 	t += 8.3333315e-03f;
1275 	t *= s;
1276 	t += -1.666666664e-01f;
1277 	t *= s;
1278 	t += 1.0f;
1279 	t *= a;
1280 
1281 	return t;
1282 
1283 #endif
1284 }
1285 
1286 /*
1287 ============
1288 SSE_Sin4
1289 ============
1290 */
SSE_Sin4(float a[4],float s[4])1291 void SSE_Sin4( float a[4], float s[4] ) {
1292 	__asm {
1293 		mov			edi, a
1294 		mov			esi, s
1295 		movaps		xmm1, [edi]
1296 		movaps		xmm2, xmm1
1297 		mulps		xmm2, SIMD_SP_oneOverTwoPI
1298 		movhlps		xmm3, xmm2
1299 		cvttss2si	ecx, xmm2
1300 		cvtsi2ss	xmm2, ecx
1301 		cvttss2si	edx, xmm3
1302 		cvtsi2ss	xmm3, edx
1303 		shufps		xmm2, xmm2, R_SHUFFLEPS( 1, 0, 0, 0 )
1304 		shufps		xmm3, xmm3, R_SHUFFLEPS( 1, 0, 0, 0 )
1305 		cvttss2si	ecx, xmm2
1306 		cvtsi2ss	xmm2, ecx
1307 		cvttss2si	edx, xmm3
1308 		cvtsi2ss	xmm3, edx
1309 		shufps		xmm2, xmm3, R_SHUFFLEPS( 1, 0, 1, 0 )
1310 		movaps		xmm3, xmm1
1311 		cmpltps		xmm3, SIMD_SP_zero
1312 		andps		xmm3, SIMD_SP_one
1313 		subps		xmm2, xmm3
1314 		mulps		xmm2, SIMD_SP_twoPI
1315 		subps		xmm1, xmm2
1316 
1317 		movaps		xmm0, SIMD_SP_PI			// xmm0 = PI
1318 		subps		xmm0, xmm1					// xmm0 = PI - a
1319 		movaps		xmm1, xmm0					// xmm1 = PI - a
1320 		andps		xmm1, SIMD_SP_signBitMask	// xmm1 = signbit( PI - a )
1321 		movaps		xmm2, xmm0					// xmm2 = PI - a
1322 		xorps		xmm2, xmm1					// xmm2 = fabs( PI - a )
1323 		cmpnltps	xmm2, SIMD_SP_halfPI		// xmm2 = ( fabs( PI - a ) >= idMath::HALF_PI ) ? 0xFFFFFFFF : 0x00000000
1324 		movaps		xmm3, SIMD_SP_PI			// xmm3 = PI
1325 		xorps		xmm3, xmm1					// xmm3 = PI ^ signbit( PI - a )
1326 		andps		xmm3, xmm2					// xmm3 = ( fabs( PI - a ) >= idMath::HALF_PI ) ? ( PI ^ signbit( PI - a ) ) : 0.0f
1327 		andps		xmm2, SIMD_SP_signBitMask	// xmm2 = ( fabs( PI - a ) >= idMath::HALF_PI ) ? SIMD_SP_signBitMask : 0.0f
1328 		xorps		xmm0, xmm2
1329 		addps		xmm0, xmm3
1330 
1331 		movaps		xmm1, xmm0
1332 		mulps		xmm1, xmm1
1333 		movaps		xmm2, SIMD_SP_sin_c0
1334 		mulps		xmm2, xmm1
1335 		addps		xmm2, SIMD_SP_sin_c1
1336 		mulps		xmm2, xmm1
1337 		addps		xmm2, SIMD_SP_sin_c2
1338 		mulps		xmm2, xmm1
1339 		addps		xmm2, SIMD_SP_sin_c3
1340 		mulps		xmm2, xmm1
1341 		addps		xmm2, SIMD_SP_sin_c4
1342 		mulps		xmm2, xmm1
1343 		addps		xmm2, SIMD_SP_one
1344 		mulps		xmm2, xmm0
1345 		movaps		[esi], xmm2
1346 	}
1347 }
1348 
1349 /*
1350 ============
1351 SSE_CosZeroHalfPI
1352 
1353   The angle must be between zero and half PI.
1354 ============
1355 */
SSE_CosZeroHalfPI(float a)1356 float SSE_CosZeroHalfPI( float a ) {
1357 #if 1
1358 
1359 	float t;
1360 
1361 	assert( a >= 0.0f && a <= idMath::HALF_PI );
1362 
1363 	__asm {
1364 		movss		xmm0, a
1365 		mulss		xmm0, xmm0
1366 		movss		xmm1, SIMD_SP_cos_c0
1367 		mulss		xmm1, xmm0
1368 		addss		xmm1, SIMD_SP_cos_c1
1369 		mulss		xmm1, xmm0
1370 		addss		xmm1, SIMD_SP_cos_c2
1371 		mulss		xmm1, xmm0
1372 		addss		xmm1, SIMD_SP_cos_c3
1373 		mulss		xmm1, xmm0
1374 		addss		xmm1, SIMD_SP_cos_c4
1375 		mulss		xmm1, xmm0
1376 		addss		xmm1, SIMD_SP_one
1377 		movss		t, xmm1
1378 	}
1379 
1380 	return t;
1381 
1382 #else
1383 
1384 	float s, t;
1385 
1386 	assert( a >= 0.0f && a <= idMath::HALF_PI );
1387 
1388 	s = a * a;
1389 	t = -2.605e-07f;
1390 	t *= s;
1391 	t += 2.47609e-05f;
1392 	t *= s;
1393 	t += -1.3888397e-03f;
1394 	t *= s;
1395 	t += 4.16666418e-02f;
1396 	t *= s;
1397 	t += -4.999999963e-01f;
1398 	t *= s;
1399 	t += 1.0f;
1400 
1401 	return t;
1402 
1403 #endif
1404 }
1405 
1406 /*
1407 ============
1408 SSE_Cos4ZeroHalfPI
1409 
1410   The angle must be between zero and half PI.
1411 ============
1412 */
SSE_Cos4ZeroHalfPI(float a[4],float c[4])1413 void SSE_Cos4ZeroHalfPI( float a[4], float c[4] ) {
1414 	__asm {
1415 		mov			edi, a
1416 		mov			esi, c
1417 		movaps		xmm0, [edi]
1418 		mulps		xmm0, xmm0
1419 		movaps		xmm1, SIMD_SP_cos_c0
1420 		mulps		xmm1, xmm0
1421 		addps		xmm1, SIMD_SP_cos_c1
1422 		mulps		xmm1, xmm0
1423 		addps		xmm1, SIMD_SP_cos_c2
1424 		mulps		xmm1, xmm0
1425 		addps		xmm1, SIMD_SP_cos_c3
1426 		mulps		xmm1, xmm0
1427 		addps		xmm1, SIMD_SP_cos_c4
1428 		mulps		xmm1, xmm0
1429 		addps		xmm1, SIMD_SP_one
1430 		movaps		[esi], xmm2
1431 	}
1432 }
1433 
1434 /*
1435 ============
1436 SSE_Cos
1437 ============
1438 */
SSE_Cos(float a)1439 float SSE_Cos( float a ) {
1440 #if 1
1441 
1442 	float t;
1443 
1444 	__asm {
1445 		movss		xmm1, a
1446 		movss		xmm2, xmm1
1447 		movss		xmm3, xmm1
1448 		mulss		xmm2, SIMD_SP_oneOverTwoPI
1449 		cvttss2si	ecx, xmm2
1450 		cmpltss		xmm3, SIMD_SP_zero
1451 		andps		xmm3, SIMD_SP_one
1452 		cvtsi2ss	xmm2, ecx
1453 		subss		xmm2, xmm3
1454 		mulss		xmm2, SIMD_SP_twoPI
1455 		subss		xmm1, xmm2
1456 
1457 		movss		xmm0, SIMD_SP_PI			// xmm0 = PI
1458 		subss		xmm0, xmm1					// xmm0 = PI - a
1459 		movss		xmm1, xmm0					// xmm1 = PI - a
1460 		andps		xmm1, SIMD_SP_signBitMask	// xmm1 = signbit( PI - a )
1461 		movss		xmm2, xmm0					// xmm2 = PI - a
1462 		xorps		xmm2, xmm1					// xmm2 = fabs( PI - a )
1463 		cmpnltss	xmm2, SIMD_SP_halfPI		// xmm2 = ( fabs( PI - a ) >= idMath::HALF_PI ) ? 0xFFFFFFFF : 0x00000000
1464 		movss		xmm3, SIMD_SP_PI			// xmm3 = PI
1465 		xorps		xmm3, xmm1					// xmm3 = PI ^ signbit( PI - a )
1466 		andps		xmm3, xmm2					// xmm3 = ( fabs( PI - a ) >= idMath::HALF_PI ) ? ( PI ^ signbit( PI - a ) ) : 0.0f
1467 		andps		xmm2, SIMD_SP_signBitMask	// xmm2 = ( fabs( PI - a ) >= idMath::HALF_PI ) ? SIMD_SP_signBitMask : 0.0f
1468 		xorps		xmm0, xmm2
1469 		addps		xmm0, xmm3
1470 
1471 		mulss		xmm0, xmm0
1472 		movss		xmm1, SIMD_SP_cos_c0
1473 		mulss		xmm1, xmm0
1474 		addss		xmm1, SIMD_SP_cos_c1
1475 		mulss		xmm1, xmm0
1476 		addss		xmm1, SIMD_SP_cos_c2
1477 		mulss		xmm1, xmm0
1478 		addss		xmm1, SIMD_SP_cos_c3
1479 		mulss		xmm1, xmm0
1480 		addss		xmm1, SIMD_SP_cos_c4
1481 		mulss		xmm1, xmm0
1482 		addss		xmm1, SIMD_SP_one
1483 		xorps		xmm2, SIMD_SP_signBitMask
1484 		xorps		xmm1, xmm2
1485 		movss		t, xmm1
1486 	}
1487 
1488 	return t;
1489 
1490 #else
1491 
1492 	float s, t;
1493 
1494 	if ( ( a < 0.0f ) || ( a >= idMath::TWO_PI ) ) {
1495 		a -= floorf( a / idMath::TWO_PI ) * idMath::TWO_PI;
1496 	}
1497 
1498 	a = idMath::PI - a;
1499 	if ( fabs( a ) >= idMath::HALF_PI ) {
1500 		a = ( ( a < 0.0f ) ? -idMath::PI : idMath::PI ) - a;
1501 		d = 1.0f;
1502 	} else {
1503 		d = -1.0f;
1504 	}
1505 
1506 	s = a * a;
1507 	t = -2.605e-07f;
1508 	t *= s;
1509 	t += 2.47609e-05f;
1510 	t *= s;
1511 	t += -1.3888397e-03f;
1512 	t *= s;
1513 	t += 4.16666418e-02f;
1514 	t *= s;
1515 	t += -4.999999963e-01f;
1516 	t *= s;
1517 	t += 1.0f;
1518 	t *= d;
1519 
1520 	return t;
1521 
1522 #endif
1523 }
1524 
1525 /*
1526 ============
1527 SSE_Cos4
1528 ============
1529 */
SSE_Cos4(float a[4],float c[4])1530 void SSE_Cos4( float a[4], float c[4] ) {
1531 	__asm {
1532 		mov			edi, a
1533 		mov			esi, c
1534 		movaps		xmm1, [edi]
1535 		movaps		xmm2, xmm1
1536 		mulps		xmm2, SIMD_SP_oneOverTwoPI
1537 		movhlps		xmm3, xmm2
1538 		cvttss2si	ecx, xmm2
1539 		cvtsi2ss	xmm2, ecx
1540 		cvttss2si	edx, xmm3
1541 		cvtsi2ss	xmm3, edx
1542 		shufps		xmm2, xmm2, R_SHUFFLEPS( 1, 0, 0, 0 )
1543 		shufps		xmm3, xmm3, R_SHUFFLEPS( 1, 0, 0, 0 )
1544 		cvttss2si	ecx, xmm2
1545 		cvtsi2ss	xmm2, ecx
1546 		cvttss2si	edx, xmm3
1547 		cvtsi2ss	xmm3, edx
1548 		shufps		xmm2, xmm3, R_SHUFFLEPS( 1, 0, 1, 0 )
1549 		movaps		xmm3, xmm1
1550 		cmpltps		xmm3, SIMD_SP_zero
1551 		andps		xmm3, SIMD_SP_one
1552 		subps		xmm2, xmm3
1553 		mulps		xmm2, SIMD_SP_twoPI
1554 		subps		xmm1, xmm2
1555 
1556 		movaps		xmm0, SIMD_SP_PI			// xmm0 = PI
1557 		subps		xmm0, xmm1					// xmm0 = PI - a
1558 		movaps		xmm1, xmm0					// xmm1 = PI - a
1559 		andps		xmm1, SIMD_SP_signBitMask	// xmm1 = signbit( PI - a )
1560 		movaps		xmm2, xmm0					// xmm2 = PI - a
1561 		xorps		xmm2, xmm1					// xmm2 = fabs( PI - a )
1562 		cmpnltps	xmm2, SIMD_SP_halfPI		// xmm2 = ( fabs( PI - a ) >= idMath::HALF_PI ) ? 0xFFFFFFFF : 0x00000000
1563 		movaps		xmm3, SIMD_SP_PI			// xmm3 = PI
1564 		xorps		xmm3, xmm1					// xmm3 = PI ^ signbit( PI - a )
1565 		andps		xmm3, xmm2					// xmm3 = ( fabs( PI - a ) >= idMath::HALF_PI ) ? ( PI ^ signbit( PI - a ) ) : 0.0f
1566 		andps		xmm2, SIMD_SP_signBitMask	// xmm2 = ( fabs( PI - a ) >= idMath::HALF_PI ) ? SIMD_SP_signBitMask : 0.0f
1567 		xorps		xmm0, xmm2
1568 		addps		xmm0, xmm3
1569 
1570 		mulps		xmm0, xmm0
1571 		movaps		xmm1, SIMD_SP_cos_c0
1572 		mulps		xmm1, xmm0
1573 		addps		xmm1, SIMD_SP_cos_c1
1574 		mulps		xmm1, xmm0
1575 		addps		xmm1, SIMD_SP_cos_c2
1576 		mulps		xmm1, xmm0
1577 		addps		xmm1, SIMD_SP_cos_c3
1578 		mulps		xmm1, xmm0
1579 		addps		xmm1, SIMD_SP_cos_c4
1580 		mulps		xmm1, xmm0
1581 		addps		xmm1, SIMD_SP_one
1582 		xorps		xmm2, SIMD_SP_signBitMask
1583 		xorps		xmm1, xmm2
1584 		movaps		[esi], xmm1
1585 	}
1586 }
1587 
1588 /*
1589 ============
1590 SSE_SinCos
1591 ============
1592 */
SSE_SinCos(float a,float & s,float & c)1593 void SSE_SinCos( float a, float &s, float &c ) {
1594 	__asm {
1595 		mov			edi, s
1596 		mov			esi, c
1597 		movss		xmm1, a
1598 		movss		xmm2, xmm1
1599 		movss		xmm3, xmm1
1600 		mulss		xmm2, SIMD_SP_oneOverTwoPI
1601 		cvttss2si	ecx, xmm2
1602 		cmpltss		xmm3, SIMD_SP_zero
1603 		andps		xmm3, SIMD_SP_one
1604 		cvtsi2ss	xmm2, ecx
1605 		subss		xmm2, xmm3
1606 		mulss		xmm2, SIMD_SP_twoPI
1607 		subss		xmm1, xmm2
1608 
1609 		movss		xmm0, SIMD_SP_PI			// xmm0 = PI
1610 		subss		xmm0, xmm1					// xmm0 = PI - a
1611 		movss		xmm1, xmm0					// xmm1 = PI - a
1612 		andps		xmm1, SIMD_SP_signBitMask	// xmm1 = signbit( PI - a )
1613 		movss		xmm2, xmm0					// xmm2 = PI - a
1614 		xorps		xmm2, xmm1					// xmm2 = fabs( PI - a )
1615 		cmpnltss	xmm2, SIMD_SP_halfPI		// xmm2 = ( fabs( PI - a ) >= idMath::HALF_PI ) ? 0xFFFFFFFF : 0x00000000
1616 		movss		xmm3, SIMD_SP_PI			// xmm3 = PI
1617 		xorps		xmm3, xmm1					// xmm3 = PI ^ signbit( PI - a )
1618 		andps		xmm3, xmm2					// xmm3 = ( fabs( PI - a ) >= idMath::HALF_PI ) ? ( PI ^ signbit( PI - a ) ) : 0.0f
1619 		andps		xmm2, SIMD_SP_signBitMask	// xmm2 = ( fabs( PI - a ) >= idMath::HALF_PI ) ? SIMD_SP_signBitMask : 0.0f
1620 		xorps		xmm0, xmm2
1621 		addps		xmm0, xmm3
1622 
1623 		movss		xmm1, xmm0
1624 		mulss		xmm1, xmm1
1625 		movss		xmm3, SIMD_SP_sin_c0
1626 		movss		xmm4, SIMD_SP_cos_c0
1627 		mulss		xmm3, xmm1
1628 		mulss		xmm4, xmm1
1629 		addss		xmm3, SIMD_SP_sin_c1
1630 		addss		xmm4, SIMD_SP_cos_c1
1631 		mulss		xmm3, xmm1
1632 		mulss		xmm4, xmm1
1633 		addss		xmm3, SIMD_SP_sin_c2
1634 		addss		xmm4, SIMD_SP_cos_c2
1635 		mulss		xmm3, xmm1
1636 		mulss		xmm4, xmm1
1637 		addss		xmm3, SIMD_SP_sin_c3
1638 		addss		xmm4, SIMD_SP_cos_c3
1639 		mulss		xmm3, xmm1
1640 		mulss		xmm4, xmm1
1641 		addss		xmm3, SIMD_SP_sin_c4
1642 		addss		xmm4, SIMD_SP_cos_c4
1643 		mulss		xmm3, xmm1
1644 		mulss		xmm4, xmm1
1645 		addss		xmm3, SIMD_SP_one
1646 		addss		xmm4, SIMD_SP_one
1647 		mulss		xmm3, xmm0
1648 		xorps		xmm2, SIMD_SP_signBitMask
1649 		xorps		xmm4, xmm2
1650 		movss		[edi], xmm2
1651 		movss		[esi], xmm3
1652 	}
1653 }
1654 
1655 /*
1656 ============
1657 SSE_SinCos4
1658 ============
1659 */
SSE_SinCos4(float a[4],float s[4],float c[4])1660 void SSE_SinCos4( float a[4], float s[4], float c[4] ) {
1661 	__asm {
1662 		mov			eax, a
1663 		mov			edi, s
1664 		mov			esi, c
1665 		movaps		xmm1, [eax]
1666 		movaps		xmm2, xmm1
1667 		mulps		xmm2, SIMD_SP_oneOverTwoPI
1668 		movhlps		xmm3, xmm2
1669 		cvttss2si	ecx, xmm2
1670 		cvtsi2ss	xmm2, ecx
1671 		cvttss2si	edx, xmm3
1672 		cvtsi2ss	xmm3, edx
1673 		shufps		xmm2, xmm2, R_SHUFFLEPS( 1, 0, 0, 0 )
1674 		shufps		xmm3, xmm3, R_SHUFFLEPS( 1, 0, 0, 0 )
1675 		cvttss2si	ecx, xmm2
1676 		cvtsi2ss	xmm2, ecx
1677 		cvttss2si	edx, xmm3
1678 		cvtsi2ss	xmm3, edx
1679 		shufps		xmm2, xmm3, R_SHUFFLEPS( 1, 0, 1, 0 )
1680 		movaps		xmm3, xmm1
1681 		cmpltps		xmm3, SIMD_SP_zero
1682 		andps		xmm3, SIMD_SP_one
1683 		subps		xmm2, xmm3
1684 		mulps		xmm2, SIMD_SP_twoPI
1685 		subps		xmm1, xmm2
1686 
1687 		movaps		xmm0, SIMD_SP_PI			// xmm0 = PI
1688 		subps		xmm0, xmm1					// xmm0 = PI - a
1689 		movaps		xmm1, xmm0					// xmm1 = PI - a
1690 		andps		xmm1, SIMD_SP_signBitMask	// xmm1 = signbit( PI - a )
1691 		movaps		xmm2, xmm0					// xmm2 = PI - a
1692 		xorps		xmm2, xmm1					// xmm2 = fabs( PI - a )
1693 		cmpnltps	xmm2, SIMD_SP_halfPI		// xmm2 = ( fabs( PI - a ) >= idMath::HALF_PI ) ? 0xFFFFFFFF : 0x00000000
1694 		movaps		xmm3, SIMD_SP_PI			// xmm3 = PI
1695 		xorps		xmm3, xmm1					// xmm3 = PI ^ signbit( PI - a )
1696 		andps		xmm3, xmm2					// xmm3 = ( fabs( PI - a ) >= idMath::HALF_PI ) ? ( PI ^ signbit( PI - a ) ) : 0.0f
1697 		andps		xmm2, SIMD_SP_signBitMask	// xmm2 = ( fabs( PI - a ) >= idMath::HALF_PI ) ? SIMD_SP_signBitMask : 0.0f
1698 		xorps		xmm0, xmm2
1699 		addps		xmm0, xmm3
1700 
1701 		movaps		xmm0, [eax]
1702 		movaps		xmm1, xmm0
1703 		mulps		xmm1, xmm1
1704 		movaps		xmm3, SIMD_SP_sin_c0
1705 		movaps		xmm4, SIMD_SP_cos_c0
1706 		mulps		xmm3, xmm1
1707 		mulps		xmm4, xmm1
1708 		addps		xmm3, SIMD_SP_sin_c1
1709 		addps		xmm4, SIMD_SP_cos_c1
1710 		mulps		xmm3, xmm1
1711 		mulps		xmm4, xmm1
1712 		addps		xmm3, SIMD_SP_sin_c2
1713 		addps		xmm4, SIMD_SP_cos_c2
1714 		mulps		xmm3, xmm1
1715 		mulps		xmm4, xmm1
1716 		addps		xmm3, SIMD_SP_sin_c3
1717 		addps		xmm4, SIMD_SP_cos_c3
1718 		mulps		xmm3, xmm1
1719 		mulps		xmm4, xmm1
1720 		addps		xmm3, SIMD_SP_sin_c4
1721 		addps		xmm4, SIMD_SP_cos_c4
1722 		mulps		xmm3, xmm1
1723 		mulps		xmm4, xmm1
1724 		addps		xmm3, SIMD_SP_one
1725 		addps		xmm4, SIMD_SP_one
1726 		mulps		xmm3, xmm0
1727 		xorps		xmm2, SIMD_SP_signBitMask
1728 		xorps		xmm4, xmm2
1729 		movaps		[edi], xmm3
1730 		movaps		[esi], xmm4
1731 	}
1732 }
1733 
1734 /*
1735 ============
1736 SSE_ATanPositive
1737 
1738   Both 'x' and 'y' must be positive.
1739 ============
1740 */
SSE_ATanPositive(float y,float x)1741 float SSE_ATanPositive( float y, float x ) {
1742 #if 1
1743 
1744 	float t;
1745 
1746 	assert( y >= 0.0f && x >= 0.0f );
1747 
1748 	__asm {
1749 		movss		xmm0, x
1750 		movss		xmm3, xmm0
1751 		movss		xmm1, y
1752 		minss		xmm0, xmm1
1753 		maxss		xmm1, xmm3
1754 		cmpeqss		xmm3, xmm0
1755 		rcpss		xmm2, xmm1
1756 		mulss		xmm1, xmm2
1757 		mulss		xmm1, xmm2
1758 		addss		xmm2, xmm2
1759 		subss		xmm2, xmm1				// xmm2 = 1 / y or 1 / x
1760 		mulss		xmm0, xmm2				// xmm0 = x / y or y / x
1761 		movss		xmm1, xmm3
1762 		andps		xmm1, SIMD_SP_signBitMask
1763 		xorps		xmm0, xmm1				// xmm0 = -x / y or y / x
1764 		andps		xmm3, SIMD_SP_halfPI	// xmm3 = HALF_PI or 0.0f
1765 		movss		xmm1, xmm0
1766 		mulss		xmm1, xmm1				// xmm1 = s
1767 		movss		xmm2, SIMD_SP_atan_c0
1768 		mulss		xmm2, xmm1
1769 		addss		xmm2, SIMD_SP_atan_c1
1770 		mulss		xmm2, xmm1
1771 		addss		xmm2, SIMD_SP_atan_c2
1772 		mulss		xmm2, xmm1
1773 		addss		xmm2, SIMD_SP_atan_c3
1774 		mulss		xmm2, xmm1
1775 		addss		xmm2, SIMD_SP_atan_c4
1776 		mulss		xmm2, xmm1
1777 		addss		xmm2, SIMD_SP_atan_c5
1778 		mulss		xmm2, xmm1
1779 		addss		xmm2, SIMD_SP_atan_c6
1780 		mulss		xmm2, xmm1
1781 		addss		xmm2, SIMD_SP_atan_c7
1782 		mulss		xmm2, xmm1
1783 		addss		xmm2, SIMD_SP_one
1784 		mulss		xmm2, xmm0
1785 		addss		xmm2, xmm3
1786 		movss		t, xmm2
1787 	}
1788 
1789 	return t;
1790 
1791 #else
1792 
1793 	float a, d, s, t;
1794 
1795 	assert( y >= 0.0f && x >= 0.0f );
1796 
1797 	if ( y > x ) {
1798 		a = -x / y;
1799 		d = idMath::HALF_PI;
1800 	} else {
1801 		a = y / x;
1802 		d = 0.0f;
1803 	}
1804 	s = a * a;
1805 	t = 0.0028662257f;
1806 	t *= s;
1807 	t += -0.0161657367f;
1808 	t *= s;
1809 	t += 0.0429096138f;
1810 	t *= s;
1811 	t += -0.0752896400f;
1812 	t *= s;
1813 	t += 0.1065626393f;
1814 	t *= s;
1815 	t += -0.1420889944f;
1816 	t *= s;
1817 	t += 0.1999355085f;
1818 	t *= s;
1819 	t += -0.3333314528f;
1820 	t *= s;
1821 	t += 1.0f;
1822 	t *= a;
1823 	t += d;
1824 
1825 	return t;
1826 
1827 #endif
1828 }
1829 
1830 /*
1831 ============
1832 SSE_ATan4Positive
1833 
1834   Both 'x' and 'y' must be positive.
1835 ============
1836 */
SSE_ATan4Positive(float y[4],float x[4],float at[4])1837 void SSE_ATan4Positive( float y[4], float x[4], float at[4] ) {
1838 	__asm {
1839 		mov			esi, x
1840 		mov			edi, y
1841 		mov			edx, at
1842 		movaps		xmm0, [esi]
1843 		movaps		xmm3, xmm0
1844 		movaps		xmm1, [edi]
1845 		minps		xmm0, xmm1
1846 		maxps		xmm1, xmm3
1847 		cmpeqps		xmm3, xmm0
1848 		rcpps		xmm2, xmm1
1849 		mulps		xmm1, xmm2
1850 		mulps		xmm1, xmm2
1851 		addps		xmm2, xmm2
1852 		subps		xmm2, xmm1				// xmm2 = 1 / y or 1 / x
1853 		mulps		xmm0, xmm2				// xmm0 = x / y or y / x
1854 		movaps		xmm1, xmm3
1855 		andps		xmm1, SIMD_SP_signBitMask
1856 		xorps		xmm0, xmm1				// xmm0 = -x / y or y / x
1857 		andps		xmm3, SIMD_SP_halfPI	// xmm3 = HALF_PI or 0.0f
1858 		movaps		xmm1, xmm0
1859 		mulps		xmm1, xmm1				// xmm1 = s
1860 		movaps		xmm2, SIMD_SP_atan_c0
1861 		mulps		xmm2, xmm1
1862 		addps		xmm2, SIMD_SP_atan_c1
1863 		mulps		xmm2, xmm1
1864 		addps		xmm2, SIMD_SP_atan_c2
1865 		mulps		xmm2, xmm1
1866 		addps		xmm2, SIMD_SP_atan_c3
1867 		mulps		xmm2, xmm1
1868 		addps		xmm2, SIMD_SP_atan_c4
1869 		mulps		xmm2, xmm1
1870 		addps		xmm2, SIMD_SP_atan_c5
1871 		mulps		xmm2, xmm1
1872 		addps		xmm2, SIMD_SP_atan_c6
1873 		mulps		xmm2, xmm1
1874 		addps		xmm2, SIMD_SP_atan_c7
1875 		mulps		xmm2, xmm1
1876 		addps		xmm2, SIMD_SP_one
1877 		mulps		xmm2, xmm0
1878 		addps		xmm2, xmm3
1879 		movaps		[edx], xmm2
1880 	}
1881 }
1882 
1883 /*
1884 ============
1885 SSE_ATan
1886 ============
1887 */
SSE_ATan(float y,float x)1888 float SSE_ATan( float y, float x ) {
1889 #if 1
1890 
1891 	float t;
1892 
1893 	__asm {
1894 		movss		xmm0, x
1895 		movss		xmm3, xmm0
1896 		movss		xmm4, xmm0
1897 		andps		xmm0, SIMD_SP_absMask
1898 		movss		xmm1, y
1899 		xorps		xmm4, xmm1
1900 		andps		xmm1, SIMD_SP_absMask
1901 		andps		xmm4, SIMD_SP_signBitMask
1902 		minss		xmm0, xmm1
1903 		maxss		xmm1, xmm3
1904 		cmpeqss		xmm3, xmm0
1905 		rcpss		xmm2, xmm1
1906 		mulss		xmm1, xmm2
1907 		mulss		xmm1, xmm2
1908 		addss		xmm2, xmm2
1909 		subss		xmm2, xmm1				// xmm2 = 1 / y or 1 / x
1910 		mulss		xmm0, xmm2				// xmm0 = x / y or y / x
1911 		xorps		xmm0, xmm4
1912 		movss		xmm1, xmm3
1913 		andps		xmm1, SIMD_SP_signBitMask
1914 		xorps		xmm0, xmm1				// xmm0 = -x / y or y / x
1915 		orps		xmm4, SIMD_SP_halfPI	// xmm4 = +/- HALF_PI
1916 		andps		xmm3, xmm4				// xmm3 = +/- HALF_PI or 0.0f
1917 		movss		xmm1, xmm0
1918 		mulss		xmm1, xmm1				// xmm1 = s
1919 		movss		xmm2, SIMD_SP_atan_c0
1920 		mulss		xmm2, xmm1
1921 		addss		xmm2, SIMD_SP_atan_c1
1922 		mulss		xmm2, xmm1
1923 		addss		xmm2, SIMD_SP_atan_c2
1924 		mulss		xmm2, xmm1
1925 		addss		xmm2, SIMD_SP_atan_c3
1926 		mulss		xmm2, xmm1
1927 		addss		xmm2, SIMD_SP_atan_c4
1928 		mulss		xmm2, xmm1
1929 		addss		xmm2, SIMD_SP_atan_c5
1930 		mulss		xmm2, xmm1
1931 		addss		xmm2, SIMD_SP_atan_c6
1932 		mulss		xmm2, xmm1
1933 		addss		xmm2, SIMD_SP_atan_c7
1934 		mulss		xmm2, xmm1
1935 		addss		xmm2, SIMD_SP_one
1936 		mulss		xmm2, xmm0
1937 		addss		xmm2, xmm3
1938 		movss		t, xmm2
1939 	}
1940 
1941 	return t;
1942 
1943 #else
1944 
1945 	float a, d, s, t;
1946 
1947 	if ( fabs( y ) > fabs( x ) ) {
1948 		a = -x / y;
1949 		d = idMath::HALF_PI;
1950 		*((unsigned int *)&d) ^= ( *((unsigned int *)&x) ^ *((unsigned int *)&y) ) & (1<<31);
1951 	} else {
1952 		a = y / x;
1953 		d = 0.0f;
1954 	}
1955 
1956 	s = a * a;
1957 	t = 0.0028662257f;
1958 	t *= s;
1959 	t += -0.0161657367f;
1960 	t *= s;
1961 	t += 0.0429096138f;
1962 	t *= s;
1963 	t += -0.0752896400f;
1964 	t *= s;
1965 	t += 0.1065626393f;
1966 	t *= s;
1967 	t += -0.1420889944f;
1968 	t *= s;
1969 	t += 0.1999355085f;
1970 	t *= s;
1971 	t += -0.3333314528f;
1972 	t *= s;
1973 	t += 1.0f;
1974 	t *= a;
1975 	t += d;
1976 
1977 	return t;
1978 
1979 #endif
1980 }
1981 
1982 /*
1983 ============
1984 SSE_ATan4
1985 ============
1986 */
SSE_ATan4(float y[4],float x[4],float at[4])1987 void SSE_ATan4( float y[4], float x[4], float at[4] ) {
1988 	__asm {
1989 		mov			esi, x
1990 		mov			edi, y
1991 		mov			edx, at
1992 		movaps		xmm0, [esi]
1993 		movaps		xmm3, xmm0
1994 		movaps		xmm4, xmm0
1995 		andps		xmm0, SIMD_SP_absMask
1996 		movaps		xmm1, [edi]
1997 		xorps		xmm4, xmm1
1998 		andps		xmm1, SIMD_SP_absMask
1999 		andps		xmm4, SIMD_SP_signBitMask
2000 		minps		xmm0, xmm1
2001 		maxps		xmm1, xmm3
2002 		cmpeqps		xmm3, xmm0
2003 		rcpps		xmm2, xmm1
2004 		mulps		xmm1, xmm2
2005 		mulps		xmm1, xmm2
2006 		addps		xmm2, xmm2
2007 		subps		xmm2, xmm1				// xmm2 = 1 / y or 1 / x
2008 		mulps		xmm0, xmm2				// xmm0 = x / y or y / x
2009 		xorps		xmm0, xmm4
2010 		movaps		xmm1, xmm3
2011 		andps		xmm1, SIMD_SP_signBitMask
2012 		xorps		xmm0, xmm1				// xmm0 = -x / y or y / x
2013 		orps		xmm4, SIMD_SP_halfPI	// xmm4 = +/- HALF_PI
2014 		andps		xmm3, xmm4				// xmm3 = +/- HALF_PI or 0.0f
2015 		movaps		xmm1, xmm0
2016 		mulps		xmm1, xmm1				// xmm1 = s
2017 		movaps		xmm2, SIMD_SP_atan_c0
2018 		mulps		xmm2, xmm1
2019 		addps		xmm2, SIMD_SP_atan_c1
2020 		mulps		xmm2, xmm1
2021 		addps		xmm2, SIMD_SP_atan_c2
2022 		mulps		xmm2, xmm1
2023 		addps		xmm2, SIMD_SP_atan_c3
2024 		mulps		xmm2, xmm1
2025 		addps		xmm2, SIMD_SP_atan_c4
2026 		mulps		xmm2, xmm1
2027 		addps		xmm2, SIMD_SP_atan_c5
2028 		mulps		xmm2, xmm1
2029 		addps		xmm2, SIMD_SP_atan_c6
2030 		mulps		xmm2, xmm1
2031 		addps		xmm2, SIMD_SP_atan_c7
2032 		mulps		xmm2, xmm1
2033 		addps		xmm2, SIMD_SP_one
2034 		mulps		xmm2, xmm0
2035 		addps		xmm2, xmm3
2036 		movaps		[edx], xmm2
2037 	}
2038 }
2039 
2040 /*
2041 ============
2042 SSE_TestTrigonometry
2043 ============
2044 */
SSE_TestTrigonometry(void)2045 void SSE_TestTrigonometry( void ) {
2046 	int i;
2047 	float a, s1, s2, c1, c2;
2048 
2049 	for ( i = 0; i < 100; i++ ) {
2050 		a = i * idMath::HALF_PI / 100.0f;
2051 
2052 		s1 = sin( a );
2053 		s2 = SSE_SinZeroHalfPI( a );
2054 
2055 		if ( fabs( s1 - s2 ) > 1e-7f ) {
2056 			assert( 0 );
2057 		}
2058 
2059 		c1 = cos( a );
2060 		c2 = SSE_CosZeroHalfPI( a );
2061 
2062 		if ( fabs( c1 - c2 ) > 1e-7f ) {
2063 			assert( 0 );
2064 		}
2065 	}
2066 
2067 	for ( i = -200; i < 200; i++ ) {
2068 		a = i * idMath::TWO_PI / 100.0f;
2069 
2070 		s1 = sin( a );
2071 		s2 = SSE_Sin( a );
2072 
2073 		if ( fabs( s1 - s2 ) > 1e-6f ) {
2074 			assert( 0 );
2075 		}
2076 
2077 		c1 = cos( a );
2078 		c2 = SSE_Cos( a );
2079 
2080 		if ( fabs( c1 - c2 ) > 1e-6f ) {
2081 			assert( 0 );
2082 		}
2083 
2084 		SSE_SinCos( a, s2, c2 );
2085 		if ( fabs( s1 - s2 ) > 1e-6f || fabs( c1 - c2 ) > 1e-6f ) {
2086 			assert( 0 );
2087 		}
2088 	}
2089 }
2090 
2091 /*
2092 ============
2093 idSIMD_SSE::GetName
2094 ============
2095 */
GetName(void) const2096 const char * idSIMD_SSE::GetName( void ) const {
2097 	return "MMX & SSE";
2098 }
2099 
2100 /*
2101 ============
2102 idSIMD_SSE::Add
2103 
2104   dst[i] = constant + src[i];
2105 ============
2106 */
Add(float * dst,const float constant,const float * src,const int count)2107 void VPCALL idSIMD_SSE::Add( float *dst, const float constant, const float *src, const int count ) {
2108 	KFLOAT_CA( add, dst, src, constant, count )
2109 }
2110 
2111 /*
2112 ============
2113 idSIMD_SSE::Add
2114 
2115   dst[i] = src0[i] + src1[i];
2116 ============
2117 */
Add(float * dst,const float * src0,const float * src1,const int count)2118 void VPCALL idSIMD_SSE::Add( float *dst, const float *src0, const float *src1, const int count ) {
2119 	KFLOAT_AA( add, dst, src0, src1, count )
2120 }
2121 
2122 /*
2123 ============
2124 idSIMD_SSE::Sub
2125 
2126   dst[i] = constant - src[i];
2127 ============
2128 */
Sub(float * dst,const float constant,const float * src,const int count)2129 void VPCALL idSIMD_SSE::Sub( float *dst, const float constant, const float *src, const int count ) {
2130 	KFLOAT_CA( sub, dst, src, constant, count )
2131 }
2132 
2133 /*
2134 ============
2135 idSIMD_SSE::Sub
2136 
2137   dst[i] = src0[i] - src1[i];
2138 ============
2139 */
Sub(float * dst,const float * src0,const float * src1,const int count)2140 void VPCALL idSIMD_SSE::Sub( float *dst, const float *src0, const float *src1, const int count ) {
2141 	KFLOAT_AA( sub, dst, src0, src1, count )
2142 }
2143 
2144 /*
2145 ============
2146 idSIMD_SSE::Mul
2147 
2148   dst[i] = constant * src[i];
2149 ============
2150 */
Mul(float * dst,const float constant,const float * src,const int count)2151 void VPCALL idSIMD_SSE::Mul( float *dst, const float constant, const float *src, const int count ) {
2152 	KFLOAT_CA( mul, dst, src, constant, count )
2153 }
2154 
2155 /*
2156 ============
2157 idSIMD_SSE::Mul
2158 
2159   dst[i] = src0[i] * src1[i];
2160 ============
2161 */
Mul(float * dst,const float * src0,const float * src1,const int count)2162 void VPCALL idSIMD_SSE::Mul( float *dst, const float *src0, const float *src1, const int count ) {
2163 	KFLOAT_AA( mul, dst, src0, src1, count )
2164 }
2165 
2166 /*
2167 ============
2168 idSIMD_SSE::Div
2169 
2170   dst[i] = constant / src[i];
2171 ============
2172 */
Div(float * dst,const float constant,const float * src,const int count)2173 void VPCALL idSIMD_SSE::Div( float *dst, const float constant, const float *src, const int count ) {
2174 	int pre, post;
2175 
2176 	//	1 / x = 2 * rcpps(x) - (x * rcpps(x) * rcpps(x));
2177 	__asm
2178 	{
2179 		movss	xmm1,constant
2180 		shufps	xmm1,xmm1,0
2181 
2182 		KFLOATINITDS( dst, src, count, pre, post )
2183 		and		eax,15
2184 		jne		lpNA
2185 		jmp		lpA
2186 		align	16
2187 lpA:
2188 		movaps	xmm2,[edx+ebx]
2189 		movaps	xmm3,[edx+ebx+16]
2190 		rcpps	xmm4,xmm2
2191 		rcpps	xmm5,xmm3
2192 		prefetchnta	[edx+ebx+64]
2193 		mulps	xmm2,xmm4
2194 		mulps	xmm2,xmm4
2195 		mulps	xmm3,xmm5
2196 		mulps	xmm3,xmm5
2197 		addps	xmm4,xmm4
2198 		addps	xmm5,xmm5
2199 		subps	xmm4,xmm2
2200 		subps	xmm5,xmm3
2201 		mulps	xmm4,xmm1
2202 		mulps	xmm5,xmm1
2203 		movaps	[edi+ebx],xmm4
2204 		movaps	[edi+ebx+16],xmm5
2205 		add		ebx,16*2
2206 		jl		lpA
2207 		jmp		done
2208 		align	16
2209 lpNA:
2210 		movups	xmm2,[edx+ebx]
2211 		movups	xmm3,[edx+ebx+16]
2212 		rcpps	xmm4,xmm2
2213 		rcpps	xmm5,xmm3
2214 		prefetchnta	[edx+ebx+64]
2215 		mulps	xmm2,xmm4
2216 		mulps	xmm2,xmm4
2217 		mulps	xmm3,xmm5
2218 		mulps	xmm3,xmm5
2219 		addps	xmm4,xmm4
2220 		addps	xmm5,xmm5
2221 		subps	xmm4,xmm2
2222 		subps	xmm5,xmm3
2223 		mulps	xmm4,xmm1
2224 		mulps	xmm5,xmm1
2225 		movaps	[edi+ebx],xmm4
2226 		movaps	[edi+ebx+16],xmm5
2227 		add		ebx,16*2
2228 		jl		lpNA
2229 done:
2230 		mov		edx,src
2231 		mov		edi,dst
2232 		KFLOATOPER( KDIVDSS1( [edi+ebx],xmm1,[edx+ebx] ),
2233 					KDIVDSS4( [edi+ebx],xmm1,[edx+ebx] ), count )
2234 	}
2235 }
2236 
2237 /*
2238 ============
2239 idSIMD_SSE::Div
2240 
2241   dst[i] = src0[i] / src1[i];
2242 ============
2243 */
Div(float * dst,const float * src0,const float * src1,const int count)2244 void VPCALL idSIMD_SSE::Div( float *dst, const float *src0, const float *src1, const int count ) {
2245 	int		pre,post;
2246 
2247 	//	1 / x = 2 * rcpps(x) - (x * rcpps(x) * rcpps(x));
2248 	__asm
2249 	{
2250 		KFLOATINITDSS( dst, src0, src1, count, pre, post )
2251 		and		eax,15
2252 		jne		lpNA
2253 		jmp		lpA
2254 		align	16
2255 lpA:
2256 		movaps	xmm2,[esi+ebx]
2257 		movaps	xmm3,[esi+ebx+16]
2258 		rcpps	xmm4,xmm2
2259 		rcpps	xmm5,xmm3
2260 		prefetchnta	[esi+ebx+64]
2261 		mulps	xmm2,xmm4
2262 		mulps	xmm2,xmm4
2263 		mulps	xmm3,xmm5
2264 		mulps	xmm3,xmm5
2265 		addps	xmm4,xmm4
2266 		addps	xmm5,xmm5
2267 		subps	xmm4,xmm2
2268 		subps	xmm5,xmm3
2269 		mulps	xmm4,[edx+ebx]
2270 		mulps	xmm5,[edx+ebx+16]
2271 		movaps	[edi+ebx],xmm4
2272 		movaps	[edi+ebx+16],xmm5
2273 		add		ebx,16*2
2274 		jl		lpA
2275 		jmp		done
2276 		align	16
2277 lpNA:
2278 		movups	xmm2,[esi+ebx]
2279 		movups	xmm3,[esi+ebx+16]
2280 		rcpps	xmm4,xmm2
2281 		rcpps	xmm5,xmm3
2282 		prefetchnta	[esi+ebx+64]
2283 		mulps	xmm2,xmm4
2284 		mulps	xmm2,xmm4
2285 		mulps	xmm3,xmm5
2286 		mulps	xmm3,xmm5
2287 		addps	xmm4,xmm4
2288 		addps	xmm5,xmm5
2289 		subps	xmm4,xmm2
2290 		subps	xmm5,xmm3
2291 		movups	xmm2,[edx+ebx]
2292 		movups	xmm3,[edx+ebx+16]
2293 		mulps	xmm4,xmm2
2294 		mulps	xmm5,xmm3
2295 		movaps	[edi+ebx],xmm4
2296 		movaps	[edi+ebx+16],xmm5
2297 		add		ebx,16*2
2298 		jl		lpNA
2299 done:
2300 		mov		edx,src0
2301 		mov		esi,src1
2302 		mov		edi,dst
2303 		KFLOATOPER( KDIVDSS1( [edi+ebx],[edx+ebx],[esi+ebx] ),
2304 					KDIVDSS4( [edi+ebx],[edx+ebx],[esi+ebx] ), count )
2305 	}
2306 }
2307 /*
2308 ============
2309 Simd_MulAdd
2310 
2311  assumes count >= 7
2312 ============
2313 */
Simd_MulAdd(float * dst,const float constant,const float * src,const int count)2314 static void Simd_MulAdd( float *dst, const float constant, const float *src, const int count ) {
2315 	__asm	mov			esi, dst
2316 	__asm	mov			edi, src
2317 	__asm	mov			eax, count
2318 	__asm	shl			eax, 2
2319 	__asm	mov			ecx, esi
2320 	__asm	mov			edx, eax
2321 	__asm	or			ecx, edi
2322 	__asm	fld			constant
2323 	__asm	and			ecx, 15
2324 	__asm	jz			SimdMulAdd16
2325 	__asm	and			ecx, 3
2326 	__asm	jnz			SimdMulAdd8
2327 	__asm	mov			ecx, esi
2328 	__asm	xor			ecx, edi
2329 	__asm	and			ecx, 15
2330 	__asm	jnz			MulAdd8
2331 	__asm	mov			ecx, esi
2332 	__asm	and			ecx, 15
2333 	__asm	neg			ecx
2334 	__asm	add			ecx, 16
2335 	__asm	sub			eax, ecx
2336 	__asm	add			edi, ecx
2337 	__asm	add			esi, ecx
2338 	__asm	neg			ecx
2339 	__asm	mov			edx, eax
2340 	__asm loopPreMulAdd16:
2341 	__asm	fld			st
2342 	__asm	fmul		dword ptr [edi+ecx]
2343 	__asm	fadd		dword ptr [esi+ecx]
2344 	__asm	fstp		dword ptr [esi+ecx]
2345 	__asm	add			ecx, 4
2346 	__asm	jl			loopPreMulAdd16
2347 	__asm SimdMulAdd16:
2348 	__asm	and			eax, ~15
2349 	__asm	movss		xmm1, constant
2350 	__asm	shufps		xmm1, xmm1, 0x00
2351 	__asm	add			esi, eax
2352 	__asm	add			edi, eax
2353 	__asm	neg			eax
2354 	__asm	align		16
2355 	__asm loopMulAdd16:
2356 	__asm	movaps		xmm0, [edi+eax]
2357 	__asm	mulps		xmm0, xmm1
2358 	__asm	addps		xmm0, [esi+eax]
2359 	__asm	movaps		[esi+eax], xmm0
2360 	__asm	add			eax, 16
2361 	__asm	jl			loopMulAdd16
2362 	__asm	jmp			postMulAdd
2363 	__asm MulAdd8:
2364 	__asm	mov			ecx, esi
2365 	__asm	and			ecx, 7
2366 	__asm	jz			SimdMulAdd8
2367 	__asm	sub			eax, ecx
2368 	__asm	add			esi, ecx
2369 	__asm	add			edi, ecx
2370 	__asm	neg			ecx
2371 	__asm	mov			edx, eax
2372 	__asm loopPreMulAdd8:
2373 	__asm	fld			st
2374 	__asm	fmul		dword ptr [edi+ecx]
2375 	__asm	fadd		dword ptr [esi+ecx]
2376 	__asm	fstp		dword ptr [esi+ecx]
2377 	__asm	add			ecx, 4
2378 	__asm	jl			loopPreMulAdd8
2379 	__asm SimdMulAdd8:
2380 	__asm	and			eax, ~15
2381 	__asm	movss		xmm1, constant
2382 	__asm	shufps		xmm1, xmm1, 0x00
2383 	__asm	add			esi, eax
2384 	__asm	add			edi, eax
2385 	__asm	neg			eax
2386 	__asm	align		16
2387 	__asm loopMulAdd8:
2388 	__asm	movlps		xmm0, [edi+eax]
2389 	__asm	movhps		xmm0, [edi+eax+8]
2390 	__asm	mulps		xmm0, xmm1
2391 	__asm	movlps		xmm2, [esi+eax]
2392 	__asm	movhps		xmm2, [esi+eax+8]
2393 	__asm	addps		xmm0, xmm2
2394 	__asm	movlps		[esi+eax], xmm0
2395 	__asm	movhps		[esi+eax+8], xmm0
2396 	__asm	add			eax, 16
2397 	__asm	jl			loopMulAdd8
2398 	__asm	jmp			postMulAdd
2399 	__asm postMulAdd:
2400 	__asm	and			edx, 15
2401 	__asm	jz			MulAddDone
2402 	__asm	add			esi, edx
2403 	__asm	add			edi, edx
2404 	__asm	neg			edx
2405 	__asm loopPostMulAdd:
2406 	__asm	fld			st
2407 	__asm	fmul		dword ptr [edi+edx]
2408 	__asm	fadd		dword ptr [esi+edx]
2409 	__asm	fstp		dword ptr [esi+edx]
2410 	__asm	add			edx, 4
2411 	__asm	jl			loopPostMulAdd
2412 	__asm MulAddDone:
2413 	__asm	fstp		st
2414 }
2415 
2416 #define MULADD_FEW( OPER )																				\
2417 switch( count ) {																						\
2418 	case 0:																								\
2419 		return;																							\
2420 	case 1:																								\
2421 		dst[0] OPER c * src[0];																			\
2422 		return;																							\
2423 	case 2:																								\
2424 		dst[0] OPER c * src[0]; dst[1] OPER c * src[1];													\
2425 		return;																							\
2426 	case 3:																								\
2427 		dst[0] OPER c * src[0]; dst[1] OPER c * src[1]; dst[2] OPER c * src[2];							\
2428 		return;																							\
2429 	case 4:																								\
2430 		dst[0] OPER c * src[0]; dst[1] OPER c * src[1]; dst[2] OPER c * src[2]; dst[3] OPER c * src[3];	\
2431 		return;																							\
2432 	case 5:																								\
2433 		dst[0] OPER c * src[0]; dst[1] OPER c * src[1]; dst[2] OPER c * src[2]; dst[3] OPER c * src[3];	\
2434 		dst[4] OPER c * src[4];																			\
2435 		return;																							\
2436 	case 6:																								\
2437 		dst[0] OPER c * src[0]; dst[1] OPER c * src[1]; dst[2] OPER c * src[2]; dst[3] OPER c * src[3];	\
2438 		dst[4] OPER c * src[4]; dst[5] OPER c * src[5];													\
2439 		return;																							\
2440 	case 7:																								\
2441 		dst[0] OPER c * src[0]; dst[1] OPER c * src[1]; dst[2] OPER c * src[2]; dst[3] OPER c * src[3];	\
2442 		dst[4] OPER c * src[4]; dst[5] OPER c * src[5]; dst[6] OPER c * src[6];							\
2443 		return;																							\
2444 	case 8:																								\
2445 		dst[0] OPER c * src[0]; dst[1] OPER c * src[1]; dst[2] OPER c * src[2]; dst[3] OPER c * src[3];	\
2446 		dst[4] OPER c * src[4]; dst[5] OPER c * src[5]; dst[6] OPER c * src[6]; dst[7] OPER c * src[7];	\
2447 		return;																							\
2448 	case 9:																								\
2449 		dst[0] OPER c * src[0]; dst[1] OPER c * src[1]; dst[2] OPER c * src[2]; dst[3] OPER c * src[3];	\
2450 		dst[4] OPER c * src[4]; dst[5] OPER c * src[5]; dst[6] OPER c * src[6]; dst[7] OPER c * src[7];	\
2451 		dst[8] OPER c * src[8];																			\
2452 		return;																							\
2453 	case 10:																							\
2454 		dst[0] OPER c * src[0]; dst[1] OPER c * src[1]; dst[2] OPER c * src[2]; dst[3] OPER c * src[3];	\
2455 		dst[4] OPER c * src[4]; dst[5] OPER c * src[5]; dst[6] OPER c * src[6]; dst[7] OPER c * src[7];	\
2456 		dst[8] OPER c * src[8]; dst[9] OPER c * src[9];													\
2457 		return;																							\
2458 	case 11:																							\
2459 		dst[0] OPER c * src[0]; dst[1] OPER c * src[1]; dst[2] OPER c * src[2]; dst[3] OPER c * src[3];	\
2460 		dst[4] OPER c * src[4]; dst[5] OPER c * src[5]; dst[6] OPER c * src[6]; dst[7] OPER c * src[7];	\
2461 		dst[8] OPER c * src[8]; dst[9] OPER c * src[9]; dst[10] OPER c * src[10];						\
2462 		return;																							\
2463 }
2464 
2465 /*
2466 ============
2467 idSIMD_SSE::MulAdd
2468 
2469   dst[i] += constant * src[i];
2470 ============
2471 */
MulAdd(float * dst,const float constant,const float * src,const int count)2472 void VPCALL idSIMD_SSE::MulAdd( float *dst, const float constant, const float *src, const int count ) {
2473 	float c = constant;
2474 	MULADD_FEW( += )
2475 	Simd_MulAdd( dst, constant, src, count );
2476 }
2477 
2478 /*
2479 ============
2480 idSIMD_SSE::MulAdd
2481 
2482   dst[i] += src0[i] * src1[i];
2483 ============
2484 */
MulAdd(float * dst,const float * src0,const float * src1,const int count)2485 void VPCALL idSIMD_SSE::MulAdd( float *dst, const float *src0, const float *src1, const int count ) {
2486 	for ( int i = 0; i < count; i++ ) {
2487 		dst[i] += src0[i] + src1[i];
2488 	}
2489 }
2490 
2491 /*
2492 ============
2493 idSIMD_SSE::MulSub
2494 
2495   dst[i] -= constant * src[i];
2496 ============
2497 */
MulSub(float * dst,const float constant,const float * src,const int count)2498 void VPCALL idSIMD_SSE::MulSub( float *dst, const float constant, const float *src, const int count ) {
2499 	float c = constant;
2500 	MULADD_FEW( -= )
2501 	Simd_MulAdd( dst, -constant, src, count );
2502 }
2503 
2504 /*
2505 ============
2506 idSIMD_SSE::MulSub
2507 
2508   dst[i] -= src0[i] * src1[i];
2509 ============
2510 */
MulSub(float * dst,const float * src0,const float * src1,const int count)2511 void VPCALL idSIMD_SSE::MulSub( float *dst, const float *src0, const float *src1, const int count ) {
2512 	for ( int i = 0; i < count; i++ ) {
2513 		dst[i] -= src0[i] + src1[i];
2514 	}
2515 }
2516 
2517 /*
2518 ============
2519 idSIMD_SSE::Dot
2520 
2521   dst[i] = constant * src[i];
2522 ============
2523 */
Dot(float * dst,const idVec3 & constant,const idVec3 * src,const int count)2524 void VPCALL idSIMD_SSE::Dot( float *dst, const idVec3 &constant, const idVec3 *src, const int count ) {
2525 	__asm
2526 	{
2527 		mov			eax, count
2528 		mov			edi, constant
2529 		mov			edx, eax
2530 		mov			esi, src
2531 		mov			ecx, dst
2532 		and			eax, ~3
2533 
2534 		movss		xmm4, [edi+0]
2535 		shufps		xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
2536 		movss		xmm5, [edi+4]
2537 		shufps		xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
2538 		movss		xmm6, [edi+8]
2539 		shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
2540 
2541 		jz			done4
2542 		imul		eax, 12
2543 		add			esi, eax
2544 		neg			eax
2545 
2546 	loop4:
2547 		movlps		xmm1, [esi+eax+ 0]
2548 		movlps		xmm2, [esi+eax+ 8]
2549 		movlps		xmm3, [esi+eax+16]
2550 		movhps		xmm1, [esi+eax+24]
2551 		movhps		xmm2, [esi+eax+32]
2552 		movhps		xmm3, [esi+eax+40]
2553 		movaps		xmm0, xmm1
2554 		shufps		xmm0, xmm2, R_SHUFFLEPS( 0, 2, 1, 3 )
2555 		shufps		xmm1, xmm3, R_SHUFFLEPS( 1, 3, 0, 2 )
2556 		shufps		xmm2, xmm3, R_SHUFFLEPS( 0, 2, 1, 3 )
2557 		add			ecx, 16
2558 		add			eax, 4*12
2559 		mulps		xmm0, xmm4
2560 		mulps		xmm1, xmm5
2561 		mulps		xmm2, xmm6
2562 		addps		xmm0, xmm1
2563 		addps		xmm0, xmm2
2564 		shufps		xmm0, xmm0, R_SHUFFLEPS( 0, 2, 1, 3 )
2565 		movlps		[ecx-16+0], xmm0
2566 		movhps		[ecx-16+8], xmm0
2567 		jl			loop4
2568 
2569 	done4:
2570 		and			edx, 3
2571 		jz			done1
2572 
2573 	loop1:
2574 		movss		xmm0, [esi+eax+0]
2575 		movss		xmm1, [esi+eax+4]
2576 		movss		xmm2, [esi+eax+8]
2577 		mulss		xmm0, xmm4
2578 		mulss		xmm1, xmm5
2579 		mulss		xmm2, xmm6
2580 		add			ecx, 4
2581 		addss		xmm0, xmm1
2582 		add			eax, 12
2583 		addss		xmm0, xmm2
2584 		dec			edx
2585 		movss		[ecx-4], xmm0
2586 		jnz			loop1
2587 
2588 	done1:
2589 	}
2590 }
2591 
2592 /*
2593 ============
2594 idSIMD_SSE::Dot
2595 
2596   dst[i] = constant * src[i].Normal() + src[i][3];
2597 ============
2598 */
Dot(float * dst,const idVec3 & constant,const idPlane * src,const int count)2599 void VPCALL idSIMD_SSE::Dot( float *dst, const idVec3 &constant, const idPlane *src, const int count ) {
2600 	__asm {
2601 		mov			eax, count
2602 		mov			edi, constant
2603 		mov			edx, eax
2604 		mov			esi, src
2605 		mov			ecx, dst
2606 		and			eax, ~3
2607 
2608 		movss		xmm5, [edi+0]
2609 		shufps		xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
2610 		movss		xmm6, [edi+4]
2611 		shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
2612 		movss		xmm7, [edi+8]
2613 		shufps		xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
2614 
2615 		jz			startVert1
2616 		imul		eax, 16
2617 		add			esi, eax
2618 		neg			eax
2619 
2620 	loopVert4:
2621 
2622 		movlps		xmm1, [esi+eax+ 0]
2623 		movlps		xmm3, [esi+eax+ 8]
2624 		movhps		xmm1, [esi+eax+16]
2625 		movhps		xmm3, [esi+eax+24]
2626 		movlps		xmm2, [esi+eax+32]
2627 		movlps		xmm4, [esi+eax+40]
2628 		movhps		xmm2, [esi+eax+48]
2629 		movhps		xmm4, [esi+eax+56]
2630 		movaps		xmm0, xmm1
2631 		shufps		xmm0, xmm2, R_SHUFFLEPS( 0, 2, 0, 2 )
2632 		shufps		xmm1, xmm2, R_SHUFFLEPS( 1, 3, 1, 3 )
2633 		movaps		xmm2, xmm3
2634 		shufps		xmm2, xmm4, R_SHUFFLEPS( 0, 2, 0, 2 )
2635 		shufps		xmm3, xmm4, R_SHUFFLEPS( 1, 3, 1, 3 )
2636 
2637 		add			ecx, 16
2638 		add			eax, 4*16
2639 
2640 		mulps		xmm0, xmm5
2641 		mulps		xmm1, xmm6
2642 		mulps		xmm2, xmm7
2643 		addps		xmm0, xmm3
2644 		addps		xmm0, xmm1
2645 		addps		xmm0, xmm2
2646 
2647 		movlps		[ecx-16+0], xmm0
2648 		movhps		[ecx-16+8], xmm0
2649 		jl			loopVert4
2650 
2651 	startVert1:
2652 		and			edx, 3
2653 		jz			done
2654 
2655 	loopVert1:
2656 		movss		xmm0, [esi+eax+0]
2657 		movss		xmm1, [esi+eax+4]
2658 		movss		xmm2, [esi+eax+8]
2659 		mulss		xmm0, xmm5
2660 		mulss		xmm1, xmm6
2661 		mulss		xmm2, xmm7
2662 		addss		xmm0, [esi+eax+12]
2663 		add			ecx, 4
2664 		addss		xmm0, xmm1
2665 		add			eax, 16
2666 		addss		xmm0, xmm2
2667 		dec			edx
2668 		movss		[ecx-4], xmm0
2669 		jnz			loopVert1
2670 
2671 	done:
2672 	}
2673 }
2674 
2675 /*
2676 ============
2677 idSIMD_SSE::Dot
2678 
2679   dst[i] = constant * src[i].xyz;
2680 ============
2681 */
Dot(float * dst,const idVec3 & constant,const idDrawVert * src,const int count)2682 void VPCALL idSIMD_SSE::Dot( float *dst, const idVec3 &constant, const idDrawVert *src, const int count ) {
2683 
2684 	assert( sizeof( idDrawVert ) == DRAWVERT_SIZE );
2685 	assert( (int)&((idDrawVert *)0)->xyz == DRAWVERT_XYZ_OFFSET );
2686 
2687 	// 0,  1,  2
2688 	// 3,  4,  5
2689 	// 6,  7,  8
2690 	// 9, 10, 11
2691 
2692 	__asm {
2693 		mov			eax, count
2694 		mov			edi, constant
2695 		mov			edx, eax
2696 		mov			esi, src
2697 		mov			ecx, dst
2698 		and			eax, ~3
2699 
2700 		movss		xmm4, [edi+0]
2701 		shufps		xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
2702 		movss		xmm5, [edi+4]
2703 		shufps		xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
2704 		movss		xmm6, [edi+8]
2705 		shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
2706 
2707 		jz			startVert1
2708 		imul		eax, DRAWVERT_SIZE
2709 		add			esi, eax
2710 		neg			eax
2711 
2712 	loopVert4:
2713 		movss		xmm0, [esi+eax+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]	//  3,  X,  X,  X
2714 		movss		xmm2, [esi+eax+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8]	//  2,  X,  X,  X
2715 		movhps		xmm0, [esi+eax+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]	//  3,  X,  0,  1
2716 		movaps		xmm1, xmm0												//  3,  X,  0,  1
2717 
2718 		movlps		xmm1, [esi+eax+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4]	//  4,  5,  0,  1
2719 		shufps		xmm2, xmm1, R_SHUFFLEPS( 0, 1, 0, 1 )					//  2,  X,  4,  5
2720 
2721 		movss		xmm3, [esi+eax+3*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]	//  9,  X,  X,  X
2722 		movhps		xmm3, [esi+eax+2*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]	//  9,  X,  6,  7
2723 		shufps		xmm0, xmm3, R_SHUFFLEPS( 2, 0, 2, 0 )					//  0,  3,  6,  9
2724 
2725 		movlps		xmm3, [esi+eax+3*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4]	// 10, 11,  6,  7
2726 		shufps		xmm1, xmm3, R_SHUFFLEPS( 3, 0, 3, 0 )					//  1,  4,  7, 10
2727 
2728 		movhps		xmm3, [esi+eax+2*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8]	// 10, 11,  8,  X
2729 		shufps		xmm2, xmm3, R_SHUFFLEPS( 0, 3, 2, 1 )					//  2,  5,  8, 11
2730 
2731 		add			ecx, 16
2732 		add			eax, 4*DRAWVERT_SIZE
2733 
2734 		mulps		xmm0, xmm4
2735 		mulps		xmm1, xmm5
2736 		mulps		xmm2, xmm6
2737 		addps		xmm0, xmm1
2738 		addps		xmm0, xmm2
2739 
2740 		movlps		[ecx-16+0], xmm0
2741 		movhps		[ecx-16+8], xmm0
2742 		jl			loopVert4
2743 
2744 	startVert1:
2745 		and			edx, 3
2746 		jz			done
2747 
2748 	loopVert1:
2749 		movss		xmm0, [esi+eax+DRAWVERT_XYZ_OFFSET+0]
2750 		movss		xmm1, [esi+eax+DRAWVERT_XYZ_OFFSET+4]
2751 		movss		xmm2, [esi+eax+DRAWVERT_XYZ_OFFSET+8]
2752 		mulss		xmm0, xmm4
2753 		mulss		xmm1, xmm5
2754 		mulss		xmm2, xmm6
2755 		add			ecx, 4
2756 		addss		xmm0, xmm1
2757 		add			eax, DRAWVERT_SIZE
2758 		addss		xmm0, xmm2
2759 		dec			edx
2760 		movss		[ecx-4], xmm0
2761 		jnz			loopVert1
2762 
2763 	done:
2764 	}
2765 }
2766 
2767 /*
2768 ============
2769 idSIMD_SSE::Dot
2770 
2771   dst[i] = constant.Normal() * src[i] + constant[3];
2772 ============
2773 */
Dot(float * dst,const idPlane & constant,const idVec3 * src,const int count)2774 void VPCALL idSIMD_SSE::Dot( float *dst, const idPlane &constant, const idVec3 *src, const int count ) {
2775 	__asm
2776 	{
2777 		mov			eax, count
2778 		mov			edi, constant
2779 		mov			edx, eax
2780 		mov			esi, src
2781 		mov			ecx, dst
2782 		and			eax, ~3
2783 
2784 		movss		xmm4, [edi+0]
2785 		shufps		xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
2786 		movss		xmm5, [edi+4]
2787 		shufps		xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
2788 		movss		xmm6, [edi+8]
2789 		shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
2790 		movss		xmm7, [edi+12]
2791 		shufps		xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
2792 
2793 		jz			done4
2794 		imul		eax, 12
2795 		add			esi, eax
2796 		neg			eax
2797 
2798 	loop4:
2799 		movlps		xmm1, [esi+eax+ 0]
2800 		movlps		xmm2, [esi+eax+ 8]
2801 		movlps		xmm3, [esi+eax+16]
2802 		movhps		xmm1, [esi+eax+24]
2803 		movhps		xmm2, [esi+eax+32]
2804 		movhps		xmm3, [esi+eax+40]
2805 		movaps		xmm0, xmm1
2806 		shufps		xmm0, xmm2, R_SHUFFLEPS( 0, 2, 1, 3 )
2807 		shufps		xmm1, xmm3, R_SHUFFLEPS( 1, 3, 0, 2 )
2808 		shufps		xmm2, xmm3, R_SHUFFLEPS( 0, 2, 1, 3 )
2809 
2810 		add			ecx, 16
2811 		add			eax, 4*12
2812 
2813 		mulps		xmm0, xmm4
2814 		mulps		xmm1, xmm5
2815 		mulps		xmm2, xmm6
2816 		addps		xmm0, xmm7
2817 		addps		xmm0, xmm1
2818 		addps		xmm0, xmm2
2819 		shufps		xmm0, xmm0, R_SHUFFLEPS( 0, 2, 1, 3 )
2820 
2821 		movlps		[ecx-16+0], xmm0
2822 		movhps		[ecx-16+8], xmm0
2823 		jl			loop4
2824 
2825 	done4:
2826 		and			edx, 3
2827 		jz			done1
2828 
2829 	loop1:
2830 		movss		xmm0, [esi+eax+0]
2831 		movss		xmm1, [esi+eax+4]
2832 		movss		xmm2, [esi+eax+8]
2833 		mulss		xmm0, xmm4
2834 		mulss		xmm1, xmm5
2835 		mulss		xmm2, xmm6
2836 		addss		xmm0, xmm7
2837 		add			ecx, 4
2838 		addss		xmm0, xmm1
2839 		add			eax, 12
2840 		addss		xmm0, xmm2
2841 		dec			edx
2842 		movss		[ecx-4], xmm0
2843 		jnz			loop1
2844 
2845 	done1:
2846 	}
2847 }
2848 
2849 /*
2850 ============
2851 idSIMD_SSE::Dot
2852 
2853   dst[i] = constant.Normal() * src[i].Normal() + constant[3] * src[i][3];
2854 ============
2855 */
Dot(float * dst,const idPlane & constant,const idPlane * src,const int count)2856 void VPCALL idSIMD_SSE::Dot( float *dst, const idPlane &constant, const idPlane *src, const int count ) {
2857 
2858 #define SINGLE_OP(SRC, DEST)							\
2859 	__asm	movlps		xmm0,[SRC]						\
2860 	__asm	movlps		xmm1,[SRC+8]					\
2861 	__asm	mulps		xmm0,xmm4						\
2862 	__asm	mulps		xmm1,xmm5						\
2863 	__asm	addps		xmm0,xmm1						\
2864 	__asm	movaps		xmm1,xmm0						\
2865 	__asm	shufps		xmm1,xmm1,SHUFFLEPS(1,1,1,1)	\
2866 	__asm	addss		xmm0,xmm1						\
2867 	__asm	movss		[DEST],xmm0						\
2868 	__asm	add			SRC,16							\
2869 	__asm	add			DEST,4
2870 
2871 #define DUAL_OP(SRC, DEST)								\
2872 	__asm	movlps		xmm0,[SRC]						\
2873 	__asm	movlps		xmm1,[SRC+8]					\
2874 	__asm	movhps		xmm0,[SRC+16]					\
2875 	__asm	movhps		xmm1,[SRC+24]					\
2876 	__asm	mulps		xmm0,xmm4						\
2877 	__asm	mulps		xmm1,xmm5						\
2878 	__asm	addps		xmm0,xmm1						\
2879 	__asm	shufps		xmm1,xmm0,SHUFFLEPS(2,0,1,0)	\
2880 	__asm	shufps		xmm0,xmm0,SHUFFLEPS(3,1,2,0)	\
2881 	__asm	addps		xmm0,xmm1						\
2882 	__asm	movhps		[DEST],xmm0						\
2883 	__asm	add			SRC,32							\
2884 	__asm	add			DEST,8
2885 
2886 	__asm {
2887 		mov			edx, dst
2888 		mov			eax, src
2889 		mov			ebx, constant
2890 		mov			ecx, count
2891 
2892 		movlps		xmm4, [ebx]
2893 		shufps		xmm4, xmm4, SHUFFLEPS(1,0,1,0)
2894 		movlps		xmm5, [ebx+8]
2895 		shufps		xmm5, xmm5, SHUFFLEPS(1,0,1,0)
2896 
2897 		xorps		xmm0, xmm0
2898 		xorps		xmm1, xmm1
2899 
2900 	_lpAlignDest:
2901 		test		edx, 0x0f
2902 		jz			_destAligned
2903 		SINGLE_OP(eax,edx)
2904 		dec			ecx
2905 		jnz			_lpAlignDest
2906 		jmp			_vpExit
2907 
2908 	_destAligned:
2909 		push		ecx
2910 
2911 		cmp			ecx, 4
2912 		jl			_post
2913 
2914 		and			ecx, ~3
2915 		shl			ecx, 2
2916 		lea			eax, [eax+ecx*4]
2917 		add			edx, ecx
2918 		neg			ecx
2919 
2920 		movlps		xmm0, [eax+ecx*4]
2921 		movhps		xmm0, [eax+ecx*4+16]
2922 		movlps		xmm2, [eax+ecx*4+32]
2923 		movhps		xmm2, [eax+ecx*4+48]
2924 		jmp			_lpStart
2925 
2926 		align	16
2927 	_lp:
2928 		prefetchnta	[eax+ecx*4+128]
2929 		addps		xmm1, xmm0
2930 		movlps		xmm0, [eax+ecx*4]
2931 		movhps		xmm0, [eax+ecx*4+16]
2932 		movlps		xmm2, [eax+ecx*4+32]
2933 		movhps		xmm2, [eax+ecx*4+48]
2934 		movaps		[edx+ecx-16],xmm1
2935 	_lpStart:
2936 		movlps		xmm1, [eax+ecx*4+8]
2937 		movhps		xmm1, [eax+ecx*4+24]
2938 		movlps		xmm3, [eax+ecx*4+40]
2939 		movhps		xmm3, [eax+ecx*4+56]
2940 		add			ecx, 16
2941 		mulps		xmm1, xmm5
2942 		mulps		xmm2, xmm4
2943 		mulps		xmm3, xmm5
2944 		addps		xmm2, xmm3						// y3+w3 x3+z3 y2+w2 x2+z2
2945 		mulps		xmm0, xmm4
2946 		addps		xmm0, xmm1						// y1+w1 x1+z1 y0+w0 x0+z0
2947 		movaps		xmm1, xmm0
2948 		shufps		xmm0, xmm2, SHUFFLEPS(2,0,2,0)	// x3+z3 x2+z2 x1+z1 x0+z0
2949 		shufps		xmm1, xmm2, SHUFFLEPS(3,1,3,1)	// y3+w3 y2+w2 y1+w1 y0+w0
2950 		js			_lp
2951 		addps		xmm1, xmm0
2952 		movaps		[edx+ecx-16], xmm1
2953 	_post:
2954 		pop			ecx
2955 		and			ecx, 0x3
2956 		cmp			ecx, 2
2957 		jl			_post1
2958 		DUAL_OP(eax,edx)
2959 		sub			ecx, 2
2960 	_post1:
2961 		cmp			ecx, 1
2962 		jne			_vpExit
2963 		SINGLE_OP(eax,edx)
2964 	_vpExit:
2965 	}
2966 
2967 #undef DUAL_OP
2968 #undef SINGLE_OP
2969 
2970 }
2971 
2972 /*
2973 ============
2974 idSIMD_SSE::Dot
2975 
2976   dst[i] = constant.Normal() * src[i].xyz + constant[3];
2977 ============
2978 */
Dot(float * dst,const idPlane & constant,const idDrawVert * src,const int count)2979 void VPCALL idSIMD_SSE::Dot( float *dst, const idPlane &constant, const idDrawVert *src, const int count ) {
2980 
2981 	assert( sizeof( idDrawVert ) == DRAWVERT_SIZE );
2982 	assert( (int)&((idDrawVert *)0)->xyz == DRAWVERT_XYZ_OFFSET );
2983 
2984 	// 0,  1,  2
2985 	// 3,  4,  5
2986 	// 6,  7,  8
2987 	// 9, 10, 11
2988 
2989 	__asm {
2990 		mov			eax, count
2991 		mov			edi, constant
2992 		mov			edx, eax
2993 		mov			esi, src
2994 		mov			ecx, dst
2995 		and			eax, ~3
2996 
2997 		movss		xmm4, [edi+0]
2998 		shufps		xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
2999 		movss		xmm5, [edi+4]
3000 		shufps		xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
3001 		movss		xmm6, [edi+8]
3002 		shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
3003 		movss		xmm7, [edi+12]
3004 		shufps		xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
3005 
3006 		jz			startVert1
3007 		imul		eax, DRAWVERT_SIZE
3008 		add			esi, eax
3009 		neg			eax
3010 
3011 	loopVert4:
3012 		movss		xmm0, [esi+eax+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]	//  3,  X,  X,  X
3013 		movss		xmm2, [esi+eax+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8]	//  2,  X,  X,  X
3014 		movhps		xmm0, [esi+eax+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]	//  3,  X,  0,  1
3015 		movaps		xmm1, xmm0												//  3,  X,  0,  1
3016 
3017 		movlps		xmm1, [esi+eax+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4]	//  4,  5,  0,  1
3018 		shufps		xmm2, xmm1, R_SHUFFLEPS( 0, 1, 0, 1 )					//  2,  X,  4,  5
3019 
3020 		movss		xmm3, [esi+eax+3*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]	//  9,  X,  X,  X
3021 		movhps		xmm3, [esi+eax+2*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]	//  9,  X,  6,  7
3022 		shufps		xmm0, xmm3, R_SHUFFLEPS( 2, 0, 2, 0 )					//  0,  3,  6,  9
3023 
3024 		movlps		xmm3, [esi+eax+3*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4]	// 10, 11,  6,  7
3025 		shufps		xmm1, xmm3, R_SHUFFLEPS( 3, 0, 3, 0 )					//  1,  4,  7, 10
3026 
3027 		movhps		xmm3, [esi+eax+2*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8]	// 10, 11,  8,  X
3028 		shufps		xmm2, xmm3, R_SHUFFLEPS( 0, 3, 2, 1 )					//  2,  5,  8, 11
3029 
3030 		add			ecx, 16
3031 		add			eax, 4*DRAWVERT_SIZE
3032 
3033 		mulps		xmm0, xmm4
3034 		mulps		xmm1, xmm5
3035 		mulps		xmm2, xmm6
3036 		addps		xmm0, xmm7
3037 		addps		xmm0, xmm1
3038 		addps		xmm0, xmm2
3039 
3040 		movlps		[ecx-16+0], xmm0
3041 		movhps		[ecx-16+8], xmm0
3042 		jl			loopVert4
3043 
3044 	startVert1:
3045 		and			edx, 3
3046 		jz			done
3047 
3048 	loopVert1:
3049 		movss		xmm0, [esi+eax+DRAWVERT_XYZ_OFFSET+0]
3050 		movss		xmm1, [esi+eax+DRAWVERT_XYZ_OFFSET+4]
3051 		movss		xmm2, [esi+eax+DRAWVERT_XYZ_OFFSET+8]
3052 		mulss		xmm0, xmm4
3053 		mulss		xmm1, xmm5
3054 		mulss		xmm2, xmm6
3055 		addss		xmm0, xmm7
3056 		add			ecx, 4
3057 		addss		xmm0, xmm1
3058 		add			eax, DRAWVERT_SIZE
3059 		addss		xmm0, xmm2
3060 		dec			edx
3061 		movss		[ecx-4], xmm0
3062 		jnz			loopVert1
3063 
3064 	done:
3065 	}
3066 }
3067 
3068 /*
3069 ============
3070 idSIMD_SSE::Dot
3071 
3072   dst[i] = src0[i] * src1[i];
3073 ============
3074 */
Dot(float * dst,const idVec3 * src0,const idVec3 * src1,const int count)3075 void VPCALL idSIMD_SSE::Dot( float *dst, const idVec3 *src0, const idVec3 *src1, const int count ) {
3076 	__asm
3077 	{
3078 		mov			eax, count
3079 		mov			edi, src0
3080 		mov			edx, eax
3081 		mov			esi, src1
3082 		mov			ecx, dst
3083 		and			eax, ~3
3084 
3085 		jz			done4
3086 		imul		eax, 12
3087 		add			edi, eax
3088 		add			esi, eax
3089 		neg			eax
3090 
3091 	loop4:
3092 		movlps		xmm0, [esi+eax]						// 0, 1, X, X
3093 		movlps		xmm3, [edi+eax]						// 0, 1, X, X
3094 		movlps		xmm1, [esi+eax+8]					// 2, 3, X, X
3095 		movlps		xmm4, [edi+eax+8]					// 2, 3, X, X
3096 		movhps		xmm0, [esi+eax+24]					// 0, 1, 6, 7
3097 		movhps		xmm3, [edi+eax+24]					// 0, 1, 6, 7
3098 		movhps		xmm1, [esi+eax+32]					// 2, 3, 8, 9
3099 		movhps		xmm4, [edi+eax+32]					// 2, 3, 8, 9
3100 		movlps		xmm2, [esi+eax+16]					// 4, 5, X, X
3101 		movlps		xmm5, [edi+eax+16]					// 4, 5, X, X
3102 		movhps		xmm2, [esi+eax+40]					// 4, 5, 10, 11
3103 		movhps		xmm5, [edi+eax+40]					// 4, 5, 10, 11
3104 
3105 		add			ecx, 16
3106 		add			eax, 48
3107 
3108 		mulps		xmm0, xmm3
3109 		mulps		xmm1, xmm4
3110 		mulps		xmm2, xmm5
3111 		movaps		xmm7, xmm0
3112 		shufps		xmm7, xmm1, R_SHUFFLEPS( 0, 2, 1, 3 )	// 0, 6, 3, 9
3113 		shufps		xmm0, xmm2, R_SHUFFLEPS( 1, 3, 0, 2 )	// 1, 7, 4, 10
3114 		shufps		xmm1, xmm2, R_SHUFFLEPS( 0, 2, 1, 3 )	// 2, 8, 5, 11
3115 		addps		xmm7, xmm0
3116 		addps		xmm7, xmm1
3117 		shufps		xmm7, xmm7, R_SHUFFLEPS( 0, 2, 1, 3 )
3118 
3119 		movlps		[ecx-16+0], xmm7
3120 		movhps		[ecx-16+8], xmm7
3121 		jl			loop4
3122 
3123 	done4:
3124 		and			edx, 3
3125 		jz			done1
3126 
3127 	loop1:
3128 		movss		xmm0, [esi+eax+0]
3129 		movss		xmm3, [edi+eax+0]
3130 		movss		xmm1, [esi+eax+4]
3131 		movss		xmm4, [edi+eax+4]
3132 		movss		xmm2, [esi+eax+8]
3133 		movss		xmm5, [edi+eax+8]
3134 		mulss		xmm0, xmm3
3135 		mulss		xmm1, xmm4
3136 		mulss		xmm2, xmm5
3137 		add			ecx, 4
3138 		addss		xmm0, xmm1
3139 		add			eax, 12
3140 		addss		xmm0, xmm2
3141 		dec			edx
3142 		movss		[ecx-4], xmm0
3143 		jnz			loop1
3144 
3145 	done1:
3146 	}
3147 }
3148 
3149 /*
3150 ============
3151 idSIMD_SSE::Dot
3152 
3153   dot = src1[0] * src2[0] + src1[1] * src2[1] + src1[2] * src2[2] + ...
3154 ============
3155 */
Dot(float & dot,const float * src1,const float * src2,const int count)3156 void VPCALL idSIMD_SSE::Dot( float &dot, const float *src1, const float *src2, const int count ) {
3157 	switch( count ) {
3158 		case 0:
3159 			dot = 0.0f;
3160 			return;
3161 		case 1:
3162 			dot = src1[0] * src2[0];
3163 			return;
3164 		case 2:
3165 			dot = src1[0] * src2[0] + src1[1] * src2[1];
3166 			return;
3167 		case 3:
3168 			dot = src1[0] * src2[0] + src1[1] * src2[1] + src1[2] * src2[2];
3169 			return;
3170 		default:
3171 			__asm {
3172 				mov			ecx, src1
3173 				mov			edx, src2
3174 				mov			eax, ecx
3175 				or			eax, edx
3176 				and			eax, 15
3177 				jz			alignedDot
3178 				// unaligned
3179 				mov			eax, count
3180 				shr			eax, 2
3181 				shl			eax, 4
3182 				add			ecx, eax
3183 				add			edx, eax
3184 				neg			eax
3185 				movups		xmm0, [ecx+eax]
3186 				movups		xmm1, [edx+eax]
3187 				mulps		xmm0, xmm1
3188 				add			eax, 16
3189 				jz			doneDot
3190 			loopUnalignedDot:
3191 				movups		xmm1, [ecx+eax]
3192 				movups		xmm2, [edx+eax]
3193 				mulps		xmm1, xmm2
3194 				addps		xmm0, xmm1
3195 				add			eax, 16
3196 				jl			loopUnalignedDot
3197 				jmp			doneDot
3198 				// aligned
3199 			alignedDot:
3200 				mov			eax, count
3201 				shr			eax, 2
3202 				shl			eax, 4
3203 				add			ecx, eax
3204 				add			edx, eax
3205 				neg			eax
3206 				movaps		xmm0, [ecx+eax]
3207 				movaps		xmm1, [edx+eax]
3208 				mulps		xmm0, xmm1
3209 				add			eax, 16
3210 				jz			doneDot
3211 			loopAlignedDot:
3212 				movaps		xmm1, [ecx+eax]
3213 				movaps		xmm2, [edx+eax]
3214 				mulps		xmm1, xmm2
3215 				addps		xmm0, xmm1
3216 				add			eax, 16
3217 				jl			loopAlignedDot
3218 			doneDot:
3219 			}
3220 			switch( count & 3 ) {
3221 				case 1:
3222 					__asm {
3223 						movss	xmm1, [ecx]
3224 						movss	xmm2, [edx]
3225 						mulss	xmm1, xmm2
3226 						addss	xmm0, xmm1
3227 					}
3228 					break;
3229 				case 2:
3230 					__asm {
3231 						xorps	xmm2, xmm2
3232 						movlps	xmm1, [ecx]
3233 						movlps	xmm2, [edx]
3234 						mulps	xmm1, xmm2
3235 						addps	xmm0, xmm1
3236 					}
3237 					break;
3238 				case 3:
3239 					__asm {
3240 						movss	xmm1, [ecx]
3241 						movhps	xmm1, [ecx+4]
3242 						movss	xmm2, [edx]
3243 						movhps	xmm2, [edx+4]
3244 						mulps	xmm1, xmm2
3245 						addps	xmm0, xmm1
3246 					}
3247 					break;
3248 			}
3249 			__asm {
3250 				movhlps		xmm1, xmm0
3251 				addps		xmm0, xmm1
3252 				movaps		xmm1, xmm0
3253 				shufps		xmm1, xmm1, R_SHUFFLEPS( 1, 0, 0, 0 )
3254 				addss		xmm0, xmm1
3255 				mov			eax, dot
3256 				movss		[eax], xmm0
3257 			}
3258 			return;
3259 	}
3260 }
3261 
3262 //
3263 //	cmpeqps		==		Equal
3264 //	cmpneqps	!=		Not Equal
3265 //	cmpltps		<		Less Than
3266 //  cmpnltps	>=		Not Less Than
3267 //	cmpnleps	>		Not Less Or Equal
3268 //
3269 #define FLIP	not al
3270 #define NOFLIP
3271 
3272 #define COMPARECONSTANT( DST, SRC0, CONSTANT, COUNT, CMP, CMPSIMD, DOFLIP )				\
3273 	int i, cnt, pre, post;																\
3274 	float *aligned;																		\
3275 																						\
3276 	/* if the float array is not aligned on a 4 byte boundary */						\
3277 	if ( ((int) SRC0) & 3 ) {															\
3278 		/* unaligned memory access */													\
3279 		pre = 0;																		\
3280 		cnt = COUNT >> 2;																\
3281 		post = COUNT - (cnt<<2);														\
3282 		__asm	mov			edx, cnt													\
3283 		__asm	test		edx, edx													\
3284 		__asm	je			doneCmp														\
3285 		__asm	push		ebx															\
3286 		__asm	neg			edx															\
3287 		__asm	mov			esi, SRC0													\
3288 		__asm	prefetchnta	[esi+64]													\
3289 		__asm	movss		xmm1, CONSTANT												\
3290 		__asm	shufps		xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )						\
3291 		__asm	mov			edi, DST													\
3292 		__asm	mov			ecx, 0x01010101												\
3293 		__asm loopNA:																	\
3294 		__asm	movups		xmm0, [esi]													\
3295 		__asm	prefetchnta	[esi+128]													\
3296 		__asm	CMPSIMD		xmm0, xmm1													\
3297 		__asm	movmskps	eax, xmm0													\
3298 		__asm	DOFLIP																	\
3299 		__asm	mov			ah, al														\
3300 		__asm	shr			ah, 1														\
3301 		__asm	mov			bx, ax														\
3302 		__asm	shl			ebx, 14														\
3303 		__asm	mov			bx, ax														\
3304 		__asm	and			ebx, ecx													\
3305 		__asm	mov			dword ptr [edi], ebx										\
3306 		__asm	add			esi, 16														\
3307 		__asm	add			edi, 4														\
3308 		__asm	inc			edx															\
3309 		__asm	jl			loopNA														\
3310 		__asm	pop			ebx															\
3311 	}																					\
3312 	else {																				\
3313 		/* aligned memory access */														\
3314 		aligned = (float *) ((((int) SRC0) + 15) & ~15);								\
3315 		if ( (int)aligned > ((int)src0) + COUNT ) {										\
3316 			pre = COUNT;																\
3317 			post = 0;																	\
3318 		}																				\
3319 		else {																			\
3320 			pre = aligned - SRC0;														\
3321 			cnt = (COUNT - pre) >> 2;													\
3322 			post = COUNT - pre - (cnt<<2);												\
3323 			__asm	mov			edx, cnt												\
3324 			__asm	test		edx, edx												\
3325 			__asm	je			doneCmp													\
3326 			__asm	push		ebx														\
3327 			__asm	neg			edx														\
3328 			__asm	mov			esi, aligned											\
3329 			__asm	prefetchnta	[esi+64]												\
3330 			__asm	movss		xmm1, CONSTANT											\
3331 			__asm	shufps		xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )					\
3332 			__asm	mov			edi, DST												\
3333 			__asm	add			edi, pre												\
3334 			__asm	mov			ecx, 0x01010101											\
3335 			__asm loopA:																\
3336 			__asm	movaps		xmm0, [esi]												\
3337 			__asm	prefetchnta	[esi+128]												\
3338 			__asm	CMPSIMD		xmm0, xmm1												\
3339 			__asm	movmskps	eax, xmm0												\
3340 			__asm	DOFLIP																\
3341 			__asm	mov			ah, al													\
3342 			__asm	shr			ah, 1													\
3343 			__asm	mov			bx, ax													\
3344 			__asm	shl			ebx, 14													\
3345 			__asm	mov			bx, ax													\
3346 			__asm	and			ebx, ecx												\
3347 			__asm	mov			dword ptr [edi], ebx									\
3348 			__asm	add			esi, 16													\
3349 			__asm	add			edi, 4													\
3350 			__asm	inc			edx														\
3351 			__asm	jl			loopA													\
3352 			__asm	pop			ebx														\
3353 		}																				\
3354 	}																					\
3355 	doneCmp:																			\
3356 	double c = constant;																\
3357 	for ( i = 0; i < pre; i++ ) {														\
3358 		dst[i] = src0[i] CMP c;															\
3359 	}																					\
3360 	for ( i = count - post; i < count; i++ ) {											\
3361 		dst[i] = src0[i] CMP c;															\
3362 	}
3363 
3364 #define COMPAREBITCONSTANT( DST, BITNUM, SRC0, CONSTANT, COUNT, CMP, CMPSIMD, DOFLIP )	\
3365 	int i, cnt, pre, post;																\
3366 	float *aligned;																		\
3367 																						\
3368 	/* if the float array is not aligned on a 4 byte boundary */						\
3369 	if ( ((int) SRC0) & 3 ) {															\
3370 		/* unaligned memory access */													\
3371 		pre = 0;																		\
3372 		cnt = COUNT >> 2;																\
3373 		post = COUNT - (cnt<<2);														\
3374 		__asm	mov			edx, cnt													\
3375 		__asm	test		edx, edx													\
3376 		__asm	je			doneCmp														\
3377 		__asm	push		ebx															\
3378 		__asm	neg			edx															\
3379 		__asm	mov			esi, SRC0													\
3380 		__asm	prefetchnta	[esi+64]													\
3381 		__asm	movss		xmm1, CONSTANT												\
3382 		__asm	shufps		xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )						\
3383 		__asm	mov			edi, DST													\
3384 		__asm	mov			cl, bitNum													\
3385 		__asm loopNA:																	\
3386 		__asm	movups		xmm0, [esi]													\
3387 		__asm	prefetchnta	[esi+128]													\
3388 		__asm	CMPSIMD		xmm0, xmm1													\
3389 		__asm	movmskps	eax, xmm0													\
3390 		__asm	DOFLIP																	\
3391 		__asm	mov			ah, al														\
3392 		__asm	shr			ah, 1														\
3393 		__asm	mov			bx, ax														\
3394 		__asm	shl			ebx, 14														\
3395 		__asm	mov			bx, ax														\
3396 		__asm	and			ebx, 0x01010101												\
3397 		__asm	shl			ebx, cl														\
3398 		__asm	or			ebx, dword ptr [edi]										\
3399 		__asm	mov			dword ptr [edi], ebx										\
3400 		__asm	add			esi, 16														\
3401 		__asm	add			edi, 4														\
3402 		__asm	inc			edx															\
3403 		__asm	jl			loopNA														\
3404 		__asm	pop			ebx															\
3405 	}																					\
3406 	else {																				\
3407 		/* aligned memory access */														\
3408 		aligned = (float *) ((((int) SRC0) + 15) & ~15);								\
3409 		if ( (int)aligned > ((int)src0) + COUNT ) {										\
3410 			pre = COUNT;																\
3411 			post = 0;																	\
3412 		}																				\
3413 		else {																			\
3414 			pre = aligned - SRC0;														\
3415 			cnt = (COUNT - pre) >> 2;													\
3416 			post = COUNT - pre - (cnt<<2);												\
3417 			__asm	mov			edx, cnt												\
3418 			__asm	test		edx, edx												\
3419 			__asm	je			doneCmp													\
3420 			__asm	push		ebx														\
3421 			__asm	neg			edx														\
3422 			__asm	mov			esi, aligned											\
3423 			__asm	prefetchnta	[esi+64]												\
3424 			__asm	movss		xmm1, CONSTANT											\
3425 			__asm	shufps		xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )					\
3426 			__asm	mov			edi, DST												\
3427 			__asm	add			edi, pre												\
3428 			__asm	mov			cl, bitNum												\
3429 			__asm loopA:																\
3430 			__asm	movaps		xmm0, [esi]												\
3431 			__asm	prefetchnta	[esi+128]												\
3432 			__asm	CMPSIMD		xmm0, xmm1												\
3433 			__asm	movmskps	eax, xmm0												\
3434 			__asm	DOFLIP																\
3435 			__asm	mov			ah, al													\
3436 			__asm	shr			ah, 1													\
3437 			__asm	mov			bx, ax													\
3438 			__asm	shl			ebx, 14													\
3439 			__asm	mov			bx, ax													\
3440 			__asm	and			ebx, 0x01010101											\
3441 			__asm	shl			ebx, cl													\
3442 			__asm	or			ebx, dword ptr [edi]									\
3443 			__asm	mov			dword ptr [edi], ebx									\
3444 			__asm	add			esi, 16													\
3445 			__asm	add			edi, 4													\
3446 			__asm	inc			edx														\
3447 			__asm	jl			loopA													\
3448 			__asm	pop			ebx														\
3449 		}																				\
3450 	}																					\
3451 	doneCmp:																			\
3452 	float c = constant;																	\
3453 	for ( i = 0; i < pre; i++ ) {														\
3454 		dst[i] |= ( src0[i] CMP c ) << BITNUM;											\
3455 	}																					\
3456 	for ( i = count - post; i < count; i++ ) {											\
3457 		dst[i] |= ( src0[i] CMP c ) << BITNUM;											\
3458 	}
3459 
3460 /*
3461 ============
3462 idSIMD_SSE::CmpGT
3463 
3464   dst[i] = src0[i] > constant;
3465 ============
3466 */
CmpGT(byte * dst,const float * src0,const float constant,const int count)3467 void VPCALL idSIMD_SSE::CmpGT( byte *dst, const float *src0, const float constant, const int count ) {
3468 	COMPARECONSTANT( dst, src0, constant, count, >, cmpnleps, NOFLIP )
3469 }
3470 
3471 /*
3472 ============
3473 idSIMD_SSE::CmpGT
3474 
3475   dst[i] |= ( src0[i] > constant ) << bitNum;
3476 ============
3477 */
CmpGT(byte * dst,const byte bitNum,const float * src0,const float constant,const int count)3478 void VPCALL idSIMD_SSE::CmpGT( byte *dst, const byte bitNum, const float *src0, const float constant, const int count ) {
3479 	COMPAREBITCONSTANT( dst, bitNum, src0, constant, count, >, cmpnleps, NOFLIP )
3480 }
3481 
3482 /*
3483 ============
3484 idSIMD_SSE::CmpGE
3485 
3486   dst[i] = src0[i] >= constant;
3487 ============
3488 */
CmpGE(byte * dst,const float * src0,const float constant,const int count)3489 void VPCALL idSIMD_SSE::CmpGE( byte *dst, const float *src0, const float constant, const int count ) {
3490 	COMPARECONSTANT( dst, src0, constant, count, >=, cmpnltps, NOFLIP )
3491 }
3492 
3493 /*
3494 ============
3495 idSIMD_SSE::CmpGE
3496 
3497   dst[i] |= ( src0[i] >= constant ) << bitNum;
3498 ============
3499 */
CmpGE(byte * dst,const byte bitNum,const float * src0,const float constant,const int count)3500 void VPCALL idSIMD_SSE::CmpGE( byte *dst, const byte bitNum, const float *src0, const float constant, const int count ) {
3501 	COMPAREBITCONSTANT( dst, bitNum, src0, constant, count, >=, cmpnltps, NOFLIP )
3502 }
3503 
3504 /*
3505 ============
3506 idSIMD_SSE::CmpLT
3507 
3508   dst[i] = src0[i] < constant;
3509 ============
3510 */
CmpLT(byte * dst,const float * src0,const float constant,const int count)3511 void VPCALL idSIMD_SSE::CmpLT( byte *dst, const float *src0, const float constant, const int count ) {
3512 	COMPARECONSTANT( dst, src0, constant, count, <, cmpltps, NOFLIP )
3513 }
3514 
3515 /*
3516 ============
3517 idSIMD_SSE::CmpLT
3518 
3519   dst[i] |= ( src0[i] < constant ) << bitNum;
3520 ============
3521 */
CmpLT(byte * dst,const byte bitNum,const float * src0,const float constant,const int count)3522 void VPCALL idSIMD_SSE::CmpLT( byte *dst, const byte bitNum, const float *src0, const float constant, const int count ) {
3523 	COMPAREBITCONSTANT( dst, bitNum, src0, constant, count, <, cmpltps, NOFLIP )
3524 }
3525 
3526 /*
3527 ============
3528 idSIMD_SSE::CmpLE
3529 
3530   dst[i] = src0[i] <= constant;
3531 ============
3532 */
CmpLE(byte * dst,const float * src0,const float constant,const int count)3533 void VPCALL idSIMD_SSE::CmpLE( byte *dst, const float *src0, const float constant, const int count ) {
3534 	COMPARECONSTANT( dst, src0, constant, count, <=, cmpnleps, FLIP )
3535 }
3536 
3537 /*
3538 ============
3539 idSIMD_SSE::CmpLE
3540 
3541   dst[i] |= ( src0[i] <= constant ) << bitNum;
3542 ============
3543 */
CmpLE(byte * dst,const byte bitNum,const float * src0,const float constant,const int count)3544 void VPCALL idSIMD_SSE::CmpLE( byte *dst, const byte bitNum, const float *src0, const float constant, const int count ) {
3545 	COMPAREBITCONSTANT( dst, bitNum, src0, constant, count, <=, cmpnleps, FLIP )
3546 }
3547 
3548 /*
3549 ============
3550 idSIMD_SSE::MinMax
3551 ============
3552 */
MinMax(float & min,float & max,const float * src,const int count)3553 void VPCALL idSIMD_SSE::MinMax( float &min, float &max, const float *src, const int count ) {
3554 	int i, pre, post;
3555 
3556 	min = idMath::INFINITY; max = -idMath::INFINITY;
3557 
3558 	__asm
3559 	{
3560 		push		ebx
3561 		mov			eax, min
3562 		mov			ebx, max
3563 		movss		xmm0, [eax]
3564 		movss		xmm1, [ebx]
3565 		shufps		xmm0, xmm0, 0
3566 		shufps		xmm1, xmm1, 0
3567 
3568 		KFLOATINITS( src, count, pre, post )
3569 		and			eax, 15
3570 		jz			lpA
3571 		jmp			lpNA
3572 		align		16
3573 lpNA:
3574 		movups		xmm2, [edx+ebx]
3575 		movups		xmm3, [edx+ebx+16]
3576 		minps		xmm0, xmm2
3577 		maxps		xmm1, xmm2
3578 		prefetchnta	[edx+ebx+64]
3579 		minps		xmm0, xmm3
3580 		maxps		xmm1, xmm3
3581 		add			ebx, 16*2
3582 		jl			lpNA
3583 		jmp			done2
3584 lpA:
3585 		movaps		xmm2, [edx+ebx]
3586 		movaps		xmm3, [edx+ebx+16]
3587 		minps		xmm0, xmm2
3588 		maxps		xmm1, xmm2
3589 		prefetchnta	[edx+ebx+64]
3590 		minps		xmm0, xmm3
3591 		maxps		xmm1, xmm3
3592 		add			ebx, 16*2
3593 		jl			lpA
3594 		jmp			done2
3595 		align		16
3596 done2:
3597 		movaps		xmm2, xmm0
3598 		movaps		xmm3, xmm1
3599 		shufps		xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 )
3600 		shufps		xmm3, xmm3, R_SHUFFLEPS( 1, 2, 3, 0 )
3601 		minss		xmm0, xmm2
3602 		maxss		xmm1, xmm3
3603 		shufps		xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 )
3604 		shufps		xmm3, xmm3, R_SHUFFLEPS( 1, 2, 3, 0 )
3605 		minss		xmm0, xmm2
3606 		maxss		xmm1, xmm3
3607 		shufps		xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 )
3608 		shufps		xmm3, xmm3, R_SHUFFLEPS( 1, 2, 3, 0 )
3609 		minss		xmm0, xmm2
3610 		maxss		xmm1, xmm3
3611 		mov			eax, min
3612 		mov			ebx, max
3613 		movss		[eax], xmm0
3614 		movss		[ebx], xmm1
3615 done:
3616 		pop			ebx
3617 	}
3618 
3619 	for ( i = 0; i < pre; i++ ) {
3620 		float tmp = src[i];
3621 		if ( tmp > max ) {
3622 			max = tmp;
3623 		}
3624 		if ( tmp < min ) {
3625 			min = tmp;
3626 		}
3627 	}
3628 	for ( i = count - post; i < count; i++ ) {
3629 		float tmp = src[i];
3630 		if ( tmp > max ) {
3631 			max = tmp;
3632 		}
3633 		if ( tmp < min ) {
3634 			min = tmp;
3635 		}
3636 	}
3637 }
3638 
3639 /*
3640 ============
3641 idSIMD_SSE::MinMax
3642 ============
3643 */
MinMax(idVec2 & min,idVec2 & max,const idVec2 * src,const int count)3644 void VPCALL idSIMD_SSE::MinMax( idVec2 &min, idVec2 &max, const idVec2 *src, const int count ) {
3645 	__asm {
3646 		mov			eax, count
3647 		test		eax, eax
3648 		movss		xmm0, idMath::INFINITY
3649 		xorps		xmm1, xmm1
3650 		shufps		xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
3651 		subps		xmm1, xmm0
3652 		jz			done
3653 		mov			ecx, eax
3654 		and			ecx, 1
3655 		mov			esi, src
3656 		jz			startLoop
3657 		movlps		xmm2, [esi]
3658 		shufps		xmm2, xmm2, R_SHUFFLEPS( 0, 1, 0, 1 )
3659 		dec			eax
3660 		add			esi, 2*4
3661 		minps		xmm0, xmm2
3662 		maxps		xmm1, xmm2
3663 	startLoop:
3664 		imul		eax, 2*4
3665 		add			esi, eax
3666 		neg			eax
3667 	loopVert:
3668 		movlps		xmm2, [esi+eax]
3669 		movhps		xmm2, [esi+eax+8]
3670 		add			eax, 4*4
3671 		minps		xmm0, xmm2
3672 		maxps		xmm1, xmm2
3673 		jl			loopVert
3674 	done:
3675 		movaps		xmm2, xmm0
3676 		shufps		xmm2, xmm2, R_SHUFFLEPS( 2, 3, 0, 1 )
3677 		minps		xmm0, xmm2
3678 		mov			esi, min
3679 		movlps		[esi], xmm0
3680 		movaps		xmm3, xmm1
3681 		shufps		xmm3, xmm3, R_SHUFFLEPS( 2, 3, 0, 1 )
3682 		maxps		xmm1, xmm3
3683 		mov			edi, max
3684 		movlps		[edi], xmm1
3685 	}
3686 }
3687 
3688 /*
3689 ============
3690 idSIMD_SSE::MinMax
3691 ============
3692 */
MinMax(idVec3 & min,idVec3 & max,const idVec3 * src,const int count)3693 void VPCALL idSIMD_SSE::MinMax( idVec3 &min, idVec3 &max, const idVec3 *src, const int count ) {
3694 	__asm {
3695 
3696 		movss		xmm0, idMath::INFINITY
3697 		xorps		xmm1, xmm1
3698 		shufps		xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
3699 		subps		xmm1, xmm0
3700 		movaps		xmm2, xmm0
3701 		movaps		xmm3, xmm1
3702 
3703 		mov			esi, src
3704 		mov			eax, count
3705 		and			eax, ~3
3706 		jz			done4
3707 		imul		eax, 12
3708 		add			esi, eax
3709 		neg			eax
3710 
3711 	loop4:
3712 //		prefetchnta	[esi+4*12]
3713 
3714 		movss		xmm4, [esi+eax+0*12+8]
3715 		movhps		xmm4, [esi+eax+0*12+0]
3716 		minps		xmm0, xmm4
3717 		maxps		xmm1, xmm4
3718 
3719 		movss		xmm5, [esi+eax+1*12+0]
3720 		movhps		xmm5, [esi+eax+1*12+4]
3721 		minps		xmm2, xmm5
3722 		maxps		xmm3, xmm5
3723 
3724 		movss		xmm6, [esi+eax+2*12+8]
3725 		movhps		xmm6, [esi+eax+2*12+0]
3726 		minps		xmm0, xmm6
3727 		maxps		xmm1, xmm6
3728 
3729 		movss		xmm7, [esi+eax+3*12+0]
3730 		movhps		xmm7, [esi+eax+3*12+4]
3731 		minps		xmm2, xmm7
3732 		maxps		xmm3, xmm7
3733 
3734 		add			eax, 4*12
3735 		jl			loop4
3736 
3737 	done4:
3738 		mov			eax, count
3739 		and			eax, 3
3740 		jz			done1
3741 		imul		eax, 12
3742 		add			esi, eax
3743 		neg			eax
3744 
3745 	loop1:
3746 		movss		xmm4, [esi+eax+0*12+8]
3747 		movhps		xmm4, [esi+eax+0*12+0]
3748 		minps		xmm0, xmm4
3749 		maxps		xmm1, xmm4
3750 
3751 		add			eax, 12
3752 		jl			loop1
3753 
3754 	done1:
3755 		shufps		xmm2, xmm2, R_SHUFFLEPS( 3, 1, 0, 2 )
3756 		shufps		xmm3, xmm3, R_SHUFFLEPS( 3, 1, 0, 2 )
3757 		minps		xmm0, xmm2
3758 		maxps		xmm1, xmm3
3759 		mov			esi, min
3760 		movhps		[esi], xmm0
3761 		movss		[esi+8], xmm0
3762 		mov			edi, max
3763 		movhps		[edi], xmm1
3764 		movss		[edi+8], xmm1
3765 	}
3766 }
3767 
3768 /*
3769 ============
3770 idSIMD_SSE::MinMax
3771 ============
3772 */
MinMax(idVec3 & min,idVec3 & max,const idDrawVert * src,const int count)3773 void VPCALL idSIMD_SSE::MinMax( idVec3 &min, idVec3 &max, const idDrawVert *src, const int count ) {
3774 
3775 	assert( sizeof( idDrawVert ) == DRAWVERT_SIZE );
3776 	assert( (int)&((idDrawVert *)0)->xyz == DRAWVERT_XYZ_OFFSET );
3777 
3778 	__asm {
3779 
3780 		movss		xmm0, idMath::INFINITY
3781 		xorps		xmm1, xmm1
3782 		shufps		xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
3783 		subps		xmm1, xmm0
3784 		movaps		xmm2, xmm0
3785 		movaps		xmm3, xmm1
3786 
3787 		mov			esi, src
3788 		mov			eax, count
3789 		and			eax, ~3
3790 		jz			done4
3791 		imul		eax, DRAWVERT_SIZE
3792 		add			esi, eax
3793 		neg			eax
3794 
3795 	loop4:
3796 //		prefetchnta	[esi+4*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET]
3797 
3798 		movss		xmm4, [esi+eax+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8]
3799 		movhps		xmm4, [esi+eax+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]
3800 		minps		xmm0, xmm4
3801 		maxps		xmm1, xmm4
3802 
3803 		movss		xmm5, [esi+eax+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]
3804 		movhps		xmm5, [esi+eax+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4]
3805 		minps		xmm2, xmm5
3806 		maxps		xmm3, xmm5
3807 
3808 		movss		xmm6, [esi+eax+2*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8]
3809 		movhps		xmm6, [esi+eax+2*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]
3810 		minps		xmm0, xmm6
3811 		maxps		xmm1, xmm6
3812 
3813 		movss		xmm7, [esi+eax+3*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]
3814 		movhps		xmm7, [esi+eax+3*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4]
3815 		minps		xmm2, xmm7
3816 		maxps		xmm3, xmm7
3817 
3818 		add			eax, 4*DRAWVERT_SIZE
3819 		jl			loop4
3820 
3821 	done4:
3822 		mov			eax, count
3823 		and			eax, 3
3824 		jz			done1
3825 		imul		eax, DRAWVERT_SIZE
3826 		add			esi, eax
3827 		neg			eax
3828 
3829 	loop1:
3830 		movss		xmm4, [esi+eax+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8]
3831 		movhps		xmm4, [esi+eax+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]
3832 		minps		xmm0, xmm4
3833 		maxps		xmm1, xmm4
3834 
3835 		add			eax, DRAWVERT_SIZE
3836 		jl			loop1
3837 
3838 	done1:
3839 		shufps		xmm2, xmm2, R_SHUFFLEPS( 3, 1, 0, 2 )
3840 		shufps		xmm3, xmm3, R_SHUFFLEPS( 3, 1, 0, 2 )
3841 		minps		xmm0, xmm2
3842 		maxps		xmm1, xmm3
3843 		mov			esi, min
3844 		movhps		[esi], xmm0
3845 		movss		[esi+8], xmm0
3846 		mov			edi, max
3847 		movhps		[edi], xmm1
3848 		movss		[edi+8], xmm1
3849 	}
3850 }
3851 
3852 /*
3853 ============
3854 idSIMD_SSE::MinMax
3855 ============
3856 */
MinMax(idVec3 & min,idVec3 & max,const idDrawVert * src,const int * indexes,const int count)3857 void VPCALL idSIMD_SSE::MinMax( idVec3 &min, idVec3 &max, const idDrawVert *src, const int *indexes, const int count ) {
3858 
3859 	assert( sizeof( idDrawVert ) == DRAWVERT_SIZE );
3860 	assert( (int)&((idDrawVert *)0)->xyz == DRAWVERT_XYZ_OFFSET );
3861 
3862 	__asm {
3863 
3864 		movss		xmm0, idMath::INFINITY
3865 		xorps		xmm1, xmm1
3866 		shufps		xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
3867 		subps		xmm1, xmm0
3868 		movaps		xmm2, xmm0
3869 		movaps		xmm3, xmm1
3870 
3871 		mov			edi, indexes
3872 		mov			esi, src
3873 		mov			eax, count
3874 		and			eax, ~3
3875 		jz			done4
3876 		shl			eax, 2
3877 		add			edi, eax
3878 		neg			eax
3879 
3880 	loop4:
3881 //		prefetchnta	[edi+128]
3882 //		prefetchnta	[esi+4*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET]
3883 
3884 		mov			edx, [edi+eax+0]
3885 		imul		edx, DRAWVERT_SIZE
3886 		movss		xmm4, [esi+edx+DRAWVERT_XYZ_OFFSET+8]
3887 		movhps		xmm4, [esi+edx+DRAWVERT_XYZ_OFFSET+0]
3888 		minps		xmm0, xmm4
3889 		maxps		xmm1, xmm4
3890 
3891 		mov			edx, [edi+eax+4]
3892 		imul		edx, DRAWVERT_SIZE
3893 		movss		xmm5, [esi+edx+DRAWVERT_XYZ_OFFSET+0]
3894 		movhps		xmm5, [esi+edx+DRAWVERT_XYZ_OFFSET+4]
3895 		minps		xmm2, xmm5
3896 		maxps		xmm3, xmm5
3897 
3898 		mov			edx, [edi+eax+8]
3899 		imul		edx, DRAWVERT_SIZE
3900 		movss		xmm6, [esi+edx+DRAWVERT_XYZ_OFFSET+8]
3901 		movhps		xmm6, [esi+edx+DRAWVERT_XYZ_OFFSET+0]
3902 		minps		xmm0, xmm6
3903 		maxps		xmm1, xmm6
3904 
3905 		mov			edx, [edi+eax+12]
3906 		imul		edx, DRAWVERT_SIZE
3907 		movss		xmm7, [esi+edx+DRAWVERT_XYZ_OFFSET+0]
3908 		movhps		xmm7, [esi+edx+DRAWVERT_XYZ_OFFSET+4]
3909 		minps		xmm2, xmm7
3910 		maxps		xmm3, xmm7
3911 
3912 		add			eax, 4*4
3913 		jl			loop4
3914 
3915 	done4:
3916 		mov			eax, count
3917 		and			eax, 3
3918 		jz			done1
3919 		shl			eax, 2
3920 		add			edi, eax
3921 		neg			eax
3922 
3923 	loop1:
3924 		mov			edx, [edi+eax+0]
3925 		imul		edx, DRAWVERT_SIZE;
3926 		movss		xmm4, [esi+edx+DRAWVERT_XYZ_OFFSET+8]
3927 		movhps		xmm4, [esi+edx+DRAWVERT_XYZ_OFFSET+0]
3928 		minps		xmm0, xmm4
3929 		maxps		xmm1, xmm4
3930 
3931 		add			eax, 4
3932 		jl			loop1
3933 
3934 	done1:
3935 		shufps		xmm2, xmm2, R_SHUFFLEPS( 3, 1, 0, 2 )
3936 		shufps		xmm3, xmm3, R_SHUFFLEPS( 3, 1, 0, 2 )
3937 		minps		xmm0, xmm2
3938 		maxps		xmm1, xmm3
3939 		mov			esi, min
3940 		movhps		[esi], xmm0
3941 		movss		[esi+8], xmm0
3942 		mov			edi, max
3943 		movhps		[edi], xmm1
3944 		movss		[edi+8], xmm1
3945 	}
3946 }
3947 
3948 /*
3949 ============
3950 idSIMD_SSE::Clamp
3951 ============
3952 */
Clamp(float * dst,const float * src,const float min,const float max,const int count)3953 void VPCALL idSIMD_SSE::Clamp( float *dst, const float *src, const float min, const float max, const int count ) {
3954 	int	i, pre, post;
3955 
3956 	__asm
3957 	{
3958 		movss	xmm0,min
3959 		movss	xmm1,max
3960 		shufps	xmm0,xmm0,0
3961 		shufps	xmm1,xmm1,0
3962 
3963 		KFLOATINITDS( dst, src, count, pre, post )
3964 		and		eax,15
3965 		jne		lpNA
3966 		jmp		lpA
3967 		align	16
3968 lpA:
3969 		movaps	xmm2,[edx+ebx]
3970 		movaps	xmm3,[edx+ebx+16]
3971 		maxps	xmm2,xmm0
3972 		maxps	xmm3,xmm0
3973 		prefetchnta	[edx+ebx+64]
3974 		minps	xmm2,xmm1
3975 		minps	xmm3,xmm1
3976 		movaps	[edi+ebx],xmm2
3977 		movaps	[edi+ebx+16],xmm3
3978 		add		ebx,16*2
3979 		jl		lpA
3980 		jmp		done
3981 
3982 		align	16
3983 lpNA:
3984 		movups	xmm2,[edx+ebx]
3985 		movups	xmm3,[edx+ebx+16]
3986 		maxps	xmm2,xmm0
3987 		maxps	xmm3,xmm0
3988 		prefetchnta	[edx+ebx+64]
3989 		minps	xmm2,xmm1
3990 		minps	xmm3,xmm1
3991 		movaps	[edi+ebx],xmm2
3992 		movaps	[edi+ebx+16],xmm3
3993 		add		ebx,16*2
3994 		jl		lpNA
3995 done:
3996 	}
3997 
3998 	for ( i = 0; i < pre; i++ ) {
3999 		if ( src[i] < min )
4000 			dst[i] = min;
4001 		else if ( src[i] > max )
4002 			dst[i] = max;
4003 		else
4004 			dst[i] = src[i];
4005 	}
4006 
4007 	for( i = count - post; i < count; i++ ) {
4008 		if ( src[i] < min )
4009 			dst[i] = min;
4010 		else if ( src[i] > max )
4011 			dst[i] = max;
4012 		else
4013 			dst[i] = src[i];
4014 	}
4015 }
4016 
4017 /*
4018 ============
4019 idSIMD_SSE::ClampMin
4020 ============
4021 */
ClampMin(float * dst,const float * src,const float min,const int count)4022 void VPCALL idSIMD_SSE::ClampMin( float *dst, const float *src, const float min, const int count ) {
4023 	int	i, pre, post;
4024 
4025 	__asm
4026 	{
4027 		movss	xmm0,min
4028 		shufps	xmm0,xmm0,0
4029 
4030 		KFLOATINITDS( dst, src, count, pre, post )
4031 		and		eax,15
4032 		jne		lpNA
4033 		jmp		lpA
4034 		align	16
4035 lpA:
4036 		movaps	xmm2,[edx+ebx]
4037 		movaps	xmm3,[edx+ebx+16]
4038 		maxps	xmm2,xmm0
4039 		prefetchnta	[edx+ebx+64]
4040 		maxps	xmm3,xmm0
4041 		movaps	[edi+ebx],xmm2
4042 		movaps	[edi+ebx+16],xmm3
4043 		add		ebx,16*2
4044 		jl		lpA
4045 		jmp		done
4046 
4047 		align	16
4048 lpNA:
4049 		movups	xmm2,[edx+ebx]
4050 		movups	xmm3,[edx+ebx+16]
4051 		maxps	xmm2,xmm0
4052 		prefetchnta	[edx+ebx+64]
4053 		maxps	xmm3,xmm0
4054 		movaps	[edi+ebx],xmm2
4055 		movaps	[edi+ebx+16],xmm3
4056 		add		ebx,16*2
4057 		jl		lpNA
4058 done:
4059 	}
4060 
4061 	for( i = 0; i < pre; i++ ) {
4062 		if ( src[i] < min )
4063 			dst[i] = min;
4064 		else
4065 			dst[i] = src[i];
4066 	}
4067 	for( i = count - post; i < count; i++ ) {
4068 		if ( src[i] < min )
4069 			dst[i] = min;
4070 		else
4071 			dst[i] = src[i];
4072 	}
4073 }
4074 
4075 /*
4076 ============
4077 idSIMD_SSE::ClampMax
4078 ============
4079 */
ClampMax(float * dst,const float * src,const float max,const int count)4080 void VPCALL idSIMD_SSE::ClampMax( float *dst, const float *src, const float max, const int count ) {
4081 	int	i, pre, post;
4082 
4083 	__asm
4084 	{
4085 		movss	xmm1,max
4086 		shufps	xmm1,xmm1,0
4087 
4088 		KFLOATINITDS( dst, src, count, pre, post )
4089 		and		eax,15
4090 		jne		lpNA
4091 		jmp		lpA
4092 		align	16
4093 lpA:
4094 		movaps	xmm2,[edx+ebx]
4095 		movaps	xmm3,[edx+ebx+16]
4096 		minps	xmm2,xmm1
4097 		prefetchnta	[edx+ebx+64]
4098 		minps	xmm3,xmm1
4099 		movaps	[edi+ebx],xmm2
4100 		movaps	[edi+ebx+16],xmm3
4101 		add		ebx,16*2
4102 		jl		lpA
4103 		jmp		done
4104 
4105 		align	16
4106 lpNA:
4107 		movups	xmm2,[edx+ebx]
4108 		movups	xmm3,[edx+ebx+16]
4109 		minps	xmm2,xmm1
4110 		prefetchnta	[edx+ebx+64]
4111 		minps	xmm3,xmm1
4112 		movaps	[edi+ebx],xmm2
4113 		movaps	[edi+ebx+16],xmm3
4114 		add		ebx,16*2
4115 		jl		lpNA
4116 done:
4117 	}
4118 
4119 	for( i = 0; i < pre; i++ ) {
4120 		if ( src[i] > max )
4121 			dst[i] = max;
4122 		else
4123 			dst[i] = src[i];
4124 	}
4125 
4126 	for( i = count - post; i < count; i++ ) {
4127 		if ( src[i] > max )
4128 			dst[i] = max;
4129 		else
4130 			dst[i] = src[i];
4131 	}
4132 }
4133 
4134 /*
4135 ============
4136 idSIMD_SSE::Zero16
4137 ============
4138 */
Zero16(float * dst,const int count)4139 void VPCALL idSIMD_SSE::Zero16( float *dst, const int count ) {
4140 	__asm {
4141 		mov		edx, dst
4142 		mov		eax, count
4143 		add		eax, 3
4144 		shr		eax, 2
4145 		jz		doneZero16
4146 		shl		eax, 4
4147 		add		edx, eax
4148 		neg		eax
4149 		xorps	xmm0, xmm0
4150 	loopZero16:
4151 		movaps	[edx+eax], xmm0
4152 		add		eax, 16
4153 		jl		loopZero16
4154 	doneZero16:
4155 	}
4156 }
4157 
4158 /*
4159 ============
4160 idSIMD_SSE::Negate16
4161 ============
4162 */
Negate16(float * dst,const int count)4163 void VPCALL idSIMD_SSE::Negate16( float *dst, const int count ) {
4164 	__asm {
4165 		mov		edx, dst
4166 		mov		eax, count
4167 		add		eax, 3
4168 		shr		eax, 2
4169 		jz		doneNegate16
4170 		shl		eax, 4
4171 		add		edx, eax
4172 		neg		eax
4173 		movss	xmm0, SIMD_SP_signBitMask
4174 		shufps	xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
4175 	loopNegate16:
4176 		movaps	xmm1, [edx+eax]
4177 		xorps	xmm1, xmm0
4178 		movaps	[edx+eax], xmm1
4179 		add		eax, 16
4180 		jl		loopNegate16
4181 	doneNegate16:
4182 	}
4183 }
4184 
4185 /*
4186 ============
4187 idSIMD_SSE::Copy16
4188 ============
4189 */
Copy16(float * dst,const float * src,const int count)4190 void VPCALL idSIMD_SSE::Copy16( float *dst, const float *src, const int count ) {
4191 	__asm {
4192 		mov		ecx, src
4193 		mov		edx, dst
4194 		mov		eax, count
4195 		add		eax, 3
4196 		shr		eax, 2
4197 		jz		doneCopy16
4198 		shl		eax, 4
4199 		add		ecx, eax
4200 		add		edx, eax
4201 		neg		eax
4202 	loopCopy16:
4203 		movaps	xmm0, [ecx+eax]
4204 		movaps	[edx+eax], xmm0
4205 		add		eax, 16
4206 		jl		loopCopy16
4207 	doneCopy16:
4208 	}
4209 }
4210 
4211 /*
4212 ============
4213 idSIMD_SSE::Add16
4214 ============
4215 */
Add16(float * dst,const float * src1,const float * src2,const int count)4216 void VPCALL idSIMD_SSE::Add16( float *dst, const float *src1, const float *src2, const int count ) {
4217 	__asm {
4218 		mov		ecx, src1
4219 		mov		edx, src2
4220 		mov		esi, dst
4221 		mov		eax, count
4222 		add		eax, 3
4223 		shr		eax, 2
4224 		jz		doneAdd16
4225 		shl		eax, 4
4226 		add		esi, eax
4227 		add		ecx, eax
4228 		add		edx, eax
4229 		neg		eax
4230 	loopAdd16:
4231 		movaps	xmm0, [ecx+eax]
4232 		addps	xmm0, [edx+eax]
4233 		movaps	[esi+eax], xmm0
4234 		add		eax, 16
4235 		jl		loopAdd16
4236 	doneAdd16:
4237 	}
4238 }
4239 
4240 /*
4241 ============
4242 idSIMD_SSE::Sub16
4243 ============
4244 */
Sub16(float * dst,const float * src1,const float * src2,const int count)4245 void VPCALL idSIMD_SSE::Sub16( float *dst, const float *src1, const float *src2, const int count ) {
4246 	__asm {
4247 		mov		ecx, src1
4248 		mov		edx, src2
4249 		mov		esi, dst
4250 		mov		eax, count
4251 		add		eax, 3
4252 		shr		eax, 2
4253 		jz		doneSub16
4254 		shl		eax, 4
4255 		add		esi, eax
4256 		add		ecx, eax
4257 		add		edx, eax
4258 		neg		eax
4259 	loopSub16:
4260 		movaps	xmm0, [ecx+eax]
4261 		subps	xmm0, [edx+eax]
4262 		movaps	[esi+eax], xmm0
4263 		add		eax, 16
4264 		jl		loopSub16
4265 	doneSub16:
4266 	}
4267 }
4268 
4269 /*
4270 ============
4271 idSIMD_SSE::Mul16
4272 ============
4273 */
Mul16(float * dst,const float * src1,const float constant,const int count)4274 void VPCALL idSIMD_SSE::Mul16( float *dst, const float *src1, const float constant, const int count ) {
4275 	__asm {
4276 		mov		ecx, dst
4277 		mov		edx, src1
4278 		mov		eax, count
4279 		add		eax, 3
4280 		shr		eax, 2
4281 		jz		doneMulScalar16
4282 		movss	xmm1, constant
4283 		shl		eax, 4
4284 		add		ecx, eax
4285 		add		edx, eax
4286 		neg		eax
4287 		shufps	xmm1, xmm1, 0x00
4288 	loopMulScalar16:
4289 		movaps	xmm0, [edx+eax]
4290 		mulps	xmm0, xmm1
4291 		movaps	[ecx+eax], xmm0
4292 		add		eax, 16
4293 		jl		loopMulScalar16
4294 	doneMulScalar16:
4295 	}
4296 }
4297 
4298 /*
4299 ============
4300 idSIMD_SSE::AddAssign16
4301 ============
4302 */
AddAssign16(float * dst,const float * src,const int count)4303 void VPCALL idSIMD_SSE::AddAssign16( float *dst, const float *src, const int count ) {
4304 	__asm {
4305 		mov		ecx, dst
4306 		mov		edx, src
4307 		mov		eax, count
4308 		add		eax, 3
4309 		shr		eax, 2
4310 		jz		doneAddAssign16
4311 		shl		eax, 4
4312 		add		ecx, eax
4313 		add		edx, eax
4314 		neg		eax
4315 	loopAddAssign16:
4316 		movaps	xmm0, [ecx+eax]
4317 		addps	xmm0, [edx+eax]
4318 		movaps	[ecx+eax], xmm0
4319 		add		eax, 16
4320 		jl		loopAddAssign16
4321 	doneAddAssign16:
4322 	}
4323 }
4324 
4325 /*
4326 ============
4327 idSIMD_SSE::SubAssign16
4328 ============
4329 */
SubAssign16(float * dst,const float * src,const int count)4330 void VPCALL idSIMD_SSE::SubAssign16( float *dst, const float *src, const int count ) {
4331 	__asm {
4332 		mov		ecx, dst
4333 		mov		edx, src
4334 		mov		eax, count
4335 		add		eax, 3
4336 		shr		eax, 2
4337 		jz		doneSubAssign16
4338 		shl		eax, 4
4339 		add		ecx, eax
4340 		add		edx, eax
4341 		neg		eax
4342 	loopSubAssign16:
4343 		movaps	xmm0, [ecx+eax]
4344 		subps	xmm0, [edx+eax]
4345 		movaps	[ecx+eax], xmm0
4346 		add		eax, 16
4347 		jl		loopSubAssign16
4348 	doneSubAssign16:
4349 	}
4350 }
4351 
4352 /*
4353 ============
4354 idSIMD_SSE::MulAssign16
4355 ============
4356 */
MulAssign16(float * dst,const float constant,const int count)4357 void VPCALL idSIMD_SSE::MulAssign16( float *dst, const float constant, const int count ) {
4358 	__asm {
4359 		mov		ecx, dst
4360 		mov		eax, count
4361 		add		eax, 3
4362 		shr		eax, 2
4363 		jz		doneMulAssign16
4364 		movss	xmm1, constant
4365 		shl		eax, 4
4366 		add		ecx, eax
4367 		neg		eax
4368 		shufps	xmm1, xmm1, 0x00
4369 	loopMulAssign16:
4370 		movaps	xmm0, [ecx+eax]
4371 		mulps	xmm0, xmm1
4372 		movaps	[ecx+eax], xmm0
4373 		add		eax, 16
4374 		jl		loopMulAssign16
4375 	doneMulAssign16:
4376 	}
4377 }
4378 
4379 /*
4380 ============
4381 idSIMD_SSE::MatX_MultiplyVecX
4382 
4383 	optimizes the following matrix multiplications:
4384 
4385 	NxN * Nx1
4386 	Nx6 * 6x1
4387 	6xN * Nx1
4388 
4389 	with N in the range [1-6]
4390 ============
4391 */
MatX_MultiplyVecX(idVecX & dst,const idMatX & mat,const idVecX & vec)4392 void VPCALL idSIMD_SSE::MatX_MultiplyVecX( idVecX &dst, const idMatX &mat, const idVecX &vec ) {
4393 #define STORE1( offset, reg1, reg2 )		\
4394 	__asm movss		[eax+offset], reg1
4395 #define STORE2LO( offset, reg1, reg2 )		\
4396 	__asm movlps	[eax+offset], reg1
4397 #define STORE2HI( offset, reg1, reg2 )		\
4398 	__asm movhps	[eax+offset], reg1
4399 #define STORE4( offset, reg1, reg2 )		\
4400 	__asm movlps	[eax+offset], reg1		\
4401 	__asm movhps	[eax+offset+8], reg1
4402 #define STOREC		=
4403 
4404 	int numRows;
4405 	const float *mPtr, *vPtr;
4406 	float *dstPtr;
4407 
4408 	assert( vec.GetSize() >= mat.GetNumColumns() );
4409 	assert( dst.GetSize() >= mat.GetNumRows() );
4410 
4411 	mPtr = mat.ToFloatPtr();
4412 	vPtr = vec.ToFloatPtr();
4413 	dstPtr = dst.ToFloatPtr();
4414 	numRows = mat.GetNumRows();
4415 	switch( mat.GetNumColumns() ) {
4416 		case 1: {
4417 			switch( numRows ) {
4418 				case 1: {		// 1x1 * 1x1
4419 					__asm {
4420 						mov			esi, vPtr
4421 						mov			edi, mPtr
4422 						mov			eax, dstPtr
4423 						movss		xmm0, [esi]
4424 						mulss		xmm0, [edi]
4425 						STORE1( 0, xmm0, xmm1 )
4426 					}
4427 					return;
4428 				}
4429 				case 6: {		// 6x1 * 1x1
4430 					__asm {
4431 						mov			esi, vPtr
4432 						mov			edi, mPtr
4433 						mov			eax, dstPtr
4434 						movss		xmm0, [esi]
4435 						shufps		xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
4436 						movaps		xmm1, xmm0
4437 						mulps		xmm0, [edi]
4438 						mulps		xmm1, [edi+16]
4439 						STORE4( 0, xmm0, xmm2 )
4440 						STORE2LO( 16, xmm1, xmm2 )
4441 					}
4442 					return;
4443 				}
4444 				default: {
4445 					for ( int i = 0; i < numRows; i++ ) {
4446 						dstPtr[i] STOREC mPtr[0] * vPtr[0];
4447 						mPtr++;
4448 					}
4449 					return;
4450 				}
4451 			}
4452 			break;
4453 		}
4454 		case 2: {
4455 			switch( numRows ) {
4456 				case 2: {		// 2x2 * 2x1
4457 					__asm {
4458 						mov			esi, vPtr
4459 						mov			edi, mPtr
4460 						mov			eax, dstPtr
4461 						movss		xmm0, [esi]
4462 						movss		xmm1, [esi+4]
4463 						movss		xmm2, [edi]
4464 						mulss		xmm2, xmm0
4465 						movss		xmm3, [edi+4]
4466 						mulss		xmm3, xmm1
4467 						addss		xmm2, xmm3
4468 						STORE1( 0, xmm2, xmm4 )
4469 						mulss		xmm0, [edi+8]
4470 						mulss		xmm1, [edi+8+4]
4471 						addss		xmm0, xmm1
4472 						STORE1( 4, xmm0, xmm4 )
4473 					}
4474 					return;
4475 				}
4476 				case 6: {		// 6x2 * 2x1
4477 					__asm {
4478 						mov			esi, vPtr
4479 						mov			edi, mPtr
4480 						mov			eax, dstPtr
4481 						movlps		xmm7, [esi]
4482 						shufps		xmm7, xmm7, R_SHUFFLEPS( 0, 1, 0, 1 )
4483 						movaps		xmm0, [edi]
4484 						mulps		xmm0, xmm7
4485 						movaps		xmm1, [edi+16]
4486 						mulps		xmm1, xmm7
4487 						movaps		xmm2, xmm0
4488 						shufps		xmm0, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 )
4489 						shufps		xmm2, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 )
4490 						movaps		xmm3, [edi+32]
4491 						addps		xmm0, xmm2
4492 						mulps		xmm3, xmm7
4493 						STORE4( 0, xmm0, xmm4 )
4494 						shufps		xmm3, xmm3, R_SHUFFLEPS( 0, 2, 1, 3 )
4495 						movhlps		xmm1, xmm3
4496 						addps		xmm3, xmm1
4497 						STORE2LO( 16, xmm3, xmm4 )
4498 					}
4499 					return;
4500 				}
4501 				default: {
4502 					for ( int i = 0; i < numRows; i++ ) {
4503 						dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1];
4504 						mPtr += 2;
4505 					}
4506 					return;
4507 				}
4508 			}
4509 			break;
4510 		}
4511 		case 3: {
4512 			switch( numRows ) {
4513 				case 3: {		// 3x3 * 3x1
4514 					__asm {
4515 						mov			esi, vPtr
4516 						mov			edi, mPtr
4517 						mov			eax, dstPtr
4518 						movss		xmm0, [esi]
4519 						movss		xmm4, [edi]
4520 						mulss		xmm4, xmm0
4521 						movss		xmm1, [esi+4]
4522 						movss		xmm5, [edi+4]
4523 						mulss		xmm5, xmm1
4524 						addss		xmm4, xmm5
4525 						movss		xmm2, [esi+8]
4526 						movss		xmm6, [edi+8]
4527 						mulss		xmm6, xmm2
4528 						addss		xmm4, xmm6
4529 						movss		xmm3, [edi+12]
4530 						mulss		xmm3, xmm0
4531 						STORE1( 0, xmm4, xmm7 );
4532 						movss		xmm5, [edi+12+4]
4533 						mulss		xmm5, xmm1
4534 						addss		xmm3, xmm5
4535 						movss		xmm6, [edi+12+8]
4536 						mulss		xmm6, xmm2
4537 						addss		xmm3, xmm6
4538 						mulss		xmm0, [edi+24]
4539 						mulss		xmm1, [edi+24+4]
4540 						STORE1( 4, xmm3, xmm7 );
4541 						addss		xmm0, xmm1
4542 						mulss		xmm2, [edi+24+8]
4543 						addss		xmm0, xmm2
4544 						STORE1( 8, xmm0, xmm7 );
4545 					}
4546 					return;
4547 				}
4548 				case 6: {		// 6x3 * 3x1
4549 					__asm {
4550 						mov			esi, vPtr
4551 						mov			edi, mPtr
4552 						mov			eax, dstPtr
4553 						movss		xmm5, [esi]
4554 						shufps		xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
4555 						movss		xmm6, [esi+4]
4556 						shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
4557 						movss		xmm7, [esi+8]
4558 						shufps		xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
4559 						movaps		xmm0, [edi]								// xmm0 = 0, 1, 2, 3
4560 						movlps		xmm1, [edi+4*4]
4561 						shufps		xmm1, xmm0, R_SHUFFLEPS( 0, 1, 1, 2 )	// xmm1 = 4, 5, 1, 2
4562 						movlps		xmm2, [edi+6*4]
4563 						movhps		xmm2, [edi+8*4]							// xmm2 = 6, 7, 8, 9
4564 						shufps		xmm0, xmm2, R_SHUFFLEPS( 0, 3, 0, 3 )	// xmm0 = 0, 3, 6, 9
4565 						mulps		xmm0, xmm5
4566 						movlps		xmm3, [edi+10*4]
4567 						shufps		xmm2, xmm3, R_SHUFFLEPS( 1, 2, 0, 1 )	// xmm2 = 7, 8, 10, 11
4568 						movaps		xmm3, xmm1
4569 						shufps		xmm1, xmm2, R_SHUFFLEPS( 2, 0, 0, 2 )	// xmm1 = 1, 4, 7, 10
4570 						mulps		xmm1, xmm6
4571 						shufps		xmm3, xmm2, R_SHUFFLEPS( 3, 1, 1, 3 )	// xmm3 = 2, 5, 8, 11
4572 						mulps		xmm3, xmm7
4573 						addps		xmm0, xmm1
4574 						addps		xmm0, xmm3
4575 						STORE4( 0, xmm0, xmm4 )
4576 						movss		xmm1, [edi+12*4]
4577 						mulss		xmm1, xmm5
4578 						movss		xmm2, [edi+13*4]
4579 						mulss		xmm2, xmm6
4580 						movss		xmm3, [edi+14*4]
4581 						mulss		xmm3, xmm7
4582 						addss		xmm1, xmm2
4583 						addss		xmm1, xmm3
4584 						STORE1( 16, xmm1, xmm4 )
4585 						mulss		xmm5, [edi+15*4]
4586 						mulss		xmm6, [edi+16*4]
4587 						mulss		xmm7, [edi+17*4]
4588 						addss		xmm5, xmm6
4589 						addss		xmm5, xmm7
4590 						STORE1( 20, xmm5, xmm4 )
4591 					}
4592 					return;
4593 				}
4594 				default: {
4595 					for ( int i = 0; i < numRows; i++ ) {
4596 						dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2];
4597 						mPtr += 3;
4598 					}
4599 					return;
4600 				}
4601 			}
4602 			break;
4603 		}
4604 		case 4: {
4605 			switch( numRows ) {
4606 				case 4: {		// 4x4 * 4x1
4607 					__asm {
4608 						mov			esi, vPtr
4609 						mov			edi, mPtr
4610 						mov			eax, dstPtr
4611 						movlps		xmm6, qword ptr [esi ]
4612 						movlps		xmm0, qword ptr [edi ]
4613 						shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 1, 0, 1 )
4614 						movhps		xmm0, qword ptr [edi+16]
4615 						mulps		xmm0, xmm6
4616 						movlps		xmm7, qword ptr [esi+ 8]
4617 						movlps		xmm2, qword ptr [edi+ 8]
4618 						shufps		xmm7, xmm7, R_SHUFFLEPS( 0, 1, 0, 1 )
4619 						movhps		xmm2, qword ptr [edi+24]
4620 						mulps		xmm2, xmm7
4621 						movlps		xmm1, qword ptr [edi+32]
4622 						movhps		xmm1, qword ptr [edi+48]
4623 						mulps		xmm1, xmm6
4624 						movlps		xmm3, qword ptr [edi+40]
4625 						addps		xmm0, xmm2
4626 						movhps		xmm3, qword ptr [edi+56]
4627 						mulps		xmm3, xmm7
4628 						movaps		xmm4, xmm0
4629 						addps		xmm1, xmm3
4630 						shufps		xmm4, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 )
4631 						shufps		xmm0, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 )
4632 						addps		xmm0, xmm4
4633 						STORE4( 0, xmm0, xmm2 )
4634 					}
4635 					return;
4636 				}
4637 				case 6: {		// 6x4 * 4x1
4638 					__asm {
4639 						mov			esi, vPtr
4640 						mov			edi, mPtr
4641 						mov			eax, dstPtr
4642 						movlps		xmm6, qword ptr [esi+ 0]
4643 						movlps		xmm0, qword ptr [edi+ 0]
4644 						shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 1, 0, 1 )
4645 						movhps		xmm0, qword ptr [edi+16]
4646 						mulps		xmm0, xmm6
4647 						movlps		xmm7, qword ptr [esi+ 8]
4648 						movlps		xmm2, qword ptr [edi+ 8]
4649 						shufps		xmm7, xmm7, R_SHUFFLEPS( 0, 1, 0, 1 )
4650 						movhps		xmm2, qword ptr [edi+24]
4651 						mulps		xmm2, xmm7
4652 						movlps		xmm1, qword ptr [edi+32]
4653 						movhps		xmm1, qword ptr [edi+48]
4654 						mulps		xmm1, xmm6
4655 						movlps		xmm3, qword ptr [edi+40]
4656 						addps		xmm0, xmm2
4657 						movhps		xmm3, qword ptr [edi+56]
4658 						mulps		xmm3, xmm7
4659 						movaps		xmm4, xmm0
4660 						addps		xmm1, xmm3
4661 						shufps		xmm4, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 )
4662 						shufps		xmm0, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 )
4663 						addps		xmm0, xmm4
4664 						movlps		xmm1, qword ptr [edi+64]
4665 						movhps		xmm1, qword ptr [edi+80]
4666 						STORE4( 0, xmm0, xmm4 )
4667 						mulps		xmm1, xmm6
4668 						movlps		xmm2, qword ptr [edi+72]
4669 						movhps		xmm2, qword ptr [edi+88]
4670 						mulps		xmm2, xmm7
4671 						addps		xmm1, xmm2
4672 						shufps		xmm1, xmm1, R_SHUFFLEPS( 0, 2, 1, 3 )
4673 						movhlps		xmm3, xmm1
4674 						addps		xmm1, xmm3
4675 						STORE2LO( 16, xmm1, xmm4 )
4676 					}
4677 					return;
4678 				}
4679 				default: {
4680 					for ( int i = 0; i < numRows; i++ ) {
4681 						dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2] + mPtr[3] * vPtr[3];
4682 						mPtr += 4;
4683 					}
4684 					return;
4685 				}
4686 			}
4687 			break;
4688 		}
4689 		case 5: {
4690 			switch( numRows ) {
4691 				case 5: {		// 5x5 * 5x1
4692 					__asm {
4693 						mov			esi, vPtr
4694 						mov			edi, mPtr
4695 						mov			eax, dstPtr
4696 						movss		xmm0, [edi+5*4]							// xmm0 =  5,  X,  X,  X
4697 						movhps		xmm0, [edi+0*4]							// xmm0 =  5,  X,  0,  1
4698 						movss		xmm5, [edi+15*4]						// xmm4 = 15,  X,  X,  X
4699 						movhps		xmm5, [edi+10*4]						// xmm5 = 15,  X, 10, 11
4700 						movaps		xmm1, xmm0								// xmm1 =  5,  X,  0,  1
4701 						shufps		xmm0, xmm5, R_SHUFFLEPS( 2, 0, 2, 0 )	// xmm0 =  0,  5, 10, 15
4702 						movlps		xmm1, [edi+6*4]							// xmm1 =  6,  7,  0,  1
4703 						movlps		xmm5, [edi+16*4]						// xmm5 = 16, 17, 10, 11
4704 						movaps		xmm2, xmm1								// xmm2 =  6,  7,  0,  1
4705 						shufps		xmm1, xmm5, R_SHUFFLEPS( 3, 0, 3, 0 )	// xmm1 =  1,  6, 11, 16
4706 						movhps		xmm2, [edi+2*4]							// xmm2 =  6,  7,  2,  3
4707 						movhps		xmm5, [edi+12*4]						// xmm5 = 16, 17, 12, 13
4708 						movaps		xmm3, xmm2								// xmm3 =  6,  7,  2,  3
4709 						shufps		xmm2, xmm5, R_SHUFFLEPS( 2, 1, 2, 1 )	// xmm2 =  2,  7, 12, 17
4710 						movlps		xmm3, [edi+8*4]							// xmm3 =  8,  9,  2,  3
4711 						movlps		xmm5, [edi+18*4]						// xmm5 = 18, 19, 12, 13
4712 						movss		xmm4, [edi+4*4]							// xmm4 =  4,  X,  X,  X
4713 						movlhps		xmm4, xmm3								// xmm4 =  4,  X,  8,  9
4714 						shufps		xmm3, xmm5, R_SHUFFLEPS( 3, 0, 3, 0 )	// xmm3 =  3,  8, 13, 18
4715 						movhps		xmm5, [edi+14*4]						// xmm6 = 18, 19, 14, 15
4716 						shufps		xmm4, xmm5, R_SHUFFLEPS( 0, 3, 2, 1 )	// xmm4 =  4,  9, 14, 19
4717 						movss		xmm7, [esi+0*4]
4718 						shufps		xmm7, xmm7, 0
4719 						mulps		xmm0, xmm7
4720 						movss		xmm5, [esi+1*4]
4721 						shufps		xmm5, xmm5, 0
4722 						mulps		xmm1, xmm5
4723 						addps		xmm0, xmm1
4724 						movss		xmm6, [esi+2*4]
4725 						shufps		xmm6, xmm6, 0
4726 						mulps		xmm2, xmm6
4727 						addps		xmm0, xmm2
4728 						movss		xmm1, [esi+3*4]
4729 						shufps		xmm1, xmm1, 0
4730 						mulps		xmm3, xmm1
4731 						addps		xmm0, xmm3
4732 						movss		xmm2, [esi+4*4]
4733 						shufps		xmm2, xmm2, 0
4734 						mulps		xmm4, xmm2
4735 						addps		xmm0, xmm4
4736 						mulss		xmm7, [edi+20*4]
4737 						mulss		xmm5, [edi+21*4]
4738 						addps		xmm7, xmm5
4739 						mulss		xmm6, [edi+22*4]
4740 						addps		xmm7, xmm6
4741 						mulss		xmm1, [edi+23*4]
4742 						addps		xmm7, xmm1
4743 						mulss		xmm2, [edi+24*4]
4744 						addps		xmm7, xmm2
4745 						STORE4( 0, xmm0, xmm3 )
4746 						STORE1( 16, xmm7, xmm4 )
4747 					}
4748 					return;
4749 				}
4750 				case 6: {		// 6x5 * 5x1
4751 					__asm {
4752 						mov			esi, vPtr
4753 						mov			edi, mPtr
4754 						mov			eax, dstPtr
4755 						movlps		xmm6, [esi]
4756 						shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 1, 0, 1 )
4757 						movlps		xmm7, [esi+8]
4758 						shufps		xmm7, xmm7, R_SHUFFLEPS( 0, 1, 0, 1 )
4759 						movlps		xmm0, [edi]
4760 						movhps		xmm3, [edi+8]
4761 						movaps		xmm1, [edi+16]
4762 						movlps		xmm2, [edi+32]
4763 						shufps		xmm0, xmm1, R_SHUFFLEPS( 0, 1, 1, 2 )	// xmm0 = 0, 1, 5, 6
4764 						shufps		xmm1, xmm2, R_SHUFFLEPS( 0, 3, 0, 1 )	// xmm1 = 4, 7, 8, 9
4765 						shufps		xmm3, xmm1, R_SHUFFLEPS( 2, 3, 1, 2 )	// xmm3 = 2, 3, 7, 8
4766 						mulps		xmm0, xmm6
4767 						mulps		xmm3, xmm7
4768 						movlps		xmm2, [edi+40]
4769 						addps		xmm0, xmm3								// xmm0 + xmm1
4770 						movhps		xmm5, [edi+40+8]
4771 						movlps		xmm3, [edi+40+16]
4772 						movhps		xmm3, [edi+40+24]
4773 						movlps		xmm4, [edi+40+32]
4774 						shufps		xmm2, xmm3, R_SHUFFLEPS( 0, 1, 1, 2 )	// xmm2 = 10, 11, 15, 16
4775 						shufps		xmm3, xmm4, R_SHUFFLEPS( 0, 3, 0, 1 )	// xmm3 = 14, 17, 18, 19
4776 						shufps		xmm5, xmm3, R_SHUFFLEPS( 2, 3, 1, 2 )	// xmm5 = 12, 13, 17, 18
4777 						mulps		xmm2, xmm6
4778 						mulps		xmm5, xmm7
4779 						addps		xmm2, xmm5								// xmm2 + xmm3
4780 						movss		xmm5, [esi+16]
4781 						shufps		xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
4782 						movaps		xmm4, xmm0
4783 						shufps		xmm0, xmm2, R_SHUFFLEPS( 0, 2, 0, 2 )
4784 						shufps		xmm4, xmm2, R_SHUFFLEPS( 1, 3, 1, 3 )
4785 						shufps		xmm1, xmm3, R_SHUFFLEPS( 0, 3, 0, 3 )
4786 						addps		xmm0, xmm4
4787 						mulps		xmm1, xmm5
4788 						addps		xmm0, xmm1
4789 						STORE4( 0, xmm0, xmm2 )
4790 						movlps		xmm4, [edi+80]
4791 						movhps		xmm3, [edi+80+8]
4792 						movaps		xmm1, [edi+80+16]
4793 						movlps		xmm2, [edi+80+32]
4794 						shufps		xmm4, xmm1, R_SHUFFLEPS( 0, 1, 1, 2 )	// xmm4 = 20, 21, 25, 26
4795 						shufps		xmm1, xmm2, R_SHUFFLEPS( 0, 3, 0, 1 )	// xmm1 = 24, 27, 28, 29
4796 						shufps		xmm3, xmm1, R_SHUFFLEPS( 2, 3, 1, 2 )	// xmm3 = 22, 23, 27, 28
4797 						mulps		xmm4, xmm6
4798 						mulps		xmm3, xmm7
4799 						mulps		xmm1, xmm5
4800 						addps		xmm4, xmm3								// xmm4 + xmm1
4801 						shufps		xmm1, xmm4, R_SHUFFLEPS( 0, 3, 0, 2 )
4802 						shufps		xmm4, xmm4, R_SHUFFLEPS( 1, 3, 0, 0 )
4803 						addps		xmm4, xmm1
4804 						shufps		xmm1, xmm1, R_SHUFFLEPS( 2, 3, 0, 1 )
4805 						addps		xmm4, xmm1
4806 						STORE2LO( 16, xmm4, xmm2 )
4807 					}
4808 					return;
4809 				}
4810 				default: {
4811 					for ( int i = 0; i < numRows; i++ ) {
4812 						dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2] + mPtr[3] * vPtr[3] + mPtr[4] * vPtr[4];
4813 						mPtr += 5;
4814 					}
4815 					return;
4816 				}
4817 			}
4818 			break;
4819 		}
4820 		case 6: {
4821 			switch( numRows ) {
4822 				case 1: {		// 1x6 * 6x1
4823 					__asm {
4824 						mov			esi, vPtr
4825 						mov			edi, mPtr
4826 						mov			eax, dstPtr
4827 						movss		xmm0, [esi]
4828 						mulss		xmm0, [edi]
4829 						movss		xmm1, [esi+4]
4830 						mulss		xmm1, [edi+4]
4831 						movss		xmm2, [esi+8]
4832 						addss		xmm0, xmm1
4833 						mulss		xmm2, [edi+8]
4834 						movss		xmm3, [esi+12]
4835 						addss		xmm0, xmm2
4836 						mulss		xmm3, [edi+12]
4837 						movss		xmm4, [esi+16]
4838 						addss		xmm0, xmm3
4839 						mulss		xmm4, [edi+16]
4840 						movss		xmm5, [esi+20]
4841 						addss		xmm0, xmm4
4842 						mulss		xmm5, [edi+20]
4843 						movss		xmm6, [esi+24]
4844 						addss		xmm0, xmm5
4845 						mulss		xmm6, [edi+24]
4846 						addss		xmm0, xmm6
4847 						STORE1( 0, xmm0, xmm7 )
4848 					}
4849 					return;
4850 				}
4851 				case 2: {		// 2x6 * 6x1
4852 					__asm {
4853 						mov			esi, vPtr
4854 						mov			edi, mPtr
4855 						mov			eax, dstPtr
4856 						// load idVecX
4857 						movlps		xmm4, [esi]
4858 						movhps		xmm4, [esi+8]
4859 						movlps		xmm5, [esi+16]
4860 						movlhps		xmm5, xmm4
4861 						movhlps		xmm6, xmm4
4862 						movlhps		xmm6, xmm5
4863 						// row 0 and 1
4864 						movaps		xmm0, [edi]
4865 						movaps		xmm1, [edi+16]
4866 						movaps		xmm2, [edi+32]
4867 						mulps		xmm0, xmm4
4868 						mulps		xmm1, xmm5
4869 						mulps		xmm2, xmm6
4870 						movhlps		xmm3, xmm0
4871 						movlhps		xmm3, xmm2
4872 						addps		xmm1, xmm3
4873 						shufps		xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 )
4874 						addps		xmm1, xmm0
4875 						shufps		xmm1, xmm1, R_SHUFFLEPS( 0, 2, 1, 3 )
4876 						movhlps		xmm0, xmm1
4877 						addps		xmm0, xmm1
4878 						STORE2LO( 0, xmm0, xmm3 )
4879 					}
4880 					return;
4881 				}
4882 				case 3: {		// 3x6 * 6x1
4883 					__asm {
4884 						mov			esi, vPtr
4885 						mov			edi, mPtr
4886 						mov			eax, dstPtr
4887 						// load idVecX
4888 						movlps		xmm4, [esi]
4889 						movhps		xmm4, [esi+8]
4890 						movlps		xmm5, [esi+16]
4891 						movlhps		xmm5, xmm4
4892 						movhlps		xmm6, xmm4
4893 						movlhps		xmm6, xmm5
4894 						// row 0 and 1
4895 						movaps		xmm0, [edi]
4896 						movaps		xmm1, [edi+16]
4897 						movaps		xmm2, [edi+32]
4898 						mulps		xmm0, xmm4
4899 						mulps		xmm1, xmm5
4900 						mulps		xmm2, xmm6
4901 						movhlps		xmm3, xmm0
4902 						movlhps		xmm3, xmm2
4903 						addps		xmm1, xmm3
4904 						shufps		xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 )
4905 						addps		xmm1, xmm0
4906 						shufps		xmm1, xmm1, R_SHUFFLEPS( 0, 2, 1, 3 )
4907 						movhlps		xmm0, xmm1
4908 						addps		xmm0, xmm1
4909 						STORE2LO( 0, xmm0, xmm3 )
4910 						// row 2
4911 						movaps		xmm0, [edi+48]
4912 						movaps		xmm1, [edi+48+16]
4913 						mulps		xmm0, xmm4
4914 						mulps		xmm1, xmm5
4915 						addps		xmm0, xmm1
4916 						movhlps		xmm1, xmm0
4917 						addps		xmm0, xmm1
4918 						movaps		xmm1, xmm0
4919 						shufps		xmm1, xmm1, R_SHUFFLEPS( 1, 0, 0, 0 )
4920 						addss		xmm0, xmm1
4921 						STORE1( 8, xmm0, xmm3 )
4922 					}
4923 					return;
4924 				}
4925 				case 4: {		// 4x6 * 6x1
4926 					__asm {
4927 						mov			esi, vPtr
4928 						mov			edi, mPtr
4929 						mov			eax, dstPtr
4930 						// load idVecX
4931 						movlps		xmm4, [esi]
4932 						movhps		xmm4, [esi+8]
4933 						movlps		xmm5, [esi+16]
4934 						movlhps		xmm5, xmm4
4935 						movhlps		xmm6, xmm4
4936 						movlhps		xmm6, xmm5
4937 						// row 0 and 1
4938 						movaps		xmm0, [edi]
4939 						movaps		xmm1, [edi+16]
4940 						movaps		xmm2, [edi+32]
4941 						mulps		xmm0, xmm4
4942 						mulps		xmm1, xmm5
4943 						mulps		xmm2, xmm6
4944 						movhlps		xmm7, xmm0
4945 						movlhps		xmm7, xmm2
4946 						addps		xmm7, xmm1
4947 						shufps		xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 )
4948 						addps		xmm7, xmm0
4949 						// row 2 and 3
4950 						movaps		xmm0, [edi+48]
4951 						movaps		xmm1, [edi+48+16]
4952 						movaps		xmm2, [edi+48+32]
4953 						mulps		xmm0, xmm4
4954 						mulps		xmm1, xmm5
4955 						mulps		xmm2, xmm6
4956 						movhlps		xmm3, xmm0
4957 						movlhps		xmm3, xmm2
4958 						addps		xmm1, xmm3
4959 						shufps		xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 )
4960 						addps		xmm1, xmm0
4961 						// last 4 additions for the first 4 rows and store result
4962 						movaps		xmm0, xmm7
4963 						shufps		xmm7, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 )
4964 						shufps		xmm0, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 )
4965 						addps		xmm0, xmm7
4966 						STORE4( 0, xmm0, xmm4 )
4967 					}
4968 					return;
4969 				}
4970 				case 5: {		// 5x6 * 6x1
4971 					__asm {
4972 						mov			esi, vPtr
4973 						mov			edi, mPtr
4974 						mov			eax, dstPtr
4975 						// load idVecX
4976 						movlps		xmm4, [esi]
4977 						movhps		xmm4, [esi+8]
4978 						movlps		xmm5, [esi+16]
4979 						movlhps		xmm5, xmm4
4980 						movhlps		xmm6, xmm4
4981 						movlhps		xmm6, xmm5
4982 						// row 0 and 1
4983 						movaps		xmm0, [edi]
4984 						movaps		xmm1, [edi+16]
4985 						movaps		xmm2, [edi+32]
4986 						mulps		xmm0, xmm4
4987 						mulps		xmm1, xmm5
4988 						mulps		xmm2, xmm6
4989 						movhlps		xmm7, xmm0
4990 						movlhps		xmm7, xmm2
4991 						addps		xmm7, xmm1
4992 						shufps		xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 )
4993 						addps		xmm7, xmm0
4994 						// row 2 and 3
4995 						movaps		xmm0, [edi+48]
4996 						movaps		xmm1, [edi+48+16]
4997 						movaps		xmm2, [edi+48+32]
4998 						mulps		xmm0, xmm4
4999 						mulps		xmm1, xmm5
5000 						mulps		xmm2, xmm6
5001 						movhlps		xmm3, xmm0
5002 						movlhps		xmm3, xmm2
5003 						addps		xmm1, xmm3
5004 						shufps		xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 )
5005 						addps		xmm1, xmm0
5006 						// last 4 additions for the first 4 rows and store result
5007 						movaps		xmm0, xmm7
5008 						shufps		xmm7, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 )
5009 						shufps		xmm0, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 )
5010 						addps		xmm0, xmm7
5011 						STORE4( 0, xmm0, xmm3 )
5012 						// row 5
5013 						movaps		xmm0, [edi+96]
5014 						movaps		xmm1, [edi+96+16]
5015 						mulps		xmm0, xmm4
5016 						mulps		xmm1, xmm5
5017 						addps		xmm0, xmm1
5018 						movhlps		xmm1, xmm0
5019 						addps		xmm0, xmm1
5020 						movaps		xmm1, xmm0
5021 						shufps		xmm1, xmm1, 0x01
5022 						addss		xmm0, xmm1
5023 						STORE1( 16, xmm0, xmm3 )
5024 					}
5025 					return;
5026 				}
5027 				case 6: {		// 6x6 * 6x1
5028 					__asm {
5029 						mov			esi, vPtr
5030 						mov			edi, mPtr
5031 						mov			eax, dstPtr
5032 						movlps		xmm7, qword ptr [esi]
5033 						movlps		xmm6, qword ptr [esi+8]
5034 						shufps		xmm7, xmm7, 0x44
5035 						shufps		xmm6, xmm6, 0x44
5036 						movlps		xmm0, qword ptr [edi    ]
5037 						movhps		xmm0, qword ptr [edi+ 24]
5038 						mulps		xmm0, xmm7
5039 						movlps		xmm3, qword ptr [edi+  8]
5040 						movhps		xmm3, qword ptr [edi+ 32]
5041 						mulps		xmm3, xmm6
5042 						movlps		xmm1, qword ptr [edi+ 48]
5043 						movhps		xmm1, qword ptr [edi+ 72]
5044 						mulps		xmm1, xmm7
5045 						movlps		xmm2, qword ptr [edi+ 96]
5046 						movhps		xmm2, qword ptr [edi+120]
5047 						mulps		xmm2, xmm7
5048 						movlps		xmm4, qword ptr [edi+ 56]
5049 						movhps		xmm4, qword ptr [edi+ 80]
5050 						movlps		xmm5, qword ptr [edi+104]
5051 						movhps		xmm5, qword ptr [edi+128]
5052 						mulps		xmm4, xmm6
5053 						movlps		xmm7, qword ptr [esi+16]
5054 						addps		xmm0, xmm3
5055 						shufps		xmm7, xmm7, 0x44
5056 						mulps		xmm5, xmm6
5057 						addps		xmm1, xmm4
5058 						movlps		xmm3, qword ptr [edi+ 16]
5059 						movhps		xmm3, qword ptr [edi+ 40]
5060 						addps		xmm2, xmm5
5061 						movlps		xmm4, qword ptr [edi+ 64]
5062 						movhps		xmm4, qword ptr [edi+ 88]
5063 						mulps		xmm3, xmm7
5064 						movlps		xmm5, qword ptr [edi+112]
5065 						movhps		xmm5, qword ptr [edi+136]
5066 						addps		xmm0, xmm3
5067 						mulps		xmm4, xmm7
5068 						mulps		xmm5, xmm7
5069 						addps		xmm1, xmm4
5070 						addps		xmm2, xmm5
5071 						movaps		xmm6, xmm0
5072 						shufps		xmm0, xmm1, 0x88
5073 						shufps		xmm6, xmm1, 0xDD
5074 						movaps		xmm7, xmm2
5075 						shufps		xmm7, xmm2, 0x88
5076 						shufps		xmm2, xmm2, 0xDD
5077 						addps		xmm0, xmm6
5078 						addps		xmm2, xmm7
5079 						STORE4( 0, xmm0, xmm3 )
5080 						STORE2LO( 16, xmm2, xmm4 )
5081 					}
5082 					return;
5083 				}
5084 				default: {
5085 					for ( int i = 0; i < numRows; i++ ) {
5086 						dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2] +
5087 									mPtr[3] * vPtr[3] + mPtr[4] * vPtr[4] + mPtr[5] * vPtr[5];
5088 						mPtr += 6;
5089 					}
5090 					return;
5091 				}
5092 			}
5093 			break;
5094 		}
5095 		default: {
5096 			int numColumns = mat.GetNumColumns();
5097 			for ( int i = 0; i < numRows; i++ ) {
5098 				float sum = mPtr[0] * vPtr[0];
5099 				for ( int j = 1; j < numColumns; j++ ) {
5100 					sum += mPtr[j] * vPtr[j];
5101 				}
5102 				dstPtr[i] STOREC sum;
5103 				mPtr += numColumns;
5104 			}
5105 			break;
5106 		}
5107 	}
5108 
5109 #undef STOREC
5110 #undef STORE4
5111 #undef STORE2HI
5112 #undef STORE2LO
5113 #undef STORE1
5114 }
5115 
5116 /*
5117 ============
5118 idSIMD_SSE::MatX_MultiplyAddVecX
5119 
5120 	optimizes the following matrix multiplications:
5121 
5122 	NxN * Nx1
5123 	Nx6 * 6x1
5124 	6xN * Nx1
5125 
5126 	with N in the range [1-6]
5127 ============
5128 */
MatX_MultiplyAddVecX(idVecX & dst,const idMatX & mat,const idVecX & vec)5129 void VPCALL idSIMD_SSE::MatX_MultiplyAddVecX( idVecX &dst, const idMatX &mat, const idVecX &vec ) {
5130 #define STORE1( offset, reg1, reg2 )		\
5131 	__asm movss		reg2, [eax+offset]		\
5132 	__asm addss		reg2, reg1				\
5133 	__asm movss		[eax+offset], reg2
5134 #define STORE2LO( offset, reg1, reg2 )		\
5135 	__asm movlps	reg2, [eax+offset]		\
5136 	__asm addps		reg2, reg1				\
5137 	__asm movlps	[eax+offset], reg2
5138 #define STORE2HI( offset, reg1, reg2 )		\
5139 	__asm movhps	reg2, [eax+offset]		\
5140 	__asm addps		reg2, reg1				\
5141 	__asm movhps	[eax+offset], reg2
5142 #define STORE4( offset, reg1, reg2 )		\
5143 	__asm movlps	reg2, [eax+offset]		\
5144 	__asm movhps	reg2, [eax+offset+8]	\
5145 	__asm addps		reg2, reg1				\
5146 	__asm movlps	[eax+offset], reg2		\
5147 	__asm movhps	[eax+offset+8], reg2
5148 #define STOREC		+=
5149 
5150 	int numRows;
5151 	const float *mPtr, *vPtr;
5152 	float *dstPtr;
5153 
5154 	assert( vec.GetSize() >= mat.GetNumColumns() );
5155 	assert( dst.GetSize() >= mat.GetNumRows() );
5156 
5157 	mPtr = mat.ToFloatPtr();
5158 	vPtr = vec.ToFloatPtr();
5159 	dstPtr = dst.ToFloatPtr();
5160 	numRows = mat.GetNumRows();
5161 	switch( mat.GetNumColumns() ) {
5162 		case 1: {
5163 			switch( numRows ) {
5164 				case 1: {		// 1x1 * 1x1
5165 					__asm {
5166 						mov			esi, vPtr
5167 						mov			edi, mPtr
5168 						mov			eax, dstPtr
5169 						movss		xmm0, [esi]
5170 						mulss		xmm0, [edi]
5171 						STORE1( 0, xmm0, xmm1 )
5172 					}
5173 					return;
5174 				}
5175 				case 6: {		// 6x1 * 1x1
5176 					__asm {
5177 						mov			esi, vPtr
5178 						mov			edi, mPtr
5179 						mov			eax, dstPtr
5180 						movss		xmm0, [esi]
5181 						shufps		xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
5182 						movaps		xmm1, xmm0
5183 						mulps		xmm0, [edi]
5184 						mulps		xmm1, [edi+16]
5185 						STORE4( 0, xmm0, xmm2 )
5186 						STORE2LO( 16, xmm1, xmm2 )
5187 					}
5188 					return;
5189 				}
5190 				default: {
5191 					for ( int i = 0; i < numRows; i++ ) {
5192 						dstPtr[i] STOREC mPtr[0] * vPtr[0];
5193 						mPtr++;
5194 					}
5195 					return;
5196 				}
5197 			}
5198 			break;
5199 		}
5200 		case 2: {
5201 			switch( numRows ) {
5202 				case 2: {		// 2x2 * 2x1
5203 					__asm {
5204 						mov			esi, vPtr
5205 						mov			edi, mPtr
5206 						mov			eax, dstPtr
5207 						movss		xmm0, [esi]
5208 						movss		xmm1, [esi+4]
5209 						movss		xmm2, [edi]
5210 						mulss		xmm2, xmm0
5211 						movss		xmm3, [edi+4]
5212 						mulss		xmm3, xmm1
5213 						addss		xmm2, xmm3
5214 						STORE1( 0, xmm2, xmm4 )
5215 						mulss		xmm0, [edi+8]
5216 						mulss		xmm1, [edi+8+4]
5217 						addss		xmm0, xmm1
5218 						STORE1( 4, xmm0, xmm4 )
5219 					}
5220 					return;
5221 				}
5222 				case 6: {		// 6x2 * 2x1
5223 					__asm {
5224 						mov			esi, vPtr
5225 						mov			edi, mPtr
5226 						mov			eax, dstPtr
5227 						movlps		xmm7, [esi]
5228 						shufps		xmm7, xmm7, R_SHUFFLEPS( 0, 1, 0, 1 )
5229 						movaps		xmm0, [edi]
5230 						mulps		xmm0, xmm7
5231 						movaps		xmm1, [edi+16]
5232 						mulps		xmm1, xmm7
5233 						movaps		xmm2, xmm0
5234 						shufps		xmm0, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 )
5235 						shufps		xmm2, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 )
5236 						movaps		xmm3, [edi+32]
5237 						addps		xmm0, xmm2
5238 						mulps		xmm3, xmm7
5239 						STORE4( 0, xmm0, xmm4 )
5240 						shufps		xmm3, xmm3, R_SHUFFLEPS( 0, 2, 1, 3 )
5241 						movhlps		xmm1, xmm3
5242 						addps		xmm3, xmm1
5243 						STORE2LO( 16, xmm3, xmm4 )
5244 					}
5245 					return;
5246 				}
5247 				default: {
5248 					for ( int i = 0; i < numRows; i++ ) {
5249 						dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1];
5250 						mPtr += 2;
5251 					}
5252 					return;
5253 				}
5254 			}
5255 			break;
5256 		}
5257 		case 3: {
5258 			switch( numRows ) {
5259 				case 3: {		// 3x3 * 3x1
5260 					__asm {
5261 						mov			esi, vPtr
5262 						mov			edi, mPtr
5263 						mov			eax, dstPtr
5264 						movss		xmm0, [esi]
5265 						movss		xmm4, [edi]
5266 						mulss		xmm4, xmm0
5267 						movss		xmm1, [esi+4]
5268 						movss		xmm5, [edi+4]
5269 						mulss		xmm5, xmm1
5270 						addss		xmm4, xmm5
5271 						movss		xmm2, [esi+8]
5272 						movss		xmm6, [edi+8]
5273 						mulss		xmm6, xmm2
5274 						addss		xmm4, xmm6
5275 						movss		xmm3, [edi+12]
5276 						mulss		xmm3, xmm0
5277 						STORE1( 0, xmm4, xmm7 );
5278 						movss		xmm5, [edi+12+4]
5279 						mulss		xmm5, xmm1
5280 						addss		xmm3, xmm5
5281 						movss		xmm6, [edi+12+8]
5282 						mulss		xmm6, xmm2
5283 						addss		xmm3, xmm6
5284 						mulss		xmm0, [edi+24]
5285 						mulss		xmm1, [edi+24+4]
5286 						STORE1( 4, xmm3, xmm7 );
5287 						addss		xmm0, xmm1
5288 						mulss		xmm2, [edi+24+8]
5289 						addss		xmm0, xmm2
5290 						STORE1( 8, xmm0, xmm7 );
5291 					}
5292 					return;
5293 				}
5294 				case 6: {		// 6x3 * 3x1
5295 					__asm {
5296 						mov			esi, vPtr
5297 						mov			edi, mPtr
5298 						mov			eax, dstPtr
5299 						movss		xmm5, [esi]
5300 						shufps		xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
5301 						movss		xmm6, [esi+4]
5302 						shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
5303 						movss		xmm7, [esi+8]
5304 						shufps		xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
5305 						movaps		xmm0, [edi]								// xmm0 = 0, 1, 2, 3
5306 						movlps		xmm1, [edi+4*4]
5307 						shufps		xmm1, xmm0, R_SHUFFLEPS( 0, 1, 1, 2 )	// xmm1 = 4, 5, 1, 2
5308 						movlps		xmm2, [edi+6*4]
5309 						movhps		xmm2, [edi+8*4]							// xmm2 = 6, 7, 8, 9
5310 						shufps		xmm0, xmm2, R_SHUFFLEPS( 0, 3, 0, 3 )	// xmm0 = 0, 3, 6, 9
5311 						mulps		xmm0, xmm5
5312 						movlps		xmm3, [edi+10*4]
5313 						shufps		xmm2, xmm3, R_SHUFFLEPS( 1, 2, 0, 1 )	// xmm2 = 7, 8, 10, 11
5314 						movaps		xmm3, xmm1
5315 						shufps		xmm1, xmm2, R_SHUFFLEPS( 2, 0, 0, 2 )	// xmm1 = 1, 4, 7, 10
5316 						mulps		xmm1, xmm6
5317 						shufps		xmm3, xmm2, R_SHUFFLEPS( 3, 1, 1, 3 )	// xmm3 = 2, 5, 8, 11
5318 						mulps		xmm3, xmm7
5319 						addps		xmm0, xmm1
5320 						addps		xmm0, xmm3
5321 						STORE4( 0, xmm0, xmm4 )
5322 						movss		xmm1, [edi+12*4]
5323 						mulss		xmm1, xmm5
5324 						movss		xmm2, [edi+13*4]
5325 						mulss		xmm2, xmm6
5326 						movss		xmm3, [edi+14*4]
5327 						mulss		xmm3, xmm7
5328 						addss		xmm1, xmm2
5329 						addss		xmm1, xmm3
5330 						STORE1( 16, xmm1, xmm4 )
5331 						mulss		xmm5, [edi+15*4]
5332 						mulss		xmm6, [edi+16*4]
5333 						mulss		xmm7, [edi+17*4]
5334 						addss		xmm5, xmm6
5335 						addss		xmm5, xmm7
5336 						STORE1( 20, xmm5, xmm4 )
5337 					}
5338 					return;
5339 				}
5340 				default: {
5341 					for ( int i = 0; i < numRows; i++ ) {
5342 						dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2];
5343 						mPtr += 3;
5344 					}
5345 					return;
5346 				}
5347 			}
5348 			break;
5349 		}
5350 		case 4: {
5351 			switch( numRows ) {
5352 				case 4: {		// 4x4 * 4x1
5353 					__asm {
5354 						mov			esi, vPtr
5355 						mov			edi, mPtr
5356 						mov			eax, dstPtr
5357 						movlps		xmm6, qword ptr [esi ]
5358 						movlps		xmm0, qword ptr [edi ]
5359 						shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 1, 0, 1 )
5360 						movhps		xmm0, qword ptr [edi+16]
5361 						mulps		xmm0, xmm6
5362 						movlps		xmm7, qword ptr [esi+ 8]
5363 						movlps		xmm2, qword ptr [edi+ 8]
5364 						shufps		xmm7, xmm7, R_SHUFFLEPS( 0, 1, 0, 1 )
5365 						movhps		xmm2, qword ptr [edi+24]
5366 						mulps		xmm2, xmm7
5367 						movlps		xmm1, qword ptr [edi+32]
5368 						movhps		xmm1, qword ptr [edi+48]
5369 						mulps		xmm1, xmm6
5370 						movlps		xmm3, qword ptr [edi+40]
5371 						addps		xmm0, xmm2
5372 						movhps		xmm3, qword ptr [edi+56]
5373 						mulps		xmm3, xmm7
5374 						movaps		xmm4, xmm0
5375 						addps		xmm1, xmm3
5376 						shufps		xmm4, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 )
5377 						shufps		xmm0, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 )
5378 						addps		xmm0, xmm4
5379 						STORE4( 0, xmm0, xmm2 )
5380 					}
5381 					return;
5382 				}
5383 				case 6: {		// 6x4 * 4x1
5384 					__asm {
5385 						mov			esi, vPtr
5386 						mov			edi, mPtr
5387 						mov			eax, dstPtr
5388 						movlps		xmm6, qword ptr [esi+ 0]
5389 						movlps		xmm0, qword ptr [edi+ 0]
5390 						shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 1, 0, 1 )
5391 						movhps		xmm0, qword ptr [edi+16]
5392 						mulps		xmm0, xmm6
5393 						movlps		xmm7, qword ptr [esi+ 8]
5394 						movlps		xmm2, qword ptr [edi+ 8]
5395 						shufps		xmm7, xmm7, R_SHUFFLEPS( 0, 1, 0, 1 )
5396 						movhps		xmm2, qword ptr [edi+24]
5397 						mulps		xmm2, xmm7
5398 						movlps		xmm1, qword ptr [edi+32]
5399 						movhps		xmm1, qword ptr [edi+48]
5400 						mulps		xmm1, xmm6
5401 						movlps		xmm3, qword ptr [edi+40]
5402 						addps		xmm0, xmm2
5403 						movhps		xmm3, qword ptr [edi+56]
5404 						mulps		xmm3, xmm7
5405 						movaps		xmm4, xmm0
5406 						addps		xmm1, xmm3
5407 						shufps		xmm4, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 )
5408 						shufps		xmm0, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 )
5409 						addps		xmm0, xmm4
5410 						movlps		xmm1, qword ptr [edi+64]
5411 						movhps		xmm1, qword ptr [edi+80]
5412 						STORE4( 0, xmm0, xmm4 )
5413 						mulps		xmm1, xmm6
5414 						movlps		xmm2, qword ptr [edi+72]
5415 						movhps		xmm2, qword ptr [edi+88]
5416 						mulps		xmm2, xmm7
5417 						addps		xmm1, xmm2
5418 						shufps		xmm1, xmm1, R_SHUFFLEPS( 0, 2, 1, 3 )
5419 						movhlps		xmm3, xmm1
5420 						addps		xmm1, xmm3
5421 						STORE2LO( 16, xmm1, xmm4 )
5422 					}
5423 					return;
5424 				}
5425 				default: {
5426 					for ( int i = 0; i < numRows; i++ ) {
5427 						dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2] + mPtr[3] * vPtr[3];
5428 						mPtr += 4;
5429 					}
5430 					return;
5431 				}
5432 			}
5433 			break;
5434 		}
5435 		case 5: {
5436 			switch( numRows ) {
5437 				case 5: {		// 5x5 * 5x1
5438 					__asm {
5439 						mov			esi, vPtr
5440 						mov			edi, mPtr
5441 						mov			eax, dstPtr
5442 						movss		xmm0, [edi+5*4]							// xmm0 =  5,  X,  X,  X
5443 						movhps		xmm0, [edi+0*4]							// xmm0 =  5,  X,  0,  1
5444 						movss		xmm5, [edi+15*4]						// xmm4 = 15,  X,  X,  X
5445 						movhps		xmm5, [edi+10*4]						// xmm5 = 15,  X, 10, 11
5446 						movaps		xmm1, xmm0								// xmm1 =  5,  X,  0,  1
5447 						shufps		xmm0, xmm5, R_SHUFFLEPS( 2, 0, 2, 0 )	// xmm0 =  0,  5, 10, 15
5448 						movlps		xmm1, [edi+6*4]							// xmm1 =  6,  7,  0,  1
5449 						movlps		xmm5, [edi+16*4]						// xmm5 = 16, 17, 10, 11
5450 						movaps		xmm2, xmm1								// xmm2 =  6,  7,  0,  1
5451 						shufps		xmm1, xmm5, R_SHUFFLEPS( 3, 0, 3, 0 )	// xmm1 =  1,  6, 11, 16
5452 						movhps		xmm2, [edi+2*4]							// xmm2 =  6,  7,  2,  3
5453 						movhps		xmm5, [edi+12*4]						// xmm5 = 16, 17, 12, 13
5454 						movaps		xmm3, xmm2								// xmm3 =  6,  7,  2,  3
5455 						shufps		xmm2, xmm5, R_SHUFFLEPS( 2, 1, 2, 1 )	// xmm2 =  2,  7, 12, 17
5456 						movlps		xmm3, [edi+8*4]							// xmm3 =  8,  9,  2,  3
5457 						movlps		xmm5, [edi+18*4]						// xmm5 = 18, 19, 12, 13
5458 						movss		xmm4, [edi+4*4]							// xmm4 =  4,  X,  X,  X
5459 						movlhps		xmm4, xmm3								// xmm4 =  4,  X,  8,  9
5460 						shufps		xmm3, xmm5, R_SHUFFLEPS( 3, 0, 3, 0 )	// xmm3 =  3,  8, 13, 18
5461 						movhps		xmm5, [edi+14*4]						// xmm6 = 18, 19, 14, 15
5462 						shufps		xmm4, xmm5, R_SHUFFLEPS( 0, 3, 2, 1 )	// xmm4 =  4,  9, 14, 19
5463 						movss		xmm7, [esi+0*4]
5464 						shufps		xmm7, xmm7, 0
5465 						mulps		xmm0, xmm7
5466 						movss		xmm5, [esi+1*4]
5467 						shufps		xmm5, xmm5, 0
5468 						mulps		xmm1, xmm5
5469 						addps		xmm0, xmm1
5470 						movss		xmm6, [esi+2*4]
5471 						shufps		xmm6, xmm6, 0
5472 						mulps		xmm2, xmm6
5473 						addps		xmm0, xmm2
5474 						movss		xmm1, [esi+3*4]
5475 						shufps		xmm1, xmm1, 0
5476 						mulps		xmm3, xmm1
5477 						addps		xmm0, xmm3
5478 						movss		xmm2, [esi+4*4]
5479 						shufps		xmm2, xmm2, 0
5480 						mulps		xmm4, xmm2
5481 						addps		xmm0, xmm4
5482 						mulss		xmm7, [edi+20*4]
5483 						mulss		xmm5, [edi+21*4]
5484 						addps		xmm7, xmm5
5485 						mulss		xmm6, [edi+22*4]
5486 						addps		xmm7, xmm6
5487 						mulss		xmm1, [edi+23*4]
5488 						addps		xmm7, xmm1
5489 						mulss		xmm2, [edi+24*4]
5490 						addps		xmm7, xmm2
5491 						STORE4( 0, xmm0, xmm3 )
5492 						STORE1( 16, xmm7, xmm4 )
5493 					}
5494 					return;
5495 				}
5496 				case 6: {		// 6x5 * 5x1
5497 					__asm {
5498 						mov			esi, vPtr
5499 						mov			edi, mPtr
5500 						mov			eax, dstPtr
5501 						movlps		xmm6, [esi]
5502 						shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 1, 0, 1 )
5503 						movlps		xmm7, [esi+8]
5504 						shufps		xmm7, xmm7, R_SHUFFLEPS( 0, 1, 0, 1 )
5505 						movlps		xmm0, [edi]
5506 						movhps		xmm3, [edi+8]
5507 						movaps		xmm1, [edi+16]
5508 						movlps		xmm2, [edi+32]
5509 						shufps		xmm0, xmm1, R_SHUFFLEPS( 0, 1, 1, 2 )	// xmm0 = 0, 1, 5, 6
5510 						shufps		xmm1, xmm2, R_SHUFFLEPS( 0, 3, 0, 1 )	// xmm1 = 4, 7, 8, 9
5511 						shufps		xmm3, xmm1, R_SHUFFLEPS( 2, 3, 1, 2 )	// xmm3 = 2, 3, 7, 8
5512 						mulps		xmm0, xmm6
5513 						mulps		xmm3, xmm7
5514 						movlps		xmm2, [edi+40]
5515 						addps		xmm0, xmm3								// xmm0 + xmm1
5516 						movhps		xmm5, [edi+40+8]
5517 						movlps		xmm3, [edi+40+16]
5518 						movhps		xmm3, [edi+40+24]
5519 						movlps		xmm4, [edi+40+32]
5520 						shufps		xmm2, xmm3, R_SHUFFLEPS( 0, 1, 1, 2 )	// xmm2 = 10, 11, 15, 16
5521 						shufps		xmm3, xmm4, R_SHUFFLEPS( 0, 3, 0, 1 )	// xmm3 = 14, 17, 18, 19
5522 						shufps		xmm5, xmm3, R_SHUFFLEPS( 2, 3, 1, 2 )	// xmm5 = 12, 13, 17, 18
5523 						mulps		xmm2, xmm6
5524 						mulps		xmm5, xmm7
5525 						addps		xmm2, xmm5								// xmm2 + xmm3
5526 						movss		xmm5, [esi+16]
5527 						shufps		xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
5528 						movaps		xmm4, xmm0
5529 						shufps		xmm0, xmm2, R_SHUFFLEPS( 0, 2, 0, 2 )
5530 						shufps		xmm4, xmm2, R_SHUFFLEPS( 1, 3, 1, 3 )
5531 						shufps		xmm1, xmm3, R_SHUFFLEPS( 0, 3, 0, 3 )
5532 						addps		xmm0, xmm4
5533 						mulps		xmm1, xmm5
5534 						addps		xmm0, xmm1
5535 						STORE4( 0, xmm0, xmm2 )
5536 						movlps		xmm4, [edi+80]
5537 						movhps		xmm3, [edi+80+8]
5538 						movaps		xmm1, [edi+80+16]
5539 						movlps		xmm2, [edi+80+32]
5540 						shufps		xmm4, xmm1, R_SHUFFLEPS( 0, 1, 1, 2 )	// xmm4 = 20, 21, 25, 26
5541 						shufps		xmm1, xmm2, R_SHUFFLEPS( 0, 3, 0, 1 )	// xmm1 = 24, 27, 28, 29
5542 						shufps		xmm3, xmm1, R_SHUFFLEPS( 2, 3, 1, 2 )	// xmm3 = 22, 23, 27, 28
5543 						mulps		xmm4, xmm6
5544 						mulps		xmm3, xmm7
5545 						mulps		xmm1, xmm5
5546 						addps		xmm4, xmm3								// xmm4 + xmm1
5547 						shufps		xmm1, xmm4, R_SHUFFLEPS( 0, 3, 0, 2 )
5548 						shufps		xmm4, xmm4, R_SHUFFLEPS( 1, 3, 0, 0 )
5549 						addps		xmm4, xmm1
5550 						shufps		xmm1, xmm1, R_SHUFFLEPS( 2, 3, 0, 1 )
5551 						addps		xmm4, xmm1
5552 						STORE2LO( 16, xmm4, xmm2 )
5553 					}
5554 					return;
5555 				}
5556 				default: {
5557 					for ( int i = 0; i < numRows; i++ ) {
5558 						dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2] + mPtr[3] * vPtr[3] + mPtr[4] * vPtr[4];
5559 						mPtr += 5;
5560 					}
5561 					return;
5562 				}
5563 			}
5564 			break;
5565 		}
5566 		case 6: {
5567 			switch( numRows ) {
5568 				case 1: {		// 1x6 * 6x1
5569 					__asm {
5570 						mov			esi, vPtr
5571 						mov			edi, mPtr
5572 						mov			eax, dstPtr
5573 						movss		xmm0, [esi]
5574 						mulss		xmm0, [edi]
5575 						movss		xmm1, [esi+4]
5576 						mulss		xmm1, [edi+4]
5577 						movss		xmm2, [esi+8]
5578 						addss		xmm0, xmm1
5579 						mulss		xmm2, [edi+8]
5580 						movss		xmm3, [esi+12]
5581 						addss		xmm0, xmm2
5582 						mulss		xmm3, [edi+12]
5583 						movss		xmm4, [esi+16]
5584 						addss		xmm0, xmm3
5585 						mulss		xmm4, [edi+16]
5586 						movss		xmm5, [esi+20]
5587 						addss		xmm0, xmm4
5588 						mulss		xmm5, [edi+20]
5589 						movss		xmm6, [esi+24]
5590 						addss		xmm0, xmm5
5591 						mulss		xmm6, [edi+24]
5592 						addss		xmm0, xmm6
5593 						STORE1( 0, xmm0, xmm7 )
5594 					}
5595 					return;
5596 				}
5597 				case 2: {		// 2x6 * 6x1
5598 					__asm {
5599 						mov			esi, vPtr
5600 						mov			edi, mPtr
5601 						mov			eax, dstPtr
5602 						// load idVecX
5603 						movlps		xmm4, [esi]
5604 						movhps		xmm4, [esi+8]
5605 						movlps		xmm5, [esi+16]
5606 						movlhps		xmm5, xmm4
5607 						movhlps		xmm6, xmm4
5608 						movlhps		xmm6, xmm5
5609 						// row 0 and 1
5610 						movaps		xmm0, [edi]
5611 						movaps		xmm1, [edi+16]
5612 						movaps		xmm2, [edi+32]
5613 						mulps		xmm0, xmm4
5614 						mulps		xmm1, xmm5
5615 						mulps		xmm2, xmm6
5616 						movhlps		xmm3, xmm0
5617 						movlhps		xmm3, xmm2
5618 						addps		xmm1, xmm3
5619 						shufps		xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 )
5620 						addps		xmm1, xmm0
5621 						shufps		xmm1, xmm1, R_SHUFFLEPS( 0, 2, 1, 3 )
5622 						movhlps		xmm0, xmm1
5623 						addps		xmm0, xmm1
5624 						STORE2LO( 0, xmm0, xmm3 )
5625 					}
5626 					return;
5627 				}
5628 				case 3: {		// 3x6 * 6x1
5629 					__asm {
5630 						mov			esi, vPtr
5631 						mov			edi, mPtr
5632 						mov			eax, dstPtr
5633 						// load idVecX
5634 						movlps		xmm4, [esi]
5635 						movhps		xmm4, [esi+8]
5636 						movlps		xmm5, [esi+16]
5637 						movlhps		xmm5, xmm4
5638 						movhlps		xmm6, xmm4
5639 						movlhps		xmm6, xmm5
5640 						// row 0 and 1
5641 						movaps		xmm0, [edi]
5642 						movaps		xmm1, [edi+16]
5643 						movaps		xmm2, [edi+32]
5644 						mulps		xmm0, xmm4
5645 						mulps		xmm1, xmm5
5646 						mulps		xmm2, xmm6
5647 						movhlps		xmm3, xmm0
5648 						movlhps		xmm3, xmm2
5649 						addps		xmm1, xmm3
5650 						shufps		xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 )
5651 						addps		xmm1, xmm0
5652 						shufps		xmm1, xmm1, R_SHUFFLEPS( 0, 2, 1, 3 )
5653 						movhlps		xmm0, xmm1
5654 						addps		xmm0, xmm1
5655 						STORE2LO( 0, xmm0, xmm3 )
5656 						// row 2
5657 						movaps		xmm0, [edi+48]
5658 						movaps		xmm1, [edi+48+16]
5659 						mulps		xmm0, xmm4
5660 						mulps		xmm1, xmm5
5661 						addps		xmm0, xmm1
5662 						movhlps		xmm1, xmm0
5663 						addps		xmm0, xmm1
5664 						movaps		xmm1, xmm0
5665 						shufps		xmm1, xmm1, R_SHUFFLEPS( 1, 0, 0, 0 )
5666 						addss		xmm0, xmm1
5667 						STORE1( 8, xmm0, xmm3 )
5668 					}
5669 					return;
5670 				}
5671 				case 4: {		// 4x6 * 6x1
5672 					__asm {
5673 						mov			esi, vPtr
5674 						mov			edi, mPtr
5675 						mov			eax, dstPtr
5676 						// load idVecX
5677 						movlps		xmm4, [esi]
5678 						movhps		xmm4, [esi+8]
5679 						movlps		xmm5, [esi+16]
5680 						movlhps		xmm5, xmm4
5681 						movhlps		xmm6, xmm4
5682 						movlhps		xmm6, xmm5
5683 						// row 0 and 1
5684 						movaps		xmm0, [edi]
5685 						movaps		xmm1, [edi+16]
5686 						movaps		xmm2, [edi+32]
5687 						mulps		xmm0, xmm4
5688 						mulps		xmm1, xmm5
5689 						mulps		xmm2, xmm6
5690 						movhlps		xmm7, xmm0
5691 						movlhps		xmm7, xmm2
5692 						addps		xmm7, xmm1
5693 						shufps		xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 )
5694 						addps		xmm7, xmm0
5695 						// row 2 and 3
5696 						movaps		xmm0, [edi+48]
5697 						movaps		xmm1, [edi+48+16]
5698 						movaps		xmm2, [edi+48+32]
5699 						mulps		xmm0, xmm4
5700 						mulps		xmm1, xmm5
5701 						mulps		xmm2, xmm6
5702 						movhlps		xmm3, xmm0
5703 						movlhps		xmm3, xmm2
5704 						addps		xmm1, xmm3
5705 						shufps		xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 )
5706 						addps		xmm1, xmm0
5707 						// last 4 additions for the first 4 rows and store result
5708 						movaps		xmm0, xmm7
5709 						shufps		xmm7, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 )
5710 						shufps		xmm0, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 )
5711 						addps		xmm0, xmm7
5712 						STORE4( 0, xmm0, xmm4 )
5713 					}
5714 					return;
5715 				}
5716 				case 5: {		// 5x6 * 6x1
5717 					__asm {
5718 						mov			esi, vPtr
5719 						mov			edi, mPtr
5720 						mov			eax, dstPtr
5721 						// load idVecX
5722 						movlps		xmm4, [esi]
5723 						movhps		xmm4, [esi+8]
5724 						movlps		xmm5, [esi+16]
5725 						movlhps		xmm5, xmm4
5726 						movhlps		xmm6, xmm4
5727 						movlhps		xmm6, xmm5
5728 						// row 0 and 1
5729 						movaps		xmm0, [edi]
5730 						movaps		xmm1, [edi+16]
5731 						movaps		xmm2, [edi+32]
5732 						mulps		xmm0, xmm4
5733 						mulps		xmm1, xmm5
5734 						mulps		xmm2, xmm6
5735 						movhlps		xmm7, xmm0
5736 						movlhps		xmm7, xmm2
5737 						addps		xmm7, xmm1
5738 						shufps		xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 )
5739 						addps		xmm7, xmm0
5740 						// row 2 and 3
5741 						movaps		xmm0, [edi+48]
5742 						movaps		xmm1, [edi+48+16]
5743 						movaps		xmm2, [edi+48+32]
5744 						mulps		xmm0, xmm4
5745 						mulps		xmm1, xmm5
5746 						mulps		xmm2, xmm6
5747 						movhlps		xmm3, xmm0
5748 						movlhps		xmm3, xmm2
5749 						addps		xmm1, xmm3
5750 						shufps		xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 )
5751 						addps		xmm1, xmm0
5752 						// last 4 additions for the first 4 rows and store result
5753 						movaps		xmm0, xmm7
5754 						shufps		xmm7, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 )
5755 						shufps		xmm0, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 )
5756 						addps		xmm0, xmm7
5757 						STORE4( 0, xmm0, xmm3 )
5758 						// row 5
5759 						movaps		xmm0, [edi+96]
5760 						movaps		xmm1, [edi+96+16]
5761 						mulps		xmm0, xmm4
5762 						mulps		xmm1, xmm5
5763 						addps		xmm0, xmm1
5764 						movhlps		xmm1, xmm0
5765 						addps		xmm0, xmm1
5766 						movaps		xmm1, xmm0
5767 						shufps		xmm1, xmm1, 0x01
5768 						addss		xmm0, xmm1
5769 						STORE1( 16, xmm0, xmm3 )
5770 					}
5771 					return;
5772 				}
5773 				case 6: {		// 6x6 * 6x1
5774 					__asm {
5775 						mov			esi, vPtr
5776 						mov			edi, mPtr
5777 						mov			eax, dstPtr
5778 						movlps		xmm7, qword ptr [esi]
5779 						movlps		xmm6, qword ptr [esi+8]
5780 						shufps		xmm7, xmm7, 0x44
5781 						shufps		xmm6, xmm6, 0x44
5782 						movlps		xmm0, qword ptr [edi    ]
5783 						movhps		xmm0, qword ptr [edi+ 24]
5784 						mulps		xmm0, xmm7
5785 						movlps		xmm3, qword ptr [edi+  8]
5786 						movhps		xmm3, qword ptr [edi+ 32]
5787 						mulps		xmm3, xmm6
5788 						movlps		xmm1, qword ptr [edi+ 48]
5789 						movhps		xmm1, qword ptr [edi+ 72]
5790 						mulps		xmm1, xmm7
5791 						movlps		xmm2, qword ptr [edi+ 96]
5792 						movhps		xmm2, qword ptr [edi+120]
5793 						mulps		xmm2, xmm7
5794 						movlps		xmm4, qword ptr [edi+ 56]
5795 						movhps		xmm4, qword ptr [edi+ 80]
5796 						movlps		xmm5, qword ptr [edi+104]
5797 						movhps		xmm5, qword ptr [edi+128]
5798 						mulps		xmm4, xmm6
5799 						movlps		xmm7, qword ptr [esi+16]
5800 						addps		xmm0, xmm3
5801 						shufps		xmm7, xmm7, 0x44
5802 						mulps		xmm5, xmm6
5803 						addps		xmm1, xmm4
5804 						movlps		xmm3, qword ptr [edi+ 16]
5805 						movhps		xmm3, qword ptr [edi+ 40]
5806 						addps		xmm2, xmm5
5807 						movlps		xmm4, qword ptr [edi+ 64]
5808 						movhps		xmm4, qword ptr [edi+ 88]
5809 						mulps		xmm3, xmm7
5810 						movlps		xmm5, qword ptr [edi+112]
5811 						movhps		xmm5, qword ptr [edi+136]
5812 						addps		xmm0, xmm3
5813 						mulps		xmm4, xmm7
5814 						mulps		xmm5, xmm7
5815 						addps		xmm1, xmm4
5816 						addps		xmm2, xmm5
5817 						movaps		xmm6, xmm0
5818 						shufps		xmm0, xmm1, 0x88
5819 						shufps		xmm6, xmm1, 0xDD
5820 						movaps		xmm7, xmm2
5821 						shufps		xmm7, xmm2, 0x88
5822 						shufps		xmm2, xmm2, 0xDD
5823 						addps		xmm0, xmm6
5824 						addps		xmm2, xmm7
5825 						STORE4( 0, xmm0, xmm3 )
5826 						STORE2LO( 16, xmm2, xmm4 )
5827 					}
5828 					return;
5829 				}
5830 				default: {
5831 					for ( int i = 0; i < numRows; i++ ) {
5832 						dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2] +
5833 									mPtr[3] * vPtr[3] + mPtr[4] * vPtr[4] + mPtr[5] * vPtr[5];
5834 						mPtr += 6;
5835 					}
5836 					return;
5837 				}
5838 			}
5839 			break;
5840 		}
5841 		default: {
5842 			int numColumns = mat.GetNumColumns();
5843 			for ( int i = 0; i < numRows; i++ ) {
5844 				float sum = mPtr[0] * vPtr[0];
5845 				for ( int j = 1; j < numColumns; j++ ) {
5846 					sum += mPtr[j] * vPtr[j];
5847 				}
5848 				dstPtr[i] STOREC sum;
5849 				mPtr += numColumns;
5850 			}
5851 			break;
5852 		}
5853 	}
5854 
5855 #undef STOREC
5856 #undef STORE4
5857 #undef STORE2HI
5858 #undef STORE2LO
5859 #undef STORE1
5860 }
5861 
5862 /*
5863 ============
5864 idSIMD_SSE::MatX_MultiplySubVecX
5865 
5866 	optimizes the following matrix multiplications:
5867 
5868 	NxN * Nx1
5869 	Nx6 * 6x1
5870 	6xN * Nx1
5871 
5872 	with N in the range [1-6]
5873 ============
5874 */
MatX_MultiplySubVecX(idVecX & dst,const idMatX & mat,const idVecX & vec)5875 void VPCALL idSIMD_SSE::MatX_MultiplySubVecX( idVecX &dst, const idMatX &mat, const idVecX &vec ) {
5876 #define STORE1( offset, reg1, reg2 )		\
5877 	__asm movss		reg2, [eax+offset]		\
5878 	__asm subss		reg2, reg1				\
5879 	__asm movss		[eax+offset], reg2
5880 #define STORE2LO( offset, reg1, reg2 )		\
5881 	__asm movlps	reg2, [eax+offset]		\
5882 	__asm subps		reg2, reg1				\
5883 	__asm movlps	[eax+offset], reg2
5884 #define STORE2HI( offset, reg1, reg2 )		\
5885 	__asm movhps	reg2, [eax+offset]		\
5886 	__asm subps		reg2, reg1				\
5887 	__asm movhps	[eax+offset], reg2
5888 #define STORE4( offset, reg1, reg2 )		\
5889 	__asm movlps	reg2, [eax+offset]		\
5890 	__asm movhps	reg2, [eax+offset+8]	\
5891 	__asm subps		reg2, reg1				\
5892 	__asm movlps	[eax+offset], reg2		\
5893 	__asm movhps	[eax+offset+8], reg2
5894 #define STOREC		-=
5895 
5896 	int numRows;
5897 	const float *mPtr, *vPtr;
5898 	float *dstPtr;
5899 
5900 	assert( vec.GetSize() >= mat.GetNumColumns() );
5901 	assert( dst.GetSize() >= mat.GetNumRows() );
5902 
5903 	mPtr = mat.ToFloatPtr();
5904 	vPtr = vec.ToFloatPtr();
5905 	dstPtr = dst.ToFloatPtr();
5906 	numRows = mat.GetNumRows();
5907 	switch( mat.GetNumColumns() ) {
5908 		case 1: {
5909 			switch( numRows ) {
5910 				case 1: {		// 1x1 * 1x1
5911 					__asm {
5912 						mov			esi, vPtr
5913 						mov			edi, mPtr
5914 						mov			eax, dstPtr
5915 						movss		xmm0, [esi]
5916 						mulss		xmm0, [edi]
5917 						STORE1( 0, xmm0, xmm1 )
5918 					}
5919 					return;
5920 				}
5921 				case 6: {		// 6x1 * 1x1
5922 					__asm {
5923 						mov			esi, vPtr
5924 						mov			edi, mPtr
5925 						mov			eax, dstPtr
5926 						movss		xmm0, [esi]
5927 						shufps		xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
5928 						movaps		xmm1, xmm0
5929 						mulps		xmm0, [edi]
5930 						mulps		xmm1, [edi+16]
5931 						STORE4( 0, xmm0, xmm2 )
5932 						STORE2LO( 16, xmm1, xmm2 )
5933 					}
5934 					return;
5935 				}
5936 				default: {
5937 					for ( int i = 0; i < numRows; i++ ) {
5938 						dstPtr[i] STOREC mPtr[0] * vPtr[0];
5939 						mPtr++;
5940 					}
5941 					return;
5942 				}
5943 			}
5944 			break;
5945 		}
5946 		case 2: {
5947 			switch( numRows ) {
5948 				case 2: {		// 2x2 * 2x1
5949 					__asm {
5950 						mov			esi, vPtr
5951 						mov			edi, mPtr
5952 						mov			eax, dstPtr
5953 						movss		xmm0, [esi]
5954 						movss		xmm1, [esi+4]
5955 						movss		xmm2, [edi]
5956 						mulss		xmm2, xmm0
5957 						movss		xmm3, [edi+4]
5958 						mulss		xmm3, xmm1
5959 						addss		xmm2, xmm3
5960 						STORE1( 0, xmm2, xmm4 )
5961 						mulss		xmm0, [edi+8]
5962 						mulss		xmm1, [edi+8+4]
5963 						addss		xmm0, xmm1
5964 						STORE1( 4, xmm0, xmm4 )
5965 					}
5966 					return;
5967 				}
5968 				case 6: {		// 6x2 * 2x1
5969 					__asm {
5970 						mov			esi, vPtr
5971 						mov			edi, mPtr
5972 						mov			eax, dstPtr
5973 						movlps		xmm7, [esi]
5974 						shufps		xmm7, xmm7, R_SHUFFLEPS( 0, 1, 0, 1 )
5975 						movaps		xmm0, [edi]
5976 						mulps		xmm0, xmm7
5977 						movaps		xmm1, [edi+16]
5978 						mulps		xmm1, xmm7
5979 						movaps		xmm2, xmm0
5980 						shufps		xmm0, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 )
5981 						shufps		xmm2, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 )
5982 						movaps		xmm3, [edi+32]
5983 						addps		xmm0, xmm2
5984 						mulps		xmm3, xmm7
5985 						STORE4( 0, xmm0, xmm4 )
5986 						shufps		xmm3, xmm3, R_SHUFFLEPS( 0, 2, 1, 3 )
5987 						movhlps		xmm1, xmm3
5988 						addps		xmm3, xmm1
5989 						STORE2LO( 16, xmm3, xmm4 )
5990 					}
5991 					return;
5992 				}
5993 				default: {
5994 					for ( int i = 0; i < numRows; i++ ) {
5995 						dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1];
5996 						mPtr += 2;
5997 					}
5998 					return;
5999 				}
6000 			}
6001 			break;
6002 		}
6003 		case 3: {
6004 			switch( numRows ) {
6005 				case 3: {		// 3x3 * 3x1
6006 					__asm {
6007 						mov			esi, vPtr
6008 						mov			edi, mPtr
6009 						mov			eax, dstPtr
6010 						movss		xmm0, [esi]
6011 						movss		xmm4, [edi]
6012 						mulss		xmm4, xmm0
6013 						movss		xmm1, [esi+4]
6014 						movss		xmm5, [edi+4]
6015 						mulss		xmm5, xmm1
6016 						addss		xmm4, xmm5
6017 						movss		xmm2, [esi+8]
6018 						movss		xmm6, [edi+8]
6019 						mulss		xmm6, xmm2
6020 						addss		xmm4, xmm6
6021 						movss		xmm3, [edi+12]
6022 						mulss		xmm3, xmm0
6023 						STORE1( 0, xmm4, xmm7 );
6024 						movss		xmm5, [edi+12+4]
6025 						mulss		xmm5, xmm1
6026 						addss		xmm3, xmm5
6027 						movss		xmm6, [edi+12+8]
6028 						mulss		xmm6, xmm2
6029 						addss		xmm3, xmm6
6030 						mulss		xmm0, [edi+24]
6031 						mulss		xmm1, [edi+24+4]
6032 						STORE1( 4, xmm3, xmm7 );
6033 						addss		xmm0, xmm1
6034 						mulss		xmm2, [edi+24+8]
6035 						addss		xmm0, xmm2
6036 						STORE1( 8, xmm0, xmm7 );
6037 					}
6038 					return;
6039 				}
6040 				case 6: {		// 6x3 * 3x1
6041 					__asm {
6042 						mov			esi, vPtr
6043 						mov			edi, mPtr
6044 						mov			eax, dstPtr
6045 						movss		xmm5, [esi]
6046 						shufps		xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
6047 						movss		xmm6, [esi+4]
6048 						shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
6049 						movss		xmm7, [esi+8]
6050 						shufps		xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
6051 						movaps		xmm0, [edi]								// xmm0 = 0, 1, 2, 3
6052 						movlps		xmm1, [edi+4*4]
6053 						shufps		xmm1, xmm0, R_SHUFFLEPS( 0, 1, 1, 2 )	// xmm1 = 4, 5, 1, 2
6054 						movlps		xmm2, [edi+6*4]
6055 						movhps		xmm2, [edi+8*4]							// xmm2 = 6, 7, 8, 9
6056 						shufps		xmm0, xmm2, R_SHUFFLEPS( 0, 3, 0, 3 )	// xmm0 = 0, 3, 6, 9
6057 						mulps		xmm0, xmm5
6058 						movlps		xmm3, [edi+10*4]
6059 						shufps		xmm2, xmm3, R_SHUFFLEPS( 1, 2, 0, 1 )	// xmm2 = 7, 8, 10, 11
6060 						movaps		xmm3, xmm1
6061 						shufps		xmm1, xmm2, R_SHUFFLEPS( 2, 0, 0, 2 )	// xmm1 = 1, 4, 7, 10
6062 						mulps		xmm1, xmm6
6063 						shufps		xmm3, xmm2, R_SHUFFLEPS( 3, 1, 1, 3 )	// xmm3 = 2, 5, 8, 11
6064 						mulps		xmm3, xmm7
6065 						addps		xmm0, xmm1
6066 						addps		xmm0, xmm3
6067 						STORE4( 0, xmm0, xmm4 )
6068 						movss		xmm1, [edi+12*4]
6069 						mulss		xmm1, xmm5
6070 						movss		xmm2, [edi+13*4]
6071 						mulss		xmm2, xmm6
6072 						movss		xmm3, [edi+14*4]
6073 						mulss		xmm3, xmm7
6074 						addss		xmm1, xmm2
6075 						addss		xmm1, xmm3
6076 						STORE1( 16, xmm1, xmm4 )
6077 						mulss		xmm5, [edi+15*4]
6078 						mulss		xmm6, [edi+16*4]
6079 						mulss		xmm7, [edi+17*4]
6080 						addss		xmm5, xmm6
6081 						addss		xmm5, xmm7
6082 						STORE1( 20, xmm5, xmm4 )
6083 					}
6084 					return;
6085 				}
6086 				default: {
6087 					for ( int i = 0; i < numRows; i++ ) {
6088 						dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2];
6089 						mPtr += 3;
6090 					}
6091 					return;
6092 				}
6093 			}
6094 			break;
6095 		}
6096 		case 4: {
6097 			switch( numRows ) {
6098 				case 4: {		// 4x4 * 4x1
6099 					__asm {
6100 						mov			esi, vPtr
6101 						mov			edi, mPtr
6102 						mov			eax, dstPtr
6103 						movlps		xmm6, qword ptr [esi ]
6104 						movlps		xmm0, qword ptr [edi ]
6105 						shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 1, 0, 1 )
6106 						movhps		xmm0, qword ptr [edi+16]
6107 						mulps		xmm0, xmm6
6108 						movlps		xmm7, qword ptr [esi+ 8]
6109 						movlps		xmm2, qword ptr [edi+ 8]
6110 						shufps		xmm7, xmm7, R_SHUFFLEPS( 0, 1, 0, 1 )
6111 						movhps		xmm2, qword ptr [edi+24]
6112 						mulps		xmm2, xmm7
6113 						movlps		xmm1, qword ptr [edi+32]
6114 						movhps		xmm1, qword ptr [edi+48]
6115 						mulps		xmm1, xmm6
6116 						movlps		xmm3, qword ptr [edi+40]
6117 						addps		xmm0, xmm2
6118 						movhps		xmm3, qword ptr [edi+56]
6119 						mulps		xmm3, xmm7
6120 						movaps		xmm4, xmm0
6121 						addps		xmm1, xmm3
6122 						shufps		xmm4, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 )
6123 						shufps		xmm0, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 )
6124 						addps		xmm0, xmm4
6125 						STORE4( 0, xmm0, xmm2 )
6126 					}
6127 					return;
6128 				}
6129 				case 6: {		// 6x4 * 4x1
6130 					__asm {
6131 						mov			esi, vPtr
6132 						mov			edi, mPtr
6133 						mov			eax, dstPtr
6134 						movlps		xmm6, qword ptr [esi+ 0]
6135 						movlps		xmm0, qword ptr [edi+ 0]
6136 						shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 1, 0, 1 )
6137 						movhps		xmm0, qword ptr [edi+16]
6138 						mulps		xmm0, xmm6
6139 						movlps		xmm7, qword ptr [esi+ 8]
6140 						movlps		xmm2, qword ptr [edi+ 8]
6141 						shufps		xmm7, xmm7, R_SHUFFLEPS( 0, 1, 0, 1 )
6142 						movhps		xmm2, qword ptr [edi+24]
6143 						mulps		xmm2, xmm7
6144 						movlps		xmm1, qword ptr [edi+32]
6145 						movhps		xmm1, qword ptr [edi+48]
6146 						mulps		xmm1, xmm6
6147 						movlps		xmm3, qword ptr [edi+40]
6148 						addps		xmm0, xmm2
6149 						movhps		xmm3, qword ptr [edi+56]
6150 						mulps		xmm3, xmm7
6151 						movaps		xmm4, xmm0
6152 						addps		xmm1, xmm3
6153 						shufps		xmm4, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 )
6154 						shufps		xmm0, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 )
6155 						addps		xmm0, xmm4
6156 						movlps		xmm1, qword ptr [edi+64]
6157 						movhps		xmm1, qword ptr [edi+80]
6158 						STORE4( 0, xmm0, xmm4 )
6159 						mulps		xmm1, xmm6
6160 						movlps		xmm2, qword ptr [edi+72]
6161 						movhps		xmm2, qword ptr [edi+88]
6162 						mulps		xmm2, xmm7
6163 						addps		xmm1, xmm2
6164 						shufps		xmm1, xmm1, R_SHUFFLEPS( 0, 2, 1, 3 )
6165 						movhlps		xmm3, xmm1
6166 						addps		xmm1, xmm3
6167 						STORE2LO( 16, xmm1, xmm4 )
6168 					}
6169 					return;
6170 				}
6171 				default: {
6172 					for ( int i = 0; i < numRows; i++ ) {
6173 						dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2] + mPtr[3] * vPtr[3];
6174 						mPtr += 4;
6175 					}
6176 					return;
6177 				}
6178 			}
6179 			break;
6180 		}
6181 		case 5: {
6182 			switch( numRows ) {
6183 				case 5: {		// 5x5 * 5x1
6184 					__asm {
6185 						mov			esi, vPtr
6186 						mov			edi, mPtr
6187 						mov			eax, dstPtr
6188 						movss		xmm0, [edi+5*4]							// xmm0 =  5,  X,  X,  X
6189 						movhps		xmm0, [edi+0*4]							// xmm0 =  5,  X,  0,  1
6190 						movss		xmm5, [edi+15*4]						// xmm4 = 15,  X,  X,  X
6191 						movhps		xmm5, [edi+10*4]						// xmm5 = 15,  X, 10, 11
6192 						movaps		xmm1, xmm0								// xmm1 =  5,  X,  0,  1
6193 						shufps		xmm0, xmm5, R_SHUFFLEPS( 2, 0, 2, 0 )	// xmm0 =  0,  5, 10, 15
6194 						movlps		xmm1, [edi+6*4]							// xmm1 =  6,  7,  0,  1
6195 						movlps		xmm5, [edi+16*4]						// xmm5 = 16, 17, 10, 11
6196 						movaps		xmm2, xmm1								// xmm2 =  6,  7,  0,  1
6197 						shufps		xmm1, xmm5, R_SHUFFLEPS( 3, 0, 3, 0 )	// xmm1 =  1,  6, 11, 16
6198 						movhps		xmm2, [edi+2*4]							// xmm2 =  6,  7,  2,  3
6199 						movhps		xmm5, [edi+12*4]						// xmm5 = 16, 17, 12, 13
6200 						movaps		xmm3, xmm2								// xmm3 =  6,  7,  2,  3
6201 						shufps		xmm2, xmm5, R_SHUFFLEPS( 2, 1, 2, 1 )	// xmm2 =  2,  7, 12, 17
6202 						movlps		xmm3, [edi+8*4]							// xmm3 =  8,  9,  2,  3
6203 						movlps		xmm5, [edi+18*4]						// xmm5 = 18, 19, 12, 13
6204 						movss		xmm4, [edi+4*4]							// xmm4 =  4,  X,  X,  X
6205 						movlhps		xmm4, xmm3								// xmm4 =  4,  X,  8,  9
6206 						shufps		xmm3, xmm5, R_SHUFFLEPS( 3, 0, 3, 0 )	// xmm3 =  3,  8, 13, 18
6207 						movhps		xmm5, [edi+14*4]						// xmm6 = 18, 19, 14, 15
6208 						shufps		xmm4, xmm5, R_SHUFFLEPS( 0, 3, 2, 1 )	// xmm4 =  4,  9, 14, 19
6209 						movss		xmm7, [esi+0*4]
6210 						shufps		xmm7, xmm7, 0
6211 						mulps		xmm0, xmm7
6212 						movss		xmm5, [esi+1*4]
6213 						shufps		xmm5, xmm5, 0
6214 						mulps		xmm1, xmm5
6215 						addps		xmm0, xmm1
6216 						movss		xmm6, [esi+2*4]
6217 						shufps		xmm6, xmm6, 0
6218 						mulps		xmm2, xmm6
6219 						addps		xmm0, xmm2
6220 						movss		xmm1, [esi+3*4]
6221 						shufps		xmm1, xmm1, 0
6222 						mulps		xmm3, xmm1
6223 						addps		xmm0, xmm3
6224 						movss		xmm2, [esi+4*4]
6225 						shufps		xmm2, xmm2, 0
6226 						mulps		xmm4, xmm2
6227 						addps		xmm0, xmm4
6228 						mulss		xmm7, [edi+20*4]
6229 						mulss		xmm5, [edi+21*4]
6230 						addps		xmm7, xmm5
6231 						mulss		xmm6, [edi+22*4]
6232 						addps		xmm7, xmm6
6233 						mulss		xmm1, [edi+23*4]
6234 						addps		xmm7, xmm1
6235 						mulss		xmm2, [edi+24*4]
6236 						addps		xmm7, xmm2
6237 						STORE4( 0, xmm0, xmm3 )
6238 						STORE1( 16, xmm7, xmm4 )
6239 					}
6240 					return;
6241 				}
6242 				case 6: {		// 6x5 * 5x1
6243 					__asm {
6244 						mov			esi, vPtr
6245 						mov			edi, mPtr
6246 						mov			eax, dstPtr
6247 						movlps		xmm6, [esi]
6248 						shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 1, 0, 1 )
6249 						movlps		xmm7, [esi+8]
6250 						shufps		xmm7, xmm7, R_SHUFFLEPS( 0, 1, 0, 1 )
6251 						movlps		xmm0, [edi]
6252 						movhps		xmm3, [edi+8]
6253 						movaps		xmm1, [edi+16]
6254 						movlps		xmm2, [edi+32]
6255 						shufps		xmm0, xmm1, R_SHUFFLEPS( 0, 1, 1, 2 )	// xmm0 = 0, 1, 5, 6
6256 						shufps		xmm1, xmm2, R_SHUFFLEPS( 0, 3, 0, 1 )	// xmm1 = 4, 7, 8, 9
6257 						shufps		xmm3, xmm1, R_SHUFFLEPS( 2, 3, 1, 2 )	// xmm3 = 2, 3, 7, 8
6258 						mulps		xmm0, xmm6
6259 						mulps		xmm3, xmm7
6260 						movlps		xmm2, [edi+40]
6261 						addps		xmm0, xmm3								// xmm0 + xmm1
6262 						movhps		xmm5, [edi+40+8]
6263 						movlps		xmm3, [edi+40+16]
6264 						movhps		xmm3, [edi+40+24]
6265 						movlps		xmm4, [edi+40+32]
6266 						shufps		xmm2, xmm3, R_SHUFFLEPS( 0, 1, 1, 2 )	// xmm2 = 10, 11, 15, 16
6267 						shufps		xmm3, xmm4, R_SHUFFLEPS( 0, 3, 0, 1 )	// xmm3 = 14, 17, 18, 19
6268 						shufps		xmm5, xmm3, R_SHUFFLEPS( 2, 3, 1, 2 )	// xmm5 = 12, 13, 17, 18
6269 						mulps		xmm2, xmm6
6270 						mulps		xmm5, xmm7
6271 						addps		xmm2, xmm5								// xmm2 + xmm3
6272 						movss		xmm5, [esi+16]
6273 						shufps		xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
6274 						movaps		xmm4, xmm0
6275 						shufps		xmm0, xmm2, R_SHUFFLEPS( 0, 2, 0, 2 )
6276 						shufps		xmm4, xmm2, R_SHUFFLEPS( 1, 3, 1, 3 )
6277 						shufps		xmm1, xmm3, R_SHUFFLEPS( 0, 3, 0, 3 )
6278 						addps		xmm0, xmm4
6279 						mulps		xmm1, xmm5
6280 						addps		xmm0, xmm1
6281 						STORE4( 0, xmm0, xmm2 )
6282 						movlps		xmm4, [edi+80]
6283 						movhps		xmm3, [edi+80+8]
6284 						movaps		xmm1, [edi+80+16]
6285 						movlps		xmm2, [edi+80+32]
6286 						shufps		xmm4, xmm1, R_SHUFFLEPS( 0, 1, 1, 2 )	// xmm4 = 20, 21, 25, 26
6287 						shufps		xmm1, xmm2, R_SHUFFLEPS( 0, 3, 0, 1 )	// xmm1 = 24, 27, 28, 29
6288 						shufps		xmm3, xmm1, R_SHUFFLEPS( 2, 3, 1, 2 )	// xmm3 = 22, 23, 27, 28
6289 						mulps		xmm4, xmm6
6290 						mulps		xmm3, xmm7
6291 						mulps		xmm1, xmm5
6292 						addps		xmm4, xmm3								// xmm4 + xmm1
6293 						shufps		xmm1, xmm4, R_SHUFFLEPS( 0, 3, 0, 2 )
6294 						shufps		xmm4, xmm4, R_SHUFFLEPS( 1, 3, 0, 0 )
6295 						addps		xmm4, xmm1
6296 						shufps		xmm1, xmm1, R_SHUFFLEPS( 2, 3, 0, 1 )
6297 						addps		xmm4, xmm1
6298 						STORE2LO( 16, xmm4, xmm2 )
6299 					}
6300 					return;
6301 				}
6302 				default: {
6303 					for ( int i = 0; i < numRows; i++ ) {
6304 						dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2] + mPtr[3] * vPtr[3] + mPtr[4] * vPtr[4];
6305 						mPtr += 5;
6306 					}
6307 					return;
6308 				}
6309 			}
6310 			break;
6311 		}
6312 		case 6: {
6313 			switch( numRows ) {
6314 				case 1: {		// 1x6 * 6x1
6315 					__asm {
6316 						mov			esi, vPtr
6317 						mov			edi, mPtr
6318 						mov			eax, dstPtr
6319 						movss		xmm0, [esi]
6320 						mulss		xmm0, [edi]
6321 						movss		xmm1, [esi+4]
6322 						mulss		xmm1, [edi+4]
6323 						movss		xmm2, [esi+8]
6324 						addss		xmm0, xmm1
6325 						mulss		xmm2, [edi+8]
6326 						movss		xmm3, [esi+12]
6327 						addss		xmm0, xmm2
6328 						mulss		xmm3, [edi+12]
6329 						movss		xmm4, [esi+16]
6330 						addss		xmm0, xmm3
6331 						mulss		xmm4, [edi+16]
6332 						movss		xmm5, [esi+20]
6333 						addss		xmm0, xmm4
6334 						mulss		xmm5, [edi+20]
6335 						movss		xmm6, [esi+24]
6336 						addss		xmm0, xmm5
6337 						mulss		xmm6, [edi+24]
6338 						addss		xmm0, xmm6
6339 						STORE1( 0, xmm0, xmm7 )
6340 					}
6341 					return;
6342 				}
6343 				case 2: {		// 2x6 * 6x1
6344 					__asm {
6345 						mov			esi, vPtr
6346 						mov			edi, mPtr
6347 						mov			eax, dstPtr
6348 						// load idVecX
6349 						movlps		xmm4, [esi]
6350 						movhps		xmm4, [esi+8]
6351 						movlps		xmm5, [esi+16]
6352 						movlhps		xmm5, xmm4
6353 						movhlps		xmm6, xmm4
6354 						movlhps		xmm6, xmm5
6355 						// row 0 and 1
6356 						movaps		xmm0, [edi]
6357 						movaps		xmm1, [edi+16]
6358 						movaps		xmm2, [edi+32]
6359 						mulps		xmm0, xmm4
6360 						mulps		xmm1, xmm5
6361 						mulps		xmm2, xmm6
6362 						movhlps		xmm3, xmm0
6363 						movlhps		xmm3, xmm2
6364 						addps		xmm1, xmm3
6365 						shufps		xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 )
6366 						addps		xmm1, xmm0
6367 						shufps		xmm1, xmm1, R_SHUFFLEPS( 0, 2, 1, 3 )
6368 						movhlps		xmm0, xmm1
6369 						addps		xmm0, xmm1
6370 						STORE2LO( 0, xmm0, xmm3 )
6371 					}
6372 					return;
6373 				}
6374 				case 3: {		// 3x6 * 6x1
6375 					__asm {
6376 						mov			esi, vPtr
6377 						mov			edi, mPtr
6378 						mov			eax, dstPtr
6379 						// load idVecX
6380 						movlps		xmm4, [esi]
6381 						movhps		xmm4, [esi+8]
6382 						movlps		xmm5, [esi+16]
6383 						movlhps		xmm5, xmm4
6384 						movhlps		xmm6, xmm4
6385 						movlhps		xmm6, xmm5
6386 						// row 0 and 1
6387 						movaps		xmm0, [edi]
6388 						movaps		xmm1, [edi+16]
6389 						movaps		xmm2, [edi+32]
6390 						mulps		xmm0, xmm4
6391 						mulps		xmm1, xmm5
6392 						mulps		xmm2, xmm6
6393 						movhlps		xmm3, xmm0
6394 						movlhps		xmm3, xmm2
6395 						addps		xmm1, xmm3
6396 						shufps		xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 )
6397 						addps		xmm1, xmm0
6398 						shufps		xmm1, xmm1, R_SHUFFLEPS( 0, 2, 1, 3 )
6399 						movhlps		xmm0, xmm1
6400 						addps		xmm0, xmm1
6401 						STORE2LO( 0, xmm0, xmm3 )
6402 						// row 2
6403 						movaps		xmm0, [edi+48]
6404 						movaps		xmm1, [edi+48+16]
6405 						mulps		xmm0, xmm4
6406 						mulps		xmm1, xmm5
6407 						addps		xmm0, xmm1
6408 						movhlps		xmm1, xmm0
6409 						addps		xmm0, xmm1
6410 						movaps		xmm1, xmm0
6411 						shufps		xmm1, xmm1, R_SHUFFLEPS( 1, 0, 0, 0 )
6412 						addss		xmm0, xmm1
6413 						STORE1( 8, xmm0, xmm3 )
6414 					}
6415 					return;
6416 				}
6417 				case 4: {		// 4x6 * 6x1
6418 					__asm {
6419 						mov			esi, vPtr
6420 						mov			edi, mPtr
6421 						mov			eax, dstPtr
6422 						// load idVecX
6423 						movlps		xmm4, [esi]
6424 						movhps		xmm4, [esi+8]
6425 						movlps		xmm5, [esi+16]
6426 						movlhps		xmm5, xmm4
6427 						movhlps		xmm6, xmm4
6428 						movlhps		xmm6, xmm5
6429 						// row 0 and 1
6430 						movaps		xmm0, [edi]
6431 						movaps		xmm1, [edi+16]
6432 						movaps		xmm2, [edi+32]
6433 						mulps		xmm0, xmm4
6434 						mulps		xmm1, xmm5
6435 						mulps		xmm2, xmm6
6436 						movhlps		xmm7, xmm0
6437 						movlhps		xmm7, xmm2
6438 						addps		xmm7, xmm1
6439 						shufps		xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 )
6440 						addps		xmm7, xmm0
6441 						// row 2 and 3
6442 						movaps		xmm0, [edi+48]
6443 						movaps		xmm1, [edi+48+16]
6444 						movaps		xmm2, [edi+48+32]
6445 						mulps		xmm0, xmm4
6446 						mulps		xmm1, xmm5
6447 						mulps		xmm2, xmm6
6448 						movhlps		xmm3, xmm0
6449 						movlhps		xmm3, xmm2
6450 						addps		xmm1, xmm3
6451 						shufps		xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 )
6452 						addps		xmm1, xmm0
6453 						// last 4 additions for the first 4 rows and store result
6454 						movaps		xmm0, xmm7
6455 						shufps		xmm7, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 )
6456 						shufps		xmm0, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 )
6457 						addps		xmm0, xmm7
6458 						STORE4( 0, xmm0, xmm4 )
6459 					}
6460 					return;
6461 				}
6462 				case 5: {		// 5x6 * 6x1
6463 					__asm {
6464 						mov			esi, vPtr
6465 						mov			edi, mPtr
6466 						mov			eax, dstPtr
6467 						// load idVecX
6468 						movlps		xmm4, [esi]
6469 						movhps		xmm4, [esi+8]
6470 						movlps		xmm5, [esi+16]
6471 						movlhps		xmm5, xmm4
6472 						movhlps		xmm6, xmm4
6473 						movlhps		xmm6, xmm5
6474 						// row 0 and 1
6475 						movaps		xmm0, [edi]
6476 						movaps		xmm1, [edi+16]
6477 						movaps		xmm2, [edi+32]
6478 						mulps		xmm0, xmm4
6479 						mulps		xmm1, xmm5
6480 						mulps		xmm2, xmm6
6481 						movhlps		xmm7, xmm0
6482 						movlhps		xmm7, xmm2
6483 						addps		xmm7, xmm1
6484 						shufps		xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 )
6485 						addps		xmm7, xmm0
6486 						// row 2 and 3
6487 						movaps		xmm0, [edi+48]
6488 						movaps		xmm1, [edi+48+16]
6489 						movaps		xmm2, [edi+48+32]
6490 						mulps		xmm0, xmm4
6491 						mulps		xmm1, xmm5
6492 						mulps		xmm2, xmm6
6493 						movhlps		xmm3, xmm0
6494 						movlhps		xmm3, xmm2
6495 						addps		xmm1, xmm3
6496 						shufps		xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 )
6497 						addps		xmm1, xmm0
6498 						// last 4 additions for the first 4 rows and store result
6499 						movaps		xmm0, xmm7
6500 						shufps		xmm7, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 )
6501 						shufps		xmm0, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 )
6502 						addps		xmm0, xmm7
6503 						STORE4( 0, xmm0, xmm3 )
6504 						// row 5
6505 						movaps		xmm0, [edi+96]
6506 						movaps		xmm1, [edi+96+16]
6507 						mulps		xmm0, xmm4
6508 						mulps		xmm1, xmm5
6509 						addps		xmm0, xmm1
6510 						movhlps		xmm1, xmm0
6511 						addps		xmm0, xmm1
6512 						movaps		xmm1, xmm0
6513 						shufps		xmm1, xmm1, 0x01
6514 						addss		xmm0, xmm1
6515 						STORE1( 16, xmm0, xmm3 )
6516 					}
6517 					return;
6518 				}
6519 				case 6: {		// 6x6 * 6x1
6520 					__asm {
6521 						mov			esi, vPtr
6522 						mov			edi, mPtr
6523 						mov			eax, dstPtr
6524 						movlps		xmm7, qword ptr [esi]
6525 						movlps		xmm6, qword ptr [esi+8]
6526 						shufps		xmm7, xmm7, 0x44
6527 						shufps		xmm6, xmm6, 0x44
6528 						movlps		xmm0, qword ptr [edi    ]
6529 						movhps		xmm0, qword ptr [edi+ 24]
6530 						mulps		xmm0, xmm7
6531 						movlps		xmm3, qword ptr [edi+  8]
6532 						movhps		xmm3, qword ptr [edi+ 32]
6533 						mulps		xmm3, xmm6
6534 						movlps		xmm1, qword ptr [edi+ 48]
6535 						movhps		xmm1, qword ptr [edi+ 72]
6536 						mulps		xmm1, xmm7
6537 						movlps		xmm2, qword ptr [edi+ 96]
6538 						movhps		xmm2, qword ptr [edi+120]
6539 						mulps		xmm2, xmm7
6540 						movlps		xmm4, qword ptr [edi+ 56]
6541 						movhps		xmm4, qword ptr [edi+ 80]
6542 						movlps		xmm5, qword ptr [edi+104]
6543 						movhps		xmm5, qword ptr [edi+128]
6544 						mulps		xmm4, xmm6
6545 						movlps		xmm7, qword ptr [esi+16]
6546 						addps		xmm0, xmm3
6547 						shufps		xmm7, xmm7, 0x44
6548 						mulps		xmm5, xmm6
6549 						addps		xmm1, xmm4
6550 						movlps		xmm3, qword ptr [edi+ 16]
6551 						movhps		xmm3, qword ptr [edi+ 40]
6552 						addps		xmm2, xmm5
6553 						movlps		xmm4, qword ptr [edi+ 64]
6554 						movhps		xmm4, qword ptr [edi+ 88]
6555 						mulps		xmm3, xmm7
6556 						movlps		xmm5, qword ptr [edi+112]
6557 						movhps		xmm5, qword ptr [edi+136]
6558 						addps		xmm0, xmm3
6559 						mulps		xmm4, xmm7
6560 						mulps		xmm5, xmm7
6561 						addps		xmm1, xmm4
6562 						addps		xmm2, xmm5
6563 						movaps		xmm6, xmm0
6564 						shufps		xmm0, xmm1, 0x88
6565 						shufps		xmm6, xmm1, 0xDD
6566 						movaps		xmm7, xmm2
6567 						shufps		xmm7, xmm2, 0x88
6568 						shufps		xmm2, xmm2, 0xDD
6569 						addps		xmm0, xmm6
6570 						addps		xmm2, xmm7
6571 						STORE4( 0, xmm0, xmm3 )
6572 						STORE2LO( 16, xmm2, xmm4 )
6573 					}
6574 					return;
6575 				}
6576 				default: {
6577 					for ( int i = 0; i < numRows; i++ ) {
6578 						dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2] +
6579 									mPtr[3] * vPtr[3] + mPtr[4] * vPtr[4] + mPtr[5] * vPtr[5];
6580 						mPtr += 6;
6581 					}
6582 					return;
6583 				}
6584 			}
6585 			break;
6586 		}
6587 		default: {
6588 			int numColumns = mat.GetNumColumns();
6589 			for ( int i = 0; i < numRows; i++ ) {
6590 				float sum = mPtr[0] * vPtr[0];
6591 				for ( int j = 1; j < numColumns; j++ ) {
6592 					sum += mPtr[j] * vPtr[j];
6593 				}
6594 				dstPtr[i] STOREC sum;
6595 				mPtr += numColumns;
6596 			}
6597 			break;
6598 		}
6599 	}
6600 
6601 #undef STOREC
6602 #undef STORE4
6603 #undef STORE2HI
6604 #undef STORE2LO
6605 #undef STORE1
6606 }
6607 
6608 /*
6609 ============
6610 idSIMD_SSE::MatX_TransposeMultiplyVecX
6611 
6612 	optimizes the following matrix multiplications:
6613 
6614 	Nx6 * Nx1
6615 	6xN * 6x1
6616 
6617 	with N in the range [1-6]
6618 ============
6619 */
MatX_TransposeMultiplyVecX(idVecX & dst,const idMatX & mat,const idVecX & vec)6620 void VPCALL idSIMD_SSE::MatX_TransposeMultiplyVecX( idVecX &dst, const idMatX &mat, const idVecX &vec ) {
6621 #define STORE1( offset, reg1, reg2 )		\
6622 	__asm movss		[eax+offset], reg1
6623 #define STORE2LO( offset, reg1, reg2 )		\
6624 	__asm movlps	[eax+offset], reg1
6625 #define STORE2HI( offset, reg1, reg2 )		\
6626 	__asm movhps	[eax+offset], reg1
6627 #define STORE4( offset, reg1, reg2 )		\
6628 	__asm movlps	[eax+offset], reg1		\
6629 	__asm movhps	[eax+offset+8], reg1
6630 #define STOREC		=
6631 
6632 	int numColumns;
6633 	const float *mPtr, *vPtr;
6634 	float *dstPtr;
6635 
6636 	assert( vec.GetSize() >= mat.GetNumRows() );
6637 	assert( dst.GetSize() >= mat.GetNumColumns() );
6638 
6639 	mPtr = mat.ToFloatPtr();
6640 	vPtr = vec.ToFloatPtr();
6641 	dstPtr = dst.ToFloatPtr();
6642 	numColumns = mat.GetNumColumns();
6643 	switch( mat.GetNumRows() ) {
6644 		case 1:
6645 			switch( numColumns ) {
6646 				case 6: {		// 1x6 * 1x1
6647 					__asm {
6648 						mov			esi, vPtr
6649 						mov			edi, mPtr
6650 						mov			eax, dstPtr
6651 						movss		xmm0, [esi]
6652 						shufps		xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
6653 						movaps		xmm1, xmm0
6654 						mulps		xmm0, [edi]
6655 						mulps		xmm1, [edi+16]
6656 						STORE4( 0, xmm0, xmm2 )
6657 						STORE2LO( 16, xmm1, xmm3 )
6658 					}
6659 					return;
6660 				}
6661 				default: {
6662 					for ( int i = 0; i < numColumns; i++ ) {
6663 						dstPtr[i] STOREC *(mPtr) * vPtr[0];
6664 						mPtr++;
6665 					}
6666 					return;
6667 				}
6668 			}
6669 			break;
6670 		case 2:
6671 			switch( numColumns ) {
6672 				case 6: {		// 2x6 * 2x1
6673 					__asm {
6674 						mov			esi, vPtr
6675 						mov			edi, mPtr
6676 						mov			eax, dstPtr
6677 						movlps		xmm0, [esi]
6678 						movaps		xmm1, xmm0
6679 						shufps		xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
6680 						shufps		xmm1, xmm1, R_SHUFFLEPS( 1, 1, 1, 1 )
6681 						movaps		xmm2, [edi]
6682 						mulps		xmm2, xmm0
6683 						movlps		xmm3, [edi+24]
6684 						movhps		xmm3, [edi+32]
6685 						mulps		xmm3, xmm1
6686 						addps		xmm2, xmm3
6687 						shufps		xmm0, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
6688 						movlps		xmm4, [edi+16]
6689 						movhps		xmm4, [edi+40]
6690 						mulps		xmm4, xmm0
6691 						movhlps		xmm3, xmm4
6692 						addps		xmm3, xmm4
6693 						STORE4( 0, xmm2, xmm5 )
6694 						STORE2LO( 16, xmm3, xmm6 )
6695 					}
6696 					return;
6697 				}
6698 				default: {
6699 					for ( int i = 0; i < numColumns; i++ ) {
6700 						dstPtr[i] STOREC *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1];
6701 						mPtr++;
6702 					}
6703 					return;
6704 				}
6705 			}
6706 			break;
6707 		case 3:
6708 			switch( numColumns ) {
6709 				case 6: {		// 3x6 * 3x1
6710 					__asm {
6711 						mov			esi, vPtr
6712 						mov			edi, mPtr
6713 						mov			eax, dstPtr
6714 						movlps		xmm0, [esi+0*4]
6715 						movss		xmm1, [esi+2*4]
6716 						movlps		xmm3, [edi+(0*6+0)*4]
6717 						movhps		xmm3, [edi+(0*6+2)*4]
6718 						movaps		xmm4, xmm0
6719 						shufps		xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
6720 						mulps		xmm3, xmm4
6721 						movlps		xmm5, [edi+(1*6+0)*4]
6722 						movhps		xmm5, [edi+(1*6+2)*4]
6723 						movaps		xmm6, xmm0
6724 						shufps		xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
6725 						mulps		xmm5, xmm6
6726 						addps		xmm3, xmm5
6727 						movlps		xmm4, [edi+(2*6+0)*4]
6728 						movhps		xmm4, [edi+(2*6+2)*4]
6729 						shufps		xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
6730 						mulps		xmm4, xmm1
6731 						addps		xmm3, xmm4
6732 						STORE4( 0, xmm3, xmm7 )
6733 						shufps		xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 )
6734 						movlps		xmm3, [edi+(0*6+4)*4]
6735 						movhps		xmm3, [edi+(1*6+4)*4]
6736 						mulps		xmm3, xmm0
6737 						movhlps		xmm4, xmm3
6738 						addps		xmm3, xmm4
6739 						movlps		xmm5, [edi+(2*6+4)*4]
6740 						mulps		xmm5, xmm1
6741 						addps		xmm3, xmm5
6742 						STORE2LO( 16, xmm3, xmm7 )
6743 					}
6744 					return;
6745 				}
6746 				default: {
6747 					for ( int i = 0; i < numColumns; i++ ) {
6748 						dstPtr[i] STOREC *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2];
6749 						mPtr++;
6750 					}
6751 					return;
6752 				}
6753 			}
6754 			break;
6755 		case 4:
6756 			switch( numColumns ) {
6757 				case 6: {		// 4x6 * 4x1
6758 					__asm {
6759 						mov			esi, vPtr
6760 						mov			edi, mPtr
6761 						mov			eax, dstPtr
6762 						movlps		xmm0, [esi+0*4]
6763 						movlps		xmm1, [esi+2*4]
6764 						movaps		xmm3, xmm0
6765 						shufps		xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 )
6766 						mulps		xmm3, [edi+(0*6+0)*4]
6767 						movlps		xmm5, [edi+(1*6+0)*4]
6768 						movhps		xmm5, [edi+(1*6+2)*4]
6769 						movaps		xmm6, xmm0
6770 						shufps		xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
6771 						mulps		xmm5, xmm6
6772 						addps		xmm3, xmm5
6773 						movlps		xmm4, [edi+(2*6+0)*4]
6774 						movhps		xmm4, [edi+(2*6+2)*4]
6775 						movaps		xmm6, xmm1
6776 						shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
6777 						mulps		xmm4, xmm6
6778 						addps		xmm3, xmm4
6779 						movlps		xmm5, [edi+(3*6+0)*4]
6780 						movhps		xmm5, [edi+(3*6+2)*4]
6781 						movaps		xmm6, xmm1
6782 						shufps		xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
6783 						mulps		xmm5, xmm6
6784 						addps		xmm3, xmm5
6785 						STORE4( 0, xmm3, xmm7 )
6786 						shufps		xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 )
6787 						shufps		xmm1, xmm1, R_SHUFFLEPS( 0, 0, 1, 1 )
6788 						movlps		xmm3, [edi+(0*6+4)*4]
6789 						movhps		xmm3, [edi+(1*6+4)*4]
6790 						mulps		xmm3, xmm0
6791 						movlps		xmm4, [edi+(2*6+4)*4]
6792 						movhps		xmm4, [edi+(3*6+4)*4]
6793 						mulps		xmm4, xmm1
6794 						addps		xmm3, xmm4
6795 						movhlps		xmm4, xmm3
6796 						addps		xmm3, xmm4
6797 						STORE2LO( 16, xmm3, xmm7 )
6798 					}
6799 					return;
6800 				}
6801 				default: {
6802 					for ( int i = 0; i < numColumns; i++ ) {
6803 						dstPtr[i] STOREC *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2] +
6804 								*(mPtr+3*numColumns) * vPtr[3];
6805 						mPtr++;
6806 					}
6807 					return;
6808 				}
6809 			}
6810 			break;
6811 		case 5:
6812 			switch( numColumns ) {
6813 				case 6: {		// 5x6 * 5x1
6814 					__asm {
6815 						mov			esi, vPtr
6816 						mov			edi, mPtr
6817 						mov			eax, dstPtr
6818 						movlps		xmm0, [esi+0*4]
6819 						movlps		xmm1, [esi+2*4]
6820 						movss		xmm2, [esi+4*4]
6821 						movaps		xmm3, xmm0
6822 						shufps		xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 )
6823 						mulps		xmm3, [edi+(0*6+0)*4]
6824 						movlps		xmm5, [edi+(1*6+0)*4]
6825 						movhps		xmm5, [edi+(1*6+2)*4]
6826 						movaps		xmm6, xmm0
6827 						shufps		xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
6828 						mulps		xmm5, xmm6
6829 						addps		xmm3, xmm5
6830 						movaps		xmm6, xmm1
6831 						shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
6832 						mulps		xmm6, [edi+(2*6+0)*4]
6833 						addps		xmm3, xmm6
6834 						movlps		xmm5, [edi+(3*6+0)*4]
6835 						movhps		xmm5, [edi+(3*6+2)*4]
6836 						movaps		xmm6, xmm1
6837 						shufps		xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
6838 						mulps		xmm5, xmm6
6839 						addps		xmm3, xmm5
6840 						shufps		xmm2, xmm2, R_SHUFFLEPS( 0, 0, 0, 0 )
6841 						movaps		xmm4, xmm2
6842 						mulps		xmm4, [edi+(4*6+0)*4]
6843 						addps		xmm3, xmm4
6844 						STORE4( 0, xmm3, xmm7 )
6845 						shufps		xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 )
6846 						shufps		xmm1, xmm1, R_SHUFFLEPS( 0, 0, 1, 1 )
6847 						movlps		xmm3, [edi+(0*6+4)*4]
6848 						movhps		xmm3, [edi+(1*6+4)*4]
6849 						mulps		xmm3, xmm0
6850 						movlps		xmm4, [edi+(2*6+4)*4]
6851 						movhps		xmm4, [edi+(3*6+4)*4]
6852 						mulps		xmm4, xmm1
6853 						addps		xmm3, xmm4
6854 						movhlps		xmm4, xmm3
6855 						addps		xmm3, xmm4
6856 						movlps		xmm5, [edi+(4*6+4)*4]
6857 						mulps		xmm5, xmm2
6858 						addps		xmm3, xmm5
6859 						STORE2LO( 16, xmm3, xmm7 )
6860 					}
6861 					return;
6862 				}
6863 				default: {
6864 					for ( int i = 0; i < numColumns; i++ ) {
6865 						dstPtr[i] STOREC *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2] +
6866 								*(mPtr+3*numColumns) * vPtr[3] + *(mPtr+4*numColumns) * vPtr[4];
6867 						mPtr++;
6868 					}
6869 					return;
6870 				}
6871 			}
6872 			break;
6873 		case 6:
6874 			switch( numColumns ) {
6875 				case 1: {		// 6x1 * 6x1
6876 					__asm {
6877 						mov			esi, vPtr
6878 						mov			edi, mPtr
6879 						mov			eax, dstPtr
6880 						movlps		xmm0, [esi]
6881 						movhps		xmm0, [esi+8]
6882 						movlps		xmm1, [esi+16]
6883 						mulps		xmm0, [edi]
6884 						mulps		xmm1, [edi+16]
6885 						shufps		xmm1, xmm0, R_SHUFFLEPS( 0, 1, 3, 2 )
6886 						addps		xmm0, xmm1
6887 						movhlps		xmm2, xmm0
6888 						addss		xmm2, xmm0
6889 						shufps		xmm0, xmm0, R_SHUFFLEPS( 1, 0, 0, 0 )
6890 						addss		xmm2, xmm0
6891 						STORE1( 0, xmm2, xmm3 )
6892 					}
6893 					return;
6894 				}
6895 				case 2: {		// 6x2 * 6x1
6896 					__asm {
6897 						mov			esi, vPtr
6898 						mov			edi, mPtr
6899 						mov			eax, dstPtr
6900 						movlps		xmm0, [esi+0*4]
6901 						shufps		xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 )
6902 						movaps		xmm6, [edi+0*4]
6903 						mulps		xmm6, xmm0
6904 						movlps		xmm1, [esi+2*4]
6905 						shufps		xmm1, xmm1, R_SHUFFLEPS( 0, 0, 1, 1 )
6906 						movaps		xmm7, [edi+4*4]
6907 						mulps		xmm7, xmm1
6908 						addps		xmm6, xmm7
6909 						movlps		xmm2, [esi+4*4]
6910 						shufps		xmm2, xmm2, R_SHUFFLEPS( 0, 0, 1, 1 )
6911 						movaps		xmm7, [edi+8*4]
6912 						mulps		xmm7, xmm2
6913 						addps		xmm6, xmm7
6914 						movhlps		xmm3, xmm6
6915 						addps		xmm3, xmm6
6916 						STORE2LO( 0, xmm3, xmm7 )
6917 					}
6918 					return;
6919 				}
6920 				case 3: {		// 6x3 * 6x1
6921 					__asm {
6922 						mov			esi, vPtr
6923 						mov			edi, mPtr
6924 						mov			eax, dstPtr
6925 						movss		xmm0, [edi+(0*3+2)*4]
6926 						movhps		xmm0, [edi+(0*3+0)*4]
6927 						shufps		xmm0, xmm0, R_SHUFFLEPS( 2, 1, 3, 0 )
6928 						movss		xmm6, [esi+0*4]
6929 						shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
6930 						mulps		xmm6, xmm0
6931 						movss		xmm1, [edi+(1*3+0)*4]
6932 						movhps		xmm1, [edi+(1*3+1)*4]
6933 						movss		xmm7, [esi+1*4]
6934 						shufps		xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
6935 						mulps		xmm7, xmm1
6936 						addps		xmm6, xmm7
6937 						movss		xmm2, [edi+(2*3+2)*4]
6938 						movhps		xmm2, [edi+(2*3+0)*4]
6939 						shufps		xmm2, xmm2, R_SHUFFLEPS( 2, 1, 3, 0 )
6940 						movss		xmm7, [esi+2*4]
6941 						shufps		xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
6942 						mulps		xmm7, xmm2
6943 						addps		xmm6, xmm7
6944 						movss		xmm3, [edi+(3*3+0)*4]
6945 						movhps		xmm3, [edi+(3*3+1)*4]
6946 						movss		xmm7, [esi+3*4]
6947 						shufps		xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
6948 						mulps		xmm7, xmm3
6949 						addps		xmm6, xmm7
6950 						movss		xmm4, [edi+(4*3+2)*4]
6951 						movhps		xmm4, [edi+(4*3+0)*4]
6952 						shufps		xmm4, xmm4, R_SHUFFLEPS( 2, 1, 3, 0 )
6953 						movss		xmm7, [esi+4*4]
6954 						shufps		xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
6955 						mulps		xmm7, xmm4
6956 						addps		xmm6, xmm7
6957 						movss		xmm5, [edi+(5*3+0)*4]
6958 						movhps		xmm5, [edi+(5*3+1)*4]
6959 						movss		xmm7, [esi+5*4]
6960 						shufps		xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
6961 						mulps		xmm7, xmm5
6962 						addps		xmm6, xmm7
6963 						STORE1( 0, xmm6, xmm7 )
6964 						STORE2HI( 4, xmm6, xmm7 )
6965 					}
6966 					return;
6967 				}
6968 				case 4: {		// 6x4 * 6x1
6969 					__asm {
6970 						mov			esi, vPtr
6971 						mov			edi, mPtr
6972 						mov			eax, dstPtr
6973 						movlps		xmm3, [edi+(0*4+0)*4]
6974 						movhps		xmm3, [edi+(0*4+2)*4]
6975 						movss		xmm4, [esi+0*4]
6976 						shufps		xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
6977 						mulps		xmm3, xmm4
6978 						movlps		xmm5, [edi+(1*4+0)*4]
6979 						movhps		xmm5, [edi+(1*4+2)*4]
6980 						movss		xmm6, [esi+1*4]
6981 						shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
6982 						mulps		xmm5, xmm6
6983 						addps		xmm3, xmm5
6984 						movlps		xmm4, [edi+(2*4+0)*4]
6985 						movhps		xmm4, [edi+(2*4+2)*4]
6986 						movss		xmm6, [esi+2*4]
6987 						shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
6988 						mulps		xmm4, xmm6
6989 						addps		xmm3, xmm4
6990 						movlps		xmm5, [edi+(3*4+0)*4]
6991 						movhps		xmm5, [edi+(3*4+2)*4]
6992 						movss		xmm6, [esi+3*4]
6993 						shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
6994 						mulps		xmm5, xmm6
6995 						addps		xmm3, xmm5
6996 						movlps		xmm4, [edi+(4*4+0)*4]
6997 						movhps		xmm4, [edi+(4*4+2)*4]
6998 						movss		xmm6, [esi+4*4]
6999 						shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
7000 						mulps		xmm4, xmm6
7001 						addps		xmm3, xmm4
7002 						movlps		xmm5, [edi+(5*4+0)*4]
7003 						movhps		xmm5, [edi+(5*4+2)*4]
7004 						movss		xmm6, [esi+5*4]
7005 						shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
7006 						mulps		xmm5, xmm6
7007 						addps		xmm3, xmm5
7008 						STORE4( 0, xmm3, xmm7 )
7009 					}
7010 					return;
7011 				}
7012 				case 5: {		// 6x5 * 6x1
7013 					__asm {
7014 						mov			esi, vPtr
7015 						mov			edi, mPtr
7016 						mov			eax, dstPtr
7017 						movlps		xmm6, [edi+(0*5+0)*4]
7018 						movhps		xmm6, [edi+(0*5+2)*4]
7019 						movss		xmm0, [esi+0*4]
7020 						shufps		xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
7021 						mulps		xmm6, xmm0
7022 						movlps		xmm7, [edi+(1*5+0)*4]
7023 						movhps		xmm7, [edi+(1*5+2)*4]
7024 						movss		xmm1, [esi+1*4]
7025 						shufps		xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
7026 						mulps		xmm7, xmm1
7027 						addps		xmm6, xmm7
7028 						movlps		xmm7, [edi+(2*5+0)*4]
7029 						movhps		xmm7, [edi+(2*5+2)*4]
7030 						movss		xmm2, [esi+2*4]
7031 						shufps		xmm2, xmm2, R_SHUFFLEPS( 0, 0, 0, 0 )
7032 						mulps		xmm7, xmm2
7033 						addps		xmm6, xmm7
7034 						movlps		xmm7, [edi+(3*5+0)*4]
7035 						movhps		xmm7, [edi+(3*5+2)*4]
7036 						movss		xmm3, [esi+3*4]
7037 						shufps		xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 )
7038 						mulps		xmm7, xmm3
7039 						addps		xmm6, xmm7
7040 						movlps		xmm7, [edi+(4*5+0)*4]
7041 						movhps		xmm7, [edi+(4*5+2)*4]
7042 						movss		xmm4, [esi+4*4]
7043 						shufps		xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
7044 						mulps		xmm7, xmm4
7045 						addps		xmm6, xmm7
7046 						movlps		xmm7, [edi+(5*5+0)*4]
7047 						movhps		xmm7, [edi+(5*5+2)*4]
7048 						movss		xmm5, [esi+5*4]
7049 						shufps		xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
7050 						mulps		xmm7, xmm5
7051 						addps		xmm6, xmm7
7052 						STORE4( 0, xmm6, xmm7 )
7053 						movss		xmm6, [edi+(0*5+4)*4]
7054 						mulss		xmm6, xmm0
7055 						movss		xmm7, [edi+(1*5+4)*4]
7056 						mulss		xmm7, xmm1
7057 						addss		xmm6, xmm7
7058 						movss		xmm7, [edi+(2*5+4)*4]
7059 						mulss		xmm7, xmm2
7060 						addss		xmm6, xmm7
7061 						movss		xmm7, [edi+(3*5+4)*4]
7062 						mulss		xmm7, xmm3
7063 						addss		xmm6, xmm7
7064 						movss		xmm7, [edi+(4*5+4)*4]
7065 						mulss		xmm7, xmm4
7066 						addss		xmm6, xmm7
7067 						movss		xmm7, [edi+(5*5+4)*4]
7068 						mulss		xmm7, xmm5
7069 						addss		xmm6, xmm7
7070 						STORE1( 16, xmm6, xmm7 )
7071 					}
7072 					return;
7073 				}
7074 				case 6: {		// 6x6 * 6x1
7075 					__asm {
7076 						mov			esi, vPtr
7077 						mov			edi, mPtr
7078 						mov			eax, dstPtr
7079 						movlps		xmm0, [esi+0*4]
7080 						movlps		xmm1, [esi+2*4]
7081 						movlps		xmm2, [esi+4*4]
7082 						movaps		xmm3, xmm0
7083 						shufps		xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 )
7084 						mulps		xmm3, [edi+(0*6+0)*4]
7085 						movlps		xmm5, [edi+(1*6+0)*4]
7086 						movhps		xmm5, [edi+(1*6+2)*4]
7087 						movaps		xmm6, xmm0
7088 						shufps		xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
7089 						mulps		xmm5, xmm6
7090 						addps		xmm3, xmm5
7091 						movaps		xmm6, xmm1
7092 						shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
7093 						mulps		xmm6, [edi+(2*6+0)*4]
7094 						addps		xmm3, xmm6
7095 						movaps		xmm6, xmm1
7096 						shufps		xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
7097 						movlps		xmm5, [edi+(3*6+0)*4]
7098 						movhps		xmm5, [edi+(3*6+2)*4]
7099 						mulps		xmm5, xmm6
7100 						addps		xmm3, xmm5
7101 						movaps		xmm6, xmm2
7102 						shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
7103 						mulps		xmm6, [edi+(4*6+0)*4]
7104 						addps		xmm3, xmm6
7105 						movaps		xmm6, xmm2
7106 						shufps		xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
7107 						movlps		xmm5, [edi+(5*6+0)*4]
7108 						movhps		xmm5, [edi+(5*6+2)*4]
7109 						mulps		xmm5, xmm6
7110 						addps		xmm3, xmm5
7111 						STORE4( 0, xmm3, xmm7 )
7112 						shufps		xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 )
7113 						shufps		xmm1, xmm1, R_SHUFFLEPS( 0, 0, 1, 1 )
7114 						shufps		xmm2, xmm2, R_SHUFFLEPS( 0, 0, 1, 1 )
7115 						movlps		xmm3, [edi+(0*6+4)*4]
7116 						movhps		xmm3, [edi+(1*6+4)*4]
7117 						mulps		xmm3, xmm0
7118 						movlps		xmm4, [edi+(2*6+4)*4]
7119 						movhps		xmm4, [edi+(3*6+4)*4]
7120 						mulps		xmm4, xmm1
7121 						addps		xmm3, xmm4
7122 						movlps		xmm5, [edi+(4*6+4)*4]
7123 						movhps		xmm5, [edi+(5*6+4)*4]
7124 						mulps		xmm5, xmm2
7125 						addps		xmm3, xmm5
7126 						movhlps		xmm4, xmm3
7127 						addps		xmm3, xmm4
7128 						STORE2LO( 16, xmm3, xmm7 )
7129 					}
7130 					return;
7131 				}
7132 				default: {
7133 					for ( int i = 0; i < numColumns; i++ ) {
7134 						dstPtr[i] STOREC *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2] +
7135 								*(mPtr+3*numColumns) * vPtr[3] + *(mPtr+4*numColumns) * vPtr[4] + *(mPtr+5*numColumns) * vPtr[5];
7136 						mPtr++;
7137 					}
7138 					return;
7139 				}
7140 			}
7141 			break;
7142 		default:
7143 			int numRows = mat.GetNumRows();
7144 			for ( int i = 0; i < numColumns; i++ ) {
7145 				mPtr = mat.ToFloatPtr() + i;
7146 				float sum = mPtr[0] * vPtr[0];
7147 				for ( int j = 1; j < numRows; j++ ) {
7148 					mPtr += numColumns;
7149 					sum += mPtr[0] * vPtr[j];
7150 				}
7151 				dstPtr[i] STOREC sum;
7152 			}
7153 			break;
7154 	}
7155 
7156 #undef STOREC
7157 #undef STORE4
7158 #undef STORE2HI
7159 #undef STORE2LO
7160 #undef STORE1
7161 }
7162 
7163 /*
7164 ============
7165 idSIMD_SSE::MatX_TransposeMultiplyAddVecX
7166 
7167 	optimizes the following matrix multiplications:
7168 
7169 	Nx6 * Nx1
7170 	6xN * 6x1
7171 
7172 	with N in the range [1-6]
7173 ============
7174 */
MatX_TransposeMultiplyAddVecX(idVecX & dst,const idMatX & mat,const idVecX & vec)7175 void VPCALL idSIMD_SSE::MatX_TransposeMultiplyAddVecX( idVecX &dst, const idMatX &mat, const idVecX &vec ) {
7176 #define STORE1( offset, reg1, reg2 )		\
7177 	__asm movss		reg2, [eax+offset]		\
7178 	__asm addss		reg2, reg1				\
7179 	__asm movss		[eax+offset], reg2
7180 #define STORE2LO( offset, reg1, reg2 )		\
7181 	__asm movlps	reg2, [eax+offset]		\
7182 	__asm addps		reg2, reg1				\
7183 	__asm movlps	[eax+offset], reg2
7184 #define STORE2HI( offset, reg1, reg2 )		\
7185 	__asm movhps	reg2, [eax+offset]		\
7186 	__asm addps		reg2, reg1				\
7187 	__asm movhps	[eax+offset], reg2
7188 #define STORE4( offset, reg1, reg2 )		\
7189 	__asm movlps	reg2, [eax+offset]		\
7190 	__asm movhps	reg2, [eax+offset+8]	\
7191 	__asm addps		reg2, reg1				\
7192 	__asm movlps	[eax+offset], reg2		\
7193 	__asm movhps	[eax+offset+8], reg2
7194 #define STOREC		+=
7195 
7196 	int numColumns;
7197 	const float *mPtr, *vPtr;
7198 	float *dstPtr;
7199 
7200 	assert( vec.GetSize() >= mat.GetNumRows() );
7201 	assert( dst.GetSize() >= mat.GetNumColumns() );
7202 
7203 	mPtr = mat.ToFloatPtr();
7204 	vPtr = vec.ToFloatPtr();
7205 	dstPtr = dst.ToFloatPtr();
7206 	numColumns = mat.GetNumColumns();
7207 	switch( mat.GetNumRows() ) {
7208 		case 1:
7209 			switch( numColumns ) {
7210 				case 6: {		// 1x6 * 1x1
7211 					__asm {
7212 						mov			esi, vPtr
7213 						mov			edi, mPtr
7214 						mov			eax, dstPtr
7215 						movss		xmm0, [esi]
7216 						shufps		xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
7217 						movaps		xmm1, xmm0
7218 						mulps		xmm0, [edi]
7219 						mulps		xmm1, [edi+16]
7220 						STORE4( 0, xmm0, xmm2 )
7221 						STORE2LO( 16, xmm1, xmm3 )
7222 					}
7223 					return;
7224 				}
7225 				default: {
7226 					for ( int i = 0; i < numColumns; i++ ) {
7227 						dstPtr[i] STOREC *(mPtr) * vPtr[0];
7228 						mPtr++;
7229 					}
7230 					return;
7231 				}
7232 			}
7233 			break;
7234 		case 2:
7235 			switch( numColumns ) {
7236 				case 6: {		// 2x6 * 2x1
7237 					__asm {
7238 						mov			esi, vPtr
7239 						mov			edi, mPtr
7240 						mov			eax, dstPtr
7241 						movlps		xmm0, [esi]
7242 						movaps		xmm1, xmm0
7243 						shufps		xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
7244 						shufps		xmm1, xmm1, R_SHUFFLEPS( 1, 1, 1, 1 )
7245 						movaps		xmm2, [edi]
7246 						mulps		xmm2, xmm0
7247 						movlps		xmm3, [edi+24]
7248 						movhps		xmm3, [edi+32]
7249 						mulps		xmm3, xmm1
7250 						addps		xmm2, xmm3
7251 						shufps		xmm0, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
7252 						movlps		xmm4, [edi+16]
7253 						movhps		xmm4, [edi+40]
7254 						mulps		xmm4, xmm0
7255 						movhlps		xmm3, xmm4
7256 						addps		xmm3, xmm4
7257 						STORE4( 0, xmm2, xmm5 )
7258 						STORE2LO( 16, xmm3, xmm6 )
7259 					}
7260 					return;
7261 				}
7262 				default: {
7263 					for ( int i = 0; i < numColumns; i++ ) {
7264 						dstPtr[i] STOREC *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1];
7265 						mPtr++;
7266 					}
7267 					return;
7268 				}
7269 			}
7270 			break;
7271 		case 3:
7272 			switch( numColumns ) {
7273 				case 6: {		// 3x6 * 3x1
7274 					__asm {
7275 						mov			esi, vPtr
7276 						mov			edi, mPtr
7277 						mov			eax, dstPtr
7278 						movlps		xmm0, [esi+0*4]
7279 						movss		xmm1, [esi+2*4]
7280 						movlps		xmm3, [edi+(0*6+0)*4]
7281 						movhps		xmm3, [edi+(0*6+2)*4]
7282 						movaps		xmm4, xmm0
7283 						shufps		xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
7284 						mulps		xmm3, xmm4
7285 						movlps		xmm5, [edi+(1*6+0)*4]
7286 						movhps		xmm5, [edi+(1*6+2)*4]
7287 						movaps		xmm6, xmm0
7288 						shufps		xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
7289 						mulps		xmm5, xmm6
7290 						addps		xmm3, xmm5
7291 						movlps		xmm4, [edi+(2*6+0)*4]
7292 						movhps		xmm4, [edi+(2*6+2)*4]
7293 						shufps		xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
7294 						mulps		xmm4, xmm1
7295 						addps		xmm3, xmm4
7296 						STORE4( 0, xmm3, xmm7 )
7297 						shufps		xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 )
7298 						movlps		xmm3, [edi+(0*6+4)*4]
7299 						movhps		xmm3, [edi+(1*6+4)*4]
7300 						mulps		xmm3, xmm0
7301 						movhlps		xmm4, xmm3
7302 						addps		xmm3, xmm4
7303 						movlps		xmm5, [edi+(2*6+4)*4]
7304 						mulps		xmm5, xmm1
7305 						addps		xmm3, xmm5
7306 						STORE2LO( 16, xmm3, xmm7 )
7307 					}
7308 					return;
7309 				}
7310 				default: {
7311 					for ( int i = 0; i < numColumns; i++ ) {
7312 						dstPtr[i] STOREC *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2];
7313 						mPtr++;
7314 					}
7315 					return;
7316 				}
7317 			}
7318 			break;
7319 		case 4:
7320 			switch( numColumns ) {
7321 				case 6: {		// 4x6 * 4x1
7322 					__asm {
7323 						mov			esi, vPtr
7324 						mov			edi, mPtr
7325 						mov			eax, dstPtr
7326 						movlps		xmm0, [esi+0*4]
7327 						movlps		xmm1, [esi+2*4]
7328 						movaps		xmm3, xmm0
7329 						shufps		xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 )
7330 						mulps		xmm3, [edi+(0*6+0)*4]
7331 						movlps		xmm5, [edi+(1*6+0)*4]
7332 						movhps		xmm5, [edi+(1*6+2)*4]
7333 						movaps		xmm6, xmm0
7334 						shufps		xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
7335 						mulps		xmm5, xmm6
7336 						addps		xmm3, xmm5
7337 						movlps		xmm4, [edi+(2*6+0)*4]
7338 						movhps		xmm4, [edi+(2*6+2)*4]
7339 						movaps		xmm6, xmm1
7340 						shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
7341 						mulps		xmm4, xmm6
7342 						addps		xmm3, xmm4
7343 						movlps		xmm5, [edi+(3*6+0)*4]
7344 						movhps		xmm5, [edi+(3*6+2)*4]
7345 						movaps		xmm6, xmm1
7346 						shufps		xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
7347 						mulps		xmm5, xmm6
7348 						addps		xmm3, xmm5
7349 						STORE4( 0, xmm3, xmm7 )
7350 						shufps		xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 )
7351 						shufps		xmm1, xmm1, R_SHUFFLEPS( 0, 0, 1, 1 )
7352 						movlps		xmm3, [edi+(0*6+4)*4]
7353 						movhps		xmm3, [edi+(1*6+4)*4]
7354 						mulps		xmm3, xmm0
7355 						movlps		xmm4, [edi+(2*6+4)*4]
7356 						movhps		xmm4, [edi+(3*6+4)*4]
7357 						mulps		xmm4, xmm1
7358 						addps		xmm3, xmm4
7359 						movhlps		xmm4, xmm3
7360 						addps		xmm3, xmm4
7361 						STORE2LO( 16, xmm3, xmm7 )
7362 					}
7363 					return;
7364 				}
7365 				default: {
7366 					for ( int i = 0; i < numColumns; i++ ) {
7367 						dstPtr[i] STOREC *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2] +
7368 								*(mPtr+3*numColumns) * vPtr[3];
7369 						mPtr++;
7370 					}
7371 					return;
7372 				}
7373 			}
7374 			break;
7375 		case 5:
7376 			switch( numColumns ) {
7377 				case 6: {		// 5x6 * 5x1
7378 					__asm {
7379 						mov			esi, vPtr
7380 						mov			edi, mPtr
7381 						mov			eax, dstPtr
7382 						movlps		xmm0, [esi+0*4]
7383 						movlps		xmm1, [esi+2*4]
7384 						movss		xmm2, [esi+4*4]
7385 						movaps		xmm3, xmm0
7386 						shufps		xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 )
7387 						mulps		xmm3, [edi+(0*6+0)*4]
7388 						movlps		xmm5, [edi+(1*6+0)*4]
7389 						movhps		xmm5, [edi+(1*6+2)*4]
7390 						movaps		xmm6, xmm0
7391 						shufps		xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
7392 						mulps		xmm5, xmm6
7393 						addps		xmm3, xmm5
7394 						movaps		xmm6, xmm1
7395 						shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
7396 						mulps		xmm6, [edi+(2*6+0)*4]
7397 						addps		xmm3, xmm6
7398 						movlps		xmm5, [edi+(3*6+0)*4]
7399 						movhps		xmm5, [edi+(3*6+2)*4]
7400 						movaps		xmm6, xmm1
7401 						shufps		xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
7402 						mulps		xmm5, xmm6
7403 						addps		xmm3, xmm5
7404 						shufps		xmm2, xmm2, R_SHUFFLEPS( 0, 0, 0, 0 )
7405 						movaps		xmm4, xmm2
7406 						mulps		xmm4, [edi+(4*6+0)*4]
7407 						addps		xmm3, xmm4
7408 						STORE4( 0, xmm3, xmm7 )
7409 						shufps		xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 )
7410 						shufps		xmm1, xmm1, R_SHUFFLEPS( 0, 0, 1, 1 )
7411 						movlps		xmm3, [edi+(0*6+4)*4]
7412 						movhps		xmm3, [edi+(1*6+4)*4]
7413 						mulps		xmm3, xmm0
7414 						movlps		xmm4, [edi+(2*6+4)*4]
7415 						movhps		xmm4, [edi+(3*6+4)*4]
7416 						mulps		xmm4, xmm1
7417 						addps		xmm3, xmm4
7418 						movhlps		xmm4, xmm3
7419 						addps		xmm3, xmm4
7420 						movlps		xmm5, [edi+(4*6+4)*4]
7421 						mulps		xmm5, xmm2
7422 						addps		xmm3, xmm5
7423 						STORE2LO( 16, xmm3, xmm7 )
7424 					}
7425 					return;
7426 				}
7427 				default: {
7428 					for ( int i = 0; i < numColumns; i++ ) {
7429 						dstPtr[i] STOREC *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2] +
7430 								*(mPtr+3*numColumns) * vPtr[3] + *(mPtr+4*numColumns) * vPtr[4];
7431 						mPtr++;
7432 					}
7433 					return;
7434 				}
7435 			}
7436 			break;
7437 		case 6:
7438 			switch( numColumns ) {
7439 				case 1: {		// 6x1 * 6x1
7440 					__asm {
7441 						mov			esi, vPtr
7442 						mov			edi, mPtr
7443 						mov			eax, dstPtr
7444 						movlps		xmm0, [esi]
7445 						movhps		xmm0, [esi+8]
7446 						movlps		xmm1, [esi+16]
7447 						mulps		xmm0, [edi]
7448 						mulps		xmm1, [edi+16]
7449 						shufps		xmm1, xmm0, R_SHUFFLEPS( 0, 1, 3, 2 )
7450 						addps		xmm0, xmm1
7451 						movhlps		xmm2, xmm0
7452 						addss		xmm2, xmm0
7453 						shufps		xmm0, xmm0, R_SHUFFLEPS( 1, 0, 0, 0 )
7454 						addss		xmm2, xmm0
7455 						STORE1( 0, xmm2, xmm3 )
7456 					}
7457 					return;
7458 				}
7459 				case 2: {		// 6x2 * 6x1
7460 					__asm {
7461 						mov			esi, vPtr
7462 						mov			edi, mPtr
7463 						mov			eax, dstPtr
7464 						movlps		xmm0, [esi+0*4]
7465 						shufps		xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 )
7466 						movaps		xmm6, [edi+0*4]
7467 						mulps		xmm6, xmm0
7468 						movlps		xmm1, [esi+2*4]
7469 						shufps		xmm1, xmm1, R_SHUFFLEPS( 0, 0, 1, 1 )
7470 						movaps		xmm7, [edi+4*4]
7471 						mulps		xmm7, xmm1
7472 						addps		xmm6, xmm7
7473 						movlps		xmm2, [esi+4*4]
7474 						shufps		xmm2, xmm2, R_SHUFFLEPS( 0, 0, 1, 1 )
7475 						movaps		xmm7, [edi+8*4]
7476 						mulps		xmm7, xmm2
7477 						addps		xmm6, xmm7
7478 						movhlps		xmm3, xmm6
7479 						addps		xmm3, xmm6
7480 						STORE2LO( 0, xmm3, xmm7 )
7481 					}
7482 					return;
7483 				}
7484 				case 3: {		// 6x3 * 6x1
7485 					__asm {
7486 						mov			esi, vPtr
7487 						mov			edi, mPtr
7488 						mov			eax, dstPtr
7489 						movss		xmm0, [edi+(0*3+2)*4]
7490 						movhps		xmm0, [edi+(0*3+0)*4]
7491 						shufps		xmm0, xmm0, R_SHUFFLEPS( 2, 1, 3, 0 )
7492 						movss		xmm6, [esi+0*4]
7493 						shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
7494 						mulps		xmm6, xmm0
7495 						movss		xmm1, [edi+(1*3+0)*4]
7496 						movhps		xmm1, [edi+(1*3+1)*4]
7497 						movss		xmm7, [esi+1*4]
7498 						shufps		xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
7499 						mulps		xmm7, xmm1
7500 						addps		xmm6, xmm7
7501 						movss		xmm2, [edi+(2*3+2)*4]
7502 						movhps		xmm2, [edi+(2*3+0)*4]
7503 						shufps		xmm2, xmm2, R_SHUFFLEPS( 2, 1, 3, 0 )
7504 						movss		xmm7, [esi+2*4]
7505 						shufps		xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
7506 						mulps		xmm7, xmm2
7507 						addps		xmm6, xmm7
7508 						movss		xmm3, [edi+(3*3+0)*4]
7509 						movhps		xmm3, [edi+(3*3+1)*4]
7510 						movss		xmm7, [esi+3*4]
7511 						shufps		xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
7512 						mulps		xmm7, xmm3
7513 						addps		xmm6, xmm7
7514 						movss		xmm4, [edi+(4*3+2)*4]
7515 						movhps		xmm4, [edi+(4*3+0)*4]
7516 						shufps		xmm4, xmm4, R_SHUFFLEPS( 2, 1, 3, 0 )
7517 						movss		xmm7, [esi+4*4]
7518 						shufps		xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
7519 						mulps		xmm7, xmm4
7520 						addps		xmm6, xmm7
7521 						movss		xmm5, [edi+(5*3+0)*4]
7522 						movhps		xmm5, [edi+(5*3+1)*4]
7523 						movss		xmm7, [esi+5*4]
7524 						shufps		xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
7525 						mulps		xmm7, xmm5
7526 						addps		xmm6, xmm7
7527 						STORE1( 0, xmm6, xmm7 )
7528 						STORE2HI( 4, xmm6, xmm7 )
7529 					}
7530 					return;
7531 				}
7532 				case 4: {		// 6x4 * 6x1
7533 					__asm {
7534 						mov			esi, vPtr
7535 						mov			edi, mPtr
7536 						mov			eax, dstPtr
7537 						movlps		xmm3, [edi+(0*4+0)*4]
7538 						movhps		xmm3, [edi+(0*4+2)*4]
7539 						movss		xmm4, [esi+0*4]
7540 						shufps		xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
7541 						mulps		xmm3, xmm4
7542 						movlps		xmm5, [edi+(1*4+0)*4]
7543 						movhps		xmm5, [edi+(1*4+2)*4]
7544 						movss		xmm6, [esi+1*4]
7545 						shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
7546 						mulps		xmm5, xmm6
7547 						addps		xmm3, xmm5
7548 						movlps		xmm4, [edi+(2*4+0)*4]
7549 						movhps		xmm4, [edi+(2*4+2)*4]
7550 						movss		xmm6, [esi+2*4]
7551 						shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
7552 						mulps		xmm4, xmm6
7553 						addps		xmm3, xmm4
7554 						movlps		xmm5, [edi+(3*4+0)*4]
7555 						movhps		xmm5, [edi+(3*4+2)*4]
7556 						movss		xmm6, [esi+3*4]
7557 						shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
7558 						mulps		xmm5, xmm6
7559 						addps		xmm3, xmm5
7560 						movlps		xmm4, [edi+(4*4+0)*4]
7561 						movhps		xmm4, [edi+(4*4+2)*4]
7562 						movss		xmm6, [esi+4*4]
7563 						shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
7564 						mulps		xmm4, xmm6
7565 						addps		xmm3, xmm4
7566 						movlps		xmm5, [edi+(5*4+0)*4]
7567 						movhps		xmm5, [edi+(5*4+2)*4]
7568 						movss		xmm6, [esi+5*4]
7569 						shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
7570 						mulps		xmm5, xmm6
7571 						addps		xmm3, xmm5
7572 						STORE4( 0, xmm3, xmm7 )
7573 					}
7574 					return;
7575 				}
7576 				case 5: {		// 6x5 * 6x1
7577 					__asm {
7578 						mov			esi, vPtr
7579 						mov			edi, mPtr
7580 						mov			eax, dstPtr
7581 						movlps		xmm6, [edi+(0*5+0)*4]
7582 						movhps		xmm6, [edi+(0*5+2)*4]
7583 						movss		xmm0, [esi+0*4]
7584 						shufps		xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
7585 						mulps		xmm6, xmm0
7586 						movlps		xmm7, [edi+(1*5+0)*4]
7587 						movhps		xmm7, [edi+(1*5+2)*4]
7588 						movss		xmm1, [esi+1*4]
7589 						shufps		xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
7590 						mulps		xmm7, xmm1
7591 						addps		xmm6, xmm7
7592 						movlps		xmm7, [edi+(2*5+0)*4]
7593 						movhps		xmm7, [edi+(2*5+2)*4]
7594 						movss		xmm2, [esi+2*4]
7595 						shufps		xmm2, xmm2, R_SHUFFLEPS( 0, 0, 0, 0 )
7596 						mulps		xmm7, xmm2
7597 						addps		xmm6, xmm7
7598 						movlps		xmm7, [edi+(3*5+0)*4]
7599 						movhps		xmm7, [edi+(3*5+2)*4]
7600 						movss		xmm3, [esi+3*4]
7601 						shufps		xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 )
7602 						mulps		xmm7, xmm3
7603 						addps		xmm6, xmm7
7604 						movlps		xmm7, [edi+(4*5+0)*4]
7605 						movhps		xmm7, [edi+(4*5+2)*4]
7606 						movss		xmm4, [esi+4*4]
7607 						shufps		xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
7608 						mulps		xmm7, xmm4
7609 						addps		xmm6, xmm7
7610 						movlps		xmm7, [edi+(5*5+0)*4]
7611 						movhps		xmm7, [edi+(5*5+2)*4]
7612 						movss		xmm5, [esi+5*4]
7613 						shufps		xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
7614 						mulps		xmm7, xmm5
7615 						addps		xmm6, xmm7
7616 						STORE4( 0, xmm6, xmm7 )
7617 						movss		xmm6, [edi+(0*5+4)*4]
7618 						mulss		xmm6, xmm0
7619 						movss		xmm7, [edi+(1*5+4)*4]
7620 						mulss		xmm7, xmm1
7621 						addss		xmm6, xmm7
7622 						movss		xmm7, [edi+(2*5+4)*4]
7623 						mulss		xmm7, xmm2
7624 						addss		xmm6, xmm7
7625 						movss		xmm7, [edi+(3*5+4)*4]
7626 						mulss		xmm7, xmm3
7627 						addss		xmm6, xmm7
7628 						movss		xmm7, [edi+(4*5+4)*4]
7629 						mulss		xmm7, xmm4
7630 						addss		xmm6, xmm7
7631 						movss		xmm7, [edi+(5*5+4)*4]
7632 						mulss		xmm7, xmm5
7633 						addss		xmm6, xmm7
7634 						STORE1( 16, xmm6, xmm7 )
7635 					}
7636 					return;
7637 				}
7638 				case 6: {		// 6x6 * 6x1
7639 					__asm {
7640 						mov			esi, vPtr
7641 						mov			edi, mPtr
7642 						mov			eax, dstPtr
7643 						movlps		xmm0, [esi+0*4]
7644 						movlps		xmm1, [esi+2*4]
7645 						movlps		xmm2, [esi+4*4]
7646 						movaps		xmm3, xmm0
7647 						shufps		xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 )
7648 						mulps		xmm3, [edi+(0*6+0)*4]
7649 						movlps		xmm5, [edi+(1*6+0)*4]
7650 						movhps		xmm5, [edi+(1*6+2)*4]
7651 						movaps		xmm6, xmm0
7652 						shufps		xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
7653 						mulps		xmm5, xmm6
7654 						addps		xmm3, xmm5
7655 						movaps		xmm6, xmm1
7656 						shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
7657 						mulps		xmm6, [edi+(2*6+0)*4]
7658 						addps		xmm3, xmm6
7659 						movaps		xmm6, xmm1
7660 						shufps		xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
7661 						movlps		xmm5, [edi+(3*6+0)*4]
7662 						movhps		xmm5, [edi+(3*6+2)*4]
7663 						mulps		xmm5, xmm6
7664 						addps		xmm3, xmm5
7665 						movaps		xmm6, xmm2
7666 						shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
7667 						mulps		xmm6, [edi+(4*6+0)*4]
7668 						addps		xmm3, xmm6
7669 						movaps		xmm6, xmm2
7670 						shufps		xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
7671 						movlps		xmm5, [edi+(5*6+0)*4]
7672 						movhps		xmm5, [edi+(5*6+2)*4]
7673 						mulps		xmm5, xmm6
7674 						addps		xmm3, xmm5
7675 						STORE4( 0, xmm3, xmm7 )
7676 						shufps		xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 )
7677 						shufps		xmm1, xmm1, R_SHUFFLEPS( 0, 0, 1, 1 )
7678 						shufps		xmm2, xmm2, R_SHUFFLEPS( 0, 0, 1, 1 )
7679 						movlps		xmm3, [edi+(0*6+4)*4]
7680 						movhps		xmm3, [edi+(1*6+4)*4]
7681 						mulps		xmm3, xmm0
7682 						movlps		xmm4, [edi+(2*6+4)*4]
7683 						movhps		xmm4, [edi+(3*6+4)*4]
7684 						mulps		xmm4, xmm1
7685 						addps		xmm3, xmm4
7686 						movlps		xmm5, [edi+(4*6+4)*4]
7687 						movhps		xmm5, [edi+(5*6+4)*4]
7688 						mulps		xmm5, xmm2
7689 						addps		xmm3, xmm5
7690 						movhlps		xmm4, xmm3
7691 						addps		xmm3, xmm4
7692 						STORE2LO( 16, xmm3, xmm7 )
7693 					}
7694 					return;
7695 				}
7696 				default: {
7697 					for ( int i = 0; i < numColumns; i++ ) {
7698 						dstPtr[i] STOREC *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2] +
7699 								*(mPtr+3*numColumns) * vPtr[3] + *(mPtr+4*numColumns) * vPtr[4] + *(mPtr+5*numColumns) * vPtr[5];
7700 						mPtr++;
7701 					}
7702 					return;
7703 				}
7704 			}
7705 			break;
7706 		default:
7707 			int numRows = mat.GetNumRows();
7708 			for ( int i = 0; i < numColumns; i++ ) {
7709 				mPtr = mat.ToFloatPtr() + i;
7710 				float sum = mPtr[0] * vPtr[0];
7711 				for ( int j = 1; j < numRows; j++ ) {
7712 					mPtr += numColumns;
7713 					sum += mPtr[0] * vPtr[j];
7714 				}
7715 				dstPtr[i] STOREC sum;
7716 			}
7717 			break;
7718 	}
7719 
7720 #undef STOREC
7721 #undef STORE4
7722 #undef STORE2HI
7723 #undef STORE2LO
7724 #undef STORE1
7725 }
7726 
7727 /*
7728 ============
7729 void idSIMD_SSE::MatX_TransposeMultiplySubVecX
7730 
7731 	optimizes the following matrix multiplications:
7732 
7733 	Nx6 * Nx1
7734 	6xN * 6x1
7735 
7736 	with N in the range [1-6]
7737 ============
7738 */
MatX_TransposeMultiplySubVecX(idVecX & dst,const idMatX & mat,const idVecX & vec)7739 void VPCALL idSIMD_SSE::MatX_TransposeMultiplySubVecX( idVecX &dst, const idMatX &mat, const idVecX &vec ) {
7740 #define STORE1( offset, reg1, reg2 )		\
7741 	__asm movss		reg2, [eax+offset]		\
7742 	__asm subss		reg2, reg1				\
7743 	__asm movss		[eax+offset], reg2
7744 #define STORE2LO( offset, reg1, reg2 )		\
7745 	__asm movlps	reg2, [eax+offset]		\
7746 	__asm subps		reg2, reg1				\
7747 	__asm movlps	[eax+offset], reg2
7748 #define STORE2HI( offset, reg1, reg2 )		\
7749 	__asm movhps	reg2, [eax+offset]		\
7750 	__asm subps		reg2, reg1				\
7751 	__asm movhps	[eax+offset], reg2
7752 #define STORE4( offset, reg1, reg2 )		\
7753 	__asm movlps	reg2, [eax+offset]		\
7754 	__asm movhps	reg2, [eax+offset+8]	\
7755 	__asm subps		reg2, reg1				\
7756 	__asm movlps	[eax+offset], reg2		\
7757 	__asm movhps	[eax+offset+8], reg2
7758 #define STOREC		-=
7759 
7760 	int numColumns;
7761 	const float *mPtr, *vPtr;
7762 	float *dstPtr;
7763 
7764 	assert( vec.GetSize() >= mat.GetNumRows() );
7765 	assert( dst.GetSize() >= mat.GetNumColumns() );
7766 
7767 	mPtr = mat.ToFloatPtr();
7768 	vPtr = vec.ToFloatPtr();
7769 	dstPtr = dst.ToFloatPtr();
7770 	numColumns = mat.GetNumColumns();
7771 	switch( mat.GetNumRows() ) {
7772 		case 1:
7773 			switch( numColumns ) {
7774 				case 6: {		// 1x6 * 1x1
7775 					__asm {
7776 						mov			esi, vPtr
7777 						mov			edi, mPtr
7778 						mov			eax, dstPtr
7779 						movss		xmm0, [esi]
7780 						shufps		xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
7781 						movaps		xmm1, xmm0
7782 						mulps		xmm0, [edi]
7783 						mulps		xmm1, [edi+16]
7784 						STORE4( 0, xmm0, xmm2 )
7785 						STORE2LO( 16, xmm1, xmm3 )
7786 					}
7787 					return;
7788 				}
7789 				default: {
7790 					for ( int i = 0; i < numColumns; i++ ) {
7791 						dstPtr[i] STOREC *(mPtr) * vPtr[0];
7792 						mPtr++;
7793 					}
7794 					return;
7795 				}
7796 			}
7797 			break;
7798 		case 2:
7799 			switch( numColumns ) {
7800 				case 6: {		// 2x6 * 2x1
7801 					__asm {
7802 						mov			esi, vPtr
7803 						mov			edi, mPtr
7804 						mov			eax, dstPtr
7805 						movlps		xmm0, [esi]
7806 						movaps		xmm1, xmm0
7807 						shufps		xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
7808 						shufps		xmm1, xmm1, R_SHUFFLEPS( 1, 1, 1, 1 )
7809 						movaps		xmm2, [edi]
7810 						mulps		xmm2, xmm0
7811 						movlps		xmm3, [edi+24]
7812 						movhps		xmm3, [edi+32]
7813 						mulps		xmm3, xmm1
7814 						addps		xmm2, xmm3
7815 						shufps		xmm0, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
7816 						movlps		xmm4, [edi+16]
7817 						movhps		xmm4, [edi+40]
7818 						mulps		xmm4, xmm0
7819 						movhlps		xmm3, xmm4
7820 						addps		xmm3, xmm4
7821 						STORE4( 0, xmm2, xmm5 )
7822 						STORE2LO( 16, xmm3, xmm6 )
7823 					}
7824 					return;
7825 				}
7826 				default: {
7827 					for ( int i = 0; i < numColumns; i++ ) {
7828 						dstPtr[i] STOREC *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1];
7829 						mPtr++;
7830 					}
7831 					return;
7832 				}
7833 			}
7834 			break;
7835 		case 3:
7836 			switch( numColumns ) {
7837 				case 6: {		// 3x6 * 3x1
7838 					__asm {
7839 						mov			esi, vPtr
7840 						mov			edi, mPtr
7841 						mov			eax, dstPtr
7842 						movlps		xmm0, [esi+0*4]
7843 						movss		xmm1, [esi+2*4]
7844 						movlps		xmm3, [edi+(0*6+0)*4]
7845 						movhps		xmm3, [edi+(0*6+2)*4]
7846 						movaps		xmm4, xmm0
7847 						shufps		xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
7848 						mulps		xmm3, xmm4
7849 						movlps		xmm5, [edi+(1*6+0)*4]
7850 						movhps		xmm5, [edi+(1*6+2)*4]
7851 						movaps		xmm6, xmm0
7852 						shufps		xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
7853 						mulps		xmm5, xmm6
7854 						addps		xmm3, xmm5
7855 						movlps		xmm4, [edi+(2*6+0)*4]
7856 						movhps		xmm4, [edi+(2*6+2)*4]
7857 						shufps		xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
7858 						mulps		xmm4, xmm1
7859 						addps		xmm3, xmm4
7860 						STORE4( 0, xmm3, xmm7 )
7861 						shufps		xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 )
7862 						movlps		xmm3, [edi+(0*6+4)*4]
7863 						movhps		xmm3, [edi+(1*6+4)*4]
7864 						mulps		xmm3, xmm0
7865 						movhlps		xmm4, xmm3
7866 						addps		xmm3, xmm4
7867 						movlps		xmm5, [edi+(2*6+4)*4]
7868 						mulps		xmm5, xmm1
7869 						addps		xmm3, xmm5
7870 						STORE2LO( 16, xmm3, xmm7 )
7871 					}
7872 					return;
7873 				}
7874 				default: {
7875 					for ( int i = 0; i < numColumns; i++ ) {
7876 						dstPtr[i] STOREC *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2];
7877 						mPtr++;
7878 					}
7879 					return;
7880 				}
7881 			}
7882 			break;
7883 		case 4:
7884 			switch( numColumns ) {
7885 				case 6: {		// 4x6 * 4x1
7886 					__asm {
7887 						mov			esi, vPtr
7888 						mov			edi, mPtr
7889 						mov			eax, dstPtr
7890 						movlps		xmm0, [esi+0*4]
7891 						movlps		xmm1, [esi+2*4]
7892 						movaps		xmm3, xmm0
7893 						shufps		xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 )
7894 						mulps		xmm3, [edi+(0*6+0)*4]
7895 						movlps		xmm5, [edi+(1*6+0)*4]
7896 						movhps		xmm5, [edi+(1*6+2)*4]
7897 						movaps		xmm6, xmm0
7898 						shufps		xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
7899 						mulps		xmm5, xmm6
7900 						addps		xmm3, xmm5
7901 						movlps		xmm4, [edi+(2*6+0)*4]
7902 						movhps		xmm4, [edi+(2*6+2)*4]
7903 						movaps		xmm6, xmm1
7904 						shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
7905 						mulps		xmm4, xmm6
7906 						addps		xmm3, xmm4
7907 						movlps		xmm5, [edi+(3*6+0)*4]
7908 						movhps		xmm5, [edi+(3*6+2)*4]
7909 						movaps		xmm6, xmm1
7910 						shufps		xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
7911 						mulps		xmm5, xmm6
7912 						addps		xmm3, xmm5
7913 						STORE4( 0, xmm3, xmm7 )
7914 						shufps		xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 )
7915 						shufps		xmm1, xmm1, R_SHUFFLEPS( 0, 0, 1, 1 )
7916 						movlps		xmm3, [edi+(0*6+4)*4]
7917 						movhps		xmm3, [edi+(1*6+4)*4]
7918 						mulps		xmm3, xmm0
7919 						movlps		xmm4, [edi+(2*6+4)*4]
7920 						movhps		xmm4, [edi+(3*6+4)*4]
7921 						mulps		xmm4, xmm1
7922 						addps		xmm3, xmm4
7923 						movhlps		xmm4, xmm3
7924 						addps		xmm3, xmm4
7925 						STORE2LO( 16, xmm3, xmm7 )
7926 					}
7927 					return;
7928 				}
7929 				default: {
7930 					for ( int i = 0; i < numColumns; i++ ) {
7931 						dstPtr[i] STOREC *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2] +
7932 								*(mPtr+3*numColumns) * vPtr[3];
7933 						mPtr++;
7934 					}
7935 					return;
7936 				}
7937 			}
7938 			break;
7939 		case 5:
7940 			switch( numColumns ) {
7941 				case 6: {		// 5x6 * 5x1
7942 					__asm {
7943 						mov			esi, vPtr
7944 						mov			edi, mPtr
7945 						mov			eax, dstPtr
7946 						movlps		xmm0, [esi+0*4]
7947 						movlps		xmm1, [esi+2*4]
7948 						movss		xmm2, [esi+4*4]
7949 						movaps		xmm3, xmm0
7950 						shufps		xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 )
7951 						mulps		xmm3, [edi+(0*6+0)*4]
7952 						movlps		xmm5, [edi+(1*6+0)*4]
7953 						movhps		xmm5, [edi+(1*6+2)*4]
7954 						movaps		xmm6, xmm0
7955 						shufps		xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
7956 						mulps		xmm5, xmm6
7957 						addps		xmm3, xmm5
7958 						movaps		xmm6, xmm1
7959 						shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
7960 						mulps		xmm6, [edi+(2*6+0)*4]
7961 						addps		xmm3, xmm6
7962 						movlps		xmm5, [edi+(3*6+0)*4]
7963 						movhps		xmm5, [edi+(3*6+2)*4]
7964 						movaps		xmm6, xmm1
7965 						shufps		xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
7966 						mulps		xmm5, xmm6
7967 						addps		xmm3, xmm5
7968 						shufps		xmm2, xmm2, R_SHUFFLEPS( 0, 0, 0, 0 )
7969 						movaps		xmm4, xmm2
7970 						mulps		xmm4, [edi+(4*6+0)*4]
7971 						addps		xmm3, xmm4
7972 						STORE4( 0, xmm3, xmm7 )
7973 						shufps		xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 )
7974 						shufps		xmm1, xmm1, R_SHUFFLEPS( 0, 0, 1, 1 )
7975 						movlps		xmm3, [edi+(0*6+4)*4]
7976 						movhps		xmm3, [edi+(1*6+4)*4]
7977 						mulps		xmm3, xmm0
7978 						movlps		xmm4, [edi+(2*6+4)*4]
7979 						movhps		xmm4, [edi+(3*6+4)*4]
7980 						mulps		xmm4, xmm1
7981 						addps		xmm3, xmm4
7982 						movhlps		xmm4, xmm3
7983 						addps		xmm3, xmm4
7984 						movlps		xmm5, [edi+(4*6+4)*4]
7985 						mulps		xmm5, xmm2
7986 						addps		xmm3, xmm5
7987 						STORE2LO( 16, xmm3, xmm7 )
7988 					}
7989 					return;
7990 				}
7991 				default: {
7992 					for ( int i = 0; i < numColumns; i++ ) {
7993 						dstPtr[i] STOREC *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2] +
7994 								*(mPtr+3*numColumns) * vPtr[3] + *(mPtr+4*numColumns) * vPtr[4];
7995 						mPtr++;
7996 					}
7997 					return;
7998 				}
7999 			}
8000 			break;
8001 		case 6:
8002 			switch( numColumns ) {
8003 				case 1: {		// 6x1 * 6x1
8004 					__asm {
8005 						mov			esi, vPtr
8006 						mov			edi, mPtr
8007 						mov			eax, dstPtr
8008 						movlps		xmm0, [esi]
8009 						movhps		xmm0, [esi+8]
8010 						movlps		xmm1, [esi+16]
8011 						mulps		xmm0, [edi]
8012 						mulps		xmm1, [edi+16]
8013 						shufps		xmm1, xmm0, R_SHUFFLEPS( 0, 1, 3, 2 )
8014 						addps		xmm0, xmm1
8015 						movhlps		xmm2, xmm0
8016 						addss		xmm2, xmm0
8017 						shufps		xmm0, xmm0, R_SHUFFLEPS( 1, 0, 0, 0 )
8018 						addss		xmm2, xmm0
8019 						STORE1( 0, xmm2, xmm3 )
8020 					}
8021 					return;
8022 				}
8023 				case 2: {		// 6x2 * 6x1
8024 					__asm {
8025 						mov			esi, vPtr
8026 						mov			edi, mPtr
8027 						mov			eax, dstPtr
8028 						movlps		xmm0, [esi+0*4]
8029 						shufps		xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 )
8030 						movaps		xmm6, [edi+0*4]
8031 						mulps		xmm6, xmm0
8032 						movlps		xmm1, [esi+2*4]
8033 						shufps		xmm1, xmm1, R_SHUFFLEPS( 0, 0, 1, 1 )
8034 						movaps		xmm7, [edi+4*4]
8035 						mulps		xmm7, xmm1
8036 						addps		xmm6, xmm7
8037 						movlps		xmm2, [esi+4*4]
8038 						shufps		xmm2, xmm2, R_SHUFFLEPS( 0, 0, 1, 1 )
8039 						movaps		xmm7, [edi+8*4]
8040 						mulps		xmm7, xmm2
8041 						addps		xmm6, xmm7
8042 						movhlps		xmm3, xmm6
8043 						addps		xmm3, xmm6
8044 						STORE2LO( 0, xmm3, xmm7 )
8045 					}
8046 					return;
8047 				}
8048 				case 3: {		// 6x3 * 6x1
8049 					__asm {
8050 						mov			esi, vPtr
8051 						mov			edi, mPtr
8052 						mov			eax, dstPtr
8053 						movss		xmm0, [edi+(0*3+2)*4]
8054 						movhps		xmm0, [edi+(0*3+0)*4]
8055 						shufps		xmm0, xmm0, R_SHUFFLEPS( 2, 1, 3, 0 )
8056 						movss		xmm6, [esi+0*4]
8057 						shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
8058 						mulps		xmm6, xmm0
8059 						movss		xmm1, [edi+(1*3+0)*4]
8060 						movhps		xmm1, [edi+(1*3+1)*4]
8061 						movss		xmm7, [esi+1*4]
8062 						shufps		xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
8063 						mulps		xmm7, xmm1
8064 						addps		xmm6, xmm7
8065 						movss		xmm2, [edi+(2*3+2)*4]
8066 						movhps		xmm2, [edi+(2*3+0)*4]
8067 						shufps		xmm2, xmm2, R_SHUFFLEPS( 2, 1, 3, 0 )
8068 						movss		xmm7, [esi+2*4]
8069 						shufps		xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
8070 						mulps		xmm7, xmm2
8071 						addps		xmm6, xmm7
8072 						movss		xmm3, [edi+(3*3+0)*4]
8073 						movhps		xmm3, [edi+(3*3+1)*4]
8074 						movss		xmm7, [esi+3*4]
8075 						shufps		xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
8076 						mulps		xmm7, xmm3
8077 						addps		xmm6, xmm7
8078 						movss		xmm4, [edi+(4*3+2)*4]
8079 						movhps		xmm4, [edi+(4*3+0)*4]
8080 						shufps		xmm4, xmm4, R_SHUFFLEPS( 2, 1, 3, 0 )
8081 						movss		xmm7, [esi+4*4]
8082 						shufps		xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
8083 						mulps		xmm7, xmm4
8084 						addps		xmm6, xmm7
8085 						movss		xmm5, [edi+(5*3+0)*4]
8086 						movhps		xmm5, [edi+(5*3+1)*4]
8087 						movss		xmm7, [esi+5*4]
8088 						shufps		xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
8089 						mulps		xmm7, xmm5
8090 						addps		xmm6, xmm7
8091 						STORE1( 0, xmm6, xmm7 )
8092 						STORE2HI( 4, xmm6, xmm7 )
8093 					}
8094 					return;
8095 				}
8096 				case 4: {		// 6x4 * 6x1
8097 					__asm {
8098 						mov			esi, vPtr
8099 						mov			edi, mPtr
8100 						mov			eax, dstPtr
8101 						movlps		xmm3, [edi+(0*4+0)*4]
8102 						movhps		xmm3, [edi+(0*4+2)*4]
8103 						movss		xmm4, [esi+0*4]
8104 						shufps		xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
8105 						mulps		xmm3, xmm4
8106 						movlps		xmm5, [edi+(1*4+0)*4]
8107 						movhps		xmm5, [edi+(1*4+2)*4]
8108 						movss		xmm6, [esi+1*4]
8109 						shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
8110 						mulps		xmm5, xmm6
8111 						addps		xmm3, xmm5
8112 						movlps		xmm4, [edi+(2*4+0)*4]
8113 						movhps		xmm4, [edi+(2*4+2)*4]
8114 						movss		xmm6, [esi+2*4]
8115 						shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
8116 						mulps		xmm4, xmm6
8117 						addps		xmm3, xmm4
8118 						movlps		xmm5, [edi+(3*4+0)*4]
8119 						movhps		xmm5, [edi+(3*4+2)*4]
8120 						movss		xmm6, [esi+3*4]
8121 						shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
8122 						mulps		xmm5, xmm6
8123 						addps		xmm3, xmm5
8124 						movlps		xmm4, [edi+(4*4+0)*4]
8125 						movhps		xmm4, [edi+(4*4+2)*4]
8126 						movss		xmm6, [esi+4*4]
8127 						shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
8128 						mulps		xmm4, xmm6
8129 						addps		xmm3, xmm4
8130 						movlps		xmm5, [edi+(5*4+0)*4]
8131 						movhps		xmm5, [edi+(5*4+2)*4]
8132 						movss		xmm6, [esi+5*4]
8133 						shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
8134 						mulps		xmm5, xmm6
8135 						addps		xmm3, xmm5
8136 						STORE4( 0, xmm3, xmm7 )
8137 					}
8138 					return;
8139 				}
8140 				case 5: {		// 6x5 * 6x1
8141 					__asm {
8142 						mov			esi, vPtr
8143 						mov			edi, mPtr
8144 						mov			eax, dstPtr
8145 						movlps		xmm6, [edi+(0*5+0)*4]
8146 						movhps		xmm6, [edi+(0*5+2)*4]
8147 						movss		xmm0, [esi+0*4]
8148 						shufps		xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
8149 						mulps		xmm6, xmm0
8150 						movlps		xmm7, [edi+(1*5+0)*4]
8151 						movhps		xmm7, [edi+(1*5+2)*4]
8152 						movss		xmm1, [esi+1*4]
8153 						shufps		xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
8154 						mulps		xmm7, xmm1
8155 						addps		xmm6, xmm7
8156 						movlps		xmm7, [edi+(2*5+0)*4]
8157 						movhps		xmm7, [edi+(2*5+2)*4]
8158 						movss		xmm2, [esi+2*4]
8159 						shufps		xmm2, xmm2, R_SHUFFLEPS( 0, 0, 0, 0 )
8160 						mulps		xmm7, xmm2
8161 						addps		xmm6, xmm7
8162 						movlps		xmm7, [edi+(3*5+0)*4]
8163 						movhps		xmm7, [edi+(3*5+2)*4]
8164 						movss		xmm3, [esi+3*4]
8165 						shufps		xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 )
8166 						mulps		xmm7, xmm3
8167 						addps		xmm6, xmm7
8168 						movlps		xmm7, [edi+(4*5+0)*4]
8169 						movhps		xmm7, [edi+(4*5+2)*4]
8170 						movss		xmm4, [esi+4*4]
8171 						shufps		xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
8172 						mulps		xmm7, xmm4
8173 						addps		xmm6, xmm7
8174 						movlps		xmm7, [edi+(5*5+0)*4]
8175 						movhps		xmm7, [edi+(5*5+2)*4]
8176 						movss		xmm5, [esi+5*4]
8177 						shufps		xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
8178 						mulps		xmm7, xmm5
8179 						addps		xmm6, xmm7
8180 						STORE4( 0, xmm6, xmm7 )
8181 						movss		xmm6, [edi+(0*5+4)*4]
8182 						mulss		xmm6, xmm0
8183 						movss		xmm7, [edi+(1*5+4)*4]
8184 						mulss		xmm7, xmm1
8185 						addss		xmm6, xmm7
8186 						movss		xmm7, [edi+(2*5+4)*4]
8187 						mulss		xmm7, xmm2
8188 						addss		xmm6, xmm7
8189 						movss		xmm7, [edi+(3*5+4)*4]
8190 						mulss		xmm7, xmm3
8191 						addss		xmm6, xmm7
8192 						movss		xmm7, [edi+(4*5+4)*4]
8193 						mulss		xmm7, xmm4
8194 						addss		xmm6, xmm7
8195 						movss		xmm7, [edi+(5*5+4)*4]
8196 						mulss		xmm7, xmm5
8197 						addss		xmm6, xmm7
8198 						STORE1( 16, xmm6, xmm7 )
8199 					}
8200 					return;
8201 				}
8202 				case 6: {		// 6x6 * 6x1
8203 					__asm {
8204 						mov			esi, vPtr
8205 						mov			edi, mPtr
8206 						mov			eax, dstPtr
8207 						movlps		xmm0, [esi+0*4]
8208 						movlps		xmm1, [esi+2*4]
8209 						movlps		xmm2, [esi+4*4]
8210 						movaps		xmm3, xmm0
8211 						shufps		xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 )
8212 						mulps		xmm3, [edi+(0*6+0)*4]
8213 						movlps		xmm5, [edi+(1*6+0)*4]
8214 						movhps		xmm5, [edi+(1*6+2)*4]
8215 						movaps		xmm6, xmm0
8216 						shufps		xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
8217 						mulps		xmm5, xmm6
8218 						addps		xmm3, xmm5
8219 						movaps		xmm6, xmm1
8220 						shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
8221 						mulps		xmm6, [edi+(2*6+0)*4]
8222 						addps		xmm3, xmm6
8223 						movaps		xmm6, xmm1
8224 						shufps		xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
8225 						movlps		xmm5, [edi+(3*6+0)*4]
8226 						movhps		xmm5, [edi+(3*6+2)*4]
8227 						mulps		xmm5, xmm6
8228 						addps		xmm3, xmm5
8229 						movaps		xmm6, xmm2
8230 						shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
8231 						mulps		xmm6, [edi+(4*6+0)*4]
8232 						addps		xmm3, xmm6
8233 						movaps		xmm6, xmm2
8234 						shufps		xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
8235 						movlps		xmm5, [edi+(5*6+0)*4]
8236 						movhps		xmm5, [edi+(5*6+2)*4]
8237 						mulps		xmm5, xmm6
8238 						addps		xmm3, xmm5
8239 						STORE4( 0, xmm3, xmm7 )
8240 						shufps		xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 )
8241 						shufps		xmm1, xmm1, R_SHUFFLEPS( 0, 0, 1, 1 )
8242 						shufps		xmm2, xmm2, R_SHUFFLEPS( 0, 0, 1, 1 )
8243 						movlps		xmm3, [edi+(0*6+4)*4]
8244 						movhps		xmm3, [edi+(1*6+4)*4]
8245 						mulps		xmm3, xmm0
8246 						movlps		xmm4, [edi+(2*6+4)*4]
8247 						movhps		xmm4, [edi+(3*6+4)*4]
8248 						mulps		xmm4, xmm1
8249 						addps		xmm3, xmm4
8250 						movlps		xmm5, [edi+(4*6+4)*4]
8251 						movhps		xmm5, [edi+(5*6+4)*4]
8252 						mulps		xmm5, xmm2
8253 						addps		xmm3, xmm5
8254 						movhlps		xmm4, xmm3
8255 						addps		xmm3, xmm4
8256 						STORE2LO( 16, xmm3, xmm7 )
8257 					}
8258 					return;
8259 				}
8260 				default: {
8261 					for ( int i = 0; i < numColumns; i++ ) {
8262 						dstPtr[i] STOREC *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2] +
8263 								*(mPtr+3*numColumns) * vPtr[3] + *(mPtr+4*numColumns) * vPtr[4] + *(mPtr+5*numColumns) * vPtr[5];
8264 						mPtr++;
8265 					}
8266 					return;
8267 				}
8268 			}
8269 			break;
8270 		default:
8271 			int numRows = mat.GetNumRows();
8272 			for ( int i = 0; i < numColumns; i++ ) {
8273 				mPtr = mat.ToFloatPtr() + i;
8274 				float sum = mPtr[0] * vPtr[0];
8275 				for ( int j = 1; j < numRows; j++ ) {
8276 					mPtr += numColumns;
8277 					sum += mPtr[0] * vPtr[j];
8278 				}
8279 				dstPtr[i] STOREC sum;
8280 			}
8281 			break;
8282 	}
8283 
8284 #undef STOREC
8285 #undef STORE4
8286 #undef STORE2HI
8287 #undef STORE2LO
8288 #undef STORE1
8289 }
8290 
8291 /*
8292 ============
8293 idSIMD_SSE::MatX_MultiplyMatX
8294 
8295 	optimizes the following matrix multiplications:
8296 
8297 	NxN * Nx6
8298 	6xN * Nx6
8299 	Nx6 * 6xN
8300 	6x6 * 6xN
8301 
8302 	with N in the range [1-6].
8303 
8304 	The hot cache clock cycle counts are generally better for the SIMD version than the
8305 	FPU version. At times up to 40% less clock cycles on a P3. In practise however,
8306 	the results are poor probably due to memory access.
8307 ============
8308 */
MatX_MultiplyMatX(idMatX & dst,const idMatX & m1,const idMatX & m2)8309 void VPCALL idSIMD_SSE::MatX_MultiplyMatX( idMatX &dst, const idMatX &m1, const idMatX &m2 ) {
8310 	int i, j, k, l, n;
8311 	float *dstPtr;
8312 	const float *m1Ptr, *m2Ptr;
8313 	double sum;
8314 
8315 	assert( m1.GetNumColumns() == m2.GetNumRows() );
8316 
8317 	dstPtr = dst.ToFloatPtr();
8318 	m1Ptr = m1.ToFloatPtr();
8319 	m2Ptr = m2.ToFloatPtr();
8320 	k = m1.GetNumRows();
8321 	l = m2.GetNumColumns();
8322 	n = m1.GetNumColumns();
8323 
8324 	switch( n ) {
8325 		case 1: {
8326 			if ( !(l^6) ) {
8327 				switch( k ) {
8328 					case 1:	{			// 1x1 * 1x6, no precision loss compared to FPU version
8329 						__asm {
8330 							mov			esi, m2Ptr
8331 							mov			edi, m1Ptr
8332 							mov			eax, dstPtr
8333 							movss		xmm0, [edi]
8334 							shufps		xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
8335 							movaps		xmm1, [esi]
8336 							mulps		xmm1, xmm0
8337 							movaps		[eax], xmm1
8338 							movlps		xmm2, [esi+16]
8339 							mulps		xmm2, xmm0
8340 							movlps		[eax+16], xmm2
8341 						}
8342 						return;
8343 					}
8344 					case 6: {			// 6x1 * 1x6, no precision loss compared to FPU version
8345 						__asm {
8346 							mov			esi, m2Ptr
8347 							mov			edi, m1Ptr
8348 							mov			eax, dstPtr
8349 							xorps		xmm1, xmm1
8350 							movaps		xmm0, [edi]
8351 							movlps		xmm1, [edi+16]
8352 							movlhps		xmm1, xmm0
8353 							movhlps		xmm2, xmm0
8354 							movlhps		xmm2, xmm1
8355 							// row 0 and 1
8356 							movaps		xmm3, [esi]
8357 							movaps		xmm4, xmm3
8358 							shufps		xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
8359 							movaps		xmm5, xmm3
8360 							shufps		xmm5, xmm5, R_SHUFFLEPS( 0, 0, 1, 1 )
8361 							movaps		xmm6, xmm3
8362 							shufps		xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
8363 							mulps		xmm4, xmm0
8364 							mulps		xmm5, xmm1
8365 							mulps		xmm6, xmm2
8366 							movaps		[eax], xmm4
8367 							movaps		[eax+16], xmm5
8368 							movaps		[eax+32], xmm6
8369 							// row 2 and 3
8370 							movaps		xmm4, xmm3
8371 							shufps		xmm4, xmm4, R_SHUFFLEPS( 2, 2, 2, 2 )
8372 							movaps		xmm5, xmm3
8373 							shufps		xmm5, xmm5, R_SHUFFLEPS( 2, 2, 3, 3 )
8374 							shufps		xmm3, xmm3, R_SHUFFLEPS( 3, 3, 3, 3 )
8375 							mulps		xmm4, xmm0
8376 							mulps		xmm5, xmm1
8377 							mulps		xmm3, xmm2
8378 							movaps		[eax+48], xmm4
8379 							movaps		[eax+64], xmm5
8380 							movaps		[eax+80], xmm3
8381 							// row 4 and 5
8382 							movlps		xmm3, [esi+16]
8383 							movaps		xmm4, xmm3
8384 							shufps		xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
8385 							movaps		xmm5, xmm3
8386 							shufps		xmm5, xmm5, R_SHUFFLEPS( 0, 0, 1, 1 )
8387 							shufps		xmm3, xmm3, R_SHUFFLEPS( 1, 1, 1, 1 )
8388 							mulps		xmm4, xmm0
8389 							mulps		xmm5, xmm1
8390 							mulps		xmm3, xmm2
8391 							movaps		[eax+96], xmm4
8392 							movaps		[eax+112], xmm5
8393 							movaps		[eax+128], xmm3
8394 						}
8395 						return;
8396 					}
8397 				}
8398 			}
8399 			for ( i = 0; i < k; i++ ) {
8400 				m2Ptr = m2.ToFloatPtr();
8401 				for ( j = 0; j < l; j++ ) {
8402 					*dstPtr++ = m1Ptr[0] * m2Ptr[0];
8403 					m2Ptr++;
8404 				}
8405 				m1Ptr++;
8406 			}
8407 			break;
8408 		}
8409 		case 2: {
8410 			if ( !(l^6) ) {
8411 				switch( k ) {
8412 					case 2: {			// 2x2 * 2x6
8413 
8414 						#define MUL_Nx2_2x6_INIT								\
8415 						__asm mov		esi, m2Ptr								\
8416 						__asm mov		edi, m1Ptr								\
8417 						__asm mov		eax, dstPtr								\
8418 						__asm movaps	xmm0, [esi]								\
8419 						__asm movlps	xmm1, [esi+16]							\
8420 						__asm movhps	xmm1, [esi+40]							\
8421 						__asm movlps	xmm2, [esi+24]							\
8422 						__asm movhps	xmm2, [esi+32]
8423 
8424 						#define MUL_Nx2_2x6_ROW2( row )							\
8425 						__asm movaps	xmm3, [edi+row*16]						\
8426 						__asm movaps	xmm5, xmm0								\
8427 						__asm movaps	xmm4, xmm3								\
8428 						__asm shufps	xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )	\
8429 						__asm mulps		xmm5, xmm4								\
8430 						__asm movaps	xmm4, xmm3								\
8431 						__asm movaps	xmm6, xmm2								\
8432 						__asm shufps	xmm4, xmm4, R_SHUFFLEPS( 1, 1, 1, 1 )	\
8433 						__asm mulps		xmm6, xmm4								\
8434 						__asm addps		xmm5, xmm6								\
8435 						__asm movaps	[eax+row*48], xmm5						\
8436 						__asm movaps	xmm4, xmm3								\
8437 						__asm shufps	xmm4, xmm4, R_SHUFFLEPS( 0, 0, 1, 1 )	\
8438 						__asm movaps	xmm7, xmm1								\
8439 						__asm mulps		xmm7, xmm4								\
8440 						__asm movaps	xmm4, xmm3								\
8441 						__asm movaps	xmm5, xmm0								\
8442 						__asm shufps	xmm4, xmm4, R_SHUFFLEPS( 2, 2, 2, 2 )	\
8443 						__asm mulps		xmm5, xmm4								\
8444 						__asm movaps	xmm4, xmm3								\
8445 						__asm movaps	xmm6, xmm2								\
8446 						__asm shufps	xmm4, xmm4, R_SHUFFLEPS( 3, 3, 3, 3 )	\
8447 						__asm mulps		xmm6, xmm4								\
8448 						__asm addps		xmm5, xmm6								\
8449 						__asm shufps	xmm3, xmm3, R_SHUFFLEPS( 2, 2, 3, 3 )	\
8450 						__asm movaps	xmm6, xmm1								\
8451 						__asm mulps		xmm6, xmm3								\
8452 						__asm movaps	xmm4, xmm7								\
8453 						__asm movlhps	xmm7, xmm6								\
8454 						__asm movhlps	xmm6, xmm4								\
8455 						__asm addps		xmm6, xmm7								\
8456 						__asm movlps	[eax+row*48+16], xmm6					\
8457 						__asm movlps	[eax+row*48+24], xmm5					\
8458 						__asm movhps	[eax+row*48+32], xmm5					\
8459 						__asm movhps	[eax+row*48+40], xmm6
8460 
8461 						MUL_Nx2_2x6_INIT
8462 						MUL_Nx2_2x6_ROW2( 0 )
8463 
8464 						return;
8465 					}
8466 					case 6: {			// 6x2 * 2x6
8467 
8468 						MUL_Nx2_2x6_INIT
8469 						MUL_Nx2_2x6_ROW2( 0 )
8470 						MUL_Nx2_2x6_ROW2( 1 )
8471 						MUL_Nx2_2x6_ROW2( 2 )
8472 
8473 						return;
8474 					}
8475 				}
8476 			}
8477 			for ( i = 0; i < k; i++ ) {
8478 				m2Ptr = m2.ToFloatPtr();
8479 				for ( j = 0; j < l; j++ ) {
8480 					*dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[1] * m2Ptr[l];
8481 					m2Ptr++;
8482 				}
8483 				m1Ptr += 2;
8484 			}
8485 			break;
8486 		}
8487 		case 3: {
8488 			if ( !(l^6) ) {
8489 				switch( k ) {
8490 					case 3: {			// 3x3 * 3x6
8491 						__asm {
8492 							mov		esi, m2Ptr
8493 							mov		edi, m1Ptr
8494 							mov		eax, dstPtr
8495 							movaps	xmm5, xmmword ptr [esi]
8496 							movlps	xmm6, qword ptr [esi+24]
8497 							movhps	xmm6, qword ptr [esi+32]
8498 							movaps	xmm7, xmmword ptr [esi+48]
8499 							movss	xmm0, dword ptr [edi]
8500 							shufps	xmm0, xmm0, 0
8501 							mulps	xmm0, xmm5
8502 							movss	xmm1, dword ptr [edi+4]
8503 							shufps	xmm1, xmm1, 0
8504 							mulps	xmm1, xmm6
8505 							movss	xmm2, dword ptr [edi+8]
8506 							shufps	xmm2, xmm2, 0
8507 							mulps	xmm2, xmm7
8508 							addps	xmm0, xmm1
8509 							addps	xmm0, xmm2
8510 							movaps	xmmword ptr [eax], xmm0
8511 							movss	xmm3, dword ptr [edi+12]
8512 							shufps	xmm3, xmm3, 0
8513 							mulps	xmm3, xmm5
8514 							movss	xmm4, dword ptr [edi+16]
8515 							shufps	xmm4, xmm4, 0
8516 							mulps	xmm4, xmm6
8517 							movss	xmm0, dword ptr [edi+20]
8518 							shufps	xmm0, xmm0, 0
8519 							mulps	xmm0, xmm7
8520 							addps	xmm3, xmm4
8521 							addps	xmm0, xmm3
8522 							movlps	qword ptr [eax+24], xmm0
8523 							movhps	qword ptr [eax+32], xmm0
8524 							movss	xmm1, dword ptr [edi+24]
8525 							shufps	xmm1, xmm1, 0
8526 							mulps	xmm1, xmm5
8527 							movss	xmm2, dword ptr [edi+28]
8528 							shufps	xmm2, xmm2, 0
8529 							mulps	xmm2, xmm6
8530 							movss	xmm3, dword ptr [edi+32]
8531 							shufps	xmm3, xmm3, 0
8532 							mulps	xmm3, xmm7
8533 							addps	xmm1, xmm2
8534 							addps	xmm1, xmm3
8535 							movaps	xmmword ptr [eax+48], xmm1
8536 							movlps	xmm5, qword ptr [esi+16]
8537 							movlps	xmm6, qword ptr [esi+40]
8538 							movlps	xmm7, qword ptr [esi+64]
8539 							shufps	xmm5, xmm5, 0x44
8540 							shufps	xmm6, xmm6, 0x44
8541 							shufps	xmm7, xmm7, 0x44
8542 							movaps	xmm3, xmmword ptr [edi]
8543 							movlps	xmm4, qword ptr [edi+16]
8544 							movaps	xmm0, xmm3
8545 							shufps	xmm0, xmm0, 0xF0
8546 							mulps	xmm0, xmm5
8547 							movaps	xmm1, xmm3
8548 							shufps	xmm1, xmm4, 0x05
8549 							mulps	xmm1, xmm6
8550 							shufps	xmm3, xmm4, 0x5A
8551 							mulps	xmm3, xmm7
8552 							addps	xmm1, xmm0
8553 							addps	xmm1, xmm3
8554 							movlps	qword ptr [eax+16], xmm1
8555 							movhps	qword ptr [eax+40], xmm1
8556 							movss	xmm0, dword ptr [edi+24]
8557 							shufps	xmm0, xmm0, 0
8558 							mulps	xmm0, xmm5
8559 							movss	xmm2, dword ptr [edi+28]
8560 							shufps	xmm2, xmm2, 0
8561 							mulps	xmm2, xmm6
8562 							movss	xmm4, dword ptr [edi+32]
8563 							shufps	xmm4, xmm4, 0
8564 							mulps	xmm4, xmm7
8565 							addps	xmm0, xmm2
8566 							addps	xmm0, xmm4
8567 							movlps	qword ptr [eax+64], xmm0
8568 						}
8569 						return;
8570 					}
8571 					case 6: {			// 6x3 * 3x6
8572 						#define MUL_Nx3_3x6_FIRST4COLUMNS_INIT						\
8573 						__asm mov			esi, m2Ptr								\
8574 						__asm mov			edi, m1Ptr								\
8575 						__asm mov			eax, dstPtr								\
8576 						__asm movlps		xmm0, [esi+ 0*4]						\
8577 						__asm movhps		xmm0, [esi+ 2*4]						\
8578 						__asm movlps		xmm1, [esi+ 6*4]						\
8579 						__asm movhps		xmm1, [esi+ 8*4]						\
8580 						__asm movlps		xmm2, [esi+12*4]						\
8581 						__asm movhps		xmm2, [esi+14*4]
8582 
8583 						#define MUL_Nx3_3x6_FIRST4COLUMNS_ROW( row )				\
8584 						__asm movss			xmm3, [edi+(row*3+0)*4]					\
8585 						__asm shufps		xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 )	\
8586 						__asm mulps			xmm3, xmm0								\
8587 						__asm movss			xmm4, [edi+(row*3+1)*4]					\
8588 						__asm shufps		xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )	\
8589 						__asm mulps			xmm4, xmm1								\
8590 						__asm addps			xmm3, xmm4								\
8591 						__asm movss			xmm5, [edi+(row*3+2)*4]					\
8592 						__asm shufps		xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )	\
8593 						__asm mulps			xmm5, xmm2								\
8594 						__asm addps			xmm3, xmm5								\
8595 						__asm movlps		[eax+(row*6+0)*4], xmm3					\
8596 						__asm movhps		[eax+(row*6+2)*4], xmm3
8597 
8598 						#define MUL_Nx3_3x6_LAST2COLUMNS_ROW6						\
8599 						__asm movlps		xmm0, [esi+ 4*4]						\
8600 						__asm movlps		xmm1, [esi+10*4]						\
8601 						__asm movlps		xmm2, [esi+16*4]						\
8602 						__asm shufps		xmm0, xmm0, 0x44						\
8603 						__asm shufps		xmm1, xmm1, 0x44						\
8604 						__asm shufps		xmm2, xmm2, 0x44						\
8605 						__asm movlps		xmm3, [edi+0*4]							\
8606 						__asm movhps		xmm3, [edi+2*4]							\
8607 						__asm movaps		xmm4, xmm3								\
8608 						__asm movaps		xmm5, xmm3								\
8609 						__asm shufps		xmm3, xmm3, 0xF0						\
8610 						__asm mulps			xmm3, xmm0								\
8611 						__asm movlps		xmm6, [edi+4*4]							\
8612 						__asm movhps		xmm6, [edi+6*4]							\
8613 						__asm shufps		xmm4, xmm6, 0x05						\
8614 						__asm mulps			xmm4, xmm1								\
8615 						__asm addps			xmm3, xmm4								\
8616 						__asm shufps		xmm5, xmm6, 0x5A						\
8617 						__asm mulps			xmm5, xmm2								\
8618 						__asm addps			xmm3, xmm5								\
8619 						__asm movlps		[eax+4*4], xmm3							\
8620 						__asm movhps		[eax+10*4], xmm3						\
8621 						__asm movaps		xmm5, xmm6								\
8622 						__asm movlps		xmm3, [edi+8*4]							\
8623 						__asm movhps		xmm3, [edi+10*4]						\
8624 						__asm movaps		xmm4, xmm3								\
8625 						__asm shufps		xmm5, xmm3, 0x5A						\
8626 						__asm mulps			xmm5, xmm0								\
8627 						__asm shufps		xmm6, xmm3, 0xAF						\
8628 						__asm mulps			xmm6, xmm1								\
8629 						__asm addps			xmm5, xmm6								\
8630 						__asm shufps		xmm4, xmm4, 0xF0						\
8631 						__asm mulps			xmm4, xmm2								\
8632 						__asm addps			xmm4, xmm5								\
8633 						__asm movlps		[eax+16*4], xmm4						\
8634 						__asm movhps		[eax+22*4], xmm4						\
8635 						__asm movlps		xmm6, [edi+12*4]						\
8636 						__asm movhps		xmm6, [edi+14*4]						\
8637 						__asm movaps		xmm5, xmm6								\
8638 						__asm movaps		xmm4, xmm6								\
8639 						__asm shufps		xmm6, xmm6, 0xF0						\
8640 						__asm mulps			xmm6, xmm0								\
8641 						__asm movlps		xmm3, [edi+16*4]						\
8642 						__asm shufps		xmm5, xmm3, 0x05						\
8643 						__asm mulps			xmm5, xmm1								\
8644 						__asm addps			xmm5, xmm6								\
8645 						__asm shufps		xmm4, xmm3, 0x5A						\
8646 						__asm mulps			xmm4, xmm2								\
8647 						__asm addps			xmm4, xmm5								\
8648 						__asm movlps		[eax+28*4], xmm4						\
8649 						__asm movhps		[eax+34*4], xmm4
8650 
8651 						MUL_Nx3_3x6_FIRST4COLUMNS_INIT
8652 						MUL_Nx3_3x6_FIRST4COLUMNS_ROW( 0 )
8653 						MUL_Nx3_3x6_FIRST4COLUMNS_ROW( 1 )
8654 						MUL_Nx3_3x6_FIRST4COLUMNS_ROW( 2 )
8655 						MUL_Nx3_3x6_FIRST4COLUMNS_ROW( 3 )
8656 						MUL_Nx3_3x6_FIRST4COLUMNS_ROW( 4 )
8657 						MUL_Nx3_3x6_FIRST4COLUMNS_ROW( 5 )
8658 						MUL_Nx3_3x6_LAST2COLUMNS_ROW6
8659 
8660 						return;
8661 					}
8662 				}
8663 			}
8664 			for ( i = 0; i < k; i++ ) {
8665 				m2Ptr = m2.ToFloatPtr();
8666 				for ( j = 0; j < l; j++ ) {
8667 					*dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[1] * m2Ptr[l] + m1Ptr[2] * m2Ptr[2*l];
8668 					m2Ptr++;
8669 				}
8670 				m1Ptr += 3;
8671 			}
8672 			break;
8673 		}
8674 		case 4: {
8675 			if ( !(l^6) ) {
8676 				switch( k ) {
8677 					case 4: {			// 4x4 * 4x6
8678 
8679 						#define MUL_Nx4_4x6_FIRST4COLUMNS_INIT						\
8680 						__asm mov			esi, m2Ptr								\
8681 						__asm mov			edi, m1Ptr								\
8682 						__asm mov			eax, dstPtr								\
8683 						__asm movlps		xmm0, [esi+ 0*4]						\
8684 						__asm movhps		xmm0, [esi+ 2*4]						\
8685 						__asm movlps		xmm1, [esi+ 6*4]						\
8686 						__asm movhps		xmm1, [esi+ 8*4]						\
8687 						__asm movlps		xmm2, [esi+12*4]						\
8688 						__asm movhps		xmm2, [esi+14*4]						\
8689 						__asm movlps		xmm3, [esi+18*4]						\
8690 						__asm movhps		xmm3, [esi+20*4]
8691 
8692 						#define MUL_Nx4_4x6_FIRST4COLUMNS_ROW( row )				\
8693 						__asm movss			xmm4, [edi+row*16+0*4]					\
8694 						__asm shufps		xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )	\
8695 						__asm mulps			xmm4, xmm0								\
8696 						__asm movss			xmm5, [edi+row*16+1*4]					\
8697 						__asm shufps		xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )	\
8698 						__asm mulps			xmm5, xmm1								\
8699 						__asm addps			xmm4, xmm5								\
8700 						__asm movss			xmm6, [edi+row*16+2*4]					\
8701 						__asm shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )	\
8702 						__asm mulps			xmm6, xmm2								\
8703 						__asm addps			xmm4, xmm6								\
8704 						__asm movss			xmm7, [edi+row*16+3*4]					\
8705 						__asm shufps		xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )	\
8706 						__asm mulps			xmm7, xmm3								\
8707 						__asm addps			xmm4, xmm7								\
8708 						__asm movlps		[eax+row*24+0], xmm4					\
8709 						__asm movhps		[eax+row*24+8], xmm4
8710 
8711 						#define MUL_Nx4_4x6_LAST2COLUMNS_INIT						\
8712 						__asm movlps		xmm0, [esi+ 4*4]						\
8713 						__asm movlps		xmm1, [esi+10*4]						\
8714 						__asm movlps		xmm2, [esi+16*4]						\
8715 						__asm movlps		xmm3, [esi+22*4]						\
8716 						__asm shufps		xmm0, xmm1, R_SHUFFLEPS( 0, 1, 0, 1 )	\
8717 						__asm shufps		xmm1, xmm0, R_SHUFFLEPS( 0, 1, 0, 1 )	\
8718 						__asm shufps		xmm2, xmm3, R_SHUFFLEPS( 0, 1, 0, 1 )	\
8719 						__asm shufps		xmm3, xmm2, R_SHUFFLEPS( 0, 1, 0, 1 )
8720 
8721 						#define MUL_Nx4_4x6_LAST2COLUMNS_ROW2( row )				\
8722 						__asm movlps		xmm7, [edi+row*32+ 0*4]					\
8723 						__asm movhps		xmm7, [edi+row*32+ 4*4]					\
8724 						__asm movaps		xmm6, xmm7								\
8725 						__asm shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 0, 3, 3 )	\
8726 						__asm mulps			xmm6, xmm0								\
8727 						__asm shufps		xmm7, xmm7, R_SHUFFLEPS( 1, 1, 2, 2 )	\
8728 						__asm mulps			xmm7, xmm1								\
8729 						__asm addps			xmm6, xmm7								\
8730 						__asm movlps		xmm4, [edi+row*32+ 2*4]					\
8731 						__asm movhps		xmm4, [edi+row*32+ 6*4]					\
8732 						__asm movaps		xmm5, xmm4								\
8733 						__asm shufps		xmm5, xmm5, R_SHUFFLEPS( 0, 0, 3, 3 )	\
8734 						__asm mulps			xmm5, xmm2								\
8735 						__asm addps			xmm6, xmm5								\
8736 						__asm shufps		xmm4, xmm4, R_SHUFFLEPS( 1, 1, 2, 2 )	\
8737 						__asm mulps			xmm4, xmm3								\
8738 						__asm addps			xmm6, xmm4								\
8739 						__asm movlps		[eax+row*48+ 4*4], xmm6					\
8740 						__asm movhps		[eax+row*48+10*4], xmm6
8741 
8742 						MUL_Nx4_4x6_FIRST4COLUMNS_INIT
8743 						MUL_Nx4_4x6_FIRST4COLUMNS_ROW( 0 )
8744 						MUL_Nx4_4x6_FIRST4COLUMNS_ROW( 1 )
8745 						MUL_Nx4_4x6_FIRST4COLUMNS_ROW( 2 )
8746 						MUL_Nx4_4x6_FIRST4COLUMNS_ROW( 3 )
8747 						MUL_Nx4_4x6_LAST2COLUMNS_INIT
8748 						MUL_Nx4_4x6_LAST2COLUMNS_ROW2( 0 )
8749 						MUL_Nx4_4x6_LAST2COLUMNS_ROW2( 1 )
8750 
8751 						return;
8752 					}
8753 					case 6: {			// 6x4 * 4x6
8754 
8755 						MUL_Nx4_4x6_FIRST4COLUMNS_INIT
8756 						MUL_Nx4_4x6_FIRST4COLUMNS_ROW( 0 )
8757 						MUL_Nx4_4x6_FIRST4COLUMNS_ROW( 1 )
8758 						MUL_Nx4_4x6_FIRST4COLUMNS_ROW( 2 )
8759 						MUL_Nx4_4x6_FIRST4COLUMNS_ROW( 3 )
8760 						MUL_Nx4_4x6_FIRST4COLUMNS_ROW( 4 )
8761 						MUL_Nx4_4x6_FIRST4COLUMNS_ROW( 5 )
8762 						MUL_Nx4_4x6_LAST2COLUMNS_INIT
8763 						MUL_Nx4_4x6_LAST2COLUMNS_ROW2( 0 )
8764 						MUL_Nx4_4x6_LAST2COLUMNS_ROW2( 1 )
8765 						MUL_Nx4_4x6_LAST2COLUMNS_ROW2( 2 )
8766 
8767 						return;
8768 					}
8769 				}
8770 			}
8771 			for ( i = 0; i < k; i++ ) {
8772 				m2Ptr = m2.ToFloatPtr();
8773 				for ( j = 0; j < l; j++ ) {
8774 					*dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[1] * m2Ptr[l] + m1Ptr[2] * m2Ptr[2*l] +
8775 									 m1Ptr[3] * m2Ptr[3*l];
8776 					m2Ptr++;
8777 				}
8778 				m1Ptr += 4;
8779 			}
8780 			break;
8781 		}
8782 		case 5: {
8783 			if ( !(l^6) ) {
8784 				switch( k ) {
8785 					case 5: {			// 5x5 * 5x6
8786 
8787 						#define MUL_Nx5_5x6_FIRST4COLUMNS_INIT						\
8788 						__asm mov			esi, m2Ptr								\
8789 						__asm mov			edi, m1Ptr								\
8790 						__asm mov			eax, dstPtr								\
8791 						__asm movlps		xmm0, [esi+ 0*4]						\
8792 						__asm movhps		xmm0, [esi+ 2*4]						\
8793 						__asm movlps		xmm1, [esi+ 6*4]						\
8794 						__asm movhps		xmm1, [esi+ 8*4]						\
8795 						__asm movlps		xmm2, [esi+12*4]						\
8796 						__asm movhps		xmm2, [esi+14*4]						\
8797 						__asm movlps		xmm3, [esi+18*4]						\
8798 						__asm movhps		xmm3, [esi+20*4]						\
8799 						__asm movlps		xmm4, [esi+24*4]						\
8800 						__asm movhps		xmm4, [esi+26*4]
8801 
8802 						#define MUL_Nx5_5x6_FIRST4COLUMNS_ROW( row )				\
8803 						__asm movss			xmm6, [edi+row*20+0*4]					\
8804 						__asm shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )	\
8805 						__asm mulps			xmm6, xmm0								\
8806 						__asm movss			xmm5, [edi+row*20+1*4]					\
8807 						__asm shufps		xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )	\
8808 						__asm mulps			xmm5, xmm1								\
8809 						__asm addps			xmm6, xmm5								\
8810 						__asm movss			xmm5, [edi+row*20+2*4]					\
8811 						__asm shufps		xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )	\
8812 						__asm mulps			xmm5, xmm2								\
8813 						__asm addps			xmm6, xmm5								\
8814 						__asm movss			xmm5, [edi+row*20+3*4]					\
8815 						__asm shufps		xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )	\
8816 						__asm mulps			xmm5, xmm3								\
8817 						__asm addps			xmm6, xmm5								\
8818 						__asm movss			xmm5, [edi+row*20+4*4]					\
8819 						__asm shufps		xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )	\
8820 						__asm mulps			xmm5, xmm4								\
8821 						__asm addps			xmm6, xmm5								\
8822 						__asm movlps		[eax+row*24+0], xmm6					\
8823 						__asm movhps		[eax+row*24+8], xmm6
8824 
8825 						#define MUL_Nx5_5x6_LAST2COLUMNS_INIT						\
8826 						__asm movlps		xmm0, [esi+ 4*4]						\
8827 						__asm movlps		xmm1, [esi+10*4]						\
8828 						__asm movlps		xmm2, [esi+16*4]						\
8829 						__asm movlps		xmm3, [esi+22*4]						\
8830 						__asm movlps		xmm4, [esi+28*4]						\
8831 						__asm shufps		xmm0, xmm1, R_SHUFFLEPS( 0, 1, 0, 1 )	\
8832 						__asm shufps		xmm1, xmm2, R_SHUFFLEPS( 0, 1, 0, 1 )	\
8833 						__asm shufps		xmm2, xmm3, R_SHUFFLEPS( 0, 1, 0, 1 )	\
8834 						__asm shufps		xmm3, xmm4, R_SHUFFLEPS( 0, 1, 0, 1 )	\
8835 						__asm shufps		xmm4, xmm0, R_SHUFFLEPS( 0, 1, 0, 1 )
8836 
8837 						#define MUL_Nx5_5x6_LAST2COLUMNS_ROW2( row )				\
8838 						__asm movlps		xmm7, [edi+row*40+ 0*4]					\
8839 						__asm movhps		xmm7, [edi+row*40+ 6*4]					\
8840 						__asm movaps		xmm6, xmm7								\
8841 						__asm shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 0, 2, 2 )	\
8842 						__asm mulps			xmm6, xmm0								\
8843 						__asm movaps		xmm5, xmm7								\
8844 						__asm shufps		xmm5, xmm5, R_SHUFFLEPS( 1, 1, 3, 3 )	\
8845 						__asm mulps			xmm5, xmm1								\
8846 						__asm addps			xmm6, xmm5								\
8847 						__asm movlps		xmm7, [edi+row*40+ 2*4]					\
8848 						__asm movhps		xmm7, [edi+row*40+ 8*4]					\
8849 						__asm movaps		xmm5, xmm7								\
8850 						__asm shufps		xmm5, xmm5, R_SHUFFLEPS( 0, 0, 2, 2 )	\
8851 						__asm mulps			xmm5, xmm2								\
8852 						__asm addps			xmm6, xmm5								\
8853 						__asm movaps		xmm5, xmm7								\
8854 						__asm shufps		xmm5, xmm5, R_SHUFFLEPS( 1, 1, 3, 3 )	\
8855 						__asm mulps			xmm5, xmm3								\
8856 						__asm addps			xmm6, xmm5								\
8857 						__asm movlps		xmm5, [edi+row*40+ 4*4]					\
8858 						__asm shufps		xmm5, xmm5, R_SHUFFLEPS( 0, 0, 1, 1 )	\
8859 						__asm mulps			xmm5, xmm4								\
8860 						__asm addps			xmm6, xmm5								\
8861 						__asm movlps		[eax+row*48+ 4*4], xmm6					\
8862 						__asm movhps		[eax+row*48+10*4], xmm6
8863 
8864 						#define MUL_Nx5_5x6_LAST2COLUMNS_ROW( row )					\
8865 						__asm movlps		xmm6, [edi+20*4+0*4]					\
8866 						__asm unpcklps		xmm6, xmm6								\
8867 						__asm mulps			xmm6, xmm0								\
8868 						__asm movlps		xmm5, [edi+20*4+2*4]					\
8869 						__asm unpcklps		xmm5, xmm5								\
8870 						__asm mulps			xmm5, xmm2								\
8871 						__asm addps			xmm6, xmm5								\
8872 						__asm movss			xmm5, [edi+20*4+4*4]					\
8873 						__asm unpcklps		xmm5, xmm5								\
8874 						__asm mulps			xmm5, xmm4								\
8875 						__asm addps			xmm6, xmm5								\
8876 						__asm movhlps		xmm7, xmm6								\
8877 						__asm addps			xmm6, xmm7								\
8878 						__asm movlps		[eax+row*24+4*4], xmm6
8879 
8880 						MUL_Nx5_5x6_FIRST4COLUMNS_INIT
8881 						MUL_Nx5_5x6_FIRST4COLUMNS_ROW( 0 )
8882 						MUL_Nx5_5x6_FIRST4COLUMNS_ROW( 1 )
8883 						MUL_Nx5_5x6_FIRST4COLUMNS_ROW( 2 )
8884 						MUL_Nx5_5x6_FIRST4COLUMNS_ROW( 3 )
8885 						MUL_Nx5_5x6_FIRST4COLUMNS_ROW( 4 )
8886 						MUL_Nx5_5x6_LAST2COLUMNS_INIT
8887 						MUL_Nx5_5x6_LAST2COLUMNS_ROW2( 0 )
8888 						MUL_Nx5_5x6_LAST2COLUMNS_ROW2( 1 )
8889 						MUL_Nx5_5x6_LAST2COLUMNS_ROW( 4 )
8890 
8891 						return;
8892 					}
8893 					case 6: {			// 6x5 * 5x6
8894 
8895 						MUL_Nx5_5x6_FIRST4COLUMNS_INIT
8896 						MUL_Nx5_5x6_FIRST4COLUMNS_ROW( 0 )
8897 						MUL_Nx5_5x6_FIRST4COLUMNS_ROW( 1 )
8898 						MUL_Nx5_5x6_FIRST4COLUMNS_ROW( 2 )
8899 						MUL_Nx5_5x6_FIRST4COLUMNS_ROW( 3 )
8900 						MUL_Nx5_5x6_FIRST4COLUMNS_ROW( 4 )
8901 						MUL_Nx5_5x6_FIRST4COLUMNS_ROW( 5 )
8902 						MUL_Nx5_5x6_LAST2COLUMNS_INIT
8903 						MUL_Nx5_5x6_LAST2COLUMNS_ROW2( 0 )
8904 						MUL_Nx5_5x6_LAST2COLUMNS_ROW2( 1 )
8905 						MUL_Nx5_5x6_LAST2COLUMNS_ROW2( 2 )
8906 
8907 						return;
8908 					}
8909 				}
8910 			}
8911 			for ( i = 0; i < k; i++ ) {
8912 				m2Ptr = m2.ToFloatPtr();
8913 				for ( j = 0; j < l; j++ ) {
8914 					*dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[1] * m2Ptr[l] + m1Ptr[2] * m2Ptr[2*l] +
8915 									 m1Ptr[3] * m2Ptr[3*l] + m1Ptr[4] * m2Ptr[4*l];
8916 					m2Ptr++;
8917 				}
8918 				m1Ptr += 5;
8919 			}
8920 			break;
8921 		}
8922 		case 6: {
8923 			switch( k ) {
8924 				case 1: {
8925 					if ( !(l^1) ) {		// 1x6 * 6x1
8926 						dstPtr[0] = m1Ptr[0] * m2Ptr[0] + m1Ptr[1] * m2Ptr[1] + m1Ptr[2] * m2Ptr[2] +
8927 									 m1Ptr[3] * m2Ptr[3] + m1Ptr[4] * m2Ptr[4] + m1Ptr[5] * m2Ptr[5];
8928 						return;
8929 					}
8930 					break;
8931 				}
8932 				case 2: {
8933 					if ( !(l^2) ) {		// 2x6 * 6x2
8934 
8935 						#define MUL_Nx6_6x2_INIT								\
8936 						__asm mov		esi, m2Ptr								\
8937 						__asm mov		edi, m1Ptr								\
8938 						__asm mov		eax, dstPtr								\
8939 						__asm movaps	xmm0, [esi]								\
8940 						__asm movaps	xmm1, [esi+16]							\
8941 						__asm movaps	xmm2, [esi+32]
8942 
8943 						#define MUL_Nx6_6x2_ROW2( row )							\
8944 						__asm movaps	xmm7, [edi+row*48+0*4]					\
8945 						__asm movaps	xmm6, xmm7								\
8946 						__asm shufps	xmm7, xmm7, R_SHUFFLEPS( 0, 0, 1, 1 )	\
8947 						__asm mulps		xmm7, xmm0								\
8948 						__asm shufps	xmm6, xmm6, R_SHUFFLEPS( 2, 2, 3, 3 )	\
8949 						__asm mulps		xmm6, xmm1								\
8950 						__asm addps		xmm7, xmm6								\
8951 						__asm movaps	xmm6, [edi+row*48+4*4]					\
8952 						__asm movaps	xmm5, xmm6								\
8953 						__asm shufps	xmm6, xmm6, R_SHUFFLEPS( 0, 0, 1, 1 )	\
8954 						__asm mulps		xmm6, xmm2								\
8955 						__asm addps		xmm7, xmm6								\
8956 						__asm shufps	xmm5, xmm5, R_SHUFFLEPS( 2, 2, 3, 3 )	\
8957 						__asm mulps		xmm5, xmm0								\
8958 						__asm movaps	xmm6, [edi+row*48+24+2*4]				\
8959 						__asm movaps	xmm4, xmm6								\
8960 						__asm shufps	xmm6, xmm6, R_SHUFFLEPS( 0, 0, 1, 1 )	\
8961 						__asm mulps		xmm6, xmm1								\
8962 						__asm addps		xmm5, xmm6								\
8963 						__asm shufps	xmm4, xmm4, R_SHUFFLEPS( 2, 2, 3, 3 )	\
8964 						__asm mulps		xmm4, xmm2								\
8965 						__asm addps		xmm5, xmm4								\
8966 						__asm movaps	xmm4, xmm5								\
8967 						__asm movhlps	xmm5, xmm7								\
8968 						__asm movlhps	xmm7, xmm4								\
8969 						__asm addps		xmm7, xmm5								\
8970 						__asm movaps	[eax+row*16], xmm7
8971 
8972 						MUL_Nx6_6x2_INIT
8973 						MUL_Nx6_6x2_ROW2( 0 )
8974 
8975 						return;
8976 					}
8977 					break;
8978 				}
8979 				case 3: {
8980 					if ( !(l^3) ) {		// 3x6 * 6x3
8981 
8982 						#define MUL_Nx6_6x3_INIT								\
8983 						__asm mov		esi, m2Ptr								\
8984 						__asm mov		edi, m1Ptr								\
8985 						__asm mov		eax, dstPtr								\
8986 						__asm movss		xmm0, [esi+ 0*4]						\
8987 						__asm movhps	xmm0, [esi+ 1*4]						\
8988 						__asm movss		xmm1, [esi+ 3*4]						\
8989 						__asm movhps	xmm1, [esi+ 4*4]						\
8990 						__asm movss		xmm2, [esi+ 6*4]						\
8991 						__asm movhps	xmm2, [esi+ 7*4]						\
8992 						__asm movss		xmm3, [esi+ 9*4]						\
8993 						__asm movhps	xmm3, [esi+10*4]						\
8994 						__asm movss		xmm4, [esi+12*4]						\
8995 						__asm movhps	xmm4, [esi+13*4]						\
8996 						__asm movss		xmm5, [esi+15*4]						\
8997 						__asm movhps	xmm5, [esi+16*4]
8998 
8999 						#define MUL_Nx6_6x3_ROW( row )							\
9000 						__asm movss		xmm7, [edi+row*24+0]					\
9001 						__asm shufps	xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )	\
9002 						__asm mulps		xmm7, xmm0								\
9003 						__asm movss		xmm6, [edi+row*24+4]					\
9004 						__asm shufps	xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )	\
9005 						__asm mulps		xmm6, xmm1								\
9006 						__asm addps		xmm7, xmm6								\
9007 						__asm movss		xmm6, [edi+row*24+8]					\
9008 						__asm shufps	xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )	\
9009 						__asm mulps		xmm6, xmm2								\
9010 						__asm addps		xmm7, xmm6								\
9011 						__asm movss		xmm6, [edi+row*24+12]					\
9012 						__asm shufps	xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )	\
9013 						__asm mulps		xmm6, xmm3								\
9014 						__asm addps		xmm7, xmm6								\
9015 						__asm movss		xmm6, [edi+row*24+16]					\
9016 						__asm shufps	xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )	\
9017 						__asm mulps		xmm6, xmm4								\
9018 						__asm addps		xmm7, xmm6								\
9019 						__asm movss		xmm6, [edi+row*24+20]					\
9020 						__asm shufps	xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )	\
9021 						__asm mulps		xmm6, xmm5								\
9022 						__asm addps		xmm7, xmm6								\
9023 						__asm movss		[eax+row*12+0], xmm7					\
9024 						__asm movhps	[eax+row*12+4], xmm7
9025 
9026 						MUL_Nx6_6x3_INIT
9027 						MUL_Nx6_6x3_ROW( 0 )
9028 						MUL_Nx6_6x3_ROW( 1 )
9029 						MUL_Nx6_6x3_ROW( 2 )
9030 
9031 						return;
9032 					}
9033 					break;
9034 				}
9035 				case 4: {
9036 					if ( !(l^4) ) {		// 4x6 * 6x4
9037 
9038 						#define MUL_Nx6_6x4_INIT								\
9039 						__asm mov		esi, m2Ptr								\
9040 						__asm mov		edi, m1Ptr								\
9041 						__asm mov		eax, dstPtr								\
9042 						__asm movaps	xmm0, [esi]								\
9043 						__asm movaps	xmm1, [esi+16]							\
9044 						__asm movaps	xmm2, [esi+32]							\
9045 						__asm movaps	xmm3, [esi+48]							\
9046 						__asm movaps	xmm4, [esi+64]							\
9047 						__asm movaps	xmm5, [esi+80]
9048 
9049 						#define MUL_Nx6_6x4_ROW( row )							\
9050 						__asm movss		xmm7, [edi+row*24+0]					\
9051 						__asm shufps	xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )	\
9052 						__asm mulps		xmm7, xmm0								\
9053 						__asm movss		xmm6, [edi+row*24+4]					\
9054 						__asm shufps	xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )	\
9055 						__asm mulps		xmm6, xmm1								\
9056 						__asm addps		xmm7, xmm6								\
9057 						__asm movss		xmm6, [edi+row*24+8]					\
9058 						__asm shufps	xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )	\
9059 						__asm mulps		xmm6, xmm2								\
9060 						__asm addps		xmm7, xmm6								\
9061 						__asm movss		xmm6, [edi+row*24+12]					\
9062 						__asm shufps	xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )	\
9063 						__asm mulps		xmm6, xmm3								\
9064 						__asm addps		xmm7, xmm6								\
9065 						__asm movss		xmm6, [edi+row*24+16]					\
9066 						__asm shufps	xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )	\
9067 						__asm mulps		xmm6, xmm4								\
9068 						__asm addps		xmm7, xmm6								\
9069 						__asm movss		xmm6, [edi+row*24+20]					\
9070 						__asm shufps	xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )	\
9071 						__asm mulps		xmm6, xmm5								\
9072 						__asm addps		xmm7, xmm6								\
9073 						__asm movaps	[eax+row*16], xmm7
9074 
9075 						MUL_Nx6_6x4_INIT
9076 						MUL_Nx6_6x4_ROW( 0 )
9077 						MUL_Nx6_6x4_ROW( 1 )
9078 						MUL_Nx6_6x4_ROW( 2 )
9079 						MUL_Nx6_6x4_ROW( 3 )
9080 
9081 						return;
9082 					}
9083 					break;
9084 				}
9085 				case 5: {
9086 					if ( !(l^5) ) {		// 5x6 * 6x5
9087 
9088 						#define MUL_Nx6_6x5_INIT								\
9089 						__asm mov		esi, m2Ptr								\
9090 						__asm mov		edi, m1Ptr								\
9091 						__asm mov		eax, dstPtr								\
9092 						__asm movaps	xmm0, [esi]								\
9093 						__asm movlps	xmm1, [esi+20]							\
9094 						__asm movhps	xmm1, [esi+28]							\
9095 						__asm movlps	xmm2, [esi+40]							\
9096 						__asm movhps	xmm2, [esi+48]							\
9097 						__asm movlps	xmm3, [esi+60]							\
9098 						__asm movhps	xmm3, [esi+68]							\
9099 						__asm movaps	xmm4, [esi+80]							\
9100 						__asm movlps	xmm5, [esi+100]							\
9101 						__asm movhps	xmm5, [esi+108]
9102 
9103 						#define MUL_Nx6_6x5_ROW( row )							\
9104 						__asm movss		xmm7, [edi+row*24+0]					\
9105 						__asm shufps	xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )	\
9106 						__asm mulps		xmm7, xmm0								\
9107 						__asm fld		dword ptr [edi+(row*6+0)*4]				\
9108 						__asm fmul		dword ptr [esi+(4+0*5)*4]				\
9109 						__asm movss		xmm6, [edi+row*24+4]					\
9110 						__asm shufps	xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )	\
9111 						__asm mulps		xmm6, xmm1								\
9112 						__asm addps		xmm7, xmm6								\
9113 						__asm fld		dword ptr [edi+(row*6+1)*4]				\
9114 						__asm fmul		dword ptr [esi+(4+1*5)*4]				\
9115 						__asm faddp		st(1),st								\
9116 						__asm movss		xmm6, [edi+row*24+8]					\
9117 						__asm shufps	xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )	\
9118 						__asm mulps		xmm6, xmm2								\
9119 						__asm addps		xmm7, xmm6								\
9120 						__asm fld		dword ptr [edi+(row*6+2)*4]				\
9121 						__asm fmul		dword ptr [esi+(4+2*5)*4]				\
9122 						__asm faddp		st(1),st								\
9123 						__asm movss		xmm6, [edi+row*24+12]					\
9124 						__asm shufps	xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )	\
9125 						__asm mulps		xmm6, xmm3								\
9126 						__asm addps		xmm7, xmm6								\
9127 						__asm fld		dword ptr [edi+(row*6+3)*4]				\
9128 						__asm fmul		dword ptr [esi+(4+3*5)*4]				\
9129 						__asm faddp		st(1),st								\
9130 						__asm movss		xmm6, [edi+row*24+16]					\
9131 						__asm shufps	xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )	\
9132 						__asm mulps		xmm6, xmm4								\
9133 						__asm addps		xmm7, xmm6								\
9134 						__asm fld		dword ptr [edi+(row*6+4)*4]				\
9135 						__asm fmul		dword ptr [esi+(4+4*5)*4]				\
9136 						__asm faddp		st(1),st								\
9137 						__asm movss		xmm6, [edi+row*24+20]					\
9138 						__asm shufps	xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )	\
9139 						__asm mulps		xmm6, xmm5								\
9140 						__asm addps		xmm7, xmm6								\
9141 						__asm fld		dword ptr [edi+(row*6+5)*4]				\
9142 						__asm fmul		dword ptr [esi+(4+5*5)*4]				\
9143 						__asm faddp		st(1),st								\
9144 						__asm fstp		dword ptr [eax+(row*5+4)*4]				\
9145 						__asm movlps	[eax+row*20], xmm7						\
9146 						__asm movhps	[eax+row*20+8], xmm7
9147 
9148 						MUL_Nx6_6x5_INIT
9149 						MUL_Nx6_6x5_ROW( 0 )
9150 						MUL_Nx6_6x5_ROW( 1 )
9151 						MUL_Nx6_6x5_ROW( 2 )
9152 						MUL_Nx6_6x5_ROW( 3 )
9153 						MUL_Nx6_6x5_ROW( 4 )
9154 
9155 						return;
9156 					}
9157 					break;
9158 				}
9159 				case 6: {
9160 					switch( l ) {
9161 						case 1: {		// 6x6 * 6x1
9162 							__asm {
9163 								mov			esi, m2Ptr
9164 								mov			edi, m1Ptr
9165 								mov			eax, dstPtr
9166 								movlps		xmm7, qword ptr [esi]
9167 								movlps		xmm6, qword ptr [esi+8]
9168 								shufps		xmm7, xmm7, 0x44
9169 								shufps		xmm6, xmm6, 0x44
9170 								movlps		xmm0, qword ptr [edi    ]
9171 								movhps		xmm0, qword ptr [edi+ 24]
9172 								mulps		xmm0, xmm7
9173 								movlps		xmm3, qword ptr [edi+  8]
9174 								movhps		xmm3, qword ptr [edi+ 32]
9175 								mulps		xmm3, xmm6
9176 								movlps		xmm1, qword ptr [edi+ 48]
9177 								movhps		xmm1, qword ptr [edi+ 72]
9178 								mulps		xmm1, xmm7
9179 								movlps		xmm2, qword ptr [edi+ 96]
9180 								movhps		xmm2, qword ptr [edi+120]
9181 								mulps		xmm2, xmm7
9182 								movlps		xmm4, qword ptr [edi+ 56]
9183 								movhps		xmm4, qword ptr [edi+ 80]
9184 								movlps		xmm5, qword ptr [edi+104]
9185 								movhps		xmm5, qword ptr [edi+128]
9186 								mulps		xmm4, xmm6
9187 								movlps		xmm7, qword ptr [esi+16]
9188 								addps		xmm0, xmm3
9189 								shufps		xmm7, xmm7, 0x44
9190 								mulps		xmm5, xmm6
9191 								addps		xmm1, xmm4
9192 								movlps		xmm3, qword ptr [edi+ 16]
9193 								movhps		xmm3, qword ptr [edi+ 40]
9194 								addps		xmm2, xmm5
9195 								movlps		xmm4, qword ptr [edi+ 64]
9196 								movhps		xmm4, qword ptr [edi+ 88]
9197 								mulps		xmm3, xmm7
9198 								movlps		xmm5, qword ptr [edi+112]
9199 								movhps		xmm5, qword ptr [edi+136]
9200 								addps		xmm0, xmm3
9201 								mulps		xmm4, xmm7
9202 								mulps		xmm5, xmm7
9203 								addps		xmm1, xmm4
9204 								addps		xmm2, xmm5
9205 								movaps		xmm6, xmm0
9206 								shufps		xmm0, xmm1, 0x88
9207 								shufps		xmm6, xmm1, 0xDD
9208 								movaps		xmm7, xmm2
9209 								shufps		xmm7, xmm2, 0x88
9210 								shufps		xmm2, xmm2, 0xDD
9211 								addps		xmm0, xmm6
9212 								addps		xmm2, xmm7
9213 								movlps		[eax], xmm0
9214 								movhps		[eax+8], xmm0
9215 								movlps		[eax+16], xmm2
9216 							}
9217 							return;
9218 						}
9219 						case 2: {		// 6x6 * 6x2
9220 
9221 							MUL_Nx6_6x2_INIT
9222 							MUL_Nx6_6x2_ROW2( 0 )
9223 							MUL_Nx6_6x2_ROW2( 1 )
9224 							MUL_Nx6_6x2_ROW2( 2 )
9225 
9226 							return;
9227 						}
9228 						case 3: {		// 6x6 * 6x3
9229 
9230 							MUL_Nx6_6x3_INIT
9231 							MUL_Nx6_6x3_ROW( 0 )
9232 							MUL_Nx6_6x3_ROW( 1 )
9233 							MUL_Nx6_6x3_ROW( 2 )
9234 							MUL_Nx6_6x3_ROW( 3 )
9235 							MUL_Nx6_6x3_ROW( 4 )
9236 							MUL_Nx6_6x3_ROW( 5 )
9237 
9238 							return;
9239 						}
9240 						case 4: {		// 6x6 * 6x4
9241 
9242 							MUL_Nx6_6x4_INIT
9243 							MUL_Nx6_6x4_ROW( 0 )
9244 							MUL_Nx6_6x4_ROW( 1 )
9245 							MUL_Nx6_6x4_ROW( 2 )
9246 							MUL_Nx6_6x4_ROW( 3 )
9247 							MUL_Nx6_6x4_ROW( 4 )
9248 							MUL_Nx6_6x4_ROW( 5 )
9249 
9250 							return;
9251 						}
9252 						case 5: {		// 6x6 * 6x5
9253 
9254 							MUL_Nx6_6x5_INIT
9255 							MUL_Nx6_6x5_ROW( 0 )
9256 							MUL_Nx6_6x5_ROW( 1 )
9257 							MUL_Nx6_6x5_ROW( 2 )
9258 							MUL_Nx6_6x5_ROW( 3 )
9259 							MUL_Nx6_6x5_ROW( 4 )
9260 							MUL_Nx6_6x5_ROW( 5 )
9261 
9262 							return;
9263 						}
9264 						case 6: {		// 6x6 * 6x6
9265 							__asm {
9266 								mov			ecx, dword ptr m2Ptr
9267 								movlps		xmm3, qword ptr [ecx+72]
9268 								mov			edx, dword ptr m1Ptr
9269 								// Loading first 4 columns (upper 4 rows) of m2Ptr.
9270 								movaps		xmm0, xmmword ptr [ecx]
9271 								movlps		xmm1, qword ptr [ecx+24]
9272 								movhps		xmm1, qword ptr [ecx+32]
9273 								movaps		xmm2, xmmword ptr [ecx+48]
9274 								movhps		xmm3, qword ptr [ecx+80]
9275 								// Calculating first 4 elements in the first row of the destination matrix.
9276 								movss		xmm4, dword ptr [edx]
9277 								movss		xmm5, dword ptr [edx+4]
9278 								mov			eax, dword ptr dstPtr
9279 								shufps		xmm4, xmm4, 0
9280 								movss		xmm6, dword ptr [edx+8]
9281 								shufps		xmm5, xmm5, 0
9282 								movss		xmm7, dword ptr [edx+12]
9283 								mulps		xmm4, xmm0
9284 								shufps		xmm6, xmm6, 0
9285 								shufps		xmm7, xmm7, 0
9286 								mulps		xmm5, xmm1
9287 								mulps		xmm6, xmm2
9288 								addps		xmm5, xmm4
9289 								mulps		xmm7, xmm3
9290 								addps		xmm6, xmm5
9291 								addps		xmm7, xmm6
9292 								movaps		xmmword ptr [eax], xmm7
9293 								// Calculating first 4 elements in the second row of the destination matrix.
9294 								movss		xmm4, dword ptr [edx+24]
9295 								shufps		xmm4, xmm4, 0
9296 								mulps		xmm4, xmm0
9297 								movss		xmm5, dword ptr [edx+28]
9298 								shufps		xmm5, xmm5, 0
9299 								mulps		xmm5, xmm1
9300 								movss		xmm6, dword ptr [edx+32]
9301 								shufps		xmm6, xmm6, 0
9302 								movss		xmm7, dword ptr [edx+36]
9303 								shufps		xmm7, xmm7, 0
9304 								mulps		xmm6, xmm2
9305 								mulps		xmm7, xmm3
9306 								addps		xmm7, xmm6
9307 								addps		xmm5, xmm4
9308 								addps		xmm7, xmm5
9309 								// Calculating first 4 elements in the third row of the destination matrix.
9310 								movss		xmm4, dword ptr [edx+48]
9311 								movss		xmm5, dword ptr [edx+52]
9312 								movlps		qword ptr [eax+24], xmm7 ; save 2nd
9313 								movhps		qword ptr [eax+32], xmm7 ; row
9314 								movss		xmm6, dword ptr [edx+56]
9315 								movss		xmm7, dword ptr [edx+60]
9316 								shufps		xmm4, xmm4, 0
9317 								shufps		xmm5, xmm5, 0
9318 								shufps		xmm6, xmm6, 0
9319 								shufps		xmm7, xmm7, 0
9320 								mulps		xmm4, xmm0
9321 								mulps		xmm5, xmm1
9322 								mulps		xmm6, xmm2
9323 								mulps		xmm7, xmm3
9324 								addps		xmm5, xmm4
9325 								addps		xmm7, xmm6
9326 								addps		xmm7, xmm5
9327 								movaps		xmmword ptr [eax+48], xmm7
9328 								// Calculating first 4 elements in the fourth row of the destination matrix.
9329 								movss		xmm4, dword ptr [edx+72]
9330 								movss		xmm5, dword ptr [edx+76]
9331 								movss		xmm6, dword ptr [edx+80]
9332 								movss		xmm7, dword ptr [edx+84]
9333 								shufps		xmm4, xmm4, 0
9334 								shufps		xmm5, xmm5, 0
9335 								shufps		xmm6, xmm6, 0
9336 								shufps		xmm7, xmm7, 0
9337 								mulps		xmm4, xmm0
9338 								mulps		xmm5, xmm1
9339 								mulps		xmm6, xmm2
9340 								mulps		xmm7, xmm3
9341 								addps		xmm4, xmm5
9342 								addps		xmm6, xmm4
9343 								addps		xmm7, xmm6
9344 								movlps		qword ptr [eax+72], xmm7
9345 								movhps		qword ptr [eax+80], xmm7
9346 								// Calculating first 4 elements in the fifth row of the destination matrix.
9347 								movss		xmm4, dword ptr [edx+96]
9348 								movss		xmm5, dword ptr [edx+100]
9349 								movss		xmm6, dword ptr [edx+104]
9350 								movss		xmm7, dword ptr [edx+108]
9351 								shufps		xmm4, xmm4, 0
9352 								shufps		xmm5, xmm5, 0
9353 								shufps		xmm6, xmm6, 0
9354 								shufps		xmm7, xmm7, 0
9355 								mulps		xmm4, xmm0
9356 								mulps		xmm5, xmm1
9357 								mulps		xmm6, xmm2
9358 								mulps		xmm7, xmm3
9359 								addps		xmm5, xmm4
9360 								addps		xmm7, xmm6
9361 								addps		xmm7, xmm5
9362 								movaps		xmmword ptr [eax+96], xmm7
9363 								// Calculating first 4 elements in the sixth row of the destination matrix.
9364 								movss		xmm4, dword ptr [edx+120]
9365 								movss		xmm5, dword ptr [edx+124]
9366 								movss		xmm6, dword ptr [edx+128]
9367 								movss		xmm7, dword ptr [edx+132]
9368 								shufps		xmm4, xmm4, 0
9369 								shufps		xmm5, xmm5, 0
9370 								shufps		xmm6, xmm6, 0
9371 								shufps		xmm7, xmm7, 0
9372 								mulps		xmm4, xmm0
9373 								mulps		xmm5, xmm1
9374 								mulps		xmm6, xmm2
9375 								mulps		xmm7, xmm3
9376 								addps		xmm4, xmm5
9377 								addps		xmm6, xmm4
9378 								addps		xmm7, xmm6
9379 								movhps		qword ptr [eax+128], xmm7
9380 								movlps		qword ptr [eax+120], xmm7
9381 								// Loading first 4 columns (lower 2 rows) of m2Ptr.
9382 								movlps		xmm0, qword ptr [ecx+96]
9383 								movhps		xmm0, qword ptr [ecx+104]
9384 								movlps		xmm1, qword ptr [ecx+120]
9385 								movhps		xmm1, qword ptr [ecx+128]
9386 								// Calculating first 4 elements in the first row of the destination matrix.
9387 								movss		xmm2, dword ptr [edx+16]
9388 								shufps		xmm2, xmm2, 0
9389 								movss		xmm4, dword ptr [edx+40]
9390 								movss		xmm3, dword ptr [edx+20]
9391 								movss		xmm5, dword ptr [edx+44]
9392 								movaps		xmm6, xmmword ptr [eax]
9393 								movlps		xmm7, qword ptr [eax+24]
9394 								shufps		xmm3, xmm3, 0
9395 								shufps		xmm5, xmm5, 0
9396 								movhps		xmm7, qword ptr [eax+32]
9397 								shufps		xmm4, xmm4, 0
9398 								mulps		xmm5, xmm1
9399 								mulps		xmm2, xmm0
9400 								mulps		xmm3, xmm1
9401 								mulps		xmm4, xmm0
9402 								addps		xmm6, xmm2
9403 								addps		xmm7, xmm4
9404 								addps		xmm7, xmm5
9405 								addps		xmm6, xmm3
9406 								movlps		qword ptr [eax+24], xmm7
9407 								movaps		xmmword ptr [eax], xmm6
9408 								movhps		qword ptr [eax+32], xmm7
9409 								// Calculating first 4 elements in the third row of the destination matrix.
9410 								movss		xmm2, dword ptr [edx+64]
9411 								movss		xmm4, dword ptr [edx+88]
9412 								movss		xmm5, dword ptr [edx+92]
9413 								movss		xmm3, dword ptr [edx+68]
9414 								movaps		xmm6, xmmword ptr [eax+48]
9415 								movlps		xmm7, qword ptr [eax+72]
9416 								movhps		xmm7, qword ptr [eax+80]
9417 								shufps		xmm2, xmm2, 0
9418 								shufps		xmm4, xmm4, 0
9419 								shufps		xmm5, xmm5, 0
9420 								shufps		xmm3, xmm3, 0
9421 								mulps		xmm2, xmm0
9422 								mulps		xmm4, xmm0
9423 								mulps		xmm5, xmm1
9424 								mulps		xmm3, xmm1
9425 								addps		xmm6, xmm2
9426 								addps		xmm6, xmm3
9427 								addps		xmm7, xmm4
9428 								addps		xmm7, xmm5
9429 								movlps		qword ptr [eax+72], xmm7
9430 								movaps		xmmword ptr [eax+48], xmm6
9431 								movhps		qword ptr [eax+80], xmm7
9432 								// Calculating first 4 elements in the fifth row of the destination matrix.
9433 								movss		xmm2, dword ptr [edx+112]
9434 								movss		xmm3, dword ptr [edx+116]
9435 								movaps		xmm6, xmmword ptr [eax+96]
9436 								shufps		xmm2, xmm2, 0
9437 								shufps		xmm3, xmm3, 0
9438 								mulps		xmm2, xmm0
9439 								mulps		xmm3, xmm1
9440 								addps		xmm6, xmm2
9441 								addps		xmm6, xmm3
9442 								movaps		xmmword ptr [eax+96], xmm6
9443 								// Calculating first 4 elements in the sixth row of the destination matrix.
9444 								movss		xmm4, dword ptr [edx+136]
9445 								movss		xmm5, dword ptr [edx+140]
9446 								movhps		xmm7, qword ptr [eax+128]
9447 								movlps		xmm7, qword ptr [eax+120]
9448 								shufps		xmm4, xmm4, 0
9449 								shufps		xmm5, xmm5, 0
9450 								mulps		xmm4, xmm0
9451 								mulps		xmm5, xmm1
9452 								addps		xmm7, xmm4
9453 								addps		xmm7, xmm5
9454 								// Calculating last 2 columns of the destination matrix.
9455 								movlps		xmm0, qword ptr [ecx+16]
9456 								movhps		xmm0, qword ptr [ecx+40]
9457 								movhps		qword ptr [eax+128], xmm7
9458 								movlps		qword ptr [eax+120], xmm7
9459 								movlps		xmm2, qword ptr [ecx+64]
9460 								movhps		xmm2, qword ptr [ecx+88]
9461 								movaps		xmm3, xmm2
9462 								shufps		xmm3, xmm3, 4Eh
9463 								movlps		xmm4, qword ptr [ecx+112]
9464 								movhps		xmm4, qword ptr [ecx+136]
9465 								movaps		xmm5, xmm4
9466 								shufps		xmm5, xmm5, 4Eh
9467 								movlps		xmm6, qword ptr [edx]
9468 								movhps		xmm6, qword ptr [edx+24]
9469 								movaps		xmm7, xmm6
9470 								shufps		xmm7, xmm7, 0F0h
9471 								mulps		xmm7, xmm0
9472 								shufps		xmm6, xmm6, 0A5h
9473 								movaps		xmm1, xmm0
9474 								shufps		xmm1, xmm1, 4Eh
9475 								mulps		xmm1, xmm6
9476 								addps		xmm7, xmm1
9477 								movlps		xmm6, qword ptr [edx+8]
9478 								movhps		xmm6, qword ptr [edx+32]
9479 								movaps		xmm1, xmm6
9480 								shufps		xmm1, xmm1, 0F0h
9481 								shufps		xmm6, xmm6, 0A5h
9482 								mulps		xmm1, xmm2
9483 								mulps		xmm6, xmm3
9484 								addps		xmm7, xmm1
9485 								addps		xmm7, xmm6
9486 								movhps		xmm6, qword ptr [edx+40]
9487 								movlps		xmm6, qword ptr [edx+16]
9488 								movaps		xmm1, xmm6
9489 								shufps		xmm1, xmm1, 0F0h
9490 								shufps		xmm6, xmm6, 0A5h
9491 								mulps		xmm1, xmm4
9492 								mulps		xmm6, xmm5
9493 								addps		xmm7, xmm1
9494 								addps		xmm7, xmm6
9495 								movlps		qword ptr [eax+16], xmm7
9496 								movhps		qword ptr [eax+40], xmm7
9497 								movlps		xmm6, qword ptr [edx+48]
9498 								movhps		xmm6, qword ptr [edx+72]
9499 								movaps		xmm7, xmm6
9500 								shufps		xmm7, xmm7, 0F0h
9501 								mulps		xmm7, xmm0
9502 								shufps		xmm6, xmm6, 0A5h
9503 								movaps		xmm1, xmm0
9504 								shufps		xmm1, xmm1, 4Eh
9505 								mulps		xmm1, xmm6
9506 								addps		xmm7, xmm1
9507 								movhps		xmm6, qword ptr [edx+80]
9508 								movlps		xmm6, qword ptr [edx+56]
9509 								movaps		xmm1, xmm6
9510 								shufps		xmm1, xmm1, 0F0h
9511 								shufps		xmm6, xmm6, 0A5h
9512 								mulps		xmm1, xmm2
9513 								mulps		xmm6, xmm3
9514 								addps		xmm7, xmm1
9515 								addps		xmm7, xmm6
9516 								movlps		xmm6, qword ptr [edx+64]
9517 								movhps		xmm6, qword ptr [edx+88]
9518 								movaps		xmm1, xmm6
9519 								shufps		xmm1, xmm1, 0F0h
9520 								shufps		xmm6, xmm6, 0A5h
9521 								mulps		xmm1, xmm4
9522 								mulps		xmm6, xmm5
9523 								addps		xmm7, xmm1
9524 								addps		xmm7, xmm6
9525 								movlps		qword ptr [eax+64], xmm7
9526 								movhps		qword ptr [eax+88], xmm7
9527 								movlps		xmm6, qword ptr [edx+96]
9528 								movhps		xmm6, qword ptr [edx+120]
9529 								movaps		xmm7, xmm6
9530 								shufps		xmm7, xmm7, 0F0h
9531 								mulps		xmm7, xmm0
9532 								shufps		xmm6, xmm6, 0A5h
9533 								movaps		xmm1, xmm0
9534 								shufps		xmm1, xmm1, 4Eh
9535 								mulps		xmm1, xmm6
9536 								addps		xmm7, xmm1
9537 								movlps		xmm6, qword ptr [edx+104]
9538 								movhps		xmm6, qword ptr [edx+128]
9539 								movaps		xmm1, xmm6
9540 								shufps		xmm1, xmm1, 0F0h
9541 								shufps		xmm6, xmm6, 0A5h
9542 								mulps		xmm1, xmm2
9543 								mulps		xmm6, xmm3
9544 								addps		xmm7, xmm1
9545 								addps		xmm7, xmm6
9546 								movlps		xmm6, qword ptr [edx+112]
9547 								movhps		xmm6, qword ptr [edx+136]
9548 								movaps		xmm1, xmm6
9549 								shufps		xmm1, xmm1, 0F0h
9550 								shufps		xmm6, xmm6, 0A5h
9551 								mulps		xmm1, xmm4
9552 								mulps		xmm6, xmm5
9553 								addps		xmm7, xmm1
9554 								addps		xmm7, xmm6
9555 								movlps		qword ptr [eax+112], xmm7
9556 								movhps		qword ptr [eax+136], xmm7
9557 							}
9558 							return;
9559 						}
9560 					}
9561 				}
9562 			}
9563 			for ( i = 0; i < k; i++ ) {
9564 				m2Ptr = m2.ToFloatPtr();
9565 				for ( j = 0; j < l; j++ ) {
9566 					*dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[1] * m2Ptr[l] + m1Ptr[2] * m2Ptr[2*l] +
9567 									 m1Ptr[3] * m2Ptr[3*l] + m1Ptr[4] * m2Ptr[4*l] + m1Ptr[5] * m2Ptr[5*l];
9568 					m2Ptr++;
9569 				}
9570 				m1Ptr += 6;
9571 			}
9572 			break;
9573 		}
9574 		default: {
9575 			for ( i = 0; i < k; i++ ) {
9576 				for ( j = 0; j < l; j++ ) {
9577 					m2Ptr = m2.ToFloatPtr() + j;
9578 					sum = m1Ptr[0] * m2Ptr[0];
9579 					for ( n = 1; n < m1.GetNumColumns(); n++ ) {
9580 						m2Ptr += l;
9581 						sum += m1Ptr[n] * m2Ptr[0];
9582 					}
9583 					*dstPtr++ = sum;
9584 				}
9585 				m1Ptr += m1.GetNumColumns();
9586 			}
9587 			break;
9588 		}
9589 	}
9590 }
9591 
9592 /*
9593 ============
9594 idSIMD_SSE::MatX_TransposeMultiplyMatX
9595 
9596 	optimizes the following transpose matrix multiplications:
9597 
9598 	Nx6 * NxN
9599 	6xN * 6x6
9600 
9601 	with N in the range [1-6].
9602 ============
9603 */
MatX_TransposeMultiplyMatX(idMatX & dst,const idMatX & m1,const idMatX & m2)9604 void VPCALL idSIMD_SSE::MatX_TransposeMultiplyMatX( idMatX &dst, const idMatX &m1, const idMatX &m2 ) {
9605 	int i, j, k, l, n;
9606 	float *dstPtr;
9607 	const float *m1Ptr, *m2Ptr;
9608 	double sum;
9609 
9610 	assert( m1.GetNumRows() == m2.GetNumRows() );
9611 
9612 	m1Ptr = m1.ToFloatPtr();
9613 	m2Ptr = m2.ToFloatPtr();
9614 	dstPtr = dst.ToFloatPtr();
9615 	k = m1.GetNumColumns();
9616 	l = m2.GetNumColumns();
9617 
9618 	switch( m1.GetNumRows() ) {
9619 		case 1:
9620 			if ( !((k^6)|(l^1)) ) {			// 1x6 * 1x1
9621 				__asm {
9622 					mov		esi, m2Ptr
9623 					mov		edi, m1Ptr
9624 					mov		eax, dstPtr
9625 					movss	xmm0, [esi]
9626 					shufps	xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
9627 					movaps	xmm1, xmm0
9628 					mulps	xmm0, [edi]
9629 					mulps	xmm1, [edi+16]
9630 					movaps	[eax], xmm0
9631 					movlps	[eax+16], xmm1
9632 				}
9633 				return;
9634 			}
9635 			for ( i = 0; i < k; i++ ) {
9636 				m2Ptr = m2.ToFloatPtr();
9637 				for ( j = 0; j < l; j++ ) {
9638 					*dstPtr++ = m1Ptr[0] * m2Ptr[0];
9639 					m2Ptr++;
9640 				}
9641 				m1Ptr++;
9642 			}
9643 			break;
9644 		case 2:
9645 			if ( !((k^6)|(l^2)) ) {			// 2x6 * 2x2
9646 				#define MUL_2xN_2x2_INIT								\
9647 				__asm mov		esi, m2Ptr								\
9648 				__asm mov		edi, m1Ptr								\
9649 				__asm mov		eax, dstPtr								\
9650 				__asm movlps	xmm0, [esi]								\
9651 				__asm shufps	xmm0, xmm0, R_SHUFFLEPS( 0, 1, 0, 1 )	\
9652 				__asm movlps	xmm1, [esi+8]							\
9653 				__asm shufps	xmm1, xmm1, R_SHUFFLEPS( 0, 1, 0, 1 )
9654 
9655 				#define MUL_2xN_2x2_ROW2( N, row )						\
9656 				__asm movlps	xmm6, [edi+(row+0*N)*4]					\
9657 				__asm shufps	xmm6, xmm6, R_SHUFFLEPS( 0, 0, 1, 1 )	\
9658 				__asm movlps	xmm7, [edi+(row+1*N)*4]					\
9659 				__asm shufps	xmm7, xmm7, R_SHUFFLEPS( 0, 0, 1, 1 )	\
9660 				__asm mulps		xmm6, xmm0								\
9661 				__asm mulps		xmm7, xmm1								\
9662 				__asm addps		xmm6, xmm7								\
9663 				__asm movaps	[eax+(row*2)*4], xmm6
9664 
9665 				MUL_2xN_2x2_INIT
9666 				MUL_2xN_2x2_ROW2( 6, 0 )
9667 				MUL_2xN_2x2_ROW2( 6, 2 )
9668 				MUL_2xN_2x2_ROW2( 6, 4 )
9669 
9670 				return;
9671 			}
9672 			for ( i = 0; i < k; i++ ) {
9673 				m2Ptr = m2.ToFloatPtr();
9674 				for ( j = 0; j < l; j++ ) {
9675 					*dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[k] * m2Ptr[l];
9676 					m2Ptr++;
9677 				}
9678 				m1Ptr++;
9679 			}
9680 			break;
9681 		case 3:
9682 			if ( !((k^6)|(l^3)) ) {			// 3x6 * 3x3
9683 
9684 				#define MUL_3xN_3x3_INIT								\
9685 				__asm mov		esi, m2Ptr								\
9686 				__asm mov		edi, m1Ptr								\
9687 				__asm mov		eax, dstPtr								\
9688 				__asm movss		xmm0, [esi+(0*3+0)*4]					\
9689 				__asm movhps	xmm0, [esi+(0*3+1)*4]					\
9690 				__asm movss		xmm1, [esi+(1*3+0)*4]					\
9691 				__asm movhps	xmm1, [esi+(1*3+1)*4]					\
9692 				__asm movss		xmm2, [esi+(2*3+0)*4]					\
9693 				__asm movhps	xmm2, [esi+(2*3+1)*4]
9694 
9695 				#define MUL_3xN_3x3_INIT_ROW4							\
9696 				__asm shufps	xmm0, xmm0, R_SHUFFLEPS( 0, 2, 3, 0 )	\
9697 				__asm shufps	xmm1, xmm1, R_SHUFFLEPS( 0, 2, 3, 0 )	\
9698 				__asm shufps	xmm2, xmm2, R_SHUFFLEPS( 0, 2, 3, 0 )
9699 
9700 				#define MUL_3xN_3x3_ROW4( N, row )						\
9701 				__asm movlps	xmm3, [edi+(row+0*N+0)*4]				\
9702 				__asm shufps	xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 1 )	\
9703 				__asm movlps	xmm4, [edi+(row+1*N+0)*4]				\
9704 				__asm shufps	xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 1 )	\
9705 				__asm movlps	xmm5, [edi+(row+2*N+0)*4]				\
9706 				__asm shufps	xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 1 )	\
9707 				__asm mulps		xmm3, xmm0								\
9708 				__asm mulps		xmm4, xmm1								\
9709 				__asm mulps		xmm5, xmm2								\
9710 				__asm addps		xmm3, xmm4								\
9711 				__asm addps		xmm3, xmm5								\
9712 				__asm movaps	[eax+(row*3+0)*4], xmm3					\
9713 				__asm shufps	xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 1 )	\
9714 				__asm shufps	xmm1, xmm1, R_SHUFFLEPS( 1, 2, 3, 1 )	\
9715 				__asm shufps	xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 1 )	\
9716 				__asm movlps	xmm3, [edi+(row+0*N+1)*4]				\
9717 				__asm shufps	xmm3, xmm3, R_SHUFFLEPS( 0, 0, 1, 1 )	\
9718 				__asm movlps	xmm4, [edi+(row+1*N+1)*4]				\
9719 				__asm shufps	xmm4, xmm4, R_SHUFFLEPS( 0, 0, 1, 1 )	\
9720 				__asm movlps	xmm5, [edi+(row+2*N+1)*4]				\
9721 				__asm shufps	xmm5, xmm5, R_SHUFFLEPS( 0, 0, 1, 1 )	\
9722 				__asm mulps		xmm3, xmm0								\
9723 				__asm mulps		xmm4, xmm1								\
9724 				__asm mulps		xmm5, xmm2								\
9725 				__asm addps		xmm3, xmm4								\
9726 				__asm addps		xmm3, xmm5								\
9727 				__asm movaps	[eax+(row*3+4)*4], xmm3					\
9728 				__asm shufps	xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 1 )	\
9729 				__asm shufps	xmm1, xmm1, R_SHUFFLEPS( 1, 2, 3, 1 )	\
9730 				__asm shufps	xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 1 )	\
9731 				__asm movlps	xmm3, [edi+(row+0*N+2)*4]				\
9732 				__asm shufps	xmm3, xmm3, R_SHUFFLEPS( 0, 1, 1, 1 )	\
9733 				__asm movlps	xmm4, [edi+(row+1*N+2)*4]				\
9734 				__asm shufps	xmm4, xmm4, R_SHUFFLEPS( 0, 1, 1, 1 )	\
9735 				__asm movlps	xmm5, [edi+(row+2*N+2)*4]				\
9736 				__asm shufps	xmm5, xmm5, R_SHUFFLEPS( 0, 1, 1, 1 )	\
9737 				__asm mulps		xmm3, xmm0								\
9738 				__asm mulps		xmm4, xmm1								\
9739 				__asm mulps		xmm5, xmm2								\
9740 				__asm addps		xmm3, xmm4								\
9741 				__asm addps		xmm3, xmm5								\
9742 				__asm movaps	[eax+(row*3+8)*4], xmm3
9743 
9744 				#define MUL_3xN_3x3_INIT_ROW4_ROW4						\
9745 				__asm shufps	xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 )	\
9746 				__asm shufps	xmm1, xmm1, R_SHUFFLEPS( 1, 2, 3, 0 )	\
9747 				__asm shufps	xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 )
9748 
9749 				#define MUL_3xN_3x3_INIT_ROW4_ROW						\
9750 				__asm shufps	xmm0, xmm0, R_SHUFFLEPS( 1, 1, 2, 3 )	\
9751 				__asm shufps	xmm1, xmm1, R_SHUFFLEPS( 1, 1, 2, 3 )	\
9752 				__asm shufps	xmm2, xmm2, R_SHUFFLEPS( 1, 1, 2, 3 )
9753 
9754 				#define MUL_3xN_3x3_ROW( N, row )						\
9755 				__asm movss		xmm3, [edi+(row+0*N)*4]					\
9756 				__asm shufps	xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 )	\
9757 				__asm movss		xmm4, [edi+(row+1*N)*4]					\
9758 				__asm shufps	xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )	\
9759 				__asm movss		xmm5, [edi+(row+2*N)*4]					\
9760 				__asm shufps	xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )	\
9761 				__asm mulps		xmm3, xmm0								\
9762 				__asm mulps		xmm4, xmm1								\
9763 				__asm mulps		xmm5, xmm2								\
9764 				__asm addps		xmm3, xmm4								\
9765 				__asm addps		xmm3, xmm5								\
9766 				__asm movss		[eax+(row*3+0)*4], xmm3					\
9767 				__asm movhps	[eax+(row*3+1)*4], xmm3
9768 
9769 				MUL_3xN_3x3_INIT
9770 				MUL_3xN_3x3_INIT_ROW4
9771 				MUL_3xN_3x3_ROW4( 6, 0 )
9772 				MUL_3xN_3x3_INIT_ROW4_ROW
9773 				MUL_3xN_3x3_ROW( 6, 4 )
9774 				MUL_3xN_3x3_ROW( 6, 5 )
9775 
9776 				return;
9777 			}
9778 			for ( i = 0; i < k; i++ ) {
9779 				m2Ptr = m2.ToFloatPtr();
9780 				for ( j = 0; j < l; j++ ) {
9781 					*dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[k] * m2Ptr[l] + m1Ptr[2*k] * m2Ptr[2*l];
9782 					m2Ptr++;
9783 				}
9784 				m1Ptr++;
9785 			}
9786 			break;
9787 		case 4:
9788 			if ( !((k^6)|(l^4)) ) {			// 4x6 * 4x4
9789 
9790 				#define MUL_4xN_4x4_INIT								\
9791 				__asm mov		esi, m2Ptr								\
9792 				__asm mov		edi, m1Ptr								\
9793 				__asm mov		eax, dstPtr								\
9794 				__asm movaps	xmm0, [esi]								\
9795 				__asm movaps	xmm1, [esi+16]							\
9796 				__asm movaps	xmm2, [esi+32]							\
9797 				__asm movaps	xmm3, [esi+48]
9798 
9799 				#define MUL_4xN_4x4_ROW( N, row )						\
9800 				__asm movss		xmm7, [edi+(row+0*N)*4]					\
9801 				__asm shufps	xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )	\
9802 				__asm mulps		xmm7, xmm0								\
9803 				__asm movss		xmm6, [edi+(row+1*N)*4]					\
9804 				__asm shufps	xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )	\
9805 				__asm mulps		xmm6, xmm1								\
9806 				__asm addps		xmm7, xmm6								\
9807 				__asm movss		xmm6, [edi+(row+2*N)*4]					\
9808 				__asm shufps	xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )	\
9809 				__asm mulps		xmm6, xmm2								\
9810 				__asm addps		xmm7, xmm6								\
9811 				__asm movss		xmm6, [edi+(row+3*N)*4]					\
9812 				__asm shufps	xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )	\
9813 				__asm mulps		xmm6, xmm3								\
9814 				__asm addps		xmm7, xmm6								\
9815 				__asm movaps	[eax+row*16], xmm7
9816 
9817 				MUL_4xN_4x4_INIT
9818 				MUL_4xN_4x4_ROW( 6, 0 )
9819 				MUL_4xN_4x4_ROW( 6, 1 )
9820 				MUL_4xN_4x4_ROW( 6, 2 )
9821 				MUL_4xN_4x4_ROW( 6, 3 )
9822 				MUL_4xN_4x4_ROW( 6, 4 )
9823 				MUL_4xN_4x4_ROW( 6, 5 )
9824 
9825 				return;
9826 			}
9827 			for ( i = 0; i < k; i++ ) {
9828 				m2Ptr = m2.ToFloatPtr();
9829 				for ( j = 0; j < l; j++ ) {
9830 					*dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[k] * m2Ptr[l] + m1Ptr[2*k] * m2Ptr[2*l] +
9831 									m1Ptr[3*k] * m2Ptr[3*l];
9832 					m2Ptr++;
9833 				}
9834 				m1Ptr++;
9835 			}
9836 			break;
9837 		case 5:
9838 			if ( !((k^6)|(l^5)) ) {			// 5x6 * 5x5
9839 
9840 				#define MUL_5xN_5x5_INIT								\
9841 				__asm mov		esi, m2Ptr								\
9842 				__asm mov		edi, m1Ptr								\
9843 				__asm mov		eax, dstPtr								\
9844 				__asm movlps	xmm0, [esi+ 0*4]						\
9845 				__asm movhps	xmm0, [esi+ 2*4]						\
9846 				__asm movlps	xmm1, [esi+ 5*4]						\
9847 				__asm movhps	xmm1, [esi+ 7*4]						\
9848 				__asm movlps	xmm2, [esi+10*4]						\
9849 				__asm movhps	xmm2, [esi+12*4]						\
9850 				__asm movlps	xmm3, [esi+15*4]						\
9851 				__asm movhps	xmm3, [esi+17*4]						\
9852 				__asm movlps	xmm4, [esi+20*4]						\
9853 				__asm movhps	xmm4, [esi+22*4]
9854 
9855 				#define MUL_5xN_5x5_ROW( N, row )						\
9856 				__asm movss		xmm6, [edi+(row+0*N)*4]					\
9857 				__asm shufps	xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )	\
9858 				__asm mulps		xmm6, xmm0								\
9859 				__asm fld		dword ptr [edi+(row+0*N)*4]				\
9860 				__asm fmul		dword ptr [esi+ 4*4]					\
9861 				__asm movss		xmm5, [edi+(row+1*N)*4]					\
9862 				__asm shufps	xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )	\
9863 				__asm mulps		xmm5, xmm1								\
9864 				__asm addps		xmm6, xmm5								\
9865 				__asm fld		dword ptr [edi+(row+1*N)*4]				\
9866 				__asm fmul		dword ptr [esi+ 9*4]					\
9867 				__asm faddp		st(1),st								\
9868 				__asm movss		xmm5, [edi+(row+2*N)*4]					\
9869 				__asm shufps	xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )	\
9870 				__asm mulps		xmm5, xmm2								\
9871 				__asm addps		xmm6, xmm5								\
9872 				__asm fld		dword ptr [edi+(row+2*N)*4]				\
9873 				__asm fmul		dword ptr [esi+14*4]					\
9874 				__asm faddp		st(1),st								\
9875 				__asm movss		xmm5, [edi+(row+3*N)*4]					\
9876 				__asm shufps	xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )	\
9877 				__asm mulps		xmm5, xmm3								\
9878 				__asm addps		xmm6, xmm5								\
9879 				__asm fld		dword ptr [edi+(row+3*N)*4]				\
9880 				__asm fmul		dword ptr [esi+19*4]					\
9881 				__asm faddp		st(1),st								\
9882 				__asm movss		xmm5, [edi+(row+4*N)*4]					\
9883 				__asm shufps	xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )	\
9884 				__asm mulps		xmm5, xmm4								\
9885 				__asm addps		xmm6, xmm5								\
9886 				__asm fld		dword ptr [edi+(row+4*N)*4]				\
9887 				__asm fmul		dword ptr [esi+24*4]					\
9888 				__asm faddp		st(1),st								\
9889 				__asm fstp		dword ptr [eax+(row*5+4)*4]				\
9890 				__asm movlps	[eax+(row*5+0)*4], xmm6					\
9891 				__asm movhps	[eax+(row*5+2)*4], xmm6
9892 
9893 				MUL_5xN_5x5_INIT
9894 				MUL_5xN_5x5_ROW( 6, 0 )
9895 				MUL_5xN_5x5_ROW( 6, 1 )
9896 				MUL_5xN_5x5_ROW( 6, 2 )
9897 				MUL_5xN_5x5_ROW( 6, 3 )
9898 				MUL_5xN_5x5_ROW( 6, 4 )
9899 				MUL_5xN_5x5_ROW( 6, 5 )
9900 
9901 				return;
9902 			}
9903 			for ( i = 0; i < k; i++ ) {
9904 				m2Ptr = m2.ToFloatPtr();
9905 				for ( j = 0; j < l; j++ ) {
9906 					*dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[k] * m2Ptr[l] + m1Ptr[2*k] * m2Ptr[2*l] +
9907 									m1Ptr[3*k] * m2Ptr[3*l] + m1Ptr[4*k] * m2Ptr[4*l];
9908 					m2Ptr++;
9909 				}
9910 				m1Ptr++;
9911 			}
9912 			break;
9913 		case 6:
9914 			if ( !(l^6) ) {
9915 				switch( k ) {
9916 					case 1: {					// 6x1 * 6x6
9917 						#define MUL_6xN_6x6_FIRST4COLUMNS_INIT					\
9918 						__asm mov		esi, m2Ptr								\
9919 						__asm mov		edi, m1Ptr								\
9920 						__asm mov		eax, dstPtr								\
9921 						__asm movlps	xmm0, [esi+ 0*4]						\
9922 						__asm movhps	xmm0, [esi+ 2*4]						\
9923 						__asm movlps	xmm1, [esi+ 6*4]						\
9924 						__asm movhps	xmm1, [esi+ 8*4]						\
9925 						__asm movlps	xmm2, [esi+12*4]						\
9926 						__asm movhps	xmm2, [esi+14*4]						\
9927 						__asm movlps	xmm3, [esi+18*4]						\
9928 						__asm movhps	xmm3, [esi+20*4]						\
9929 						__asm movlps	xmm4, [esi+24*4]						\
9930 						__asm movhps	xmm4, [esi+26*4]						\
9931 						__asm movlps	xmm5, [esi+30*4]						\
9932 						__asm movhps	xmm5, [esi+32*4]
9933 
9934 						#define MUL_6xN_6x6_FIRST4COLUMNS_ROW( N, row )			\
9935 						__asm movss		xmm7, [edi+(row+0*N)*4]					\
9936 						__asm shufps	xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )	\
9937 						__asm mulps		xmm7, xmm0								\
9938 						__asm movss		xmm6, [edi+(row+1*N)*4]					\
9939 						__asm shufps	xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )	\
9940 						__asm mulps		xmm6, xmm1								\
9941 						__asm addps		xmm7, xmm6								\
9942 						__asm movss		xmm6, [edi+(row+2*N)*4]					\
9943 						__asm shufps	xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )	\
9944 						__asm mulps		xmm6, xmm2								\
9945 						__asm addps		xmm7, xmm6								\
9946 						__asm movss		xmm6, [edi+(row+3*N)*4]					\
9947 						__asm shufps	xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )	\
9948 						__asm mulps		xmm6, xmm3								\
9949 						__asm addps		xmm7, xmm6								\
9950 						__asm movss		xmm6, [edi+(row+4*N)*4]					\
9951 						__asm shufps	xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )	\
9952 						__asm mulps		xmm6, xmm4								\
9953 						__asm addps		xmm7, xmm6								\
9954 						__asm movss		xmm6, [edi+(row+5*N)*4]					\
9955 						__asm shufps	xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )	\
9956 						__asm mulps		xmm6, xmm5								\
9957 						__asm addps		xmm7, xmm6								\
9958 						__asm movlps	[eax+(row*6+0)*4], xmm7					\
9959 						__asm movhps	[eax+(row*6+2)*4], xmm7
9960 
9961 						#define MUL_6xN_6x6_LAST2COLUMNS_INIT					\
9962 						__asm movlps	xmm0, [esi+ 4*4]						\
9963 						__asm movlps	xmm1, [esi+10*4]						\
9964 						__asm shufps	xmm0, xmm0, R_SHUFFLEPS( 0, 1, 0, 1 )	\
9965 						__asm shufps	xmm1, xmm1, R_SHUFFLEPS( 0, 1, 0, 1 )	\
9966 						__asm movlps	xmm2, [esi+16*4]						\
9967 						__asm movlps	xmm3, [esi+22*4]						\
9968 						__asm shufps	xmm2, xmm2, R_SHUFFLEPS( 0, 1, 0, 1 )	\
9969 						__asm shufps	xmm3, xmm3, R_SHUFFLEPS( 0, 1, 0, 1 )	\
9970 						__asm movlps	xmm4, [esi+28*4]						\
9971 						__asm movlps	xmm5, [esi+34*4]						\
9972 						__asm shufps	xmm4, xmm4, R_SHUFFLEPS( 0, 1, 0, 1 )	\
9973 						__asm shufps	xmm5, xmm5, R_SHUFFLEPS( 0, 1, 0, 1 )
9974 
9975 						#define MUL_6xN_6x6_LAST2COLUMNS_ROW2( N, row )			\
9976 						__asm movlps	xmm7, [edi+(row*2+0*N)*4]				\
9977 						__asm shufps	xmm7, xmm7, R_SHUFFLEPS( 0, 0, 1, 1 )	\
9978 						__asm mulps		xmm7, xmm0								\
9979 						__asm movlps	xmm6, [edi+(row*2+1*N)*4]				\
9980 						__asm shufps	xmm6, xmm6, R_SHUFFLEPS( 0, 0, 1, 1 )	\
9981 						__asm mulps		xmm6, xmm1								\
9982 						__asm addps		xmm7, xmm6								\
9983 						__asm movlps	xmm6, [edi+(row*2+2*N)*4]				\
9984 						__asm shufps	xmm6, xmm6, R_SHUFFLEPS( 0, 0, 1, 1 )	\
9985 						__asm mulps		xmm6, xmm2								\
9986 						__asm addps		xmm7, xmm6								\
9987 						__asm movlps	xmm6, [edi+(row*2+3*N)*4]				\
9988 						__asm shufps	xmm6, xmm6, R_SHUFFLEPS( 0, 0, 1, 1 )	\
9989 						__asm mulps		xmm6, xmm3								\
9990 						__asm addps		xmm7, xmm6								\
9991 						__asm movlps	xmm6, [edi+(row*2+4*N)*4]				\
9992 						__asm shufps	xmm6, xmm6, R_SHUFFLEPS( 0, 0, 1, 1 )	\
9993 						__asm mulps		xmm6, xmm4								\
9994 						__asm addps		xmm7, xmm6								\
9995 						__asm movlps	xmm6, [edi+(row*2+5*N)*4]				\
9996 						__asm shufps	xmm6, xmm6, R_SHUFFLEPS( 0, 0, 1, 1 )	\
9997 						__asm mulps		xmm6, xmm5								\
9998 						__asm addps		xmm7, xmm6								\
9999 						__asm movlps	[eax+(row*12+ 4)*4], xmm7				\
10000 						__asm movhps	[eax+(row*12+10)*4], xmm7
10001 
10002 						#define MUL_6xN_6x6_LAST2COLUMNS_ROW( N, row )			\
10003 						__asm movss		xmm7, [edi+(1*N-1)*4]					\
10004 						__asm shufps	xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )	\
10005 						__asm mulps		xmm7, xmm0								\
10006 						__asm movss		xmm6, [edi+(2*N-1)*4]					\
10007 						__asm shufps	xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )	\
10008 						__asm mulps		xmm6, xmm1								\
10009 						__asm addps		xmm7, xmm6								\
10010 						__asm movss		xmm6, [edi+(3*N-1)*4]					\
10011 						__asm shufps	xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )	\
10012 						__asm mulps		xmm6, xmm2								\
10013 						__asm addps		xmm7, xmm6								\
10014 						__asm movss		xmm6, [edi+(4*N-1)*4]					\
10015 						__asm shufps	xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )	\
10016 						__asm mulps		xmm6, xmm3								\
10017 						__asm addps		xmm7, xmm6								\
10018 						__asm movss		xmm6, [edi+(5*N-1)*4]					\
10019 						__asm shufps	xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )	\
10020 						__asm mulps		xmm6, xmm4								\
10021 						__asm addps		xmm7, xmm6								\
10022 						__asm movss		xmm6, [edi+(6*N-1)*4]					\
10023 						__asm shufps	xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )	\
10024 						__asm mulps		xmm6, xmm5								\
10025 						__asm addps		xmm7, xmm6								\
10026 						__asm movlps	[eax+(row*6+4)*4], xmm7
10027 
10028 						MUL_6xN_6x6_FIRST4COLUMNS_INIT
10029 						MUL_6xN_6x6_FIRST4COLUMNS_ROW( 1, 0 )
10030 						MUL_6xN_6x6_LAST2COLUMNS_INIT
10031 						MUL_6xN_6x6_LAST2COLUMNS_ROW( 1, 0 )
10032 
10033 						return;
10034 					}
10035 					case 2: {					// 6x2 * 6x6
10036 
10037 						MUL_6xN_6x6_FIRST4COLUMNS_INIT
10038 						MUL_6xN_6x6_FIRST4COLUMNS_ROW( 2, 0 )
10039 						MUL_6xN_6x6_FIRST4COLUMNS_ROW( 2, 1 )
10040 						MUL_6xN_6x6_LAST2COLUMNS_INIT
10041 						MUL_6xN_6x6_LAST2COLUMNS_ROW2( 2, 0 )
10042 
10043 						return;
10044 					}
10045 					case 3: {					// 6x3 * 6x6
10046 
10047 						MUL_6xN_6x6_FIRST4COLUMNS_INIT
10048 						MUL_6xN_6x6_FIRST4COLUMNS_ROW( 3, 0 )
10049 						MUL_6xN_6x6_FIRST4COLUMNS_ROW( 3, 1 )
10050 						MUL_6xN_6x6_FIRST4COLUMNS_ROW( 3, 2 )
10051 						MUL_6xN_6x6_LAST2COLUMNS_INIT
10052 						MUL_6xN_6x6_LAST2COLUMNS_ROW2( 3, 0 )
10053 						MUL_6xN_6x6_LAST2COLUMNS_ROW( 3, 2 )
10054 
10055 						return;
10056 					}
10057 					case 4: {					// 6x4 * 6x6
10058 
10059 						MUL_6xN_6x6_FIRST4COLUMNS_INIT
10060 						MUL_6xN_6x6_FIRST4COLUMNS_ROW( 4, 0 )
10061 						MUL_6xN_6x6_FIRST4COLUMNS_ROW( 4, 1 )
10062 						MUL_6xN_6x6_FIRST4COLUMNS_ROW( 4, 2 )
10063 						MUL_6xN_6x6_FIRST4COLUMNS_ROW( 4, 3 )
10064 						MUL_6xN_6x6_LAST2COLUMNS_INIT
10065 						MUL_6xN_6x6_LAST2COLUMNS_ROW2( 4, 0 )
10066 						MUL_6xN_6x6_LAST2COLUMNS_ROW2( 4, 1 )
10067 
10068 						return;
10069 					}
10070 					case 5: {					// 6x5 * 6x6
10071 
10072 						MUL_6xN_6x6_FIRST4COLUMNS_INIT
10073 						MUL_6xN_6x6_FIRST4COLUMNS_ROW( 5, 0 )
10074 						MUL_6xN_6x6_FIRST4COLUMNS_ROW( 5, 1 )
10075 						MUL_6xN_6x6_FIRST4COLUMNS_ROW( 5, 2 )
10076 						MUL_6xN_6x6_FIRST4COLUMNS_ROW( 5, 3 )
10077 						MUL_6xN_6x6_FIRST4COLUMNS_ROW( 5, 4 )
10078 						MUL_6xN_6x6_LAST2COLUMNS_INIT
10079 						MUL_6xN_6x6_LAST2COLUMNS_ROW2( 5, 0 )
10080 						MUL_6xN_6x6_LAST2COLUMNS_ROW2( 5, 1 )
10081 						MUL_6xN_6x6_LAST2COLUMNS_ROW( 5, 4 )
10082 
10083 						return;
10084 					}
10085 					case 6: {					// 6x6 * 6x6
10086 
10087 						MUL_6xN_6x6_FIRST4COLUMNS_INIT
10088 						MUL_6xN_6x6_FIRST4COLUMNS_ROW( 6, 0 )
10089 						MUL_6xN_6x6_FIRST4COLUMNS_ROW( 6, 1 )
10090 						MUL_6xN_6x6_FIRST4COLUMNS_ROW( 6, 2 )
10091 						MUL_6xN_6x6_FIRST4COLUMNS_ROW( 6, 3 )
10092 						MUL_6xN_6x6_FIRST4COLUMNS_ROW( 6, 4 )
10093 						MUL_6xN_6x6_FIRST4COLUMNS_ROW( 6, 5 )
10094 						MUL_6xN_6x6_LAST2COLUMNS_INIT
10095 						MUL_6xN_6x6_LAST2COLUMNS_ROW2( 6, 0 )
10096 						MUL_6xN_6x6_LAST2COLUMNS_ROW2( 6, 1 )
10097 						MUL_6xN_6x6_LAST2COLUMNS_ROW2( 6, 2 )
10098 
10099 						return;
10100 					}
10101 				}
10102 			}
10103 			for ( i = 0; i < k; i++ ) {
10104 				m2Ptr = m2.ToFloatPtr();
10105 				for ( j = 0; j < l; j++ ) {
10106 					*dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[k] * m2Ptr[l] + m1Ptr[2*k] * m2Ptr[2*l] +
10107 									m1Ptr[3*k] * m2Ptr[3*l] + m1Ptr[4*k] * m2Ptr[4*l] + m1Ptr[5*k] * m2Ptr[5*l];
10108 					m2Ptr++;
10109 				}
10110 				m1Ptr++;
10111 			}
10112 			break;
10113 		default:
10114 			for ( i = 0; i < k; i++ ) {
10115 				for ( j = 0; j < l; j++ ) {
10116 					m1Ptr = m1.ToFloatPtr() + i;
10117 					m2Ptr = m2.ToFloatPtr() + j;
10118 					sum = m1Ptr[0] * m2Ptr[0];
10119 					for ( n = 1; n < m1.GetNumRows(); n++ ) {
10120 						m1Ptr += k;
10121 						m2Ptr += l;
10122 						sum += m1Ptr[0] * m2Ptr[0];
10123 					}
10124 					*dstPtr++ = sum;
10125 				}
10126 			}
10127 		break;
10128 	}
10129 }
10130 
10131 /*
10132 ============
10133 idSIMD_SSE::MatX_LowerTriangularSolve
10134 
10135   solves x in Lx = b for the n * n sub-matrix of L
10136   if skip > 0 the first skip elements of x are assumed to be valid already
10137   L has to be a lower triangular matrix with (implicit) ones on the diagonal
10138   x == b is allowed
10139 ============
10140 */
MatX_LowerTriangularSolve(const idMatX & L,float * x,const float * b,const int n,int skip)10141 void VPCALL idSIMD_SSE::MatX_LowerTriangularSolve( const idMatX &L, float *x, const float *b, const int n, int skip ) {
10142 	int nc;
10143 	const float *lptr;
10144 
10145 	if ( skip >= n ) {
10146 		return;
10147 	}
10148 
10149 	lptr = L.ToFloatPtr();
10150 	nc = L.GetNumColumns();
10151 
10152 	// unrolled cases for n < 8
10153 	if ( n < 8 ) {
10154 		#define NSKIP( n, s )	((n<<3)|(s&7))
10155 		switch( NSKIP( n, skip ) ) {
10156 			case NSKIP( 1, 0 ): x[0] = b[0];
10157 				return;
10158 			case NSKIP( 2, 0 ): x[0] = b[0];
10159 			case NSKIP( 2, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0];
10160 				return;
10161 			case NSKIP( 3, 0 ): x[0] = b[0];
10162 			case NSKIP( 3, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0];
10163 			case NSKIP( 3, 2 ): x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1];
10164 				return;
10165 			case NSKIP( 4, 0 ): x[0] = b[0];
10166 			case NSKIP( 4, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0];
10167 			case NSKIP( 4, 2 ): x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1];
10168 			case NSKIP( 4, 3 ): x[3] = b[3] - lptr[3*nc+0] * x[0] - lptr[3*nc+1] * x[1] - lptr[3*nc+2] * x[2];
10169 				return;
10170 			case NSKIP( 5, 0 ): x[0] = b[0];
10171 			case NSKIP( 5, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0];
10172 			case NSKIP( 5, 2 ): x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1];
10173 			case NSKIP( 5, 3 ): x[3] = b[3] - lptr[3*nc+0] * x[0] - lptr[3*nc+1] * x[1] - lptr[3*nc+2] * x[2];
10174 			case NSKIP( 5, 4 ): x[4] = b[4] - lptr[4*nc+0] * x[0] - lptr[4*nc+1] * x[1] - lptr[4*nc+2] * x[2] - lptr[4*nc+3] * x[3];
10175 				return;
10176 			case NSKIP( 6, 0 ): x[0] = b[0];
10177 			case NSKIP( 6, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0];
10178 			case NSKIP( 6, 2 ): x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1];
10179 			case NSKIP( 6, 3 ): x[3] = b[3] - lptr[3*nc+0] * x[0] - lptr[3*nc+1] * x[1] - lptr[3*nc+2] * x[2];
10180 			case NSKIP( 6, 4 ): x[4] = b[4] - lptr[4*nc+0] * x[0] - lptr[4*nc+1] * x[1] - lptr[4*nc+2] * x[2] - lptr[4*nc+3] * x[3];
10181 			case NSKIP( 6, 5 ): x[5] = b[5] - lptr[5*nc+0] * x[0] - lptr[5*nc+1] * x[1] - lptr[5*nc+2] * x[2] - lptr[5*nc+3] * x[3] - lptr[5*nc+4] * x[4];
10182 				return;
10183 			case NSKIP( 7, 0 ): x[0] = b[0];
10184 			case NSKIP( 7, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0];
10185 			case NSKIP( 7, 2 ): x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1];
10186 			case NSKIP( 7, 3 ): x[3] = b[3] - lptr[3*nc+0] * x[0] - lptr[3*nc+1] * x[1] - lptr[3*nc+2] * x[2];
10187 			case NSKIP( 7, 4 ): x[4] = b[4] - lptr[4*nc+0] * x[0] - lptr[4*nc+1] * x[1] - lptr[4*nc+2] * x[2] - lptr[4*nc+3] * x[3];
10188 			case NSKIP( 7, 5 ): x[5] = b[5] - lptr[5*nc+0] * x[0] - lptr[5*nc+1] * x[1] - lptr[5*nc+2] * x[2] - lptr[5*nc+3] * x[3] - lptr[5*nc+4] * x[4];
10189 			case NSKIP( 7, 6 ): x[6] = b[6] - lptr[6*nc+0] * x[0] - lptr[6*nc+1] * x[1] - lptr[6*nc+2] * x[2] - lptr[6*nc+3] * x[3] - lptr[6*nc+4] * x[4] - lptr[6*nc+5] * x[5];
10190 				return;
10191 		}
10192 		return;
10193 	}
10194 
10195 	// process first 4 rows
10196 	switch( skip ) {
10197 		case 0: x[0] = b[0];
10198 		case 1: x[1] = b[1] - lptr[1*nc+0] * x[0];
10199 		case 2: x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1];
10200 		case 3: x[3] = b[3] - lptr[3*nc+0] * x[0] - lptr[3*nc+1] * x[1] - lptr[3*nc+2] * x[2];
10201 				skip = 4;
10202 	}
10203 
10204 	lptr = L[skip];
10205 
10206 	// this code assumes n > 4
10207 	__asm {
10208 		push		ebx
10209 		mov			eax, skip				// eax = i
10210 		shl			eax, 2					// eax = i*4
10211 		mov			edx, n					// edx = n
10212 		shl			edx, 2					// edx = n*4
10213 		mov			esi, x					// esi = x
10214 		mov			edi, lptr				// edi = lptr
10215 		add			esi, eax
10216 		add			edi, eax
10217 		mov			ebx, b					// ebx = b
10218 
10219 		// check for aligned memory
10220 		mov			ecx, nc
10221 		shl			ecx, 2
10222 		or			ecx, esi
10223 		or			ecx, edi
10224 		and			ecx, 15
10225 		jnz			loopurow
10226 
10227 		// aligned
10228 	looprow:
10229 		mov			ecx, eax
10230 		neg			ecx
10231 		movaps		xmm0, [esi+ecx]
10232 		mulps		xmm0, [edi+ecx]
10233 		add			ecx, 12*4
10234 		jg			donedot8
10235 	dot8:
10236 		movaps		xmm1, [esi+ecx-(8*4)]
10237 		mulps		xmm1, [edi+ecx-(8*4)]
10238 		addps		xmm0, xmm1
10239 		movaps		xmm3, [esi+ecx-(4*4)]
10240 		mulps		xmm3, [edi+ecx-(4*4)]
10241 		addps		xmm0, xmm3
10242 		add			ecx, 8*4
10243 		jle			dot8
10244 	donedot8:
10245 		sub			ecx, 4*4
10246 		jg			donedot4
10247 	//dot4:
10248 		movaps		xmm1, [esi+ecx-(4*4)]
10249 		mulps		xmm1, [edi+ecx-(4*4)]
10250 		addps		xmm0, xmm1
10251 		add			ecx, 4*4
10252 	donedot4:
10253 		movhlps		xmm1, xmm0
10254 		addps		xmm0, xmm1
10255 		movaps		xmm1, xmm0
10256 		shufps		xmm1, xmm1, R_SHUFFLEPS( 1, 0, 0, 0 )
10257 		addss		xmm0, xmm1
10258 		sub			ecx, 4*4
10259 		jz			dot0
10260 		add			ecx, 4
10261 		jz			dot1
10262 		add			ecx, 4
10263 		jz			dot2
10264 	//dot3:
10265 		movss		xmm1, [esi-(3*4)]
10266 		mulss		xmm1, [edi-(3*4)]
10267 		addss		xmm0, xmm1
10268 	dot2:
10269 		movss		xmm3, [esi-(2*4)]
10270 		mulss		xmm3, [edi-(2*4)]
10271 		addss		xmm0, xmm3
10272 	dot1:
10273 		movss		xmm5, [esi-(1*4)]
10274 		mulss		xmm5, [edi-(1*4)]
10275 		addss		xmm0, xmm5
10276 	dot0:
10277 		movss		xmm1, [ebx+eax]
10278 		subss		xmm1, xmm0
10279 		movss		[esi], xmm1
10280 		add			eax, 4
10281 		cmp			eax, edx
10282 		jge			done
10283 		add			esi, 4
10284 		mov			ecx, nc
10285 		shl			ecx, 2
10286 		add			edi, ecx
10287 		add			edi, 4
10288 		jmp			looprow
10289 
10290 		// unaligned
10291 	loopurow:
10292 		mov			ecx, eax
10293 		neg			ecx
10294 		movups		xmm0, [esi+ecx]
10295 		movups		xmm1, [edi+ecx]
10296 		mulps		xmm0, xmm1
10297 		add			ecx, 12*4
10298 		jg			doneudot8
10299 	udot8:
10300 		movups		xmm1, [esi+ecx-(8*4)]
10301 		movups		xmm2, [edi+ecx-(8*4)]
10302 		mulps		xmm1, xmm2
10303 		addps		xmm0, xmm1
10304 		movups		xmm3, [esi+ecx-(4*4)]
10305 		movups		xmm4, [edi+ecx-(4*4)]
10306 		mulps		xmm3, xmm4
10307 		addps		xmm0, xmm3
10308 		add			ecx, 8*4
10309 		jle			udot8
10310 	doneudot8:
10311 		sub			ecx, 4*4
10312 		jg			doneudot4
10313 	//udot4:
10314 		movups		xmm1, [esi+ecx-(4*4)]
10315 		movups		xmm2, [edi+ecx-(4*4)]
10316 		mulps		xmm1, xmm2
10317 		addps		xmm0, xmm1
10318 		add			ecx, 4*4
10319 	doneudot4:
10320 		movhlps		xmm1, xmm0
10321 		addps		xmm0, xmm1
10322 		movaps		xmm1, xmm0
10323 		shufps		xmm1, xmm1, R_SHUFFLEPS( 1, 0, 0, 0 )
10324 		addss		xmm0, xmm1
10325 		sub			ecx, 4*4
10326 		jz			udot0
10327 		add			ecx, 4
10328 		jz			udot1
10329 		add			ecx, 4
10330 		jz			udot2
10331 	//udot3:
10332 		movss		xmm1, [esi-(3*4)]
10333 		movss		xmm2, [edi-(3*4)]
10334 		mulss		xmm1, xmm2
10335 		addss		xmm0, xmm1
10336 	udot2:
10337 		movss		xmm3, [esi-(2*4)]
10338 		movss		xmm4, [edi-(2*4)]
10339 		mulss		xmm3, xmm4
10340 		addss		xmm0, xmm3
10341 	udot1:
10342 		movss		xmm5, [esi-(1*4)]
10343 		movss		xmm6, [edi-(1*4)]
10344 		mulss		xmm5, xmm6
10345 		addss		xmm0, xmm5
10346 	udot0:
10347 		movss		xmm1, [ebx+eax]
10348 		subss		xmm1, xmm0
10349 		movss		[esi], xmm1
10350 		add			eax, 4
10351 		cmp			eax, edx
10352 		jge			done
10353 		add			esi, 4
10354 		mov			ecx, nc
10355 		shl			ecx, 2
10356 		add			edi, ecx
10357 		add			edi, 4
10358 		jmp			loopurow
10359 	done:
10360 		pop			ebx
10361 	}
10362 }
10363 
10364 /*
10365 ============
10366 idSIMD_SSE::MatX_LowerTriangularSolveTranspose
10367 
10368   solves x in L'x = b for the n * n sub-matrix of L
10369   L has to be a lower triangular matrix with (implicit) ones on the diagonal
10370   x == b is allowed
10371 ============
10372 */
MatX_LowerTriangularSolveTranspose(const idMatX & L,float * x,const float * b,const int n)10373 void VPCALL idSIMD_SSE::MatX_LowerTriangularSolveTranspose( const idMatX &L, float *x, const float *b, const int n ) {
10374 	int nc;
10375 	const float *lptr;
10376 
10377 	lptr = L.ToFloatPtr();
10378 	nc = L.GetNumColumns();
10379 
10380 	// unrolled cases for n < 8
10381 	if ( n < 8 ) {
10382 		switch( n ) {
10383 			case 0:
10384 				return;
10385 			case 1:
10386 				x[0] = b[0];
10387 				return;
10388 			case 2:
10389 				x[1] = b[1];
10390 				x[0] = b[0] - lptr[1*nc+0] * x[1];
10391 				return;
10392 			case 3:
10393 				x[2] = b[2];
10394 				x[1] = b[1] - lptr[2*nc+1] * x[2];
10395 				x[0] = b[0] - lptr[2*nc+0] * x[2] - lptr[1*nc+0] * x[1];
10396 				return;
10397 			case 4:
10398 				x[3] = b[3];
10399 				x[2] = b[2] - lptr[3*nc+2] * x[3];
10400 				x[1] = b[1] - lptr[3*nc+1] * x[3] - lptr[2*nc+1] * x[2];
10401 				x[0] = b[0] - lptr[3*nc+0] * x[3] - lptr[2*nc+0] * x[2] - lptr[1*nc+0] * x[1];
10402 				return;
10403 			case 5:
10404 				x[4] = b[4];
10405 				x[3] = b[3] - lptr[4*nc+3] * x[4];
10406 				x[2] = b[2] - lptr[4*nc+2] * x[4] - lptr[3*nc+2] * x[3];
10407 				x[1] = b[1] - lptr[4*nc+1] * x[4] - lptr[3*nc+1] * x[3] - lptr[2*nc+1] * x[2];
10408 				x[0] = b[0] - lptr[4*nc+0] * x[4] - lptr[3*nc+0] * x[3] - lptr[2*nc+0] * x[2] - lptr[1*nc+0] * x[1];
10409 				return;
10410 			case 6:
10411 				x[5] = b[5];
10412 				x[4] = b[4] - lptr[5*nc+4] * x[5];
10413 				x[3] = b[3] - lptr[5*nc+3] * x[5] - lptr[4*nc+3] * x[4];
10414 				x[2] = b[2] - lptr[5*nc+2] * x[5] - lptr[4*nc+2] * x[4] - lptr[3*nc+2] * x[3];
10415 				x[1] = b[1] - lptr[5*nc+1] * x[5] - lptr[4*nc+1] * x[4] - lptr[3*nc+1] * x[3] - lptr[2*nc+1] * x[2];
10416 				x[0] = b[0] - lptr[5*nc+0] * x[5] - lptr[4*nc+0] * x[4] - lptr[3*nc+0] * x[3] - lptr[2*nc+0] * x[2] - lptr[1*nc+0] * x[1];
10417 				return;
10418 			case 7:
10419 				x[6] = b[6];
10420 				x[5] = b[5] - lptr[6*nc+5] * x[6];
10421 				x[4] = b[4] - lptr[6*nc+4] * x[6] - lptr[5*nc+4] * x[5];
10422 				x[3] = b[3] - lptr[6*nc+3] * x[6] - lptr[5*nc+3] * x[5] - lptr[4*nc+3] * x[4];
10423 				x[2] = b[2] - lptr[6*nc+2] * x[6] - lptr[5*nc+2] * x[5] - lptr[4*nc+2] * x[4] - lptr[3*nc+2] * x[3];
10424 				x[1] = b[1] - lptr[6*nc+1] * x[6] - lptr[5*nc+1] * x[5] - lptr[4*nc+1] * x[4] - lptr[3*nc+1] * x[3] - lptr[2*nc+1] * x[2];
10425 				x[0] = b[0] - lptr[6*nc+0] * x[6] - lptr[5*nc+0] * x[5] - lptr[4*nc+0] * x[4] - lptr[3*nc+0] * x[3] - lptr[2*nc+0] * x[2] - lptr[1*nc+0] * x[1];
10426 				return;
10427 		}
10428 		return;
10429 	}
10430 
10431 #if 1
10432 
10433 	int i, j, m;
10434 	float *xptr;
10435 	double s0;
10436 
10437 	// if the number of columns is not a multiple of 2 we're screwed for alignment.
10438 	// however, if the number of columns is a multiple of 2 but the number of to be
10439 	// processed rows is not a multiple of 2 we can still run 8 byte aligned
10440 	m = n;
10441 	if ( m & 1 ) {
10442 
10443 		m--;
10444 		x[m] = b[m];
10445 
10446 		lptr = L.ToFloatPtr() + m * nc + m - 4;
10447 		xptr = x + m;
10448 		__asm {
10449 			push		ebx
10450 			mov			eax, m					// eax = i
10451 			mov			esi, xptr				// esi = xptr
10452 			mov			edi, lptr				// edi = lptr
10453 			mov			ebx, b					// ebx = b
10454 			mov			edx, nc					// edx = nc*sizeof(float)
10455 			shl			edx, 2
10456 		process4rows_1:
10457 			movlps		xmm0, [ebx+eax*4-16]	// load b[i-2], b[i-1]
10458 			movhps		xmm0, [ebx+eax*4-8]		// load b[i-4], b[i-3]
10459 			xor			ecx, ecx
10460 			sub			eax, m
10461 			neg			eax
10462 			jz			done4x4_1
10463 		process4x4_1:	// process 4x4 blocks
10464 			movlps		xmm2, [edi+0]
10465 			movhps		xmm2, [edi+8]
10466 			add			edi, edx
10467 			movss		xmm1, [esi+4*ecx+0]
10468 			shufps		xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
10469 			movlps		xmm3, [edi+0]
10470 			movhps		xmm3, [edi+8]
10471 			add			edi, edx
10472 			mulps		xmm1, xmm2
10473 			subps		xmm0, xmm1
10474 			movss		xmm1, [esi+4*ecx+4]
10475 			shufps		xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
10476 			movlps		xmm4, [edi+0]
10477 			movhps		xmm4, [edi+8]
10478 			add			edi, edx
10479 			mulps		xmm1, xmm3
10480 			subps		xmm0, xmm1
10481 			movss		xmm1, [esi+4*ecx+8]
10482 			shufps		xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
10483 			movlps		xmm5, [edi+0]
10484 			movhps		xmm5, [edi+8]
10485 			add			edi, edx
10486 			mulps		xmm1, xmm4
10487 			subps		xmm0, xmm1
10488 			movss		xmm1, [esi+4*ecx+12]
10489 			shufps		xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
10490 			add			ecx, 4
10491 			cmp			ecx, eax
10492 			mulps		xmm1, xmm5
10493 			subps		xmm0, xmm1
10494 			jl			process4x4_1
10495 		done4x4_1:		// process left over of the 4 rows
10496 			movlps		xmm2, [edi+0]
10497 			movhps		xmm2, [edi+8]
10498 			movss		xmm1, [esi+4*ecx]
10499 			shufps		xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
10500 			mulps		xmm1, xmm2
10501 			subps		xmm0, xmm1
10502 			imul		ecx, edx
10503 			sub			edi, ecx
10504 			neg			eax
10505 
10506 			add			eax, m
10507 			sub			eax, 4
10508 			movaps		xmm1, xmm0
10509 			shufps		xmm1, xmm1, R_SHUFFLEPS( 1, 1, 1, 1 )
10510 			movaps		xmm2, xmm0
10511 			shufps		xmm2, xmm2, R_SHUFFLEPS( 2, 2, 2, 2 )
10512 			movaps		xmm3, xmm0
10513 			shufps		xmm3, xmm3, R_SHUFFLEPS( 3, 3, 3, 3 )
10514 			sub			edi, edx
10515 			movss		[esi-4], xmm3			// xptr[-1] = s3
10516 			movss		xmm4, xmm3
10517 			movss		xmm5, xmm3
10518 			mulss		xmm3, [edi+8]			// lptr[-1*nc+2] * s3
10519 			mulss		xmm4, [edi+4]			// lptr[-1*nc+1] * s3
10520 			mulss		xmm5, [edi+0]			// lptr[-1*nc+0] * s3
10521 			subss		xmm2, xmm3
10522 			movss		[esi-8], xmm2			// xptr[-2] = s2
10523 			movss		xmm6, xmm2
10524 			sub			edi, edx
10525 			subss		xmm0, xmm5
10526 			subss		xmm1, xmm4
10527 			mulss		xmm2, [edi+4]			// lptr[-2*nc+1] * s2
10528 			mulss		xmm6, [edi+0]			// lptr[-2*nc+0] * s2
10529 			subss		xmm1, xmm2
10530 			movss		[esi-12], xmm1			// xptr[-3] = s1
10531 			subss		xmm0, xmm6
10532 			sub			edi, edx
10533 			cmp			eax, 4
10534 			mulss		xmm1, [edi+0]			// lptr[-3*nc+0] * s1
10535 			subss		xmm0, xmm1
10536 			movss		[esi-16], xmm0			// xptr[-4] = s0
10537 			jl			done4rows_1
10538 			sub			edi, edx
10539 			sub			edi, 16
10540 			sub			esi, 16
10541 			jmp			process4rows_1
10542 		done4rows_1:
10543 			pop			ebx
10544 		}
10545 
10546 	} else {
10547 
10548 		lptr = L.ToFloatPtr() + m * nc + m - 4;
10549 		xptr = x + m;
10550 		__asm {
10551 			push		ebx
10552 			mov			eax, m					// eax = i
10553 			mov			esi, xptr				// esi = xptr
10554 			mov			edi, lptr				// edi = lptr
10555 			mov			ebx, b					// ebx = b
10556 			mov			edx, nc					// edx = nc*sizeof(float)
10557 			shl			edx, 2
10558 		process4rows:
10559 			movlps		xmm0, [ebx+eax*4-16]	// load b[i-2], b[i-1]
10560 			movhps		xmm0, [ebx+eax*4-8]		// load b[i-4], b[i-3]
10561 			sub			eax, m
10562 			jz			done4x4
10563 			neg			eax
10564 			xor			ecx, ecx
10565 		process4x4:		// process 4x4 blocks
10566 			movlps		xmm2, [edi+0]
10567 			movhps		xmm2, [edi+8]
10568 			add			edi, edx
10569 			movss		xmm1, [esi+4*ecx+0]
10570 			shufps		xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
10571 			movlps		xmm3, [edi+0]
10572 			movhps		xmm3, [edi+8]
10573 			add			edi, edx
10574 			mulps		xmm1, xmm2
10575 			subps		xmm0, xmm1
10576 			movss		xmm1, [esi+4*ecx+4]
10577 			shufps		xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
10578 			movlps		xmm4, [edi+0]
10579 			movhps		xmm4, [edi+8]
10580 			add			edi, edx
10581 			mulps		xmm1, xmm3
10582 			subps		xmm0, xmm1
10583 			movss		xmm1, [esi+4*ecx+8]
10584 			shufps		xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
10585 			movlps		xmm5, [edi+0]
10586 			movhps		xmm5, [edi+8]
10587 			add			edi, edx
10588 			mulps		xmm1, xmm4
10589 			subps		xmm0, xmm1
10590 			movss		xmm1, [esi+4*ecx+12]
10591 			shufps		xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
10592 			add			ecx, 4
10593 			cmp			ecx, eax
10594 			mulps		xmm1, xmm5
10595 			subps		xmm0, xmm1
10596 			jl			process4x4
10597 			imul		ecx, edx
10598 			sub			edi, ecx
10599 			neg			eax
10600 		done4x4:		// process left over of the 4 rows
10601 			add			eax, m
10602 			sub			eax, 4
10603 			movaps		xmm1, xmm0
10604 			shufps		xmm1, xmm1, R_SHUFFLEPS( 1, 1, 1, 1 )
10605 			movaps		xmm2, xmm0
10606 			shufps		xmm2, xmm2, R_SHUFFLEPS( 2, 2, 2, 2 )
10607 			movaps		xmm3, xmm0
10608 			shufps		xmm3, xmm3, R_SHUFFLEPS( 3, 3, 3, 3 )
10609 			sub			edi, edx
10610 			movss		[esi-4], xmm3			// xptr[-1] = s3
10611 			movss		xmm4, xmm3
10612 			movss		xmm5, xmm3
10613 			mulss		xmm3, [edi+8]			// lptr[-1*nc+2] * s3
10614 			mulss		xmm4, [edi+4]			// lptr[-1*nc+1] * s3
10615 			mulss		xmm5, [edi+0]			// lptr[-1*nc+0] * s3
10616 			subss		xmm2, xmm3
10617 			movss		[esi-8], xmm2			// xptr[-2] = s2
10618 			movss		xmm6, xmm2
10619 			sub			edi, edx
10620 			subss		xmm0, xmm5
10621 			subss		xmm1, xmm4
10622 			mulss		xmm2, [edi+4]			// lptr[-2*nc+1] * s2
10623 			mulss		xmm6, [edi+0]			// lptr[-2*nc+0] * s2
10624 			subss		xmm1, xmm2
10625 			movss		[esi-12], xmm1			// xptr[-3] = s1
10626 			subss		xmm0, xmm6
10627 			sub			edi, edx
10628 			cmp			eax, 4
10629 			mulss		xmm1, [edi+0]			// lptr[-3*nc+0] * s1
10630 			subss		xmm0, xmm1
10631 			movss		[esi-16], xmm0			// xptr[-4] = s0
10632 			jl			done4rows
10633 			sub			edi, edx
10634 			sub			edi, 16
10635 			sub			esi, 16
10636 			jmp			process4rows
10637 		done4rows:
10638 			pop			ebx
10639 		}
10640 	}
10641 
10642 	// process left over rows
10643 	for ( i = (m&3)-1; i >= 0; i-- ) {
10644 		s0 = b[i];
10645 		lptr = L[0] + i;
10646 		for ( j = i + 1; j < n; j++ ) {
10647 			s0 -= lptr[j*nc] * x[j];
10648 		}
10649 		x[i] = s0;
10650 	}
10651 
10652 #else
10653 
10654 	int i, j, m;
10655 	double s0, s1, s2, s3, t;
10656 	const float *lptr2;
10657 	float *xptr, *xptr2;
10658 
10659 	m = n;
10660 	if ( m & 1 ) {
10661 
10662 		m--;
10663 		x[m] = b[m];
10664 
10665 		lptr = L.ToFloatPtr() + m * nc + m - 4;
10666 		xptr = x + m;
10667 		// process 4 rows at a time
10668 		for ( i = m; i >= 4; i -= 4 ) {
10669 			s0 = b[i-4];
10670 			s1 = b[i-3];
10671 			s2 = b[i-2];
10672 			s3 = b[i-1];
10673 			// process 4x4 blocks
10674 			xptr2 = xptr;	// x + i;
10675 			lptr2 = lptr;	// ptr = L[i] + i - 4;
10676 			for ( j = 0; j < m-i; j += 4 ) {
10677 				t = xptr2[0];
10678 				s0 -= lptr2[0] * t;
10679 				s1 -= lptr2[1] * t;
10680 				s2 -= lptr2[2] * t;
10681 				s3 -= lptr2[3] * t;
10682 				lptr2 += nc;
10683 				xptr2++;
10684 				t = xptr2[0];
10685 				s0 -= lptr2[0] * t;
10686 				s1 -= lptr2[1] * t;
10687 				s2 -= lptr2[2] * t;
10688 				s3 -= lptr2[3] * t;
10689 				lptr2 += nc;
10690 				xptr2++;
10691 				t = xptr2[0];
10692 				s0 -= lptr2[0] * t;
10693 				s1 -= lptr2[1] * t;
10694 				s2 -= lptr2[2] * t;
10695 				s3 -= lptr2[3] * t;
10696 				lptr2 += nc;
10697 				xptr2++;
10698 				t = xptr2[0];
10699 				s0 -= lptr2[0] * t;
10700 				s1 -= lptr2[1] * t;
10701 				s2 -= lptr2[2] * t;
10702 				s3 -= lptr2[3] * t;
10703 				lptr2 += nc;
10704 				xptr2++;
10705 			}
10706 			t = xptr2[0];
10707 			s0 -= lptr2[0] * t;
10708 			s1 -= lptr2[1] * t;
10709 			s2 -= lptr2[2] * t;
10710 			s3 -= lptr2[3] * t;
10711 			// process left over of the 4 rows
10712 			lptr -= nc;
10713 			s0 -= lptr[0] * s3;
10714 			s1 -= lptr[1] * s3;
10715 			s2 -= lptr[2] * s3;
10716 			lptr -= nc;
10717 			s0 -= lptr[0] * s2;
10718 			s1 -= lptr[1] * s2;
10719 			lptr -= nc;
10720 			s0 -= lptr[0] * s1;
10721 			lptr -= nc;
10722 			// store result
10723 			xptr[-4] = s0;
10724 			xptr[-3] = s1;
10725 			xptr[-2] = s2;
10726 			xptr[-1] = s3;
10727 			// update pointers for next four rows
10728 			lptr -= 4;
10729 			xptr -= 4;
10730 		}
10731 
10732 	} else {
10733 
10734 		lptr = L.ToFloatPtr() + m * nc + m - 4;
10735 		xptr = x + m;
10736 		// process 4 rows at a time
10737 		for ( i = m; i >= 4; i -= 4 ) {
10738 			s0 = b[i-4];
10739 			s1 = b[i-3];
10740 			s2 = b[i-2];
10741 			s3 = b[i-1];
10742 			// process 4x4 blocks
10743 			xptr2 = xptr;	// x + i;
10744 			lptr2 = lptr;	// ptr = L[i] + i - 4;
10745 			for ( j = 0; j < m-i; j += 4 ) {
10746 				t = xptr2[0];
10747 				s0 -= lptr2[0] * t;
10748 				s1 -= lptr2[1] * t;
10749 				s2 -= lptr2[2] * t;
10750 				s3 -= lptr2[3] * t;
10751 				lptr2 += nc;
10752 				xptr2++;
10753 				t = xptr2[0];
10754 				s0 -= lptr2[0] * t;
10755 				s1 -= lptr2[1] * t;
10756 				s2 -= lptr2[2] * t;
10757 				s3 -= lptr2[3] * t;
10758 				lptr2 += nc;
10759 				xptr2++;
10760 				t = xptr2[0];
10761 				s0 -= lptr2[0] * t;
10762 				s1 -= lptr2[1] * t;
10763 				s2 -= lptr2[2] * t;
10764 				s3 -= lptr2[3] * t;
10765 				lptr2 += nc;
10766 				xptr2++;
10767 				t = xptr2[0];
10768 				s0 -= lptr2[0] * t;
10769 				s1 -= lptr2[1] * t;
10770 				s2 -= lptr2[2] * t;
10771 				s3 -= lptr2[3] * t;
10772 				lptr2 += nc;
10773 				xptr2++;
10774 			}
10775 			// process left over of the 4 rows
10776 			lptr -= nc;
10777 			s0 -= lptr[0] * s3;
10778 			s1 -= lptr[1] * s3;
10779 			s2 -= lptr[2] * s3;
10780 			lptr -= nc;
10781 			s0 -= lptr[0] * s2;
10782 			s1 -= lptr[1] * s2;
10783 			lptr -= nc;
10784 			s0 -= lptr[0] * s1;
10785 			lptr -= nc;
10786 			// store result
10787 			xptr[-4] = s0;
10788 			xptr[-3] = s1;
10789 			xptr[-2] = s2;
10790 			xptr[-1] = s3;
10791 			// update pointers for next four rows
10792 			lptr -= 4;
10793 			xptr -= 4;
10794 		}
10795 	}
10796 	// process left over rows
10797 	for ( i--; i >= 0; i-- ) {
10798 		s0 = b[i];
10799 		lptr = L[0] + i;
10800 		for ( j = i + 1; j < m; j++ ) {
10801 			s0 -= lptr[j*nc] * x[j];
10802 		}
10803 		x[i] = s0;
10804 	}
10805 
10806 #endif
10807 }
10808 
10809 /*
10810 ============
10811 idSIMD_SSE::MatX_LDLTFactor
10812 
10813   in-place factorization LDL' of the n * n sub-matrix of mat
10814   the reciprocal of the diagonal elements are stored in invDiag
10815   currently assumes the number of columns of mat is a multiple of 4
10816 ============
10817 */
MatX_LDLTFactor(idMatX & mat,idVecX & invDiag,const int n)10818 bool VPCALL idSIMD_SSE::MatX_LDLTFactor( idMatX &mat, idVecX &invDiag, const int n ) {
10819 #if 1
10820 
10821 	int j, nc;
10822 	float *v, *diag, *invDiagPtr, *mptr;
10823 	double s0, s1, s2, sum, d;
10824 
10825 	v = (float *) _alloca16( n * sizeof( float ) );
10826 	diag = (float *) _alloca16( n * sizeof( float ) );
10827 	invDiagPtr = invDiag.ToFloatPtr();
10828 
10829 	nc = mat.GetNumColumns();
10830 
10831 	assert( ( nc & 3 ) == 0 );
10832 
10833 	if ( n <= 0 ) {
10834 		return true;
10835 	}
10836 
10837 	mptr = mat[0];
10838 
10839 	sum = mptr[0];
10840 
10841 	if ( sum == 0.0f ) {
10842 		return false;
10843 	}
10844 
10845 	diag[0] = sum;
10846 	invDiagPtr[0] = d = 1.0f / sum;
10847 
10848 	if ( n <= 1 ) {
10849 		return true;
10850 	}
10851 
10852 	mptr = mat[0];
10853 	for ( j = 1; j < n; j++ ) {
10854 		mptr[j*nc+0] = ( mptr[j*nc+0] ) * d;
10855 	}
10856 
10857 	mptr = mat[1];
10858 
10859 	v[0] = diag[0] * mptr[0]; s0 = v[0] * mptr[0];
10860 	sum = mptr[1] - s0;
10861 
10862 	if ( sum == 0.0f ) {
10863 		return false;
10864 	}
10865 
10866 	mat[1][1] = sum;
10867 	diag[1] = sum;
10868 	invDiagPtr[1] = d = 1.0f / sum;
10869 
10870 	if ( n <= 2 ) {
10871 		return true;
10872 	}
10873 
10874 	mptr = mat[0];
10875 	for ( j = 2; j < n; j++ ) {
10876 		mptr[j*nc+1] = ( mptr[j*nc+1] - v[0] * mptr[j*nc+0] ) * d;
10877 	}
10878 
10879 	mptr = mat[2];
10880 
10881 	v[0] = diag[0] * mptr[0]; s0 = v[0] * mptr[0];
10882 	v[1] = diag[1] * mptr[1]; s1 = v[1] * mptr[1];
10883 	sum = mptr[2] - s0 - s1;
10884 
10885 	if ( sum == 0.0f ) {
10886 		return false;
10887 	}
10888 
10889 	mat[2][2] = sum;
10890 	diag[2] = sum;
10891 	invDiagPtr[2] = d = 1.0f / sum;
10892 
10893 	if ( n <= 3 ) {
10894 		return true;
10895 	}
10896 
10897 	mptr = mat[0];
10898 	for ( j = 3; j < n; j++ ) {
10899 		mptr[j*nc+2] = ( mptr[j*nc+2] - v[0] * mptr[j*nc+0] - v[1] * mptr[j*nc+1] ) * d;
10900 	}
10901 
10902 	mptr = mat[3];
10903 
10904 	v[0] = diag[0] * mptr[0]; s0 = v[0] * mptr[0];
10905 	v[1] = diag[1] * mptr[1]; s1 = v[1] * mptr[1];
10906 	v[2] = diag[2] * mptr[2]; s2 = v[2] * mptr[2];
10907 	sum = mptr[3] - s0 - s1 - s2;
10908 
10909 	if ( sum == 0.0f ) {
10910 		return false;
10911 	}
10912 
10913 	mat[3][3] = sum;
10914 	diag[3] = sum;
10915 	invDiagPtr[3] = d = 1.0f / sum;
10916 
10917 	if ( n <= 4 ) {
10918 		return true;
10919 	}
10920 
10921 	mptr = mat[0];
10922 	for ( j = 4; j < n; j++ ) {
10923 		mptr[j*nc+3] = ( mptr[j*nc+3] - v[0] * mptr[j*nc+0] - v[1] * mptr[j*nc+1] - v[2] * mptr[j*nc+2] ) * d;
10924 	}
10925 
10926 	int ncf = nc * sizeof( float );
10927 	mptr = mat[0];
10928 
10929 	__asm {
10930 		xorps		xmm2, xmm2
10931 		xorps		xmm3, xmm3
10932 		xorps		xmm4, xmm4
10933 
10934 		push		ebx
10935 		mov			ebx, 4
10936 
10937 	loopRow:
10938 			cmp			ebx, n
10939 			jge			done
10940 
10941 			mov			ecx, ebx				// esi = i
10942 			shl			ecx, 2					// esi = i * 4
10943 			mov			edx, diag				// edx = diag
10944 			add			edx, ecx				// edx = &diag[i]
10945 			mov			edi, ebx				// edi = i
10946 			imul		edi, ncf				// edi = i * nc * sizeof( float )
10947 			add			edi, mptr				// edi = mat[i]
10948 			add			edi, ecx				// edi = &mat[i][i]
10949 			mov			esi, v					// ecx = v
10950 			add			esi, ecx				// ecx = &v[i]
10951 			mov			eax, invDiagPtr			// eax = invDiagPtr
10952 			add			eax, ecx				// eax = &invDiagPtr[i]
10953 			neg			ecx
10954 
10955 			movaps		xmm0, [edx+ecx]
10956 			mulps		xmm0, [edi+ecx]
10957 			movaps		[esi+ecx], xmm0
10958 			mulps		xmm0, [edi+ecx]
10959 			add			ecx, 12*4
10960 			jg			doneDot8
10961 		dot8:
10962 			movaps		xmm1, [edx+ecx-(8*4)]
10963 			mulps		xmm1, [edi+ecx-(8*4)]
10964 			movaps		[esi+ecx-(8*4)], xmm1
10965 			mulps		xmm1, [edi+ecx-(8*4)]
10966 			addps		xmm0, xmm1
10967 			movaps		xmm2, [edx+ecx-(4*4)]
10968 			mulps		xmm2, [edi+ecx-(4*4)]
10969 			movaps		[esi+ecx-(4*4)], xmm2
10970 			mulps		xmm2, [edi+ecx-(4*4)]
10971 			addps		xmm0, xmm2
10972 			add			ecx, 8*4
10973 			jle			dot8
10974 		doneDot8:
10975 			sub			ecx, 4*4
10976 			jg			doneDot4
10977 			movaps		xmm1, [edx+ecx-(4*4)]
10978 			mulps		xmm1, [edi+ecx-(4*4)]
10979 			movaps		[esi+ecx-(4*4)], xmm1
10980 			mulps		xmm1, [edi+ecx-(4*4)]
10981 			addps		xmm0, xmm1
10982 			add			ecx, 4*4
10983 		doneDot4:
10984 			sub			ecx, 2*4
10985 			jg			doneDot2
10986 			movlps		xmm3, [edx+ecx-(2*4)]
10987 			movlps		xmm4, [edi+ecx-(2*4)]
10988 			mulps		xmm3, xmm4
10989 			movlps		[esi+ecx-(2*4)], xmm3
10990 			mulps		xmm3, xmm4
10991 			addps		xmm0, xmm3
10992 			add			ecx, 2*4
10993 		doneDot2:
10994 			sub			ecx, 1*4
10995 			jg			doneDot1
10996 			movss		xmm3, [edx+ecx-(1*4)]
10997 			movss		xmm4, [edi+ecx-(1*4)]
10998 			mulss		xmm3, xmm4
10999 			movss		[esi+ecx-(1*4)], xmm3
11000 			mulss		xmm3, xmm4
11001 			addss		xmm0, xmm3
11002 		doneDot1:
11003 			movhlps		xmm2, xmm0
11004 			addps		xmm0, xmm2
11005 			movaps		xmm2, xmm0
11006 			shufps		xmm2, xmm2, R_SHUFFLEPS( 1, 0, 0, 0 )
11007 			addss		xmm0, xmm2
11008 			movss		xmm1, [edi]
11009 			subss		xmm1, xmm0
11010 			movss		[edi], xmm1				// mptr[i] = sum;
11011 			movss		[edx], xmm1				// diag[i] = sum;
11012 
11013 			// if ( sum == 0.0f ) return false;
11014 			movaps		xmm2, xmm1
11015 			cmpeqss		xmm2, SIMD_SP_zero
11016 			andps		xmm2, SIMD_SP_tiny
11017 			orps		xmm1, xmm2
11018 
11019 			rcpss		xmm7, xmm1
11020 			mulss		xmm1, xmm7
11021 			mulss		xmm1, xmm7
11022 			addss		xmm7, xmm7
11023 			subss		xmm7, xmm1
11024 			movss		[eax], xmm7				// invDiagPtr[i] = 1.0f / sum;
11025 
11026 			mov			edx, n					// edx = n
11027 			sub			edx, ebx				// edx = n - i
11028 			dec			edx						// edx = n - i - 1
11029 			jle			doneSubRow				// if ( i + 1 >= n ) return true;
11030 
11031 			mov			eax, ebx				// eax = i
11032 			shl			eax, 2					// eax = i * 4
11033 			neg			eax
11034 
11035 		loopSubRow:
11036 				add			edi, ncf
11037 				mov			ecx, eax
11038 				movaps		xmm0, [esi+ecx]
11039 				mulps		xmm0, [edi+ecx]
11040 				add			ecx, 12*4
11041 				jg			doneSubDot8
11042 			subDot8:
11043 				movaps		xmm1, [esi+ecx-(8*4)]
11044 				mulps		xmm1, [edi+ecx-(8*4)]
11045 				addps		xmm0, xmm1
11046 				movaps		xmm2, [esi+ecx-(4*4)]
11047 				mulps		xmm2, [edi+ecx-(4*4)]
11048 				addps		xmm0, xmm2
11049 				add			ecx, 8*4
11050 				jle			subDot8
11051 			doneSubDot8:
11052 				sub			ecx, 4*4
11053 				jg			doneSubDot4
11054 				movaps		xmm1, [esi+ecx-(4*4)]
11055 				mulps		xmm1, [edi+ecx-(4*4)]
11056 				addps		xmm0, xmm1
11057 				add			ecx, 4*4
11058 			doneSubDot4:
11059 				sub			ecx, 2*4
11060 				jg			doneSubDot2
11061 				movlps		xmm3, [esi+ecx-(2*4)]
11062 				movlps		xmm4, [edi+ecx-(2*4)]
11063 				mulps		xmm3, xmm4
11064 				addps		xmm0, xmm3
11065 				add			ecx, 2*4
11066 			doneSubDot2:
11067 				sub			ecx, 1*4
11068 				jg			doneSubDot1
11069 				movss		xmm3, [esi+ecx-(1*4)]
11070 				movss		xmm4, [edi+ecx-(1*4)]
11071 				mulss		xmm3, xmm4
11072 				addss		xmm0, xmm3
11073 			doneSubDot1:
11074 				movhlps		xmm2, xmm0
11075 				addps		xmm0, xmm2
11076 				movaps		xmm2, xmm0
11077 				shufps		xmm2, xmm2, R_SHUFFLEPS( 1, 0, 0, 0 )
11078 				addss		xmm0, xmm2
11079 				movss		xmm1, [edi]
11080 				subss		xmm1, xmm0
11081 				mulss		xmm1, xmm7
11082 				movss		[edi], xmm1
11083 				dec			edx
11084 				jg			loopSubRow
11085 		doneSubRow:
11086 			inc		ebx
11087 			jmp		loopRow
11088 	done:
11089 		pop		ebx
11090 	}
11091 
11092 	return true;
11093 
11094 #else
11095 
11096 	int i, j, k, nc;
11097 	float *v, *diag, *mptr;
11098 	double s0, s1, s2, s3, sum, d;
11099 
11100 	v = (float *) _alloca16( n * sizeof( float ) );
11101 	diag = (float *) _alloca16( n * sizeof( float ) );
11102 
11103 	nc = mat.GetNumColumns();
11104 
11105 	if ( n <= 0 ) {
11106 		return true;
11107 	}
11108 
11109 	mptr = mat[0];
11110 
11111 	sum = mptr[0];
11112 
11113 	if ( sum == 0.0f ) {
11114 		return false;
11115 	}
11116 
11117 	diag[0] = sum;
11118 	invDiag[0] = d = 1.0f / sum;
11119 
11120 	if ( n <= 1 ) {
11121 		return true;
11122 	}
11123 
11124 	mptr = mat[0];
11125 	for ( j = 1; j < n; j++ ) {
11126 		mptr[j*nc+0] = ( mptr[j*nc+0] ) * d;
11127 	}
11128 
11129 	mptr = mat[1];
11130 
11131 	v[0] = diag[0] * mptr[0]; s0 = v[0] * mptr[0];
11132 	sum = mptr[1] - s0;
11133 
11134 	if ( sum == 0.0f ) {
11135 		return false;
11136 	}
11137 
11138 	mat[1][1] = sum;
11139 	diag[1] = sum;
11140 	invDiag[1] = d = 1.0f / sum;
11141 
11142 	if ( n <= 2 ) {
11143 		return true;
11144 	}
11145 
11146 	mptr = mat[0];
11147 	for ( j = 2; j < n; j++ ) {
11148 		mptr[j*nc+1] = ( mptr[j*nc+1] - v[0] * mptr[j*nc+0] ) * d;
11149 	}
11150 
11151 	mptr = mat[2];
11152 
11153 	v[0] = diag[0] * mptr[0]; s0 = v[0] * mptr[0];
11154 	v[1] = diag[1] * mptr[1]; s1 = v[1] * mptr[1];
11155 	sum = mptr[2] - s0 - s1;
11156 
11157 	if ( sum == 0.0f ) {
11158 		return false;
11159 	}
11160 
11161 	mat[2][2] = sum;
11162 	diag[2] = sum;
11163 	invDiag[2] = d = 1.0f / sum;
11164 
11165 	if ( n <= 3 ) {
11166 		return true;
11167 	}
11168 
11169 	mptr = mat[0];
11170 	for ( j = 3; j < n; j++ ) {
11171 		mptr[j*nc+2] = ( mptr[j*nc+2] - v[0] * mptr[j*nc+0] - v[1] * mptr[j*nc+1] ) * d;
11172 	}
11173 
11174 	mptr = mat[3];
11175 
11176 	v[0] = diag[0] * mptr[0]; s0 = v[0] * mptr[0];
11177 	v[1] = diag[1] * mptr[1]; s1 = v[1] * mptr[1];
11178 	v[2] = diag[2] * mptr[2]; s2 = v[2] * mptr[2];
11179 	sum = mptr[3] - s0 - s1 - s2;
11180 
11181 	if ( sum == 0.0f ) {
11182 		return false;
11183 	}
11184 
11185 	mat[3][3] = sum;
11186 	diag[3] = sum;
11187 	invDiag[3] = d = 1.0f / sum;
11188 
11189 	if ( n <= 4 ) {
11190 		return true;
11191 	}
11192 
11193 	mptr = mat[0];
11194 	for ( j = 4; j < n; j++ ) {
11195 		mptr[j*nc+3] = ( mptr[j*nc+3] - v[0] * mptr[j*nc+0] - v[1] * mptr[j*nc+1] - v[2] * mptr[j*nc+2] ) * d;
11196 	}
11197 
11198 	for ( i = 4; i < n; i++ ) {
11199 
11200 		mptr = mat[i];
11201 
11202 		v[0] = diag[0] * mptr[0]; s0 = v[0] * mptr[0];
11203 		v[1] = diag[1] * mptr[1]; s1 = v[1] * mptr[1];
11204 		v[2] = diag[2] * mptr[2]; s2 = v[2] * mptr[2];
11205 		v[3] = diag[3] * mptr[3]; s3 = v[3] * mptr[3];
11206 		for ( k = 4; k < i-3; k += 4 ) {
11207 			v[k+0] = diag[k+0] * mptr[k+0]; s0 += v[k+0] * mptr[k+0];
11208 			v[k+1] = diag[k+1] * mptr[k+1]; s1 += v[k+1] * mptr[k+1];
11209 			v[k+2] = diag[k+2] * mptr[k+2]; s2 += v[k+2] * mptr[k+2];
11210 			v[k+3] = diag[k+3] * mptr[k+3]; s3 += v[k+3] * mptr[k+3];
11211 		}
11212 		switch( i - k ) {
11213 			case 3: v[k+2] = diag[k+2] * mptr[k+2]; s0 += v[k+2] * mptr[k+2];
11214 			case 2: v[k+1] = diag[k+1] * mptr[k+1]; s1 += v[k+1] * mptr[k+1];
11215 			case 1: v[k+0] = diag[k+0] * mptr[k+0]; s2 += v[k+0] * mptr[k+0];
11216 		}
11217 		sum = s3;
11218 		sum += s2;
11219 		sum += s1;
11220 		sum += s0;
11221 		sum = mptr[i] - sum;
11222 
11223 		if ( sum == 0.0f ) {
11224 			return false;
11225 		}
11226 
11227 		mat[i][i] = sum;
11228 		diag[i] = sum;
11229 		invDiag[i] = d = 1.0f / sum;
11230 
11231 		if ( i + 1 >= n ) {
11232 			return true;
11233 		}
11234 
11235 		mptr = mat[i+1];
11236 		for ( j = i+1; j < n; j++ ) {
11237 			s0 = mptr[0] * v[0];
11238 			s1 = mptr[1] * v[1];
11239 			s2 = mptr[2] * v[2];
11240 			s3 = mptr[3] * v[3];
11241 			for ( k = 4; k < i-7; k += 8 ) {
11242 				s0 += mptr[k+0] * v[k+0];
11243 				s1 += mptr[k+1] * v[k+1];
11244 				s2 += mptr[k+2] * v[k+2];
11245 				s3 += mptr[k+3] * v[k+3];
11246 				s0 += mptr[k+4] * v[k+4];
11247 				s1 += mptr[k+5] * v[k+5];
11248 				s2 += mptr[k+6] * v[k+6];
11249 				s3 += mptr[k+7] * v[k+7];
11250 			}
11251 			switch( i - k ) {
11252 				case 7: s0 += mptr[k+6] * v[k+6];
11253 				case 6: s1 += mptr[k+5] * v[k+5];
11254 				case 5: s2 += mptr[k+4] * v[k+4];
11255 				case 4: s3 += mptr[k+3] * v[k+3];
11256 				case 3: s0 += mptr[k+2] * v[k+2];
11257 				case 2: s1 += mptr[k+1] * v[k+1];
11258 				case 1: s2 += mptr[k+0] * v[k+0];
11259 			}
11260 			sum = s3;
11261 			sum += s2;
11262 			sum += s1;
11263 			sum += s0;
11264 			mptr[i] = ( mptr[i] - sum ) * d;
11265 			mptr += nc;
11266 		}
11267 	}
11268 
11269 	return true;
11270 
11271 #endif
11272 }
11273 
11274 /*
11275 ============
11276 idSIMD_SSE::BlendJoints
11277 ============
11278 */
11279 #define REFINE_BLENDJOINTS_RECIPROCAL
11280 
BlendJoints(idJointQuat * joints,const idJointQuat * blendJoints,const float lerp,const int * index,const int numJoints)11281 void VPCALL idSIMD_SSE::BlendJoints( idJointQuat *joints, const idJointQuat *blendJoints, const float lerp, const int *index, const int numJoints ) {
11282 	int i;
11283 
11284 	if ( lerp <= 0.0f ) {
11285 		return;
11286 	} else if ( lerp >= 1.0f ) {
11287 		for ( i = 0; i < numJoints; i++ ) {
11288 			int j = index[i];
11289 			joints[j] = blendJoints[j];
11290 		}
11291 		return;
11292 	}
11293 
11294 	for ( i = 0; i <= numJoints - 4; i += 4 ) {
11295 		ALIGN16( float jointVert0[4] );
11296 		ALIGN16( float jointVert1[4] );
11297 		ALIGN16( float jointVert2[4] );
11298 		ALIGN16( float blendVert0[4] );
11299 		ALIGN16( float blendVert1[4] );
11300 		ALIGN16( float blendVert2[4] );
11301 		ALIGN16( float jointQuat0[4] );
11302 		ALIGN16( float jointQuat1[4] );
11303 		ALIGN16( float jointQuat2[4] );
11304 		ALIGN16( float jointQuat3[4] );
11305 		ALIGN16( float blendQuat0[4] );
11306 		ALIGN16( float blendQuat1[4] );
11307 		ALIGN16( float blendQuat2[4] );
11308 		ALIGN16( float blendQuat3[4] );
11309 
11310 		for ( int j = 0; j < 4; j++ ) {
11311 			int n = index[i+j];
11312 
11313 			jointVert0[j] = joints[n].t[0];
11314 			jointVert1[j] = joints[n].t[1];
11315 			jointVert2[j] = joints[n].t[2];
11316 
11317 			blendVert0[j] = blendJoints[n].t[0];
11318 			blendVert1[j] = blendJoints[n].t[1];
11319 			blendVert2[j] = blendJoints[n].t[2];
11320 
11321 			jointQuat0[j] = joints[n].q[0];
11322 			jointQuat1[j] = joints[n].q[1];
11323 			jointQuat2[j] = joints[n].q[2];
11324 			jointQuat3[j] = joints[n].q[3];
11325 
11326 			blendQuat0[j] = blendJoints[n].q[0];
11327 			blendQuat1[j] = blendJoints[n].q[1];
11328 			blendQuat2[j] = blendJoints[n].q[2];
11329 			blendQuat3[j] = blendJoints[n].q[3];
11330 		}
11331 
11332 #if 1
11333 		__asm {
11334 			// lerp translation
11335 			movss		xmm7, lerp
11336 			shufps		xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
11337 			movaps		xmm0, blendVert0
11338 			subps		xmm0, jointVert0
11339 			mulps		xmm0, xmm7
11340 			addps		xmm0, jointVert0
11341 			movaps		jointVert0, xmm0
11342 			movaps		xmm1, blendVert1
11343 			subps		xmm1, jointVert1
11344 			mulps		xmm1, xmm7
11345 			addps		xmm1, jointVert1
11346 			movaps		jointVert1, xmm1
11347 			movaps		xmm2, blendVert2
11348 			subps		xmm2, jointVert2
11349 			mulps		xmm2, xmm7
11350 			addps		xmm2, jointVert2
11351 			movaps		jointVert2, xmm2
11352 
11353 			// lerp quaternions
11354 			movaps		xmm0, jointQuat0
11355 			mulps		xmm0, blendQuat0
11356 			movaps		xmm1, jointQuat1
11357 			mulps		xmm1, blendQuat1
11358 			addps		xmm0, xmm1
11359 			movaps		xmm2, jointQuat2
11360 			mulps		xmm2, blendQuat2
11361 			addps		xmm0, xmm2
11362 			movaps		xmm3, jointQuat3
11363 			mulps		xmm3, blendQuat3
11364 			addps		xmm0, xmm3					// xmm0 = cosom
11365 
11366 			movaps		xmm1, xmm0
11367 			movaps		xmm2, xmm0
11368 			andps		xmm1, SIMD_SP_signBitMask	// xmm1 = signBit
11369 			xorps		xmm0, xmm1
11370 			mulps		xmm2, xmm2
11371 
11372 			xorps		xmm4, xmm4
11373 			movaps		xmm3, SIMD_SP_one
11374 			subps		xmm3, xmm2					// xmm3 = scale0
11375 			cmpeqps		xmm4, xmm3
11376 			andps		xmm4, SIMD_SP_tiny			// if values are zero replace them with a tiny number
11377 			andps		xmm3, SIMD_SP_absMask		// make sure the values are positive
11378 			orps		xmm3, xmm4
11379 
11380 #ifdef REFINE_BLENDJOINTS_RECIPROCAL
11381 			movaps		xmm2, xmm3
11382 			rsqrtps		xmm4, xmm2
11383 			mulps		xmm2, xmm4
11384 			mulps		xmm2, xmm4
11385 			subps		xmm2, SIMD_SP_rsqrt_c0
11386 			mulps		xmm4, SIMD_SP_rsqrt_c1
11387 			mulps		xmm2, xmm4
11388 #else
11389 			rsqrtps		xmm2, xmm3					// xmm2 = sinom
11390 #endif
11391 			mulps		xmm3, xmm2					// xmm3 = sqrt( scale0 )
11392 
11393 			// omega0 = atan2( xmm3, xmm0 )
11394 			movaps		xmm4, xmm0
11395 			minps		xmm0, xmm3
11396 			maxps		xmm3, xmm4
11397 			cmpeqps		xmm4, xmm0
11398 
11399 #ifdef REFINE_BLENDJOINTS_RECIPROCAL
11400 			rcpps		xmm5, xmm3
11401 			mulps		xmm3, xmm5
11402 			mulps		xmm3, xmm5
11403 			addps		xmm5, xmm5
11404 			subps		xmm5, xmm3					// xmm5 = 1 / y or 1 / x
11405 			mulps		xmm0, xmm5					// xmm0 = x / y or y / x
11406 #else
11407 			rcpps		xmm3, xmm3					// xmm3 = 1 / y or 1 / x
11408 			mulps		xmm0, xmm3					// xmm0 = x / y or y / x
11409 #endif
11410 			movaps		xmm3, xmm4
11411 			andps		xmm3, SIMD_SP_signBitMask
11412 			xorps		xmm0, xmm3					// xmm0 = -x / y or y / x
11413 			andps		xmm4, SIMD_SP_halfPI		// xmm4 = HALF_PI or 0.0f
11414 			movaps		xmm3, xmm0
11415 			mulps		xmm3, xmm3					// xmm3 = s
11416 			movaps		xmm5, SIMD_SP_atan_c0
11417 			mulps		xmm5, xmm3
11418 			addps		xmm5, SIMD_SP_atan_c1
11419 			mulps		xmm5, xmm3
11420 			addps		xmm5, SIMD_SP_atan_c2
11421 			mulps		xmm5, xmm3
11422 			addps		xmm5, SIMD_SP_atan_c3
11423 			mulps		xmm5, xmm3
11424 			addps		xmm5, SIMD_SP_atan_c4
11425 			mulps		xmm5, xmm3
11426 			addps		xmm5, SIMD_SP_atan_c5
11427 			mulps		xmm5, xmm3
11428 			addps		xmm5, SIMD_SP_atan_c6
11429 			mulps		xmm5, xmm3
11430 			addps		xmm5, SIMD_SP_atan_c7
11431 			mulps		xmm5, xmm3
11432 			addps		xmm5, SIMD_SP_one
11433 			mulps		xmm5, xmm0
11434 			addps		xmm5, xmm4					// xmm5 = omega0
11435 
11436 			movaps		xmm6, xmm7					// xmm6 = lerp
11437 			mulps		xmm6, xmm5					// xmm6 = omega1
11438 			subps		xmm5, xmm6					// xmm5 = omega0
11439 
11440 			// scale0 = sin( xmm5 ) * xmm2
11441 			// scale1 = sin( xmm6 ) * xmm2
11442 			movaps		xmm3, xmm5
11443 			movaps		xmm7, xmm6
11444 			mulps		xmm3, xmm3
11445 			mulps		xmm7, xmm7
11446 			movaps		xmm4, SIMD_SP_sin_c0
11447 			movaps		xmm0, SIMD_SP_sin_c0
11448 			mulps		xmm4, xmm3
11449 			mulps		xmm0, xmm7
11450 			addps		xmm4, SIMD_SP_sin_c1
11451 			addps		xmm0, SIMD_SP_sin_c1
11452 			mulps		xmm4, xmm3
11453 			mulps		xmm0, xmm7
11454 			addps		xmm4, SIMD_SP_sin_c2
11455 			addps		xmm0, SIMD_SP_sin_c2
11456 			mulps		xmm4, xmm3
11457 			mulps		xmm0, xmm7
11458 			addps		xmm4, SIMD_SP_sin_c3
11459 			addps		xmm0, SIMD_SP_sin_c3
11460 			mulps		xmm4, xmm3
11461 			mulps		xmm0, xmm7
11462 			addps		xmm4, SIMD_SP_sin_c4
11463 			addps		xmm0, SIMD_SP_sin_c4
11464 			mulps		xmm4, xmm3
11465 			mulps		xmm0, xmm7
11466 			addps		xmm4, SIMD_SP_one
11467 			addps		xmm0, SIMD_SP_one
11468 			mulps		xmm5, xmm4
11469 			mulps		xmm6, xmm0
11470 			mulps		xmm5, xmm2					// xmm5 = scale0
11471 			mulps		xmm6, xmm2					// xmm6 = scale1
11472 
11473 			xorps		xmm6, xmm1
11474 
11475 			movaps		xmm0, jointQuat0
11476 			mulps		xmm0, xmm5
11477 			movaps		xmm1, blendQuat0
11478 			mulps		xmm1, xmm6
11479 			addps		xmm0, xmm1
11480 			movaps		jointQuat0, xmm0
11481 
11482 			movaps		xmm1, jointQuat1
11483 			mulps		xmm1, xmm5
11484 			movaps		xmm2, blendQuat1
11485 			mulps		xmm2, xmm6
11486 			addps		xmm1, xmm2
11487 			movaps		jointQuat1, xmm1
11488 
11489 			movaps		xmm2, jointQuat2
11490 			mulps		xmm2, xmm5
11491 			movaps		xmm3, blendQuat2
11492 			mulps		xmm3, xmm6
11493 			addps		xmm2, xmm3
11494 			movaps		jointQuat2, xmm2
11495 
11496 			movaps		xmm3, jointQuat3
11497 			mulps		xmm3, xmm5
11498 			movaps		xmm4, blendQuat3
11499 			mulps		xmm4, xmm6
11500 			addps		xmm3, xmm4
11501 			movaps		jointQuat3, xmm3
11502 		}
11503 
11504 #else
11505 
11506 		jointVert0[0] += lerp * ( blendVert0[0] - jointVert0[0] );
11507 		jointVert0[1] += lerp * ( blendVert0[1] - jointVert0[1] );
11508 		jointVert0[2] += lerp * ( blendVert0[2] - jointVert0[2] );
11509 		jointVert0[3] += lerp * ( blendVert0[3] - jointVert0[3] );
11510 
11511 		jointVert1[0] += lerp * ( blendVert1[0] - jointVert1[0] );
11512 		jointVert1[1] += lerp * ( blendVert1[1] - jointVert1[1] );
11513 		jointVert1[2] += lerp * ( blendVert1[2] - jointVert1[2] );
11514 		jointVert1[3] += lerp * ( blendVert1[3] - jointVert1[3] );
11515 
11516 		jointVert2[0] += lerp * ( blendVert2[0] - jointVert2[0] );
11517 		jointVert2[1] += lerp * ( blendVert2[1] - jointVert2[1] );
11518 		jointVert2[2] += lerp * ( blendVert2[2] - jointVert2[2] );
11519 		jointVert2[3] += lerp * ( blendVert2[3] - jointVert2[3] );
11520 
11521 		ALIGN16( float cosom[4] );
11522 		ALIGN16( float sinom[4] );
11523 		ALIGN16( float omega0[4] );
11524 		ALIGN16( float omega1[4] );
11525 		ALIGN16( float scale0[4] );
11526 		ALIGN16( float scale1[4] );
11527 		ALIGN16( unsigned int signBit[4] );
11528 
11529 		cosom[0] = jointQuat0[0] * blendQuat0[0];
11530 		cosom[1] = jointQuat0[1] * blendQuat0[1];
11531 		cosom[2] = jointQuat0[2] * blendQuat0[2];
11532 		cosom[3] = jointQuat0[3] * blendQuat0[3];
11533 
11534 		cosom[0] += jointQuat1[0] * blendQuat1[0];
11535 		cosom[1] += jointQuat1[1] * blendQuat1[1];
11536 		cosom[2] += jointQuat1[2] * blendQuat1[2];
11537 		cosom[3] += jointQuat1[3] * blendQuat1[3];
11538 
11539 		cosom[0] += jointQuat2[0] * blendQuat2[0];
11540 		cosom[1] += jointQuat2[1] * blendQuat2[1];
11541 		cosom[2] += jointQuat2[2] * blendQuat2[2];
11542 		cosom[3] += jointQuat2[3] * blendQuat2[3];
11543 
11544 		cosom[0] += jointQuat3[0] * blendQuat3[0];
11545 		cosom[1] += jointQuat3[1] * blendQuat3[1];
11546 		cosom[2] += jointQuat3[2] * blendQuat3[2];
11547 		cosom[3] += jointQuat3[3] * blendQuat3[3];
11548 
11549 		signBit[0] = (*(unsigned int *)&cosom[0]) & ( 1 << 31 );
11550 		signBit[1] = (*(unsigned int *)&cosom[1]) & ( 1 << 31 );
11551 		signBit[2] = (*(unsigned int *)&cosom[2]) & ( 1 << 31 );
11552 		signBit[3] = (*(unsigned int *)&cosom[3]) & ( 1 << 31 );
11553 
11554 		(*(unsigned int *)&cosom[0]) ^= signBit[0];
11555 		(*(unsigned int *)&cosom[1]) ^= signBit[1];
11556 		(*(unsigned int *)&cosom[2]) ^= signBit[2];
11557 		(*(unsigned int *)&cosom[3]) ^= signBit[3];
11558 
11559 		scale0[0] = 1.0f - cosom[0] * cosom[0];
11560 		scale0[1] = 1.0f - cosom[1] * cosom[1];
11561 		scale0[2] = 1.0f - cosom[2] * cosom[2];
11562 		scale0[3] = 1.0f - cosom[3] * cosom[3];
11563 
11564 		scale0[0] = ( scale0[0] <= 0.0f ) ? SIMD_SP_tiny[0] : scale0[0];
11565 		scale0[1] = ( scale0[1] <= 0.0f ) ? SIMD_SP_tiny[1] : scale0[1];
11566 		scale0[2] = ( scale0[2] <= 0.0f ) ? SIMD_SP_tiny[2] : scale0[2];
11567 		scale0[3] = ( scale0[3] <= 0.0f ) ? SIMD_SP_tiny[3] : scale0[3];
11568 
11569 		sinom[0] = idMath::RSqrt( scale0[0] );
11570 		sinom[1] = idMath::RSqrt( scale0[1] );
11571 		sinom[2] = idMath::RSqrt( scale0[2] );
11572 		sinom[3] = idMath::RSqrt( scale0[3] );
11573 
11574 		scale0[0] *= sinom[0];
11575 		scale0[1] *= sinom[1];
11576 		scale0[2] *= sinom[2];
11577 		scale0[3] *= sinom[3];
11578 
11579 		omega0[0] = SSE_ATanPositive( scale0[0], cosom[0] );
11580 		omega0[1] = SSE_ATanPositive( scale0[1], cosom[1] );
11581 		omega0[2] = SSE_ATanPositive( scale0[2], cosom[2] );
11582 		omega0[3] = SSE_ATanPositive( scale0[3], cosom[3] );
11583 
11584 		omega1[0] = lerp * omega0[0];
11585 		omega1[1] = lerp * omega0[1];
11586 		omega1[2] = lerp * omega0[2];
11587 		omega1[3] = lerp * omega0[3];
11588 
11589 		omega0[0] -= omega1[0];
11590 		omega0[1] -= omega1[1];
11591 		omega0[2] -= omega1[2];
11592 		omega0[3] -= omega1[3];
11593 
11594 		scale0[0] = SSE_SinZeroHalfPI( omega0[0] ) * sinom[0];
11595 		scale0[1] = SSE_SinZeroHalfPI( omega0[1] ) * sinom[1];
11596 		scale0[2] = SSE_SinZeroHalfPI( omega0[2] ) * sinom[2];
11597 		scale0[3] = SSE_SinZeroHalfPI( omega0[3] ) * sinom[3];
11598 
11599 		scale1[0] = SSE_SinZeroHalfPI( omega1[0] ) * sinom[0];
11600 		scale1[1] = SSE_SinZeroHalfPI( omega1[1] ) * sinom[1];
11601 		scale1[2] = SSE_SinZeroHalfPI( omega1[2] ) * sinom[2];
11602 		scale1[3] = SSE_SinZeroHalfPI( omega1[3] ) * sinom[3];
11603 
11604 		(*(unsigned int *)&scale1[0]) ^= signBit[0];
11605 		(*(unsigned int *)&scale1[1]) ^= signBit[1];
11606 		(*(unsigned int *)&scale1[2]) ^= signBit[2];
11607 		(*(unsigned int *)&scale1[3]) ^= signBit[3];
11608 
11609 		jointQuat0[0] = scale0[0] * jointQuat0[0] + scale1[0] * blendQuat0[0];
11610 		jointQuat0[1] = scale0[1] * jointQuat0[1] + scale1[1] * blendQuat0[1];
11611 		jointQuat0[2] = scale0[2] * jointQuat0[2] + scale1[2] * blendQuat0[2];
11612 		jointQuat0[3] = scale0[3] * jointQuat0[3] + scale1[3] * blendQuat0[3];
11613 
11614 		jointQuat1[0] = scale0[0] * jointQuat1[0] + scale1[0] * blendQuat1[0];
11615 		jointQuat1[1] = scale0[1] * jointQuat1[1] + scale1[1] * blendQuat1[1];
11616 		jointQuat1[2] = scale0[2] * jointQuat1[2] + scale1[2] * blendQuat1[2];
11617 		jointQuat1[3] = scale0[3] * jointQuat1[3] + scale1[3] * blendQuat1[3];
11618 
11619 		jointQuat2[0] = scale0[0] * jointQuat2[0] + scale1[0] * blendQuat2[0];
11620 		jointQuat2[1] = scale0[1] * jointQuat2[1] + scale1[1] * blendQuat2[1];
11621 		jointQuat2[2] = scale0[2] * jointQuat2[2] + scale1[2] * blendQuat2[2];
11622 		jointQuat2[3] = scale0[3] * jointQuat2[3] + scale1[3] * blendQuat2[3];
11623 
11624 		jointQuat3[0] = scale0[0] * jointQuat3[0] + scale1[0] * blendQuat3[0];
11625 		jointQuat3[1] = scale0[1] * jointQuat3[1] + scale1[1] * blendQuat3[1];
11626 		jointQuat3[2] = scale0[2] * jointQuat3[2] + scale1[2] * blendQuat3[2];
11627 		jointQuat3[3] = scale0[3] * jointQuat3[3] + scale1[3] * blendQuat3[3];
11628 
11629 #endif
11630 
11631 		for ( int j = 0; j < 4; j++ ) {
11632 			int n = index[i+j];
11633 
11634 			joints[n].t[0] = jointVert0[j];
11635 			joints[n].t[1] = jointVert1[j];
11636 			joints[n].t[2] = jointVert2[j];
11637 
11638 			joints[n].q[0] = jointQuat0[j];
11639 			joints[n].q[1] = jointQuat1[j];
11640 			joints[n].q[2] = jointQuat2[j];
11641 			joints[n].q[3] = jointQuat3[j];
11642 		}
11643 	}
11644 
11645 	for ( ; i < numJoints; i++ ) {
11646 		int n = index[i];
11647 
11648 		idVec3 &jointVert = joints[n].t;
11649 		const idVec3 &blendVert = blendJoints[n].t;
11650 
11651 		jointVert[0] += lerp * ( blendVert[0] - jointVert[0] );
11652 		jointVert[1] += lerp * ( blendVert[1] - jointVert[1] );
11653 		jointVert[2] += lerp * ( blendVert[2] - jointVert[2] );
11654 
11655 		idQuat &jointQuat = joints[n].q;
11656 		const idQuat &blendQuat = blendJoints[n].q;
11657 
11658 		float cosom;
11659 		float sinom;
11660 		float omega;
11661 		float scale0;
11662 		float scale1;
11663 		unsigned int signBit;
11664 
11665 		cosom = jointQuat.x * blendQuat.x + jointQuat.y * blendQuat.y + jointQuat.z * blendQuat.z + jointQuat.w * blendQuat.w;
11666 
11667 		signBit = (*(unsigned int *)&cosom) & ( 1 << 31 );
11668 
11669 		(*(unsigned int *)&cosom) ^= signBit;
11670 
11671 		scale0 = 1.0f - cosom * cosom;
11672 		scale0 = ( scale0 <= 0.0f ) ? SIMD_SP_tiny[0] : scale0;
11673 		sinom = idMath::InvSqrt( scale0 );
11674 		omega = idMath::ATan16( scale0 * sinom, cosom );
11675 		scale0 = idMath::Sin16( ( 1.0f - lerp ) * omega ) * sinom;
11676 		scale1 = idMath::Sin16( lerp * omega ) * sinom;
11677 
11678 		(*(unsigned int *)&scale1) ^= signBit;
11679 
11680 		jointQuat.x = scale0 * jointQuat.x + scale1 * blendQuat.x;
11681 		jointQuat.y = scale0 * jointQuat.y + scale1 * blendQuat.y;
11682 		jointQuat.z = scale0 * jointQuat.z + scale1 * blendQuat.z;
11683 		jointQuat.w = scale0 * jointQuat.w + scale1 * blendQuat.w;
11684 	}
11685 }
11686 
11687 /*
11688 ============
11689 idSIMD_SSE::ConvertJointQuatsToJointMats
11690 ============
11691 */
ConvertJointQuatsToJointMats(idJointMat * jointMats,const idJointQuat * jointQuats,const int numJoints)11692 void VPCALL idSIMD_SSE::ConvertJointQuatsToJointMats( idJointMat *jointMats, const idJointQuat *jointQuats, const int numJoints ) {
11693 
11694 	assert( sizeof( idJointQuat ) == JOINTQUAT_SIZE );
11695 	assert( sizeof( idJointMat ) == JOINTMAT_SIZE );
11696 	assert( (int)(&((idJointQuat *)0)->t) == (int)(&((idJointQuat *)0)->q) + (int)sizeof( ((idJointQuat *)0)->q ) );
11697 
11698 	for ( int i = 0; i < numJoints; i++ ) {
11699 
11700 		const float *q = jointQuats[i].q.ToFloatPtr();
11701 		float *m = jointMats[i].ToFloatPtr();
11702 
11703 		m[0*4+3] = q[4];
11704 		m[1*4+3] = q[5];
11705 		m[2*4+3] = q[6];
11706 
11707 		float x2 = q[0] + q[0];
11708 		float y2 = q[1] + q[1];
11709 		float z2 = q[2] + q[2];
11710 
11711 		{
11712 			float xx = q[0] * x2;
11713 			float yy = q[1] * y2;
11714 			float zz = q[2] * z2;
11715 
11716 			m[0*4+0] = 1.0f - yy - zz;
11717 			m[1*4+1] = 1.0f - xx - zz;
11718 			m[2*4+2] = 1.0f - xx - yy;
11719 		}
11720 
11721 		{
11722 			float yz = q[1] * z2;
11723 			float wx = q[3] * x2;
11724 
11725 			m[2*4+1] = yz - wx;
11726 			m[1*4+2] = yz + wx;
11727 		}
11728 
11729 		{
11730 			float xy = q[0] * y2;
11731 			float wz = q[3] * z2;
11732 
11733 			m[1*4+0] = xy - wz;
11734 			m[0*4+1] = xy + wz;
11735 		}
11736 
11737 		{
11738 			float xz = q[0] * z2;
11739 			float wy = q[3] * y2;
11740 
11741 			m[0*4+2] = xz - wy;
11742 			m[2*4+0] = xz + wy;
11743 		}
11744 	}
11745 }
11746 
11747 /*
11748 ============
11749 idSIMD_SSE::ConvertJointMatsToJointQuats
11750 ============
11751 */
ConvertJointMatsToJointQuats(idJointQuat * jointQuats,const idJointMat * jointMats,const int numJoints)11752 void VPCALL idSIMD_SSE::ConvertJointMatsToJointQuats( idJointQuat *jointQuats, const idJointMat *jointMats, const int numJoints ) {
11753 
11754 	assert( sizeof( idJointQuat ) == JOINTQUAT_SIZE );
11755 	assert( sizeof( idJointMat ) == JOINTMAT_SIZE );
11756 	assert( (int)(&((idJointQuat *)0)->t) == (int)(&((idJointQuat *)0)->q) + (int)sizeof( ((idJointQuat *)0)->q ) );
11757 
11758 #if 1
11759 
11760 	ALIGN16( byte shuffle[16] );
11761 
11762 	__asm {
11763 		mov			eax, numJoints
11764 		mov			esi, jointMats
11765 		mov			edi, jointQuats
11766 		and			eax, ~3
11767 		jz			done4
11768 		imul		eax, JOINTMAT_SIZE
11769 		add			esi, eax
11770 		neg			eax
11771 
11772 	loopMat4:
11773 		movss		xmm5, [esi+eax+3*JOINTMAT_SIZE+0*16+0*4]
11774 		movss		xmm6, [esi+eax+3*JOINTMAT_SIZE+1*16+1*4]
11775 		movss		xmm7, [esi+eax+3*JOINTMAT_SIZE+2*16+2*4]
11776 
11777 		shufps		xmm5, xmm5, R_SHUFFLEPS( 3, 0, 1, 2 )
11778 		shufps		xmm6, xmm6, R_SHUFFLEPS( 3, 0, 1, 2 )
11779 		shufps		xmm7, xmm7, R_SHUFFLEPS( 3, 0, 1, 2 )
11780 
11781 		movss		xmm0, [esi+eax+2*JOINTMAT_SIZE+0*16+0*4]
11782 		movss		xmm1, [esi+eax+2*JOINTMAT_SIZE+1*16+1*4]
11783 		movss		xmm2, [esi+eax+2*JOINTMAT_SIZE+2*16+2*4]
11784 
11785 		movss		xmm5, xmm0
11786 		movss		xmm6, xmm1
11787 		movss		xmm7, xmm2
11788 
11789 		shufps		xmm5, xmm5, R_SHUFFLEPS( 3, 0, 1, 2 )
11790 		shufps		xmm6, xmm6, R_SHUFFLEPS( 3, 0, 1, 2 )
11791 		shufps		xmm7, xmm7, R_SHUFFLEPS( 3, 0, 1, 2 )
11792 
11793 		movss		xmm0, [esi+eax+1*JOINTMAT_SIZE+0*16+0*4]
11794 		movss		xmm1, [esi+eax+1*JOINTMAT_SIZE+1*16+1*4]
11795 		movss		xmm2, [esi+eax+1*JOINTMAT_SIZE+2*16+2*4]
11796 
11797 		movss		xmm5, xmm0
11798 		movss		xmm6, xmm1
11799 		movss		xmm7, xmm2
11800 
11801 		shufps		xmm5, xmm5, R_SHUFFLEPS( 3, 0, 1, 2 )
11802 		shufps		xmm6, xmm6, R_SHUFFLEPS( 3, 0, 1, 2 )
11803 		shufps		xmm7, xmm7, R_SHUFFLEPS( 3, 0, 1, 2 )
11804 
11805 		movss		xmm0, [esi+eax+0*JOINTMAT_SIZE+0*16+0*4]
11806 		movss		xmm1, [esi+eax+0*JOINTMAT_SIZE+1*16+1*4]
11807 		movss		xmm2, [esi+eax+0*JOINTMAT_SIZE+2*16+2*4]
11808 
11809 		movss		xmm5, xmm0
11810 		movss		xmm6, xmm1
11811 		movss		xmm7, xmm2
11812 
11813 		// -------------------
11814 
11815 		movaps		xmm0, xmm5
11816 		addps		xmm0, xmm6
11817 		addps		xmm0, xmm7
11818 		cmpnltps	xmm0, SIMD_SP_zero						// xmm0 = m[0 * 4 + 0] + m[1 * 4 + 1] + m[2 * 4 + 2] > 0.0f
11819 
11820 		movaps		xmm1, xmm5
11821 		movaps		xmm2, xmm5
11822 		cmpnltps	xmm1, xmm6
11823 		cmpnltps	xmm2, xmm7
11824 		andps		xmm2, xmm1								// xmm2 = m[0 * 4 + 0] > m[1 * 4 + 1] && m[0 * 4 + 0] > m[2 * 4 + 2]
11825 
11826 		movaps		xmm4, xmm6
11827 		cmpnltps	xmm4, xmm7								// xmm3 = m[1 * 4 + 1] > m[2 * 4 + 2]
11828 
11829 		movaps		xmm1, xmm0
11830 		andnps		xmm1, xmm2
11831 		orps		xmm2, xmm0
11832 		movaps		xmm3, xmm2
11833 		andnps		xmm2, xmm4
11834 		orps		xmm3, xmm2
11835 		xorps		xmm3, SIMD_SP_not
11836 
11837 		andps		xmm0, SIMD_DW_mat2quatShuffle0
11838 		movaps		xmm4, xmm1
11839 		andps		xmm4, SIMD_DW_mat2quatShuffle1
11840 		orps		xmm0, xmm4
11841 		movaps		xmm4, xmm2
11842 		andps		xmm4, SIMD_DW_mat2quatShuffle2
11843 		orps		xmm0, xmm4
11844 		movaps		xmm4, xmm3
11845 		andps		xmm4, SIMD_DW_mat2quatShuffle3
11846 		orps		xmm4, xmm0
11847 
11848 		movaps		shuffle, xmm4
11849 
11850 		movaps		xmm0, xmm2
11851 		orps		xmm0, xmm3								// xmm0 = xmm2 | xmm3	= s0
11852 		orps		xmm2, xmm1								// xmm2 = xmm1 | xmm2	= s2
11853 		orps		xmm1, xmm3								// xmm1 = xmm1 | xmm3	= s1
11854 
11855 		andps		xmm0, SIMD_SP_signBitMask
11856 		andps		xmm1, SIMD_SP_signBitMask
11857 		andps		xmm2, SIMD_SP_signBitMask
11858 
11859 		xorps		xmm5, xmm0
11860 		xorps		xmm6, xmm1
11861 		xorps		xmm7, xmm2
11862 		addps		xmm5, xmm6
11863 		addps		xmm7, SIMD_SP_one
11864 		addps		xmm5, xmm7								// xmm5 = t
11865 
11866 		movaps		xmm7, xmm5								// xmm7 = t
11867 		rsqrtps		xmm6, xmm5
11868 		mulps		xmm5, xmm6
11869 		mulps		xmm5, xmm6
11870 		subps		xmm5, SIMD_SP_rsqrt_c0
11871 		mulps		xmm6, SIMD_SP_mat2quat_rsqrt_c1
11872 		mulps		xmm6, xmm5								// xmm5 = s
11873 
11874 		mulps		xmm7, xmm6								// xmm7 = s * t
11875 		xorps		xmm6, SIMD_SP_signBitMask				// xmm6 = -s
11876 
11877 		// -------------------
11878 
11879 		add			edi, 4*JOINTQUAT_SIZE
11880 
11881 		movzx		ecx, byte ptr shuffle[0*4+0]			// ecx = k0
11882 		movss		[edi+ecx*4-4*JOINTQUAT_SIZE], xmm7		// q[k0] = s * t;
11883 
11884 		movzx		edx, byte ptr shuffle[0*4+1]			// edx = k1
11885 		movss		xmm4, [esi+eax+0*JOINTMAT_SIZE+1*16+0*4]
11886 		xorps		xmm4, xmm2
11887 		subss		xmm4, [esi+eax+0*JOINTMAT_SIZE+0*16+1*4]
11888 		mulss		xmm4, xmm6
11889 		movss		[edi+edx*4-4*JOINTQUAT_SIZE], xmm4		// q[k1] = ( m[0 * 4 + 1] - s2 * m[1 * 4 + 0] ) * s;
11890 
11891 		movzx		ecx, byte ptr shuffle[0*4+2]			// ecx = k2
11892 		movss		xmm3, [esi+eax+0*JOINTMAT_SIZE+0*16+2*4]
11893 		xorps		xmm3, xmm1
11894 		subss		xmm3, [esi+eax+0*JOINTMAT_SIZE+2*16+0*4]
11895 		mulss		xmm3, xmm6
11896 		movss		[edi+ecx*4-4*JOINTQUAT_SIZE], xmm3		// q[k2] = ( m[2 * 4 + 0] - s1 * m[0 * 4 + 2] ) * s;
11897 
11898 		movzx		edx, byte ptr shuffle[0*4+3]			// edx = k3
11899 		movss		xmm4, [esi+eax+0*JOINTMAT_SIZE+2*16+1*4]
11900 		xorps		xmm4, xmm0
11901 		subss		xmm4, [esi+eax+0*JOINTMAT_SIZE+1*16+2*4]
11902 		mulss		xmm4, xmm6
11903 		movss		[edi+edx*4-4*JOINTQUAT_SIZE], xmm4		// q[k3] = ( m[1 * 4 + 2] - s0 * m[2 * 4 + 1] ) * s;
11904 
11905 		mov			ecx, [esi+eax+0*JOINTMAT_SIZE+0*16+3*4]
11906 		mov			[edi-4*JOINTQUAT_SIZE+16], ecx			// q[4] = m[0 * 4 + 3];
11907 		mov			edx, [esi+eax+0*JOINTMAT_SIZE+1*16+3*4]
11908 		mov			[edi-4*JOINTQUAT_SIZE+20], edx			// q[5] = m[1 * 4 + 3];
11909 		mov			ecx, [esi+eax+0*JOINTMAT_SIZE+2*16+3*4]
11910 		mov			[edi-4*JOINTQUAT_SIZE+24], ecx			// q[6] = m[2 * 4 + 3];
11911 
11912 		shufps		xmm6, xmm6, R_SHUFFLEPS( 1, 2, 3, 0 )
11913 		shufps		xmm7, xmm7, R_SHUFFLEPS( 1, 2, 3, 0 )
11914 		shufps		xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 )
11915 		shufps		xmm1, xmm1, R_SHUFFLEPS( 1, 2, 3, 0 )
11916 		shufps		xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 )
11917 
11918 		movzx		ecx, byte ptr shuffle[1*4+0]			// ecx = k0
11919 		movss		[edi+ecx*4-3*JOINTQUAT_SIZE], xmm7		// q[k0] = s * t;
11920 
11921 		movzx		edx, byte ptr shuffle[1*4+1]			// edx = k1
11922 		movss		xmm4, [esi+eax+1*JOINTMAT_SIZE+1*16+0*4]
11923 		xorps		xmm4, xmm2
11924 		subss		xmm4, [esi+eax+1*JOINTMAT_SIZE+0*16+1*4]
11925 		mulss		xmm4, xmm6
11926 		movss		[edi+edx*4-3*JOINTQUAT_SIZE], xmm4		// q[k1] = ( m[0 * 4 + 1] - s2 * m[1 * 4 + 0] ) * s;
11927 
11928 		movzx		ecx, byte ptr shuffle[1*4+2]			// ecx = k2
11929 		movss		xmm3, [esi+eax+1*JOINTMAT_SIZE+0*16+2*4]
11930 		xorps		xmm3, xmm1
11931 		subss		xmm3, [esi+eax+1*JOINTMAT_SIZE+2*16+0*4]
11932 		mulss		xmm3, xmm6
11933 		movss		[edi+ecx*4-3*JOINTQUAT_SIZE], xmm3		// q[k2] = ( m[2 * 4 + 0] - s1 * m[0 * 4 + 2] ) * s;
11934 
11935 		movzx		edx, byte ptr shuffle[1*4+3]			// edx = k3
11936 		movss		xmm4, [esi+eax+1*JOINTMAT_SIZE+2*16+1*4]
11937 		xorps		xmm4, xmm0
11938 		subss		xmm4, [esi+eax+1*JOINTMAT_SIZE+1*16+2*4]
11939 		mulss		xmm4, xmm6
11940 		movss		[edi+edx*4-3*JOINTQUAT_SIZE], xmm4		// q[k3] = ( m[1 * 4 + 2] - s0 * m[2 * 4 + 1] ) * s;
11941 
11942 		mov			ecx, [esi+eax+1*JOINTMAT_SIZE+0*16+3*4]
11943 		mov			[edi-3*JOINTQUAT_SIZE+16], ecx			// q[4] = m[0 * 4 + 3];
11944 		mov			edx, [esi+eax+1*JOINTMAT_SIZE+1*16+3*4]
11945 		mov			[edi-3*JOINTQUAT_SIZE+20], edx			// q[5] = m[1 * 4 + 3];
11946 		mov			ecx, [esi+eax+1*JOINTMAT_SIZE+2*16+3*4]
11947 		mov			[edi-3*JOINTQUAT_SIZE+24], ecx			// q[6] = m[2 * 4 + 3];
11948 
11949 		shufps		xmm6, xmm6, R_SHUFFLEPS( 1, 2, 3, 0 )
11950 		shufps		xmm7, xmm7, R_SHUFFLEPS( 1, 2, 3, 0 )
11951 		shufps		xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 )
11952 		shufps		xmm1, xmm1, R_SHUFFLEPS( 1, 2, 3, 0 )
11953 		shufps		xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 )
11954 
11955 		movzx		ecx, byte ptr shuffle[2*4+0]			// ecx = k0
11956 		movss		[edi+ecx*4-2*JOINTQUAT_SIZE], xmm7		// q[k0] = s * t;
11957 
11958 		movzx		edx, byte ptr shuffle[2*4+1]			// edx = k1
11959 		movss		xmm4, [esi+eax+2*JOINTMAT_SIZE+1*16+0*4]
11960 		xorps		xmm4, xmm2
11961 		subss		xmm4, [esi+eax+2*JOINTMAT_SIZE+0*16+1*4]
11962 		mulss		xmm4, xmm6
11963 		movss		[edi+edx*4-2*JOINTQUAT_SIZE], xmm4		// q[k1] = ( m[0 * 4 + 1] - s2 * m[1 * 4 + 0] ) * s;
11964 
11965 		movzx		ecx, byte ptr shuffle[2*4+2]			// ecx = k2
11966 		movss		xmm3, [esi+eax+2*JOINTMAT_SIZE+0*16+2*4]
11967 		xorps		xmm3, xmm1
11968 		subss		xmm3, [esi+eax+2*JOINTMAT_SIZE+2*16+0*4]
11969 		mulss		xmm3, xmm6
11970 		movss		[edi+ecx*4-2*JOINTQUAT_SIZE], xmm3		// q[k2] = ( m[2 * 4 + 0] - s1 * m[0 * 4 + 2] ) * s;
11971 
11972 		movzx		edx, byte ptr shuffle[2*4+3]			// edx = k3
11973 		movss		xmm4, [esi+eax+2*JOINTMAT_SIZE+2*16+1*4]
11974 		xorps		xmm4, xmm0
11975 		subss		xmm4, [esi+eax+2*JOINTMAT_SIZE+1*16+2*4]
11976 		mulss		xmm4, xmm6
11977 		movss		[edi+edx*4-2*JOINTQUAT_SIZE], xmm4		// q[k3] = ( m[1 * 4 + 2] - s0 * m[2 * 4 + 1] ) * s;
11978 
11979 		mov			ecx, [esi+eax+2*JOINTMAT_SIZE+0*16+3*4]
11980 		mov			[edi-2*JOINTQUAT_SIZE+16], ecx			// q[4] = m[0 * 4 + 3];
11981 		mov			edx, [esi+eax+2*JOINTMAT_SIZE+1*16+3*4]
11982 		mov			[edi-2*JOINTQUAT_SIZE+20], edx			// q[5] = m[1 * 4 + 3];
11983 		mov			ecx, [esi+eax+2*JOINTMAT_SIZE+2*16+3*4]
11984 		mov			[edi-2*JOINTQUAT_SIZE+24], ecx			// q[6] = m[2 * 4 + 3];
11985 
11986 		shufps		xmm6, xmm6, R_SHUFFLEPS( 1, 2, 3, 0 )
11987 		shufps		xmm7, xmm7, R_SHUFFLEPS( 1, 2, 3, 0 )
11988 		shufps		xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 )
11989 		shufps		xmm1, xmm1, R_SHUFFLEPS( 1, 2, 3, 0 )
11990 		shufps		xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 )
11991 
11992 		movzx		ecx, byte ptr shuffle[3*4+0]			// ecx = k0
11993 		movss		[edi+ecx*4-1*JOINTQUAT_SIZE], xmm7		// q[k0] = s * t;
11994 
11995 		movzx		edx, byte ptr shuffle[3*4+1]			// edx = k1
11996 		movss		xmm4, [esi+eax+3*JOINTMAT_SIZE+1*16+0*4]
11997 		xorps		xmm4, xmm2
11998 		subss		xmm4, [esi+eax+3*JOINTMAT_SIZE+0*16+1*4]
11999 		mulss		xmm4, xmm6
12000 		movss		[edi+edx*4-1*JOINTQUAT_SIZE], xmm4		// q[k1] = ( m[0 * 4 + 1] - s2 * m[1 * 4 + 0] ) * s;
12001 
12002 		movzx		ecx, byte ptr shuffle[3*4+2]			// ecx = k2
12003 		movss		xmm3, [esi+eax+3*JOINTMAT_SIZE+0*16+2*4]
12004 		xorps		xmm3, xmm1
12005 		subss		xmm3, [esi+eax+3*JOINTMAT_SIZE+2*16+0*4]
12006 		mulss		xmm3, xmm6
12007 		movss		[edi+ecx*4-1*JOINTQUAT_SIZE], xmm3		// q[k2] = ( m[2 * 4 + 0] - s1 * m[0 * 4 + 2] ) * s;
12008 
12009 		movzx		edx, byte ptr shuffle[3*4+3]			// edx = k3
12010 		movss		xmm4, [esi+eax+3*JOINTMAT_SIZE+2*16+1*4]
12011 		xorps		xmm4, xmm0
12012 		subss		xmm4, [esi+eax+3*JOINTMAT_SIZE+1*16+2*4]
12013 		mulss		xmm4, xmm6
12014 		movss		[edi+edx*4-1*JOINTQUAT_SIZE], xmm4		// q[k3] = ( m[1 * 4 + 2] - s0 * m[2 * 4 + 1] ) * s;
12015 
12016 		mov			ecx, [esi+eax+3*JOINTMAT_SIZE+0*16+3*4]
12017 		mov			[edi-1*JOINTQUAT_SIZE+16], ecx			// q[4] = m[0 * 4 + 3];
12018 		mov			edx, [esi+eax+3*JOINTMAT_SIZE+1*16+3*4]
12019 		mov			[edi-1*JOINTQUAT_SIZE+20], edx			// q[5] = m[1 * 4 + 3];
12020 		mov			ecx, [esi+eax+3*JOINTMAT_SIZE+2*16+3*4]
12021 		mov			[edi-1*JOINTQUAT_SIZE+24], ecx			// q[6] = m[2 * 4 + 3];
12022 
12023 		add			eax, 4*JOINTMAT_SIZE
12024 		jl			loopMat4
12025 
12026 	done4:
12027 		mov			eax, numJoints
12028 		and			eax, 3
12029 		jz			done1
12030 		imul		eax, JOINTMAT_SIZE
12031 		add			esi, eax
12032 		neg			eax
12033 
12034 	loopMat1:
12035 		movss		xmm5, [esi+eax+0*JOINTMAT_SIZE+0*16+0*4]
12036 		movss		xmm6, [esi+eax+0*JOINTMAT_SIZE+1*16+1*4]
12037 		movss		xmm7, [esi+eax+0*JOINTMAT_SIZE+2*16+2*4]
12038 
12039 		// -------------------
12040 
12041 		movaps		xmm0, xmm5
12042 		addss		xmm0, xmm6
12043 		addss		xmm0, xmm7
12044 		cmpnltss	xmm0, SIMD_SP_zero						// xmm0 = m[0 * 4 + 0] + m[1 * 4 + 1] + m[2 * 4 + 2] > 0.0f
12045 
12046 		movaps		xmm1, xmm5
12047 		movaps		xmm2, xmm5
12048 		cmpnltss	xmm1, xmm6
12049 		cmpnltss	xmm2, xmm7
12050 		andps		xmm2, xmm1								// xmm2 = m[0 * 4 + 0] > m[1 * 4 + 1] && m[0 * 4 + 0] > m[2 * 4 + 2]
12051 
12052 		movaps		xmm4, xmm6
12053 		cmpnltss	xmm4, xmm7								// xmm3 = m[1 * 4 + 1] > m[2 * 4 + 2]
12054 
12055 		movaps		xmm1, xmm0
12056 		andnps		xmm1, xmm2
12057 		orps		xmm2, xmm0
12058 		movaps		xmm3, xmm2
12059 		andnps		xmm2, xmm4
12060 		orps		xmm3, xmm2
12061 		xorps		xmm3, SIMD_SP_not
12062 
12063 		andps		xmm0, SIMD_DW_mat2quatShuffle0
12064 		movaps		xmm4, xmm1
12065 		andps		xmm4, SIMD_DW_mat2quatShuffle1
12066 		orps		xmm0, xmm4
12067 		movaps		xmm4, xmm2
12068 		andps		xmm4, SIMD_DW_mat2quatShuffle2
12069 		orps		xmm0, xmm4
12070 		movaps		xmm4, xmm3
12071 		andps		xmm4, SIMD_DW_mat2quatShuffle3
12072 		orps		xmm4, xmm0
12073 
12074 		movss		shuffle, xmm4
12075 
12076 		movaps		xmm0, xmm2
12077 		orps		xmm0, xmm3								// xmm0 = xmm2 | xmm3	= s0
12078 		orps		xmm2, xmm1								// xmm2 = xmm1 | xmm2	= s2
12079 		orps		xmm1, xmm3								// xmm1 = xmm1 | xmm3	= s1
12080 
12081 		andps		xmm0, SIMD_SP_signBitMask
12082 		andps		xmm1, SIMD_SP_signBitMask
12083 		andps		xmm2, SIMD_SP_signBitMask
12084 
12085 		xorps		xmm5, xmm0
12086 		xorps		xmm6, xmm1
12087 		xorps		xmm7, xmm2
12088 		addss		xmm5, xmm6
12089 		addss		xmm7, SIMD_SP_one
12090 		addss		xmm5, xmm7								// xmm5 = t
12091 
12092 		movss		xmm7, xmm5								// xmm7 = t
12093 		rsqrtss		xmm6, xmm5
12094 		mulss		xmm5, xmm6
12095 		mulss		xmm5, xmm6
12096 		subss		xmm5, SIMD_SP_rsqrt_c0
12097 		mulss		xmm6, SIMD_SP_mat2quat_rsqrt_c1
12098 		mulss		xmm6, xmm5								// xmm5 = s
12099 
12100 		mulss		xmm7, xmm6								// xmm7 = s * t
12101 		xorps		xmm6, SIMD_SP_signBitMask				// xmm6 = -s
12102 
12103 		// -------------------
12104 
12105 		movzx		ecx, byte ptr shuffle[0]				// ecx = k0
12106 		add			edi, JOINTQUAT_SIZE
12107 		movss		[edi+ecx*4-1*JOINTQUAT_SIZE], xmm7		// q[k0] = s * t;
12108 
12109 		movzx		edx, byte ptr shuffle[1]				// edx = k1
12110 		movss		xmm4, [esi+eax+0*JOINTMAT_SIZE+1*16+0*4]
12111 		xorps		xmm4, xmm2
12112 		subss		xmm4, [esi+eax+0*JOINTMAT_SIZE+0*16+1*4]
12113 		mulss		xmm4, xmm6
12114 		movss		[edi+edx*4-1*JOINTQUAT_SIZE], xmm4		// q[k1] = ( m[0 * 4 + 1] - s2 * m[1 * 4 + 0] ) * s;
12115 
12116 		movzx		ecx, byte ptr shuffle[2]				// ecx = k2
12117 		movss		xmm3, [esi+eax+0*JOINTMAT_SIZE+0*16+2*4]
12118 		xorps		xmm3, xmm1
12119 		subss		xmm3, [esi+eax+0*JOINTMAT_SIZE+2*16+0*4]
12120 		mulss		xmm3, xmm6
12121 		movss		[edi+ecx*4-1*JOINTQUAT_SIZE], xmm3		// q[k2] = ( m[2 * 4 + 0] - s1 * m[0 * 4 + 2] ) * s;
12122 
12123 		movzx		edx, byte ptr shuffle[3]				// edx = k3
12124 		movss		xmm4, [esi+eax+0*JOINTMAT_SIZE+2*16+1*4]
12125 		xorps		xmm4, xmm0
12126 		subss		xmm4, [esi+eax+0*JOINTMAT_SIZE+1*16+2*4]
12127 		mulss		xmm4, xmm6
12128 		movss		[edi+edx*4-1*JOINTQUAT_SIZE], xmm4		// q[k3] = ( m[1 * 4 + 2] - s0 * m[2 * 4 + 1] ) * s;
12129 
12130 		mov			ecx, [esi+eax+0*JOINTMAT_SIZE+0*16+3*4]
12131 		mov			[edi-1*JOINTQUAT_SIZE+16], ecx			// q[4] = m[0 * 4 + 3];
12132 		mov			edx, [esi+eax+0*JOINTMAT_SIZE+1*16+3*4]
12133 		mov			[edi-1*JOINTQUAT_SIZE+20], edx			// q[5] = m[1 * 4 + 3];
12134 		mov			ecx, [esi+eax+0*JOINTMAT_SIZE+2*16+3*4]
12135 		mov			[edi-1*JOINTQUAT_SIZE+24], ecx			// q[6] = m[2 * 4 + 3];
12136 
12137 		add			eax, JOINTMAT_SIZE
12138 		jl			loopMat1
12139 
12140 	done1:
12141 	}
12142 
12143 #elif 0
12144 
12145 	for ( int i = 0; i < numJoints; i++ ) {
12146 		float s0, s1, s2;
12147 		int k0, k1, k2, k3;
12148 
12149 		float *q = jointQuats[i].q.ToFloatPtr();
12150 		const float *m = jointMats[i].ToFloatPtr();
12151 
12152 		if ( m[0 * 4 + 0] + m[1 * 4 + 1] + m[2 * 4 + 2] > 0.0f ) {
12153 
12154 			k0 = 3;
12155 			k1 = 2;
12156 			k2 = 1;
12157 			k3 = 0;
12158 			s0 = 1.0f;
12159 			s1 = 1.0f;
12160 			s2 = 1.0f;
12161 
12162 		} else if ( m[0 * 4 + 0] > m[1 * 4 + 1] && m[0 * 4 + 0] > m[2 * 4 + 2] ) {
12163 
12164 			k0 = 0;
12165 			k1 = 1;
12166 			k2 = 2;
12167 			k3 = 3;
12168 			s0 = 1.0f;
12169 			s1 = -1.0f;
12170 			s2 = -1.0f;
12171 
12172 		} else if ( m[1 * 4 + 1] > m[2 * 4 + 2] ) {
12173 
12174 			k0 = 1;
12175 			k1 = 0;
12176 			k2 = 3;
12177 			k3 = 2;
12178 			s0 = -1.0f;
12179 			s1 = 1.0f;
12180 			s2 = -1.0f;
12181 
12182 		} else {
12183 
12184 			k0 = 2;
12185 			k1 = 3;
12186 			k2 = 0;
12187 			k3 = 1;
12188 			s0 = -1.0f;
12189 			s1 = -1.0f;
12190 			s2 = 1.0f;
12191 
12192 		}
12193 
12194 		float t = s0 * m[0 * 4 + 0] + s1 * m[1 * 4 + 1] + s2 * m[2 * 4 + 2] + 1.0f;
12195 		float s = idMath::InvSqrt( t ) * 0.5f;
12196 
12197 		q[k0] = s * t;
12198 		q[k1] = ( m[0 * 4 + 1] - s2 * m[1 * 4 + 0] ) * s;
12199 		q[k2] = ( m[2 * 4 + 0] - s1 * m[0 * 4 + 2] ) * s;
12200 		q[k3] = ( m[1 * 4 + 2] - s0 * m[2 * 4 + 1] ) * s;
12201 
12202 		q[4] = m[0 * 4 + 3];
12203 		q[5] = m[1 * 4 + 3];
12204 		q[6] = m[2 * 4 + 3];
12205 	}
12206 
12207 #elif 1
12208 
12209 	for ( int i = 0; i < numJoints; i++ ) {
12210 
12211 		float *q = jointQuats[i].q.ToFloatPtr();
12212 		const float *m = jointMats[i].ToFloatPtr();
12213 
12214 		if ( m[0 * 4 + 0] + m[1 * 4 + 1] + m[2 * 4 + 2] > 0.0f ) {
12215 
12216 			float t = + m[0 * 4 + 0] + m[1 * 4 + 1] + m[2 * 4 + 2] + 1.0f;
12217 			float s = idMath::InvSqrt( t ) * 0.5f;
12218 
12219 			q[3] = s * t;
12220 			q[2] = ( m[0 * 4 + 1] - m[1 * 4 + 0] ) * s;
12221 			q[1] = ( m[2 * 4 + 0] - m[0 * 4 + 2] ) * s;
12222 			q[0] = ( m[1 * 4 + 2] - m[2 * 4 + 1] ) * s;
12223 
12224 		} else if ( m[0 * 4 + 0] > m[1 * 4 + 1] && m[0 * 4 + 0] > m[2 * 4 + 2] ) {
12225 
12226 			float t = + m[0 * 4 + 0] - m[1 * 4 + 1] - m[2 * 4 + 2] + 1.0f;
12227 			float s = idMath::InvSqrt( t ) * 0.5f;
12228 
12229 			q[0] = s * t;
12230 			q[1] = ( m[0 * 4 + 1] + m[1 * 4 + 0] ) * s;
12231 			q[2] = ( m[2 * 4 + 0] + m[0 * 4 + 2] ) * s;
12232 			q[3] = ( m[1 * 4 + 2] - m[2 * 4 + 1] ) * s;
12233 
12234 		} else if ( m[1 * 4 + 1] > m[2 * 4 + 2] ) {
12235 
12236 			float t = - m[0 * 4 + 0] + m[1 * 4 + 1] - m[2 * 4 + 2] + 1.0f;
12237 			float s = idMath::InvSqrt( t ) * 0.5f;
12238 
12239 			q[1] = s * t;
12240 			q[0] = ( m[0 * 4 + 1] + m[1 * 4 + 0] ) * s;
12241 			q[3] = ( m[2 * 4 + 0] - m[0 * 4 + 2] ) * s;
12242 			q[2] = ( m[1 * 4 + 2] + m[2 * 4 + 1] ) * s;
12243 
12244 		} else {
12245 
12246 			float t = - m[0 * 4 + 0] - m[1 * 4 + 1] + m[2 * 4 + 2] + 1.0f;
12247 			float s = idMath::InvSqrt( t ) * 0.5f;
12248 
12249 			q[2] = s * t;
12250 			q[3] = ( m[0 * 4 + 1] - m[1 * 4 + 0] ) * s;
12251 			q[0] = ( m[2 * 4 + 0] + m[0 * 4 + 2] ) * s;
12252 			q[1] = ( m[1 * 4 + 2] + m[2 * 4 + 1] ) * s;
12253 
12254 		}
12255 
12256 		q[4] = m[0 * 4 + 3];
12257 		q[5] = m[1 * 4 + 3];
12258 		q[6] = m[2 * 4 + 3];
12259 	}
12260 
12261 #endif
12262 }
12263 
12264 /*
12265 ============
12266 idSIMD_SSE::TransformJoints
12267 ============
12268 */
TransformJoints(idJointMat * jointMats,const int * parents,const int firstJoint,const int lastJoint)12269 void VPCALL idSIMD_SSE::TransformJoints( idJointMat *jointMats, const int *parents, const int firstJoint, const int lastJoint ) {
12270 #if 1
12271 
12272 	assert( sizeof( idJointMat ) == JOINTMAT_SIZE );
12273 
12274 	__asm {
12275 
12276 		mov			ecx, firstJoint
12277 		mov			eax, lastJoint
12278 		sub			eax, ecx
12279 		jl			done
12280 		imul		ecx, 4
12281 		mov			edi, parents
12282 		add			edi, ecx
12283 		imul		ecx, 12
12284 		mov			esi, jointMats
12285 		imul		eax, 4
12286 		add			edi, eax
12287 		neg			eax
12288 
12289 	loopJoint:
12290 
12291 		movaps		xmm0, [esi+ecx+ 0]						// xmm0 = m0, m1, m2, t0
12292 		mov			edx, [edi+eax]
12293 		movaps		xmm1, [esi+ecx+16]						// xmm1 = m2, m3, m4, t1
12294 		imul		edx, JOINTMAT_SIZE
12295 		movaps		xmm2, [esi+ecx+32]						// xmm2 = m5, m6, m7, t2
12296 
12297 		movss		xmm4, [esi+edx+ 0]
12298 		shufps		xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
12299 		mulps		xmm4, xmm0
12300 
12301 		movss		xmm5, [esi+edx+ 4]
12302 		shufps		xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
12303 		mulps		xmm5, xmm1
12304 		addps		xmm4, xmm5
12305 		movss		xmm6, [esi+edx+ 8]
12306 		shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
12307 		mulps		xmm6, xmm2
12308 		addps		xmm4, xmm6
12309 
12310 		movss		xmm5, [esi+edx+16]
12311 		shufps		xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
12312 		mulps		xmm5, xmm0
12313 
12314 		movss		xmm7, [esi+edx+12]
12315 		shufps		xmm7, xmm7, R_SHUFFLEPS( 1, 2, 3, 0 )
12316 		addps		xmm4, xmm7
12317 
12318 		movaps		[esi+ecx+ 0], xmm4
12319 
12320 		movss		xmm6, [esi+edx+20]
12321 		shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
12322 		mulps		xmm6, xmm1
12323 		addps		xmm5, xmm6
12324 		movss		xmm7, [esi+edx+24]
12325 		shufps		xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
12326 		mulps		xmm7, xmm2
12327 		addps		xmm5, xmm7
12328 
12329 		movss		xmm6, [esi+edx+32]
12330 		shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
12331 		mulps		xmm6, xmm0
12332 
12333 		movss		xmm3, [esi+edx+28]
12334 		shufps		xmm3, xmm3, R_SHUFFLEPS( 1, 2, 3, 0 )
12335 		addps		xmm5, xmm3
12336 
12337 		movaps		[esi+ecx+16], xmm5
12338 
12339 		movss		xmm7, [esi+edx+36]
12340 		shufps		xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
12341 		mulps		xmm7, xmm1
12342 		addps		xmm6, xmm7
12343 		movss		xmm3, [esi+edx+40]
12344 		shufps		xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 )
12345 		mulps		xmm3, xmm2
12346 		addps		xmm6, xmm3
12347 
12348 		movss		xmm7, [esi+edx+44]
12349 		shufps		xmm7, xmm7, R_SHUFFLEPS( 1, 2, 3, 0 )
12350 		addps		xmm6, xmm7
12351 
12352 		movaps		[esi+ecx+32], xmm6
12353 
12354 		add			ecx, JOINTMAT_SIZE
12355 		add			eax, 4
12356 		jle			loopJoint
12357 	done:
12358 	}
12359 
12360 #else
12361 
12362 	int i;
12363 
12364 	for( i = firstJoint; i <= lastJoint; i++ ) {
12365 		assert( parents[i] < i );
12366 		jointMats[i] *= jointMats[parents[i]];
12367 	}
12368 
12369 #endif
12370 }
12371 
12372 /*
12373 ============
12374 idSIMD_SSE::UntransformJoints
12375 ============
12376 */
UntransformJoints(idJointMat * jointMats,const int * parents,const int firstJoint,const int lastJoint)12377 void VPCALL idSIMD_SSE::UntransformJoints( idJointMat *jointMats, const int *parents, const int firstJoint, const int lastJoint ) {
12378 #if 1
12379 
12380 	assert( sizeof( idJointMat ) == JOINTMAT_SIZE );
12381 
12382 	__asm {
12383 
12384 		mov			edx, firstJoint
12385 		mov			eax, lastJoint
12386 		mov			ecx, eax
12387 		sub			eax, edx
12388 		jl			done
12389 		mov			esi, jointMats
12390 		imul		ecx, JOINTMAT_SIZE
12391 		imul		edx, 4
12392 		mov			edi, parents
12393 		add			edi, edx
12394 		imul		eax, 4
12395 
12396 	loopJoint:
12397 
12398 		movaps		xmm0, [esi+ecx+ 0]						// xmm0 = m0, m1, m2, t0
12399 		mov			edx, [edi+eax]
12400 		movaps		xmm1, [esi+ecx+16]						// xmm1 = m2, m3, m4, t1
12401 		imul		edx, JOINTMAT_SIZE
12402 		movaps		xmm2, [esi+ecx+32]						// xmm2 = m5, m6, m7, t2
12403 
12404 		movss		xmm6, [esi+edx+12]
12405 		shufps		xmm6, xmm6, R_SHUFFLEPS( 1, 2, 3, 0 )
12406 		subps		xmm0, xmm6
12407 		movss		xmm7, [esi+edx+28]
12408 		shufps		xmm7, xmm7, R_SHUFFLEPS( 1, 2, 3, 0 )
12409 		subps		xmm1, xmm7
12410 		movss		xmm3, [esi+edx+44]
12411 		shufps		xmm3, xmm3, R_SHUFFLEPS( 1, 2, 3, 0 )
12412 		subps		xmm2, xmm3
12413 
12414 		movss		xmm4, [esi+edx+ 0]
12415 		shufps		xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
12416 		mulps		xmm4, xmm0
12417 		movss		xmm5, [esi+edx+16]
12418 		shufps		xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
12419 		mulps		xmm5, xmm1
12420 		addps		xmm4, xmm5
12421 		movss		xmm6, [esi+edx+32]
12422 		shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
12423 		mulps		xmm6, xmm2
12424 		addps		xmm4, xmm6
12425 
12426 		movaps		[esi+ecx+ 0], xmm4
12427 
12428 		movss		xmm5, [esi+edx+ 4]
12429 		shufps		xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
12430 		mulps		xmm5, xmm0
12431 		movss		xmm6, [esi+edx+20]
12432 		shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
12433 		mulps		xmm6, xmm1
12434 		addps		xmm5, xmm6
12435 		movss		xmm7, [esi+edx+36]
12436 		shufps		xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
12437 		mulps		xmm7, xmm2
12438 		addps		xmm5, xmm7
12439 
12440 		movaps		[esi+ecx+16], xmm5
12441 
12442 		movss		xmm6, [esi+edx+ 8]
12443 		shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
12444 		mulps		xmm6, xmm0
12445 		movss		xmm7, [esi+edx+24]
12446 		shufps		xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
12447 		mulps		xmm7, xmm1
12448 		addps		xmm6, xmm7
12449 		movss		xmm3, [esi+edx+40]
12450 		shufps		xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 )
12451 		mulps		xmm3, xmm2
12452 		addps		xmm6, xmm3
12453 
12454 		movaps		[esi+ecx+32], xmm6
12455 
12456 		sub			ecx, JOINTMAT_SIZE
12457 		sub			eax, 4
12458 		jge			loopJoint
12459 	done:
12460 	}
12461 
12462 #else
12463 
12464 	int i;
12465 
12466 	for( i = lastJoint; i >= firstJoint; i-- ) {
12467 		assert( parents[i] < i );
12468 		jointMats[i] /= jointMats[parents[i]];
12469 	}
12470 
12471 #endif
12472 }
12473 
12474 /*
12475 ============
12476 idSIMD_SSE::TransformVerts
12477 ============
12478 */
TransformVerts(idDrawVert * verts,const int numVerts,const idJointMat * joints,const idVec4 * weights,const int * index,const int numWeights)12479 void VPCALL idSIMD_SSE::TransformVerts( idDrawVert *verts, const int numVerts, const idJointMat *joints, const idVec4 *weights, const int *index, const int numWeights ) {
12480 #if 1
12481 
12482 	assert( sizeof( idDrawVert ) == DRAWVERT_SIZE );
12483 	assert( (int)&((idDrawVert *)0)->xyz == DRAWVERT_XYZ_OFFSET );
12484 	assert( sizeof( idVec4 ) == JOINTWEIGHT_SIZE );
12485 	assert( sizeof( idJointMat ) == JOINTMAT_SIZE );
12486 
12487 	__asm
12488 	{
12489 		mov			eax, numVerts
12490 		test		eax, eax
12491 		jz			done
12492 		imul		eax, DRAWVERT_SIZE
12493 
12494 		mov			ecx, verts
12495 		mov			edx, index
12496 		mov			esi, weights
12497 		mov			edi, joints
12498 
12499 		add			ecx, eax
12500 		neg			eax
12501 
12502 	loopVert:
12503 		mov			ebx, [edx]
12504 		movaps		xmm2, [esi]
12505 		add			edx, 8
12506 		movaps		xmm0, xmm2
12507 		add			esi, JOINTWEIGHT_SIZE
12508 		movaps		xmm1, xmm2
12509 
12510 		mulps		xmm0, [edi+ebx+ 0]						// xmm0 = m0, m1, m2, t0
12511 		mulps		xmm1, [edi+ebx+16]						// xmm1 = m3, m4, m5, t1
12512 		mulps		xmm2, [edi+ebx+32]						// xmm2 = m6, m7, m8, t2
12513 
12514 		cmp			dword ptr [edx-4], 0
12515 
12516 		jne			doneWeight
12517 
12518 	loopWeight:
12519 		mov			ebx, [edx]
12520 		movaps		xmm5, [esi]
12521 		add			edx, 8
12522 		movaps		xmm3, xmm5
12523 		add			esi, JOINTWEIGHT_SIZE
12524 		movaps		xmm4, xmm5
12525 
12526 		mulps		xmm3, [edi+ebx+ 0]						// xmm3 = m0, m1, m2, t0
12527 		mulps		xmm4, [edi+ebx+16]						// xmm4 = m3, m4, m5, t1
12528 		mulps		xmm5, [edi+ebx+32]						// xmm5 = m6, m7, m8, t2
12529 
12530 		cmp			dword ptr [edx-4], 0
12531 
12532 		addps		xmm0, xmm3
12533 		addps		xmm1, xmm4
12534 		addps		xmm2, xmm5
12535 
12536 		je			loopWeight
12537 
12538 	doneWeight:
12539 		add			eax, DRAWVERT_SIZE
12540 
12541 		movaps		xmm6, xmm0								// xmm6 =    m0,    m1,          m2,          t0
12542 		unpcklps	xmm6, xmm1								// xmm6 =    m0,    m3,          m1,          m4
12543 		unpckhps	xmm0, xmm1								// xmm1 =    m2,    m5,          t0,          t1
12544 		addps		xmm6, xmm0								// xmm6 = m0+m2, m3+m5,       m1+t0,       m4+t1
12545 
12546 		movaps		xmm7, xmm2								// xmm7 =    m6,    m7,          m8,          t2
12547 		movlhps		xmm2, xmm6								// xmm2 =    m6,    m7,       m0+m2,       m3+m5
12548 		movhlps		xmm6, xmm7								// xmm6 =    m8,    t2,       m1+t0,       m4+t1
12549 		addps		xmm6, xmm2								// xmm6 = m6+m8, m7+t2, m0+m1+m2+t0, m3+m4+m5+t1
12550 
12551 		movhps		[ecx+eax-DRAWVERT_SIZE+0], xmm6
12552 
12553 		movaps		xmm5, xmm6								// xmm5 = m6+m8, m7+t2
12554 		shufps		xmm5, xmm5, R_SHUFFLEPS( 1, 0, 2, 3 )	// xmm5 = m7+t2, m6+m8
12555 		addss		xmm5, xmm6								// xmm5 = m6+m8+m7+t2
12556 
12557 		movss		[ecx+eax-DRAWVERT_SIZE+8], xmm5
12558 
12559 		jl			loopVert
12560 	done:
12561 	}
12562 
12563 #else
12564 
12565 	int i, j;
12566 	const byte *jointsPtr = (byte *)joints;
12567 
12568 	for( j = i = 0; i < numVerts; i++ ) {
12569 		idVec3 v;
12570 
12571 		v = ( *(idJointMat *) ( jointsPtr + index[j*2+0] ) ) * weights[j];
12572 		while( index[j*2+1] == 0 ) {
12573 			j++;
12574 			v += ( *(idJointMat *) ( jointsPtr + index[j*2+0] ) ) * weights[j];
12575 		}
12576 		j++;
12577 
12578 		verts[i].xyz = v;
12579 	}
12580 
12581 #endif
12582 }
12583 
12584 /*
12585 ============
12586 idSIMD_SSE::TracePointCull
12587 ============
12588 */
TracePointCull(byte * cullBits,byte & totalOr,const float radius,const idPlane * planes,const idDrawVert * verts,const int numVerts)12589 void VPCALL idSIMD_SSE::TracePointCull( byte *cullBits, byte &totalOr, const float radius, const idPlane *planes, const idDrawVert *verts, const int numVerts ) {
12590 #if 1
12591 
12592 	assert( sizeof( idDrawVert ) == DRAWVERT_SIZE );
12593 	assert( (int)&((idDrawVert *)0)->xyz == DRAWVERT_XYZ_OFFSET );
12594 
12595 	__asm {
12596 		push		ebx
12597 		mov			eax, numVerts
12598 		test		eax, eax
12599 		jz			done
12600 
12601 		mov			edi, planes
12602 		movlps		xmm1, [edi]								// xmm1 =  0,  1,  X,  X
12603 		movhps		xmm1, [edi+16]							// xmm1 =  0,  1,  4,  5
12604 		movlps		xmm3, [edi+8]							// xmm3 =  2,  3,  X,  X
12605 		movhps		xmm3, [edi+24]							// xmm3 =  2,  3,  6,  7
12606 		movlps		xmm4, [edi+32]							// xmm4 =  8,  9,  X,  X
12607 		movhps		xmm4, [edi+48]							// xmm4 =  8,  9, 12, 13
12608 		movlps		xmm5, [edi+40]							// xmm5 = 10, 11,  X,  X
12609 		movhps		xmm5, [edi+56]							// xmm5 = 10, 11, 14, 15
12610 		movaps		xmm0, xmm1								// xmm0 =  0,  1,  4,  5
12611 		shufps		xmm0, xmm4, R_SHUFFLEPS( 0, 2, 0, 2 )	// xmm0 =  0,  4,  8, 12
12612 		shufps		xmm1, xmm4, R_SHUFFLEPS( 1, 3, 1, 3 )	// xmm1 =  1,  5,  9, 13
12613 		movaps		xmm2, xmm3								// xmm2 =  2,  3,  6,  7
12614 		shufps		xmm2, xmm5, R_SHUFFLEPS( 0, 2, 0, 2 )	// xmm2 =  2,  6, 10, 14
12615 		shufps		xmm3, xmm5, R_SHUFFLEPS( 1, 3, 1, 3 )	// xmm3 =  3,  7, 11, 15
12616 		movss		xmm7, radius
12617 		shufps		xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
12618 
12619 		xor			edx, edx
12620 		mov			esi, verts
12621 		mov			edi, cullBits
12622 		imul		eax, DRAWVERT_SIZE
12623 		add			esi, eax
12624 		neg			eax
12625 
12626 	loopVert:
12627 		movss		xmm4, [esi+eax+DRAWVERT_XYZ_OFFSET+0]
12628 		shufps		xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
12629 		movss		xmm5, [esi+eax+DRAWVERT_XYZ_OFFSET+4]
12630 		mulps		xmm4, xmm0
12631 		shufps		xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
12632 		movss		xmm6, [esi+eax+DRAWVERT_XYZ_OFFSET+8]
12633 		mulps		xmm5, xmm1
12634 		shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
12635 		addps		xmm4, xmm5
12636 		mulps		xmm6, xmm2
12637 		addps		xmm4, xmm3
12638 		addps		xmm4, xmm6
12639 		movaps		xmm5, xmm4
12640 		xorps		xmm5, SIMD_SP_signBitMask
12641 		cmpltps		xmm4, xmm7
12642 		movmskps	ecx, xmm4
12643 		cmpltps		xmm5, xmm7
12644 		movmskps	ebx, xmm5
12645 		shl			cx, 4
12646 		or			cl, bl
12647 		inc			edi
12648 		or			dl, cl
12649 		add			eax, DRAWVERT_SIZE
12650 		mov			byte ptr [edi-1], cl
12651 		jl			loopVert
12652 
12653 	done:
12654 		mov			esi, totalOr
12655 		mov			byte ptr [esi], dl
12656 		pop			ebx
12657 	}
12658 
12659 #else
12660 
12661 	int i;
12662 	byte tOr;
12663 
12664 	tOr = 0;
12665 
12666 	for ( i = 0; i < numVerts; i++ ) {
12667 		byte bits;
12668 		float d0, d1, d2, d3, t;
12669 		const idVec3 &v = verts[i].xyz;
12670 
12671 		d0 = planes[0][0] * v[0] + planes[0][1] * v[1] + planes[0][2] * v[2] + planes[0][3];
12672 		d1 = planes[1][0] * v[0] + planes[1][1] * v[1] + planes[1][2] * v[2] + planes[1][3];
12673 		d2 = planes[2][0] * v[0] + planes[2][1] * v[1] + planes[2][2] * v[2] + planes[2][3];
12674 		d3 = planes[3][0] * v[0] + planes[3][1] * v[1] + planes[3][2] * v[2] + planes[3][3];
12675 
12676 		t = d0 + radius;
12677 		bits  = FLOATSIGNBITSET( t ) << 0;
12678 		t = d1 + radius;
12679 		bits |= FLOATSIGNBITSET( t ) << 1;
12680 		t = d2 + radius;
12681 		bits |= FLOATSIGNBITSET( t ) << 2;
12682 		t = d3 + radius;
12683 		bits |= FLOATSIGNBITSET( t ) << 3;
12684 
12685 		t = d0 - radius;
12686 		bits |= FLOATSIGNBITSET( t ) << 4;
12687 		t = d1 - radius;
12688 		bits |= FLOATSIGNBITSET( t ) << 5;
12689 		t = d2 - radius;
12690 		bits |= FLOATSIGNBITSET( t ) << 6;
12691 		t = d3 - radius;
12692 		bits |= FLOATSIGNBITSET( t ) << 7;
12693 
12694 		bits ^= 0x0F;		// flip lower four bits
12695 
12696 		tOr |= bits;
12697 		cullBits[i] = bits;
12698 	}
12699 
12700 	totalOr = tOr;
12701 
12702 #endif
12703 }
12704 
12705 /*
12706 ============
12707 idSIMD_SSE::DecalPointCull
12708 ============
12709 */
DecalPointCull(byte * cullBits,const idPlane * planes,const idDrawVert * verts,const int numVerts)12710 void VPCALL idSIMD_SSE::DecalPointCull( byte *cullBits, const idPlane *planes, const idDrawVert *verts, const int numVerts ) {
12711 #if 1
12712 
12713 	ALIGN16( float p0[4] );
12714 	ALIGN16( float p1[4] );
12715 	ALIGN16( float p2[4] );
12716 	ALIGN16( float p3[4] );
12717 	ALIGN16( float p4[4] );
12718 	ALIGN16( float p5[4] );
12719 	ALIGN16( float p6[4] );
12720 	ALIGN16( float p7[4] );
12721 
12722 	assert( sizeof( idDrawVert ) == DRAWVERT_SIZE );
12723 	assert( (int)&((idDrawVert *)0)->xyz == DRAWVERT_XYZ_OFFSET );
12724 
12725 	__asm {
12726 		mov			ecx, planes
12727 		movlps		xmm1, [ecx]								// xmm1 =  0,  1,  X,  X
12728 		movhps		xmm1, [ecx+16]							// xmm1 =  0,  1,  4,  5
12729 		movlps		xmm3, [ecx+8]							// xmm3 =  2,  3,  X,  X
12730 		movhps		xmm3, [ecx+24]							// xmm3 =  2,  3,  6,  7
12731 		movlps		xmm4, [ecx+32]							// xmm4 =  8,  9,  X,  X
12732 		movhps		xmm4, [ecx+48]							// xmm4 =  8,  9, 12, 13
12733 		movlps		xmm5, [ecx+40]							// xmm5 = 10, 11,  X,  X
12734 		movhps		xmm5, [ecx+56]							// xmm5 = 10, 11, 14, 15
12735 		movaps		xmm0, xmm1								// xmm0 =  0,  1,  4,  5
12736 		shufps		xmm0, xmm4, R_SHUFFLEPS( 0, 2, 0, 2 )	// xmm0 =  0,  4,  8, 12
12737 		shufps		xmm1, xmm4, R_SHUFFLEPS( 1, 3, 1, 3 )	// xmm1 =  1,  5,  9, 13
12738 		movaps		xmm2, xmm3								// xmm2 =  2,  3,  6,  7
12739 		shufps		xmm2, xmm5, R_SHUFFLEPS( 0, 2, 0, 2 )	// xmm2 =  2,  6, 10, 14
12740 		shufps		xmm3, xmm5, R_SHUFFLEPS( 1, 3, 1, 3 )	// xmm3 =  3,  7, 11, 15
12741 
12742 		movaps		p0, xmm0
12743 		movaps		p1, xmm1
12744 		movaps		p2, xmm2
12745 		movaps		p3, xmm3
12746 
12747 		movlps		xmm4, [ecx+64]							// xmm4 = p40, p41,   X,   X
12748 		movhps		xmm4, [ecx+80]							// xmm4 = p40, p41, p50, p51
12749 		movaps		xmm5, xmm4								// xmm5 = p40, p41, p50, p51
12750 		shufps		xmm4, xmm4, R_SHUFFLEPS( 0, 2, 0, 2 )	// xmm4 = p40, p50, p40, p50
12751 		shufps		xmm5, xmm5, R_SHUFFLEPS( 1, 3, 1, 3 )	// xmm5 = p41, p51, p41, p51
12752 		movlps		xmm6, [ecx+72]							// xmm6 = p42, p43,   X,   X
12753 		movhps		xmm6, [ecx+88]							// xmm6 = p42, p43, p52, p53
12754 		movaps		xmm7, xmm6								// xmm7 = p42, p43, p52, p53
12755 		shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 2, 0, 2 )	// xmm6 = p42, p52, p42, p52
12756 		shufps		xmm7, xmm7, R_SHUFFLEPS( 1, 3, 1, 3 )	// xmm7 = p43, p53, p43, p53
12757 
12758 		movaps		p4, xmm4
12759 		movaps		p5, xmm5
12760 		movaps		p6, xmm6
12761 		movaps		p7, xmm7
12762 
12763 		mov			esi, verts
12764 		mov			edi, cullBits
12765 		mov			eax, numVerts
12766 		and			eax, ~1
12767 		jz			done2
12768 		imul		eax, DRAWVERT_SIZE
12769 		add			esi, eax
12770 		neg			eax
12771 
12772 	loopVert2:
12773 		movaps		xmm6, p0
12774 		movss		xmm0, [esi+eax+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]
12775 		shufps		xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
12776 		mulps		xmm6, xmm0
12777 		movaps		xmm7, p1
12778 		movss		xmm1, [esi+eax+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4]
12779 		shufps		xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
12780 		mulps		xmm7, xmm1
12781 		addps		xmm6, xmm7
12782 		movaps		xmm7, p2
12783 		movss		xmm2, [esi+eax+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8]
12784 		shufps		xmm2, xmm2, R_SHUFFLEPS( 0, 0, 0, 0 )
12785 		mulps		xmm7, xmm2
12786 		addps		xmm6, xmm7
12787 		addps		xmm6, p3
12788 
12789 		cmpnltps	xmm6, SIMD_SP_zero
12790 		movmskps	ecx, xmm6
12791 
12792 		movaps		xmm6, p0
12793 		movss		xmm3, [esi+eax+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]
12794 		shufps		xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 )
12795 		mulps		xmm6, xmm3
12796 		movaps		xmm7, p1
12797 		movss		xmm4, [esi+eax+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4]
12798 		shufps		xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
12799 		mulps		xmm7, xmm4
12800 		addps		xmm6, xmm7
12801 		movaps		xmm7, p2
12802 		movss		xmm5, [esi+eax+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8]
12803 		shufps		xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
12804 		mulps		xmm7, xmm5
12805 		addps		xmm6, xmm7
12806 		addps		xmm6, p3
12807 
12808 		cmpnltps	xmm6, SIMD_SP_zero
12809 		movmskps	edx, xmm6
12810 		mov			ch, dl
12811 
12812 		shufps		xmm0, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 )
12813 		mulps		xmm0, p4
12814 		shufps		xmm1, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
12815 		mulps		xmm1, p5
12816 		addps		xmm0, xmm1
12817 		shufps		xmm2, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
12818 		mulps		xmm2, p6
12819 		addps		xmm0, xmm2
12820 		addps		xmm0, p7
12821 
12822 		cmpnltps	xmm0, SIMD_SP_zero
12823 		movmskps	edx, xmm0
12824 
12825 		add			edi, 2
12826 
12827 		mov			dh, dl
12828 		shl			dl, 4
12829 		shl			dh, 2
12830 		and			edx, (3<<4)|(3<<12)
12831 		or			ecx, edx
12832 
12833 		add			eax, 2*DRAWVERT_SIZE
12834 		mov			word ptr [edi-2], cx
12835 		jl			loopVert2
12836 
12837 	done2:
12838 
12839 		mov			eax, numVerts
12840 		and			eax, 1
12841 		jz			done
12842 
12843 		movaps		xmm6, p0
12844 		movss		xmm0, [esi+DRAWVERT_XYZ_OFFSET+0]
12845 		shufps		xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
12846 		mulps		xmm6, xmm0
12847 		movaps		xmm7, p1
12848 		movss		xmm1, [esi+DRAWVERT_XYZ_OFFSET+4]
12849 		shufps		xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
12850 		mulps		xmm7, xmm1
12851 		addps		xmm6, xmm7
12852 		movaps		xmm7, p2
12853 		movss		xmm2, [esi+DRAWVERT_XYZ_OFFSET+8]
12854 		shufps		xmm2, xmm2, R_SHUFFLEPS( 0, 0, 0, 0 )
12855 		mulps		xmm7, xmm2
12856 		addps		xmm6, xmm7
12857 		addps		xmm6, p3
12858 
12859 		cmpnltps	xmm6, SIMD_SP_zero
12860 		movmskps	ecx, xmm6
12861 
12862 		mulps		xmm0, p4
12863 		mulps		xmm1, p5
12864 		addps		xmm0, xmm1
12865 		mulps		xmm2, p6
12866 		addps		xmm0, xmm2
12867 		addps		xmm0, p7
12868 
12869 		cmpnltps	xmm0, SIMD_SP_zero
12870 		movmskps	edx, xmm0
12871 
12872 		and			edx, 3
12873 		shl			edx, 4
12874 		or			ecx, edx
12875 
12876 		mov			byte ptr [edi], cl
12877 
12878 	done:
12879 	}
12880 
12881 
12882 #else
12883 
12884 	int i;
12885 
12886 	for ( i = 0; i < numVerts; i += 2 ) {
12887 		unsigned short bits0, bits1;
12888 		float d0, d1, d2, d3, d4, d5, d6, d7, d8, d9, d10, d11;
12889 		const idVec3 &v0 = verts[i+0].xyz;
12890 		const idVec3 &v1 = verts[i+1].xyz;
12891 
12892 		d0  = planes[0][0] * v0[0] + planes[0][1] * v0[1] + planes[0][2] * v0[2] + planes[0][3];
12893 		d1  = planes[1][0] * v0[0] + planes[1][1] * v0[1] + planes[1][2] * v0[2] + planes[1][3];
12894 		d2  = planes[2][0] * v0[0] + planes[2][1] * v0[1] + planes[2][2] * v0[2] + planes[2][3];
12895 		d3  = planes[3][0] * v0[0] + planes[3][1] * v0[1] + planes[3][2] * v0[2] + planes[3][3];
12896 
12897 		d4  = planes[4][0] * v0[0] + planes[4][1] * v0[1] + planes[4][2] * v0[2] + planes[4][3];
12898 		d5  = planes[5][0] * v0[0] + planes[5][1] * v0[1] + planes[5][2] * v0[2] + planes[5][3];
12899 		d10 = planes[4][0] * v1[0] + planes[4][1] * v1[1] + planes[4][2] * v1[2] + planes[4][3];
12900 		d11 = planes[5][0] * v1[0] + planes[5][1] * v1[1] + planes[5][2] * v1[2] + planes[5][3];
12901 
12902 		d6  = planes[0][0] * v1[0] + planes[0][1] * v1[1] + planes[0][2] * v1[2] + planes[0][3];
12903 		d7  = planes[1][0] * v1[0] + planes[1][1] * v1[1] + planes[1][2] * v1[2] + planes[1][3];
12904 		d8  = planes[2][0] * v1[0] + planes[2][1] * v1[1] + planes[2][2] * v1[2] + planes[2][3];
12905 		d9  = planes[3][0] * v1[0] + planes[3][1] * v1[1] + planes[3][2] * v1[2] + planes[3][3];
12906 
12907 		bits0  = FLOATSIGNBITSET( d0  ) << (0+0);
12908 		bits0 |= FLOATSIGNBITSET( d1  ) << (0+1);
12909 		bits0 |= FLOATSIGNBITSET( d2  ) << (0+2);
12910 		bits0 |= FLOATSIGNBITSET( d3  ) << (0+3);
12911 		bits0 |= FLOATSIGNBITSET( d4  ) << (0+4);
12912 		bits0 |= FLOATSIGNBITSET( d5  ) << (0+5);
12913 
12914 		bits1  = FLOATSIGNBITSET( d6  ) << (8+0);
12915 		bits1 |= FLOATSIGNBITSET( d7  ) << (8+1);
12916 		bits1 |= FLOATSIGNBITSET( d8  ) << (8+2);
12917 		bits1 |= FLOATSIGNBITSET( d9  ) << (8+3);
12918 		bits1 |= FLOATSIGNBITSET( d10 ) << (8+4);
12919 		bits1 |= FLOATSIGNBITSET( d11 ) << (8+5);
12920 
12921 		*(unsigned short *)(cullBits + i) = ( bits0 | bits1 ) ^ 0x3F3F;
12922 	}
12923 
12924 	if ( numVerts & 1 ) {
12925 		byte bits;
12926 		float d0, d1, d2, d3, d4, d5;
12927 		const idVec3 &v = verts[numVerts - 1].xyz;
12928 
12929 		d0 = planes[0][0] * v[0] + planes[0][1] * v[1] + planes[0][2] * v[2] + planes[0][3];
12930 		d1 = planes[1][0] * v[0] + planes[1][1] * v[1] + planes[1][2] * v[2] + planes[1][3];
12931 		d2 = planes[2][0] * v[0] + planes[2][1] * v[1] + planes[2][2] * v[2] + planes[2][3];
12932 		d3 = planes[3][0] * v[0] + planes[3][1] * v[1] + planes[3][2] * v[2] + planes[3][3];
12933 
12934 		d4 = planes[4][0] * v[0] + planes[4][1] * v[1] + planes[4][2] * v[2] + planes[4][3];
12935 		d5 = planes[5][0] * v[0] + planes[5][1] * v[1] + planes[5][2] * v[2] + planes[5][3];
12936 
12937 		bits  = FLOATSIGNBITSET( d0 ) << 0;
12938 		bits |= FLOATSIGNBITSET( d1 ) << 1;
12939 		bits |= FLOATSIGNBITSET( d2 ) << 2;
12940 		bits |= FLOATSIGNBITSET( d3 ) << 3;
12941 
12942 		bits |= FLOATSIGNBITSET( d4 ) << 4;
12943 		bits |= FLOATSIGNBITSET( d5 ) << 5;
12944 
12945 		cullBits[numVerts - 1] = bits ^ 0x3F;		// flip lower 6 bits
12946 	}
12947 
12948 #endif
12949 }
12950 
12951 /*
12952 ============
12953 idSIMD_SSE::OverlayPointCull
12954 ============
12955 */
OverlayPointCull(byte * cullBits,idVec2 * texCoords,const idPlane * planes,const idDrawVert * verts,const int numVerts)12956 void VPCALL idSIMD_SSE::OverlayPointCull( byte *cullBits, idVec2 *texCoords, const idPlane *planes, const idDrawVert *verts, const int numVerts ) {
12957 #if 1
12958 
12959 	assert( sizeof( idDrawVert ) == DRAWVERT_SIZE );
12960 	assert( (int)&((idDrawVert *)0)->xyz == DRAWVERT_XYZ_OFFSET );
12961 
12962 	__asm {
12963 		mov			eax, numVerts
12964 		mov			edx, verts
12965 		mov			esi, texCoords
12966 		mov			edi, cullBits
12967 
12968 		mov			ecx, planes
12969 		movss		xmm4, [ecx+ 0]
12970 		movss		xmm5, [ecx+16]
12971 		shufps		xmm4, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
12972 		shufps		xmm4, xmm4, R_SHUFFLEPS( 0, 2, 0, 2 )
12973 		movss		xmm5, [ecx+ 4]
12974 		movss		xmm6, [ecx+20]
12975 		shufps		xmm5, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
12976 		shufps		xmm5, xmm5, R_SHUFFLEPS( 0, 2, 0, 2 )
12977 		movss		xmm6, [ecx+ 8]
12978 		movss		xmm7, [ecx+24]
12979 		shufps		xmm6, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
12980 		shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 2, 0, 2 )
12981 		movss		xmm7, [ecx+12]
12982 		movss		xmm0, [ecx+28]
12983 		shufps		xmm7, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
12984 		shufps		xmm7, xmm7, R_SHUFFLEPS( 0, 2, 0, 2 )
12985 
12986 		and			eax, ~1
12987 		jz			done2
12988 		add			edi, eax
12989 		neg			eax
12990 
12991 	loopVert2:
12992 		movss		xmm0, [edx+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]
12993 		movss		xmm1, [edx+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]
12994 		shufps		xmm0, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
12995 		mulps		xmm0, xmm4
12996 		movss		xmm1, [edx+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4]
12997 		movss		xmm2, [edx+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4]
12998 		shufps		xmm1, xmm2, R_SHUFFLEPS( 0, 0, 0, 0 )
12999 		mulps		xmm1, xmm5
13000 		movss		xmm2, [edx+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8]
13001 		movss		xmm3, [edx+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8]
13002 		shufps		xmm2, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 )
13003 		mulps		xmm2, xmm6
13004 		addps		xmm0, xmm1
13005 		addps		xmm0, xmm2
13006 		addps		xmm0, xmm7
13007 		movaps		[esi], xmm0
13008 		movaps		xmm1, xmm0
13009 		movaps		xmm2, SIMD_SP_one
13010 		subps		xmm2, xmm0
13011 		shufps		xmm0, xmm2, R_SHUFFLEPS( 0, 1, 0, 1 )
13012 		shufps		xmm1, xmm2, R_SHUFFLEPS( 2, 3, 2, 3 )
13013 		add			edx, 2*DRAWVERT_SIZE
13014 		movmskps	ecx, xmm0
13015 		mov			byte ptr [edi+eax+0], cl
13016 		add			esi, 4*4
13017 		movmskps	ecx, xmm1
13018 		mov			byte ptr [edi+eax+1], cl
13019 		add			eax, 2
13020 		jl			loopVert2
13021 
13022 	done2:
13023 		mov			eax, numVerts
13024 		and			eax, 1
13025 		jz			done
13026 
13027 		movss		xmm0, [edx+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]
13028 		shufps		xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
13029 		mulps		xmm0, xmm4
13030 		movss		xmm1, [edx+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4]
13031 		shufps		xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
13032 		mulps		xmm1, xmm5
13033 		movss		xmm2, [edx+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8]
13034 		shufps		xmm2, xmm2, R_SHUFFLEPS( 0, 0, 0, 0 )
13035 		mulps		xmm2, xmm6
13036 		addps		xmm0, xmm1
13037 		addps		xmm0, xmm2
13038 		addps		xmm0, xmm7
13039 		movlps		[esi], xmm0
13040 		movaps		xmm1, xmm0
13041 		movaps		xmm2, SIMD_SP_one
13042 		subps		xmm2, xmm0
13043 		shufps		xmm0, xmm2, R_SHUFFLEPS( 0, 1, 0, 1 )
13044 		movmskps	ecx, xmm0
13045 		mov			byte ptr [edi], cl
13046 
13047 	done:
13048 	}
13049 
13050 #else
13051 
13052 	const idPlane &p0 = planes[0];
13053 	const idPlane &p1 = planes[1];
13054 
13055 	for ( int i = 0; i < numVerts - 1; i += 2 ) {
13056 		unsigned short bits;
13057 		float d0, d1, d2, d3;
13058 
13059 		const idVec3 &v0 = verts[i+0].xyz;
13060 		const idVec3 &v1 = verts[i+1].xyz;
13061 
13062 		d0 = p0[0] * v0[0] + p0[1] * v0[1] + p0[2] * v0[2] + p0[3];
13063 		d1 = p1[0] * v0[0] + p1[1] * v0[1] + p1[2] * v0[2] + p1[3];
13064 		d2 = p0[0] * v1[0] + p0[1] * v1[1] + p0[2] * v1[2] + p0[3];
13065 		d3 = p1[0] * v1[0] + p1[1] * v1[1] + p1[2] * v1[2] + p1[3];
13066 
13067 		texCoords[i+0][0] = d0;
13068 		texCoords[i+0][1] = d1;
13069 		texCoords[i+1][0] = d2;
13070 		texCoords[i+1][1] = d3;
13071 
13072 		bits  = FLOATSIGNBITSET( d0 ) << 0;
13073 		bits |= FLOATSIGNBITSET( d1 ) << 1;
13074 		bits |= FLOATSIGNBITSET( d2 ) << 8;
13075 		bits |= FLOATSIGNBITSET( d3 ) << 9;
13076 
13077 		d0 = 1.0f - d0;
13078 		d1 = 1.0f - d1;
13079 		d2 = 1.0f - d2;
13080 		d3 = 1.0f - d3;
13081 
13082 		bits |= FLOATSIGNBITSET( d0 ) << 2;
13083 		bits |= FLOATSIGNBITSET( d1 ) << 3;
13084 		bits |= FLOATSIGNBITSET( d2 ) << 10;
13085 		bits |= FLOATSIGNBITSET( d3 ) << 11;
13086 
13087 		*(unsigned short *)(cullBits + i) = bits;
13088 	}
13089 
13090 	if ( numVerts & 1 ) {
13091 		byte bits;
13092 		float d0, d1;
13093 
13094 		const idPlane &p0 = planes[0];
13095 		const idPlane &p1 = planes[1];
13096 		const idVec3 &v0 = verts[numVerts - 1].xyz;
13097 
13098 		d0 = p0[0] * v0[0] + p0[1] * v0[1] + p0[2] * v0[2] + p0[3];
13099 		d1 = p1[0] * v0[0] + p1[1] * v0[1] + p1[2] * v0[2] + p1[3];
13100 
13101 		texCoords[i][0] = d0;
13102 		texCoords[i][1] = d1;
13103 
13104 		bits  = FLOATSIGNBITSET( d0 ) << 0;
13105 		bits |= FLOATSIGNBITSET( d1 ) << 1;
13106 
13107 		d0 = 1.0f - d0;
13108 		d1 = 1.0f - d1;
13109 
13110 		bits |= FLOATSIGNBITSET( d0 ) << 2;
13111 		bits |= FLOATSIGNBITSET( d1 ) << 3;
13112 
13113 		cullBits[numVerts - 1] = bits;
13114 	}
13115 
13116 #endif
13117 }
13118 
13119 /*
13120 ============
13121 idSIMD_SSE::DeriveTriPlanes
13122 ============
13123 */
DeriveTriPlanes(idPlane * planes,const idDrawVert * verts,const int numVerts,const int * indexes,const int numIndexes)13124 void VPCALL idSIMD_SSE::DeriveTriPlanes( idPlane *planes, const idDrawVert *verts, const int numVerts, const int *indexes, const int numIndexes ) {
13125 #if 1
13126 
13127 	assert( sizeof( idDrawVert ) == DRAWVERT_SIZE );
13128 	assert( (int)&((idDrawVert *)0)->xyz == DRAWVERT_XYZ_OFFSET );
13129 
13130 	__asm {
13131 		mov			eax, numIndexes
13132 		shl			eax, 2
13133 		mov			esi, verts
13134 		mov			edi, indexes
13135 		mov			edx, planes
13136 
13137 		add			edi, eax
13138 		neg			eax
13139 
13140 		add			eax, 4*12
13141 		jge			done4
13142 
13143 	loopPlane4:
13144 		mov			ebx, [edi+eax-4*12+4]
13145 		imul		ebx, DRAWVERT_SIZE
13146 		mov			ecx, [edi+eax-4*12+0]
13147 		imul		ecx, DRAWVERT_SIZE
13148 
13149 		movss		xmm0, [esi+ebx+DRAWVERT_XYZ_OFFSET+0]
13150 		subss		xmm0, [esi+ecx+DRAWVERT_XYZ_OFFSET+0]
13151 
13152 		movss		xmm1, [esi+ebx+DRAWVERT_XYZ_OFFSET+4]
13153 		subss		xmm1, [esi+ecx+DRAWVERT_XYZ_OFFSET+4]
13154 
13155 		movss		xmm2, [esi+ebx+DRAWVERT_XYZ_OFFSET+8]
13156 		subss		xmm2, [esi+ecx+DRAWVERT_XYZ_OFFSET+8]
13157 
13158 		mov			ebx, [edi+eax-4*12+8]
13159 		imul		ebx, DRAWVERT_SIZE
13160 
13161 		shufps		xmm0, xmm0, R_SHUFFLEPS( 3, 0, 1, 2 )
13162 		shufps		xmm1, xmm1, R_SHUFFLEPS( 3, 0, 1, 2 )
13163 		shufps		xmm2, xmm2, R_SHUFFLEPS( 3, 0, 1, 2 )
13164 
13165 		movss		xmm3, [esi+ebx+DRAWVERT_XYZ_OFFSET+0]
13166 		subss		xmm3, [esi+ecx+DRAWVERT_XYZ_OFFSET+0]
13167 
13168 		movss		xmm4, [esi+ebx+DRAWVERT_XYZ_OFFSET+4]
13169 		subss		xmm4, [esi+ecx+DRAWVERT_XYZ_OFFSET+4]
13170 
13171 		movss		xmm5, [esi+ebx+DRAWVERT_XYZ_OFFSET+8]
13172 		subss		xmm5, [esi+ecx+DRAWVERT_XYZ_OFFSET+8]
13173 
13174 		mov			ebx, [edi+eax-3*12+4]
13175 		imul		ebx, DRAWVERT_SIZE
13176 		mov			ecx, [edi+eax-3*12+0]
13177 		imul		ecx, DRAWVERT_SIZE
13178 
13179 		shufps		xmm3, xmm3, R_SHUFFLEPS( 3, 0, 1, 2 )
13180 		shufps		xmm4, xmm4, R_SHUFFLEPS( 3, 0, 1, 2 )
13181 		shufps		xmm5, xmm5, R_SHUFFLEPS( 3, 0, 1, 2 )
13182 
13183 		movss		xmm6, [esi+ebx+DRAWVERT_XYZ_OFFSET+0]
13184 		subss		xmm6, [esi+ecx+DRAWVERT_XYZ_OFFSET+0]
13185 		movss		xmm0, xmm6
13186 
13187 		movss		xmm7, [esi+ebx+DRAWVERT_XYZ_OFFSET+4]
13188 		subss		xmm7, [esi+ecx+DRAWVERT_XYZ_OFFSET+4]
13189 		movss		xmm1, xmm7
13190 
13191 		movss		xmm6, [esi+ebx+DRAWVERT_XYZ_OFFSET+8]
13192 		subss		xmm6, [esi+ecx+DRAWVERT_XYZ_OFFSET+8]
13193 		movss		xmm2, xmm6
13194 
13195 		mov			ebx, [edi+eax-3*12+8]
13196 		imul		ebx, DRAWVERT_SIZE
13197 
13198 		shufps		xmm0, xmm0, R_SHUFFLEPS( 3, 0, 1, 2 )
13199 		shufps		xmm1, xmm1, R_SHUFFLEPS( 3, 0, 1, 2 )
13200 		shufps		xmm2, xmm2, R_SHUFFLEPS( 3, 0, 1, 2 )
13201 
13202 		movss		xmm7, [esi+ebx+DRAWVERT_XYZ_OFFSET+0]
13203 		subss		xmm7, [esi+ecx+DRAWVERT_XYZ_OFFSET+0]
13204 		movss		xmm3, xmm7
13205 
13206 		movss		xmm6, [esi+ebx+DRAWVERT_XYZ_OFFSET+4]
13207 		subss		xmm6, [esi+ecx+DRAWVERT_XYZ_OFFSET+4]
13208 		movss		xmm4, xmm6
13209 
13210 		movss		xmm7, [esi+ebx+DRAWVERT_XYZ_OFFSET+8]
13211 		subss		xmm7, [esi+ecx+DRAWVERT_XYZ_OFFSET+8]
13212 		movss		xmm5, xmm7
13213 
13214 		mov			ebx, [edi+eax-2*12+4]
13215 		imul		ebx, DRAWVERT_SIZE
13216 		mov			ecx, [edi+eax-2*12+0]
13217 		imul		ecx, DRAWVERT_SIZE
13218 
13219 		shufps		xmm3, xmm3, R_SHUFFLEPS( 3, 0, 1, 2 )
13220 		shufps		xmm4, xmm4, R_SHUFFLEPS( 3, 0, 1, 2 )
13221 		shufps		xmm5, xmm5, R_SHUFFLEPS( 3, 0, 1, 2 )
13222 
13223 		movss		xmm6, [esi+ebx+DRAWVERT_XYZ_OFFSET+0]
13224 		subss		xmm6, [esi+ecx+DRAWVERT_XYZ_OFFSET+0]
13225 		movss		xmm0, xmm6
13226 
13227 		movss		xmm7, [esi+ebx+DRAWVERT_XYZ_OFFSET+4]
13228 		subss		xmm7, [esi+ecx+DRAWVERT_XYZ_OFFSET+4]
13229 		movss		xmm1, xmm7
13230 
13231 		movss		xmm6, [esi+ebx+DRAWVERT_XYZ_OFFSET+8]
13232 		subss		xmm6, [esi+ecx+DRAWVERT_XYZ_OFFSET+8]
13233 		movss		xmm2, xmm6
13234 
13235 		mov			ebx, [edi+eax-2*12+8]
13236 		imul		ebx, DRAWVERT_SIZE
13237 
13238 		shufps		xmm0, xmm0, R_SHUFFLEPS( 3, 0, 1, 2 )
13239 		shufps		xmm1, xmm1, R_SHUFFLEPS( 3, 0, 1, 2 )
13240 		shufps		xmm2, xmm2, R_SHUFFLEPS( 3, 0, 1, 2 )
13241 
13242 		movss		xmm7, [esi+ebx+DRAWVERT_XYZ_OFFSET+0]
13243 		subss		xmm7, [esi+ecx+DRAWVERT_XYZ_OFFSET+0]
13244 		movss		xmm3, xmm7
13245 
13246 		movss		xmm6, [esi+ebx+DRAWVERT_XYZ_OFFSET+4]
13247 		subss		xmm6, [esi+ecx+DRAWVERT_XYZ_OFFSET+4]
13248 		movss		xmm4, xmm6
13249 
13250 		movss		xmm7, [esi+ebx+DRAWVERT_XYZ_OFFSET+8]
13251 		subss		xmm7, [esi+ecx+DRAWVERT_XYZ_OFFSET+8]
13252 		movss		xmm5, xmm7
13253 
13254 		mov			ebx, [edi+eax-1*12+4]
13255 		imul		ebx, DRAWVERT_SIZE
13256 		mov			ecx, [edi+eax-1*12+0]
13257 		imul		ecx, DRAWVERT_SIZE
13258 
13259 		shufps		xmm3, xmm3, R_SHUFFLEPS( 3, 0, 1, 2 )
13260 		shufps		xmm4, xmm4, R_SHUFFLEPS( 3, 0, 1, 2 )
13261 		shufps		xmm5, xmm5, R_SHUFFLEPS( 3, 0, 1, 2 )
13262 
13263 		movss		xmm6, [esi+ebx+DRAWVERT_XYZ_OFFSET+0]
13264 		subss		xmm6, [esi+ecx+DRAWVERT_XYZ_OFFSET+0]
13265 		movss		xmm0, xmm6
13266 
13267 		movss		xmm7, [esi+ebx+DRAWVERT_XYZ_OFFSET+4]
13268 		subss		xmm7, [esi+ecx+DRAWVERT_XYZ_OFFSET+4]
13269 		movss		xmm1, xmm7
13270 
13271 		movss		xmm6, [esi+ebx+DRAWVERT_XYZ_OFFSET+8]
13272 		subss		xmm6, [esi+ecx+DRAWVERT_XYZ_OFFSET+8]
13273 		movss		xmm2, xmm6
13274 
13275 		mov			ebx, [edi+eax-1*12+8]
13276 		imul		ebx, DRAWVERT_SIZE
13277 
13278 		movss		xmm7, [esi+ebx+DRAWVERT_XYZ_OFFSET+0]
13279 		subss		xmm7, [esi+ecx+DRAWVERT_XYZ_OFFSET+0]
13280 		movss		xmm3, xmm7
13281 
13282 		movss		xmm6, [esi+ebx+DRAWVERT_XYZ_OFFSET+4]
13283 		subss		xmm6, [esi+ecx+DRAWVERT_XYZ_OFFSET+4]
13284 		movss		xmm4, xmm6
13285 
13286 		movss		xmm7, [esi+ebx+DRAWVERT_XYZ_OFFSET+8]
13287 		subss		xmm7, [esi+ecx+DRAWVERT_XYZ_OFFSET+8]
13288 		movss		xmm5, xmm7
13289 
13290 		movaps		xmm6, xmm4
13291 		mulps		xmm6, xmm2
13292 		movaps		xmm7, xmm5
13293 		mulps		xmm7, xmm1
13294 		subps		xmm6, xmm7
13295 
13296 		mulps		xmm5, xmm0
13297 		mulps		xmm2, xmm3
13298 		subps		xmm5, xmm2
13299 
13300 		mulps		xmm3, xmm1
13301 		mulps		xmm4, xmm0
13302 		subps		xmm3, xmm4
13303 
13304 		movaps		xmm0, xmm6
13305 		mulps		xmm6, xmm6
13306 		movaps		xmm1, xmm5
13307 		mulps		xmm5, xmm5
13308 		movaps		xmm2, xmm3
13309 		mulps		xmm3, xmm3
13310 
13311 		addps		xmm3, xmm5
13312 		addps		xmm3, xmm6
13313 		rsqrtps		xmm3, xmm3
13314 
13315 		add			edx, 4*16
13316 		mov			ecx, [edi+eax-1*12+0]
13317 		imul		ecx, DRAWVERT_SIZE
13318 
13319 		mulps		xmm0, xmm3
13320 		mulps		xmm1, xmm3
13321 		mulps		xmm2, xmm3
13322 
13323 		movss		[edx-1*16+0], xmm0
13324 		movss		[edx-1*16+4], xmm1
13325 		movss		[edx-1*16+8], xmm2
13326 
13327 		mulss		xmm0, [esi+ecx+DRAWVERT_XYZ_OFFSET+0]
13328 		mulss		xmm1, [esi+ecx+DRAWVERT_XYZ_OFFSET+4]
13329 		mulss		xmm2, [esi+ecx+DRAWVERT_XYZ_OFFSET+8]
13330 
13331 		xorps		xmm0, SIMD_SP_singleSignBitMask
13332 		subss		xmm0, xmm1
13333 		subss		xmm0, xmm2
13334 		movss		[edx-1*16+12], xmm0
13335 
13336 		mov			ecx, [edi+eax-2*12+0]
13337 		imul		ecx, DRAWVERT_SIZE
13338 
13339 		shufps		xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 )
13340 		shufps		xmm1, xmm1, R_SHUFFLEPS( 1, 2, 3, 0 )
13341 		shufps		xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 )
13342 
13343 		movss		[edx-2*16+0], xmm0
13344 		movss		[edx-2*16+4], xmm1
13345 		movss		[edx-2*16+8], xmm2
13346 
13347 		mulss		xmm0, [esi+ecx+DRAWVERT_XYZ_OFFSET+0]
13348 		mulss		xmm1, [esi+ecx+DRAWVERT_XYZ_OFFSET+4]
13349 		mulss		xmm2, [esi+ecx+DRAWVERT_XYZ_OFFSET+8]
13350 
13351 		xorps		xmm0, SIMD_SP_singleSignBitMask
13352 		subss		xmm0, xmm1
13353 		subss		xmm0, xmm2
13354 		movss		[edx-2*16+12], xmm0
13355 
13356 		mov			ecx, [edi+eax-3*12+0]
13357 		imul		ecx, DRAWVERT_SIZE
13358 
13359 		shufps		xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 )
13360 		shufps		xmm1, xmm1, R_SHUFFLEPS( 1, 2, 3, 0 )
13361 		shufps		xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 )
13362 
13363 		movss		[edx-3*16+0], xmm0
13364 		movss		[edx-3*16+4], xmm1
13365 		movss		[edx-3*16+8], xmm2
13366 
13367 		mulss		xmm0, [esi+ecx+DRAWVERT_XYZ_OFFSET+0]
13368 		mulss		xmm1, [esi+ecx+DRAWVERT_XYZ_OFFSET+4]
13369 		mulss		xmm2, [esi+ecx+DRAWVERT_XYZ_OFFSET+8]
13370 
13371 		xorps		xmm0, SIMD_SP_singleSignBitMask
13372 		subss		xmm0, xmm1
13373 		subss		xmm0, xmm2
13374 		movss		[edx-3*16+12], xmm0
13375 
13376 		mov			ecx, [edi+eax-4*12+0]
13377 		imul		ecx, DRAWVERT_SIZE
13378 
13379 		shufps		xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 )
13380 		shufps		xmm1, xmm1, R_SHUFFLEPS( 1, 2, 3, 0 )
13381 		shufps		xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 )
13382 
13383 		movss		[edx-4*16+0], xmm0
13384 		movss		[edx-4*16+4], xmm1
13385 		movss		[edx-4*16+8], xmm2
13386 
13387 		mulss		xmm0, [esi+ecx+DRAWVERT_XYZ_OFFSET+0]
13388 		mulss		xmm1, [esi+ecx+DRAWVERT_XYZ_OFFSET+4]
13389 		mulss		xmm2, [esi+ecx+DRAWVERT_XYZ_OFFSET+8]
13390 
13391 		xorps		xmm0, SIMD_SP_singleSignBitMask
13392 		subss		xmm0, xmm1
13393 		subss		xmm0, xmm2
13394 		movss		[edx-4*16+12], xmm0
13395 
13396 		add			eax, 4*12
13397 		jle			loopPlane4
13398 
13399 	done4:
13400 
13401 		sub			eax, 4*12
13402 		jge			done
13403 
13404 	loopPlane1:
13405 		mov			ebx, [edi+eax+4]
13406 		imul		ebx, DRAWVERT_SIZE
13407 		mov			ecx, [edi+eax+0]
13408 		imul		ecx, DRAWVERT_SIZE
13409 
13410 		movss		xmm0, [esi+ebx+DRAWVERT_XYZ_OFFSET+0]
13411 		subss		xmm0, [esi+ecx+DRAWVERT_XYZ_OFFSET+0]
13412 
13413 		movss		xmm1, [esi+ebx+DRAWVERT_XYZ_OFFSET+4]
13414 		subss		xmm1, [esi+ecx+DRAWVERT_XYZ_OFFSET+4]
13415 
13416 		movss		xmm2, [esi+ebx+DRAWVERT_XYZ_OFFSET+8]
13417 		subss		xmm2, [esi+ecx+DRAWVERT_XYZ_OFFSET+8]
13418 
13419 		mov			ebx, [edi+eax+8]
13420 		imul		ebx, DRAWVERT_SIZE
13421 
13422 		movss		xmm3, [esi+ebx+DRAWVERT_XYZ_OFFSET+0]
13423 		subss		xmm3, [esi+ecx+DRAWVERT_XYZ_OFFSET+0]
13424 
13425 		movss		xmm4, [esi+ebx+DRAWVERT_XYZ_OFFSET+4]
13426 		subss		xmm4, [esi+ecx+DRAWVERT_XYZ_OFFSET+4]
13427 
13428 		movss		xmm5, [esi+ebx+DRAWVERT_XYZ_OFFSET+8]
13429 		subss		xmm5, [esi+ecx+DRAWVERT_XYZ_OFFSET+8]
13430 
13431 		movss		xmm6, xmm4
13432 		mulss		xmm6, xmm2
13433 		movss		xmm7, xmm5
13434 		mulss		xmm7, xmm1
13435 		subss		xmm6, xmm7
13436 
13437 		mulss		xmm5, xmm0
13438 		mulss		xmm2, xmm3
13439 		subss		xmm5, xmm2
13440 
13441 		mulss		xmm3, xmm1
13442 		mulss		xmm4, xmm0
13443 		subss		xmm3, xmm4
13444 
13445 		movss		xmm0, xmm6
13446 		mulss		xmm6, xmm6
13447 		movss		xmm1, xmm5
13448 		mulss		xmm5, xmm5
13449 		movss		xmm2, xmm3
13450 		mulss		xmm3, xmm3
13451 
13452 		addss		xmm3, xmm5
13453 		addss		xmm3, xmm6
13454 		rsqrtss		xmm3, xmm3
13455 
13456 		add			edx, 1*16
13457 
13458 		mulss		xmm0, xmm3
13459 		mulss		xmm1, xmm3
13460 		mulss		xmm2, xmm3
13461 
13462 		movss		[edx-1*16+0], xmm0
13463 		movss		[edx-1*16+4], xmm1
13464 		movss		[edx-1*16+8], xmm2
13465 
13466 		mulss		xmm0, [esi+ecx+DRAWVERT_XYZ_OFFSET+0]
13467 		mulss		xmm1, [esi+ecx+DRAWVERT_XYZ_OFFSET+4]
13468 		mulss		xmm2, [esi+ecx+DRAWVERT_XYZ_OFFSET+8]
13469 
13470 		xorps		xmm0, SIMD_SP_singleSignBitMask
13471 		subss		xmm0, xmm1
13472 		subss		xmm0, xmm2
13473 		movss		[edx-1*16+12], xmm0
13474 
13475 		add			eax, 1*12
13476 		jl			loopPlane1
13477 
13478 	done:
13479 	}
13480 
13481 #else
13482 
13483 	int i, j;
13484 
13485 	for ( i = 0; i <= numIndexes - 12; i += 12 ) {
13486 		ALIGN16( float d0[4] );
13487 		ALIGN16( float d1[4] );
13488 		ALIGN16( float d2[4] );
13489 		ALIGN16( float d3[4] );
13490 		ALIGN16( float d4[4] );
13491 		ALIGN16( float d5[4] );
13492 		ALIGN16( float n0[4] );
13493 		ALIGN16( float n1[4] );
13494 		ALIGN16( float n2[4] );
13495 
13496 		for ( j = 0; j < 4; j++ ) {
13497 			const idDrawVert *a, *b, *c;
13498 
13499 			a = verts + indexes[i + j * 3 + 0];
13500 			b = verts + indexes[i + j * 3 + 1];
13501 			c = verts + indexes[i + j * 3 + 2];
13502 
13503 			d0[j] = b->xyz[0] - a->xyz[0];
13504 			d1[j] = b->xyz[1] - a->xyz[1];
13505 			d2[j] = b->xyz[2] - a->xyz[2];
13506 
13507 			d3[j] = c->xyz[0] - a->xyz[0];
13508 			d4[j] = c->xyz[1] - a->xyz[1];
13509 			d5[j] = c->xyz[2] - a->xyz[2];
13510 		}
13511 
13512 		ALIGN16( float tmp[4] );
13513 
13514 		n0[0] = d4[0] * d2[0];
13515 		n0[1] = d4[1] * d2[1];
13516 		n0[2] = d4[2] * d2[2];
13517 		n0[3] = d4[3] * d2[3];
13518 
13519 		n0[0] -= d5[0] * d1[0];
13520 		n0[1] -= d5[1] * d1[1];
13521 		n0[2] -= d5[2] * d1[2];
13522 		n0[3] -= d5[3] * d1[3];
13523 
13524 		n1[0] = d5[0] * d0[0];
13525 		n1[1] = d5[1] * d0[1];
13526 		n1[2] = d5[2] * d0[2];
13527 		n1[3] = d5[3] * d0[3];
13528 
13529 		n1[0] -= d3[0] * d2[0];
13530 		n1[1] -= d3[1] * d2[1];
13531 		n1[2] -= d3[2] * d2[2];
13532 		n1[3] -= d3[3] * d2[3];
13533 
13534 		n2[0] = d3[0] * d1[0];
13535 		n2[1] = d3[1] * d1[1];
13536 		n2[2] = d3[2] * d1[2];
13537 		n2[3] = d3[3] * d1[3];
13538 
13539 		n2[0] -= d4[0] * d0[0];
13540 		n2[1] -= d4[1] * d0[1];
13541 		n2[2] -= d4[2] * d0[2];
13542 		n2[3] -= d4[3] * d0[3];
13543 
13544 		tmp[0] = n0[0] * n0[0];
13545 		tmp[1] = n0[1] * n0[1];
13546 		tmp[2] = n0[2] * n0[2];
13547 		tmp[3] = n0[3] * n0[3];
13548 
13549 		tmp[0] += n1[0] * n1[0];
13550 		tmp[1] += n1[1] * n1[1];
13551 		tmp[2] += n1[2] * n1[2];
13552 		tmp[3] += n1[3] * n1[3];
13553 
13554 		tmp[0] += n2[0] * n2[0];
13555 		tmp[1] += n2[1] * n2[1];
13556 		tmp[2] += n2[2] * n2[2];
13557 		tmp[3] += n2[3] * n2[3];
13558 
13559 		tmp[0] = idMath::RSqrt( tmp[0] );
13560 		tmp[1] = idMath::RSqrt( tmp[1] );
13561 		tmp[2] = idMath::RSqrt( tmp[2] );
13562 		tmp[3] = idMath::RSqrt( tmp[3] );
13563 
13564 		n0[0] *= tmp[0];
13565 		n0[1] *= tmp[1];
13566 		n0[2] *= tmp[2];
13567 		n0[3] *= tmp[3];
13568 
13569 		n1[0] *= tmp[0];
13570 		n1[1] *= tmp[1];
13571 		n1[2] *= tmp[2];
13572 		n1[3] *= tmp[3];
13573 
13574 		n2[0] *= tmp[0];
13575 		n2[1] *= tmp[1];
13576 		n2[2] *= tmp[2];
13577 		n2[3] *= tmp[3];
13578 
13579 
13580 		for ( j = 0; j < 4; j++ ) {
13581 			const idDrawVert *a;
13582 
13583 			a = verts + indexes[i + j * 3];
13584 
13585 			planes->Normal()[0] = n0[j];
13586 			planes->Normal()[1] = n1[j];
13587 			planes->Normal()[2] = n2[j];
13588 			planes->FitThroughPoint( a->xyz );
13589 			planes++;
13590 		}
13591 	}
13592 
13593 	for ( ; i < numIndexes; i += 3 ) {
13594 		const idDrawVert *a, *b, *c;
13595 		float d0, d1, d2, d3, d4, d5;
13596 		float n0, n1, n2;
13597 
13598 		a = verts + indexes[i + 0];
13599 		b = verts + indexes[i + 1];
13600 		c = verts + indexes[i + 2];
13601 
13602 		d0 = b->xyz[0] - a->xyz[0];
13603 		d1 = b->xyz[1] - a->xyz[1];
13604 		d2 = b->xyz[2] - a->xyz[2];
13605 
13606 		d3 = c->xyz[0] - a->xyz[0];
13607 		d4 = c->xyz[1] - a->xyz[1];
13608 		d5 = c->xyz[2] - a->xyz[2];
13609 
13610 		float tmp;
13611 
13612 		n0 = d4 * d2 - d5 * d1;
13613 		n1 = d5 * d0 - d3 * d2;
13614 		n2 = d3 * d1 - d4 * d0;
13615 
13616 		tmp = idMath::RSqrt( n0 * n0 + n1 * n1 + n2 * n2 );
13617 
13618 		n0 *= tmp;
13619 		n1 *= tmp;
13620 		n2 *= tmp;
13621 
13622 		planes->Normal()[0] = n0;
13623 		planes->Normal()[1] = n1;
13624 		planes->Normal()[2] = n2;
13625 		planes->FitThroughPoint( a->xyz );
13626 		planes++;
13627 	}
13628 
13629 #endif
13630 }
13631 
13632 /*
13633 ============
13634 idSIMD_SSE::DeriveTangents
13635 ============
13636 */
13637 //#define REFINE_TANGENT_SQUAREROOT
13638 #define FIX_DEGENERATE_TANGENT
13639 
DeriveTangents(idPlane * planes,idDrawVert * verts,const int numVerts,const int * indexes,const int numIndexes)13640 void VPCALL idSIMD_SSE::DeriveTangents( idPlane *planes, idDrawVert *verts, const int numVerts, const int *indexes, const int numIndexes ) {
13641 	int i;
13642 
13643 	assert( sizeof( idDrawVert ) == DRAWVERT_SIZE );
13644 	assert( (int)&((idDrawVert *)0)->normal == DRAWVERT_NORMAL_OFFSET );
13645 	assert( (int)&((idDrawVert *)0)->tangents[0] == DRAWVERT_TANGENT0_OFFSET );
13646 	assert( (int)&((idDrawVert *)0)->tangents[1] == DRAWVERT_TANGENT1_OFFSET );
13647 
13648 	assert( planes != NULL );
13649 	assert( verts != NULL );
13650 	assert( numVerts >= 0 );
13651 
13652 #ifdef REFINE_TANGENT_SQUAREROOT
13653 	__asm {
13654 		movaps		xmm6, SIMD_SP_rsqrt_c0
13655 		movaps		xmm7, SIMD_SP_rsqrt_c1
13656 	}
13657 #endif
13658 
13659 	bool *used = (bool *)_alloca16( numVerts * sizeof( used[0] ) );
13660 	memset( used, 0, numVerts * sizeof( used[0] ) );
13661 
13662 	for ( i = 0; i <= numIndexes - 12; i += 12 ) {
13663 		idDrawVert *a, *b, *c;
13664 		ALIGN16( unsigned int signBit[4] );
13665 		ALIGN16( float d0[4] );
13666 		ALIGN16( float d1[4] );
13667 		ALIGN16( float d2[4] );
13668 		ALIGN16( float d3[4] );
13669 		ALIGN16( float d4[4] );
13670 		ALIGN16( float d5[4] );
13671 		ALIGN16( float d6[4] );
13672 		ALIGN16( float d7[4] );
13673 		ALIGN16( float d8[4] );
13674 		ALIGN16( float d9[4] );
13675 		ALIGN16( float n0[4] );
13676 		ALIGN16( float n1[4] );
13677 		ALIGN16( float n2[4] );
13678 		ALIGN16( float t0[4] );
13679 		ALIGN16( float t1[4] );
13680 		ALIGN16( float t2[4] );
13681 		ALIGN16( float t3[4] );
13682 		ALIGN16( float t4[4] );
13683 		ALIGN16( float t5[4] );
13684 
13685 		for ( int j = 0; j < 4; j++ ) {
13686 
13687 			a = verts + indexes[i + j * 3 + 0];
13688 			b = verts + indexes[i + j * 3 + 1];
13689 			c = verts + indexes[i + j * 3 + 2];
13690 
13691 			d0[j] = b->xyz[0] - a->xyz[0];
13692 			d1[j] = b->xyz[1] - a->xyz[1];
13693 			d2[j] = b->xyz[2] - a->xyz[2];
13694 			d3[j] = b->st[0] - a->st[0];
13695 			d4[j] = b->st[1] - a->st[1];
13696 
13697 			d5[j] = c->xyz[0] - a->xyz[0];
13698 			d6[j] = c->xyz[1] - a->xyz[1];
13699 			d7[j] = c->xyz[2] - a->xyz[2];
13700 			d8[j] = c->st[0] - a->st[0];
13701 			d9[j] = c->st[1] - a->st[1];
13702 		}
13703 
13704 #if 1
13705 
13706 		__asm {
13707 			// normal
13708 			movaps		xmm0, d6
13709 			mulps		xmm0, d2
13710 			movaps		xmm1, d7
13711 			mulps		xmm1, d1
13712 			subps		xmm0, xmm1
13713 
13714 			movaps		xmm1, d7
13715 			mulps		xmm1, d0
13716 			movaps		xmm2, d5
13717 			mulps		xmm2, d2
13718 			subps		xmm1, xmm2
13719 
13720 			movaps		xmm2, d5
13721 			mulps		xmm2, d1
13722 			movaps		xmm3, d6
13723 			mulps		xmm3, d0
13724 			subps		xmm2, xmm3
13725 
13726 			movaps		xmm3, xmm0
13727 			movaps		xmm4, xmm1
13728 			movaps		xmm5, xmm2
13729 
13730 			mulps		xmm3, xmm3
13731 			mulps		xmm4, xmm4
13732 			mulps		xmm5, xmm5
13733 
13734 			addps		xmm3, xmm4
13735 			addps		xmm3, xmm5
13736 
13737 #ifdef FIX_DEGENERATE_TANGENT
13738 			xorps		xmm4, xmm4
13739 			cmpeqps		xmm4, xmm3
13740 			andps		xmm4, SIMD_SP_tiny			// if values are zero replace them with a tiny number
13741 			andps		xmm3, SIMD_SP_absMask		// make sure the values are positive
13742 			orps		xmm3, xmm4
13743 #endif
13744 
13745 #ifdef REFINE_TANGENT_SQUAREROOT
13746 			rsqrtps		xmm4, xmm3
13747 			mulps		xmm3, xmm4
13748 			mulps		xmm3, xmm4
13749 			subps		xmm3, xmm6
13750 			mulps		xmm4, xmm7
13751 			mulps		xmm3, xmm4
13752 #else
13753 			rsqrtps		xmm3, xmm3
13754 #endif
13755 			mulps		xmm0, xmm3
13756 			movaps		n0, xmm0
13757 			mulps		xmm1, xmm3
13758 			movaps		n1, xmm1
13759 			mulps		xmm2, xmm3
13760 			movaps		n2, xmm2
13761 
13762 			// area sign bit
13763 			movaps		xmm0, d3
13764 			mulps		xmm0, d9
13765 			movaps		xmm1, d4
13766 			mulps		xmm1, d8
13767 			subps		xmm0, xmm1
13768 			andps		xmm0, SIMD_SP_signBitMask
13769 			movaps		signBit, xmm0
13770 
13771 			// first tangent
13772 			movaps		xmm0, d0
13773 			mulps		xmm0, d9
13774 			movaps		xmm1, d4
13775 			mulps		xmm1, d5
13776 			subps		xmm0, xmm1
13777 
13778 			movaps		xmm1, d1
13779 			mulps		xmm1, d9
13780 			movaps		xmm2, d4
13781 			mulps		xmm2, d6
13782 			subps		xmm1, xmm2
13783 
13784 			movaps		xmm2, d2
13785 			mulps		xmm2, d9
13786 			movaps		xmm3, d4
13787 			mulps		xmm3, d7
13788 			subps		xmm2, xmm3
13789 
13790 			movaps		xmm3, xmm0
13791 			movaps		xmm4, xmm1
13792 			movaps		xmm5, xmm2
13793 
13794 			mulps		xmm3, xmm3
13795 			mulps		xmm4, xmm4
13796 			mulps		xmm5, xmm5
13797 
13798 			addps		xmm3, xmm4
13799 			addps		xmm3, xmm5
13800 
13801 #ifdef FIX_DEGENERATE_TANGENT
13802 			xorps		xmm4, xmm4
13803 			cmpeqps		xmm4, xmm3
13804 			andps		xmm4, SIMD_SP_tiny			// if values are zero replace them with a tiny number
13805 			andps		xmm3, SIMD_SP_absMask		// make sure the values are positive
13806 			orps		xmm3, xmm4
13807 #endif
13808 
13809 #ifdef REFINE_TANGENT_SQUAREROOT
13810 			rsqrtps		xmm4, xmm3
13811 			mulps		xmm3, xmm4
13812 			mulps		xmm3, xmm4
13813 			subps		xmm3, xmm6
13814 			mulps		xmm4, xmm7
13815 			mulps		xmm3, xmm4
13816 #else
13817 			rsqrtps		xmm3, xmm3
13818 #endif
13819 			xorps		xmm3, signBit
13820 
13821 			mulps		xmm0, xmm3
13822 			movaps		t0, xmm0
13823 			mulps		xmm1, xmm3
13824 			movaps		t1, xmm1
13825 			mulps		xmm2, xmm3
13826 			movaps		t2, xmm2
13827 
13828 			// second tangent
13829 			movaps		xmm0, d3
13830 			mulps		xmm0, d5
13831 			movaps		xmm1, d0
13832 			mulps		xmm1, d8
13833 			subps		xmm0, xmm1
13834 
13835 			movaps		xmm1, d3
13836 			mulps		xmm1, d6
13837 			movaps		xmm2, d1
13838 			mulps		xmm2, d8
13839 			subps		xmm1, xmm2
13840 
13841 			movaps		xmm2, d3
13842 			mulps		xmm2, d7
13843 			movaps		xmm3, d2
13844 			mulps		xmm3, d8
13845 			subps		xmm2, xmm3
13846 
13847 			movaps		xmm3, xmm0
13848 			movaps		xmm4, xmm1
13849 			movaps		xmm5, xmm2
13850 
13851 			mulps		xmm3, xmm3
13852 			mulps		xmm4, xmm4
13853 			mulps		xmm5, xmm5
13854 
13855 			addps		xmm3, xmm4
13856 			addps		xmm3, xmm5
13857 
13858 #ifdef FIX_DEGENERATE_TANGENT
13859 			xorps		xmm4, xmm4
13860 			cmpeqps		xmm4, xmm3
13861 			andps		xmm4, SIMD_SP_tiny			// if values are zero replace them with a tiny number
13862 			andps		xmm3, SIMD_SP_absMask		// make sure the values are positive
13863 			orps		xmm3, xmm4
13864 #endif
13865 
13866 #ifdef REFINE_TANGENT_SQUAREROOT
13867 			rsqrtps		xmm4, xmm3
13868 			mulps		xmm3, xmm4
13869 			mulps		xmm3, xmm4
13870 			subps		xmm3, xmm6
13871 			mulps		xmm4, xmm7
13872 			mulps		xmm3, xmm4
13873 #else
13874 			rsqrtps		xmm3, xmm3
13875 #endif
13876 			xorps		xmm3, signBit
13877 
13878 			mulps		xmm0, xmm3
13879 			movaps		t3, xmm0
13880 			mulps		xmm1, xmm3
13881 			movaps		t4, xmm1
13882 			mulps		xmm2, xmm3
13883 			movaps		t5, xmm2
13884 		}
13885 
13886 #else
13887 
13888 		ALIGN16( float tmp[4] );
13889 
13890 		// normal
13891 		n0[0] = d6[0] * d2[0];
13892 		n0[1] = d6[1] * d2[1];
13893 		n0[2] = d6[2] * d2[2];
13894 		n0[3] = d6[3] * d2[3];
13895 
13896 		n0[0] -= d7[0] * d1[0];
13897 		n0[1] -= d7[1] * d1[1];
13898 		n0[2] -= d7[2] * d1[2];
13899 		n0[3] -= d7[3] * d1[3];
13900 
13901 		n1[0] = d7[0] * d0[0];
13902 		n1[1] = d7[1] * d0[1];
13903 		n1[2] = d7[2] * d0[2];
13904 		n1[3] = d7[3] * d0[3];
13905 
13906 		n1[0] -= d5[0] * d2[0];
13907 		n1[1] -= d5[1] * d2[1];
13908 		n1[2] -= d5[2] * d2[2];
13909 		n1[3] -= d5[3] * d2[3];
13910 
13911 		n2[0] = d5[0] * d1[0];
13912 		n2[1] = d5[1] * d1[1];
13913 		n2[2] = d5[2] * d1[2];
13914 		n2[3] = d5[3] * d1[3];
13915 
13916 		n2[0] -= d6[0] * d0[0];
13917 		n2[1] -= d6[1] * d0[1];
13918 		n2[2] -= d6[2] * d0[2];
13919 		n2[3] -= d6[3] * d0[3];
13920 
13921 		tmp[0] = n0[0] * n0[0];
13922 		tmp[1] = n0[1] * n0[1];
13923 		tmp[2] = n0[2] * n0[2];
13924 		tmp[3] = n0[3] * n0[3];
13925 
13926 		tmp[0] += n1[0] * n1[0];
13927 		tmp[1] += n1[1] * n1[1];
13928 		tmp[2] += n1[2] * n1[2];
13929 		tmp[3] += n1[3] * n1[3];
13930 
13931 		tmp[0] += n2[0] * n2[0];
13932 		tmp[1] += n2[1] * n2[1];
13933 		tmp[2] += n2[2] * n2[2];
13934 		tmp[3] += n2[3] * n2[3];
13935 
13936 		tmp[0] = idMath::RSqrt( tmp[0] );
13937 		tmp[1] = idMath::RSqrt( tmp[1] );
13938 		tmp[2] = idMath::RSqrt( tmp[2] );
13939 		tmp[3] = idMath::RSqrt( tmp[3] );
13940 
13941 		n0[0] *= tmp[0];
13942 		n0[1] *= tmp[1];
13943 		n0[2] *= tmp[2];
13944 		n0[3] *= tmp[3];
13945 
13946 		n1[0] *= tmp[0];
13947 		n1[1] *= tmp[1];
13948 		n1[2] *= tmp[2];
13949 		n1[3] *= tmp[3];
13950 
13951 		n2[0] *= tmp[0];
13952 		n2[1] *= tmp[1];
13953 		n2[2] *= tmp[2];
13954 		n2[3] *= tmp[3];
13955 
13956 		// area sign bit
13957 		tmp[0] = d3[0] * d9[0];
13958 		tmp[1] = d3[1] * d9[1];
13959 		tmp[2] = d3[2] * d9[2];
13960 		tmp[3] = d3[3] * d9[3];
13961 
13962 		tmp[0] -= d4[0] * d8[0];
13963 		tmp[1] -= d4[1] * d8[1];
13964 		tmp[2] -= d4[2] * d8[2];
13965 		tmp[3] -= d4[3] * d8[3];
13966 
13967 		signBit[0] = ( *(unsigned int *)&tmp[0] ) & ( 1 << 31 );
13968 		signBit[1] = ( *(unsigned int *)&tmp[1] ) & ( 1 << 31 );
13969 		signBit[2] = ( *(unsigned int *)&tmp[2] ) & ( 1 << 31 );
13970 		signBit[3] = ( *(unsigned int *)&tmp[3] ) & ( 1 << 31 );
13971 
13972 		// first tangent
13973 		t0[0] = d0[0] * d9[0];
13974 		t0[1] = d0[1] * d9[1];
13975 		t0[2] = d0[2] * d9[2];
13976 		t0[3] = d0[3] * d9[3];
13977 
13978 		t0[0] -= d4[0] * d5[0];
13979 		t0[1] -= d4[1] * d5[1];
13980 		t0[2] -= d4[2] * d5[2];
13981 		t0[3] -= d4[3] * d5[3];
13982 
13983 		t1[0] = d1[0] * d9[0];
13984 		t1[1] = d1[1] * d9[1];
13985 		t1[2] = d1[2] * d9[2];
13986 		t1[3] = d1[3] * d9[3];
13987 
13988 		t1[0] -= d4[0] * d6[0];
13989 		t1[1] -= d4[1] * d6[1];
13990 		t1[2] -= d4[2] * d6[2];
13991 		t1[3] -= d4[3] * d6[3];
13992 
13993 		t2[0] = d2[0] * d9[0];
13994 		t2[1] = d2[1] * d9[1];
13995 		t2[2] = d2[2] * d9[2];
13996 		t2[3] = d2[3] * d9[3];
13997 
13998 		t2[0] -= d4[0] * d7[0];
13999 		t2[1] -= d4[1] * d7[1];
14000 		t2[2] -= d4[2] * d7[2];
14001 		t2[3] -= d4[3] * d7[3];
14002 
14003 		tmp[0] = t0[0] * t0[0];
14004 		tmp[1] = t0[1] * t0[1];
14005 		tmp[2] = t0[2] * t0[2];
14006 		tmp[3] = t0[3] * t0[3];
14007 
14008 		tmp[0] += t1[0] * t1[0];
14009 		tmp[1] += t1[1] * t1[1];
14010 		tmp[2] += t1[2] * t1[2];
14011 		tmp[3] += t1[3] * t1[3];
14012 
14013 		tmp[0] += t2[0] * t2[0];
14014 		tmp[1] += t2[1] * t2[1];
14015 		tmp[2] += t2[2] * t2[2];
14016 		tmp[3] += t2[3] * t2[3];
14017 
14018 		tmp[0] = idMath::RSqrt( tmp[0] );
14019 		tmp[1] = idMath::RSqrt( tmp[1] );
14020 		tmp[2] = idMath::RSqrt( tmp[2] );
14021 		tmp[3] = idMath::RSqrt( tmp[3] );
14022 
14023 		*(unsigned int *)&tmp[0] ^= signBit[0];
14024 		*(unsigned int *)&tmp[1] ^= signBit[1];
14025 		*(unsigned int *)&tmp[2] ^= signBit[2];
14026 		*(unsigned int *)&tmp[3] ^= signBit[3];
14027 
14028 		t0[0] *= tmp[0];
14029 		t0[1] *= tmp[1];
14030 		t0[2] *= tmp[2];
14031 		t0[3] *= tmp[3];
14032 
14033 		t1[0] *= tmp[0];
14034 		t1[1] *= tmp[1];
14035 		t1[2] *= tmp[2];
14036 		t1[3] *= tmp[3];
14037 
14038 		t2[0] *= tmp[0];
14039 		t2[1] *= tmp[1];
14040 		t2[2] *= tmp[2];
14041 		t2[3] *= tmp[3];
14042 
14043 		// second tangent
14044 		t3[0] = d3[0] * d5[0];
14045 		t3[1] = d3[1] * d5[1];
14046 		t3[2] = d3[2] * d5[2];
14047 		t3[3] = d3[3] * d5[3];
14048 
14049 		t3[0] -= d0[0] * d8[0];
14050 		t3[1] -= d0[1] * d8[1];
14051 		t3[2] -= d0[2] * d8[2];
14052 		t3[3] -= d0[3] * d8[3];
14053 
14054 		t4[0] = d3[0] * d6[0];
14055 		t4[1] = d3[1] * d6[1];
14056 		t4[2] = d3[2] * d6[2];
14057 		t4[3] = d3[3] * d6[3];
14058 
14059 		t4[0] -= d1[0] * d8[0];
14060 		t4[1] -= d1[1] * d8[1];
14061 		t4[2] -= d1[2] * d8[2];
14062 		t4[3] -= d1[3] * d8[3];
14063 
14064 		t5[0] = d3[0] * d7[0];
14065 		t5[1] = d3[1] * d7[1];
14066 		t5[2] = d3[2] * d7[2];
14067 		t5[3] = d3[3] * d7[3];
14068 
14069 		t5[0] -= d2[0] * d8[0];
14070 		t5[1] -= d2[1] * d8[1];
14071 		t5[2] -= d2[2] * d8[2];
14072 		t5[3] -= d2[3] * d8[3];
14073 
14074 		tmp[0] = t3[0] * t3[0];
14075 		tmp[1] = t3[1] * t3[1];
14076 		tmp[2] = t3[2] * t3[2];
14077 		tmp[3] = t3[3] * t3[3];
14078 
14079 		tmp[0] += t4[0] * t4[0];
14080 		tmp[1] += t4[1] * t4[1];
14081 		tmp[2] += t4[2] * t4[2];
14082 		tmp[3] += t4[3] * t4[3];
14083 
14084 		tmp[0] += t5[0] * t5[0];
14085 		tmp[1] += t5[1] * t5[1];
14086 		tmp[2] += t5[2] * t5[2];
14087 		tmp[3] += t5[3] * t5[3];
14088 
14089 		tmp[0] = idMath::RSqrt( tmp[0] );
14090 		tmp[1] = idMath::RSqrt( tmp[1] );
14091 		tmp[2] = idMath::RSqrt( tmp[2] );
14092 		tmp[3] = idMath::RSqrt( tmp[3] );
14093 
14094 		*(unsigned int *)&tmp[0] ^= signBit[0];
14095 		*(unsigned int *)&tmp[1] ^= signBit[1];
14096 		*(unsigned int *)&tmp[2] ^= signBit[2];
14097 		*(unsigned int *)&tmp[3] ^= signBit[3];
14098 
14099 		t3[0] *= tmp[0];
14100 		t3[1] *= tmp[1];
14101 		t3[2] *= tmp[2];
14102 		t3[3] *= tmp[3];
14103 
14104 		t4[0] *= tmp[0];
14105 		t4[1] *= tmp[1];
14106 		t4[2] *= tmp[2];
14107 		t4[3] *= tmp[3];
14108 
14109 		t5[0] *= tmp[0];
14110 		t5[1] *= tmp[1];
14111 		t5[2] *= tmp[2];
14112 		t5[3] *= tmp[3];
14113 
14114 #endif
14115 
14116 		for ( int j = 0; j < 4; j++ ) {
14117 
14118 			const int v0 = indexes[i + j * 3 + 0];
14119 			const int v1 = indexes[i + j * 3 + 1];
14120 			const int v2 = indexes[i + j * 3 + 2];
14121 
14122 			a = verts + v0;
14123 			b = verts + v1;
14124 			c = verts + v2;
14125 
14126 			planes->Normal()[0] = n0[j];
14127 			planes->Normal()[1] = n1[j];
14128 			planes->Normal()[2] = n2[j];
14129 			planes->FitThroughPoint( a->xyz );
14130 			planes++;
14131 
14132 			if ( used[v0] ) {
14133 				a->normal[0] += n0[j];
14134 				a->normal[1] += n1[j];
14135 				a->normal[2] += n2[j];
14136 
14137 				a->tangents[0][0] += t0[j];
14138 				a->tangents[0][1] += t1[j];
14139 				a->tangents[0][2] += t2[j];
14140 
14141 				a->tangents[1][0] += t3[j];
14142 				a->tangents[1][1] += t4[j];
14143 				a->tangents[1][2] += t5[j];
14144 			} else {
14145 				a->normal[0] = n0[j];
14146 				a->normal[1] = n1[j];
14147 				a->normal[2] = n2[j];
14148 
14149 				a->tangents[0][0] = t0[j];
14150 				a->tangents[0][1] = t1[j];
14151 				a->tangents[0][2] = t2[j];
14152 
14153 				a->tangents[1][0] = t3[j];
14154 				a->tangents[1][1] = t4[j];
14155 				a->tangents[1][2] = t5[j];
14156 
14157 				used[v0] = true;
14158 			}
14159 
14160 			if ( used[v1] ) {
14161 				b->normal[0] += n0[j];
14162 				b->normal[1] += n1[j];
14163 				b->normal[2] += n2[j];
14164 
14165 				b->tangents[0][0] += t0[j];
14166 				b->tangents[0][1] += t1[j];
14167 				b->tangents[0][2] += t2[j];
14168 
14169 				b->tangents[1][0] += t3[j];
14170 				b->tangents[1][1] += t4[j];
14171 				b->tangents[1][2] += t5[j];
14172 			} else {
14173 				b->normal[0] = n0[j];
14174 				b->normal[1] = n1[j];
14175 				b->normal[2] = n2[j];
14176 
14177 				b->tangents[0][0] = t0[j];
14178 				b->tangents[0][1] = t1[j];
14179 				b->tangents[0][2] = t2[j];
14180 
14181 				b->tangents[1][0] = t3[j];
14182 				b->tangents[1][1] = t4[j];
14183 				b->tangents[1][2] = t5[j];
14184 
14185 				used[v1] = true;
14186 			}
14187 
14188 			if ( used[v2] ) {
14189 				c->normal[0] += n0[j];
14190 				c->normal[1] += n1[j];
14191 				c->normal[2] += n2[j];
14192 
14193 				c->tangents[0][0] += t0[j];
14194 				c->tangents[0][1] += t1[j];
14195 				c->tangents[0][2] += t2[j];
14196 
14197 				c->tangents[1][0] += t3[j];
14198 				c->tangents[1][1] += t4[j];
14199 				c->tangents[1][2] += t5[j];
14200 			} else {
14201 				c->normal[0] = n0[j];
14202 				c->normal[1] = n1[j];
14203 				c->normal[2] = n2[j];
14204 
14205 				c->tangents[0][0] = t0[j];
14206 				c->tangents[0][1] = t1[j];
14207 				c->tangents[0][2] = t2[j];
14208 
14209 				c->tangents[1][0] = t3[j];
14210 				c->tangents[1][1] = t4[j];
14211 				c->tangents[1][2] = t5[j];
14212 
14213 				used[v2] = true;
14214 			}
14215 		}
14216 	}
14217 
14218 	for ( ; i < numIndexes; i += 3 ) {
14219 		idDrawVert *a, *b, *c;
14220 		ALIGN16( unsigned int signBit[4] );
14221 		float d0, d1, d2, d3, d4;
14222 		float d5, d6, d7, d8, d9;
14223 		float n0, n1, n2;
14224 		float t0, t1, t2;
14225 		float t3, t4, t5;
14226 
14227 		const int v0 = indexes[i + 0];
14228 		const int v1 = indexes[i + 1];
14229 		const int v2 = indexes[i + 2];
14230 
14231 		a = verts + v0;
14232 		b = verts + v1;
14233 		c = verts + v2;
14234 
14235 		d0 = b->xyz[0] - a->xyz[0];
14236 		d1 = b->xyz[1] - a->xyz[1];
14237 		d2 = b->xyz[2] - a->xyz[2];
14238 		d3 = b->st[0] - a->st[0];
14239 		d4 = b->st[1] - a->st[1];
14240 
14241 		d5 = c->xyz[0] - a->xyz[0];
14242 		d6 = c->xyz[1] - a->xyz[1];
14243 		d7 = c->xyz[2] - a->xyz[2];
14244 		d8 = c->st[0] - a->st[0];
14245 		d9 = c->st[1] - a->st[1];
14246 
14247 #if 1
14248 
14249 		__asm {
14250 			// normal
14251 			movss		xmm0, d6
14252 			mulss		xmm0, d2
14253 			movss		xmm1, d7
14254 			mulss		xmm1, d1
14255 			subss		xmm0, xmm1
14256 
14257 			movss		xmm1, d7
14258 			mulss		xmm1, d0
14259 			movss		xmm2, d5
14260 			mulss		xmm2, d2
14261 			subss		xmm1, xmm2
14262 
14263 			movss		xmm2, d5
14264 			mulss		xmm2, d1
14265 			movss		xmm3, d6
14266 			mulss		xmm3, d0
14267 			subss		xmm2, xmm3
14268 
14269 			movss		xmm3, xmm0
14270 			movss		xmm4, xmm1
14271 			movss		xmm5, xmm2
14272 
14273 			mulss		xmm3, xmm3
14274 			mulss		xmm4, xmm4
14275 			mulss		xmm5, xmm5
14276 
14277 			addss		xmm3, xmm4
14278 			addss		xmm3, xmm5
14279 
14280 #ifdef FIX_DEGENERATE_TANGENT
14281 			xorps		xmm4, xmm4
14282 			cmpeqps		xmm4, xmm3
14283 			andps		xmm4, SIMD_SP_tiny			// if values are zero replace them with a tiny number
14284 			andps		xmm3, SIMD_SP_absMask		// make sure the values are positive
14285 			orps		xmm3, xmm4
14286 #endif
14287 
14288 #ifdef REFINE_TANGENT_SQUAREROOT
14289 			rsqrtss		xmm4, xmm3
14290 			mulss		xmm3, xmm4
14291 			mulss		xmm3, xmm4
14292 			subss		xmm3, xmm6
14293 			mulss		xmm4, xmm7
14294 			mulss		xmm3, xmm4
14295 #else
14296 			rsqrtss		xmm3, xmm3
14297 #endif
14298 			mulss		xmm0, xmm3
14299 			movss		n0, xmm0
14300 			mulss		xmm1, xmm3
14301 			movss		n1, xmm1
14302 			mulss		xmm2, xmm3
14303 			movss		n2, xmm2
14304 
14305 			// area sign bit
14306 			movss		xmm0, d3
14307 			mulss		xmm0, d9
14308 			movss		xmm1, d4
14309 			mulss		xmm1, d8
14310 			subss		xmm0, xmm1
14311 			andps		xmm0, SIMD_SP_signBitMask
14312 			movaps		signBit, xmm0
14313 
14314 			// first tangent
14315 			movss		xmm0, d0
14316 			mulss		xmm0, d9
14317 			movss		xmm1, d4
14318 			mulss		xmm1, d5
14319 			subss		xmm0, xmm1
14320 
14321 			movss		xmm1, d1
14322 			mulss		xmm1, d9
14323 			movss		xmm2, d4
14324 			mulss		xmm2, d6
14325 			subss		xmm1, xmm2
14326 
14327 			movss		xmm2, d2
14328 			mulss		xmm2, d9
14329 			movss		xmm3, d4
14330 			mulss		xmm3, d7
14331 			subss		xmm2, xmm3
14332 
14333 			movss		xmm3, xmm0
14334 			movss		xmm4, xmm1
14335 			movss		xmm5, xmm2
14336 
14337 			mulss		xmm3, xmm3
14338 			mulss		xmm4, xmm4
14339 			mulss		xmm5, xmm5
14340 
14341 			addss		xmm3, xmm4
14342 			addss		xmm3, xmm5
14343 
14344 #ifdef FIX_DEGENERATE_TANGENT
14345 			xorps		xmm4, xmm4
14346 			cmpeqps		xmm4, xmm3
14347 			andps		xmm4, SIMD_SP_tiny			// if values are zero replace them with a tiny number
14348 			andps		xmm3, SIMD_SP_absMask		// make sure the values are positive
14349 			orps		xmm3, xmm4
14350 #endif
14351 
14352 #ifdef REFINE_TANGENT_SQUAREROOT
14353 			rsqrtss		xmm4, xmm3
14354 			mulss		xmm3, xmm4
14355 			mulss		xmm3, xmm4
14356 			subss		xmm3, xmm6
14357 			mulss		xmm4, xmm7
14358 			mulss		xmm3, xmm4
14359 #else
14360 			rsqrtss		xmm3, xmm3
14361 #endif
14362 			xorps		xmm3, signBit
14363 
14364 			mulss		xmm0, xmm3
14365 			movss		t0, xmm0
14366 			mulss		xmm1, xmm3
14367 			movss		t1, xmm1
14368 			mulss		xmm2, xmm3
14369 			movss		t2, xmm2
14370 
14371 			// second tangent
14372 			movss		xmm0, d3
14373 			mulss		xmm0, d5
14374 			movss		xmm1, d0
14375 			mulss		xmm1, d8
14376 			subss		xmm0, xmm1
14377 
14378 			movss		xmm1, d3
14379 			mulss		xmm1, d6
14380 			movss		xmm2, d1
14381 			mulss		xmm2, d8
14382 			subss		xmm1, xmm2
14383 
14384 			movss		xmm2, d3
14385 			mulss		xmm2, d7
14386 			movss		xmm3, d2
14387 			mulss		xmm3, d8
14388 			subss		xmm2, xmm3
14389 
14390 			movss		xmm3, xmm0
14391 			movss		xmm4, xmm1
14392 			movss		xmm5, xmm2
14393 
14394 			mulss		xmm3, xmm3
14395 			mulss		xmm4, xmm4
14396 			mulss		xmm5, xmm5
14397 
14398 			addss		xmm3, xmm4
14399 			addss		xmm3, xmm5
14400 
14401 #ifdef FIX_DEGENERATE_TANGENT
14402 			xorps		xmm4, xmm4
14403 			cmpeqps		xmm4, xmm3
14404 			andps		xmm4, SIMD_SP_tiny			// if values are zero replace them with a tiny number
14405 			andps		xmm3, SIMD_SP_absMask		// make sure the values are positive
14406 			orps		xmm3, xmm4
14407 #endif
14408 
14409 #ifdef REFINE_TANGENT_SQUAREROOT
14410 			rsqrtss		xmm4, xmm3
14411 			mulss		xmm3, xmm4
14412 			mulss		xmm3, xmm4
14413 			subss		xmm3, xmm6
14414 			mulss		xmm4, xmm7
14415 			mulss		xmm3, xmm4
14416 #else
14417 			rsqrtss		xmm3, xmm3
14418 #endif
14419 			xorps		xmm3, signBit
14420 
14421 			mulss		xmm0, xmm3
14422 			movss		t3, xmm0
14423 			mulss		xmm1, xmm3
14424 			movss		t4, xmm1
14425 			mulss		xmm2, xmm3
14426 			movss		t5, xmm2
14427 		}
14428 
14429 #else
14430 
14431 		float tmp;
14432 
14433 		// normal
14434 		n0 = d6 * d2 - d7 * d1;
14435 		n1 = d7 * d0 - d5 * d2;
14436 		n2 = d5 * d1 - d6 * d0;
14437 
14438 		tmp = idMath::RSqrt( n0 * n0 + n1 * n1 + n2 * n2 );
14439 
14440 		n0 *= tmp;
14441 		n1 *= tmp;
14442 		n2 *= tmp;
14443 
14444 		// area sign bit
14445 		tmp = d3 * d9 - d4 * d8;
14446 		signBit[0] = ( *(unsigned int *)&tmp ) & ( 1 << 31 );
14447 
14448 		// first tangent
14449 		t0 = d0 * d9 - d4 * d5;
14450 		t1 = d1 * d9 - d4 * d6;
14451 		t2 = d2 * d9 - d4 * d7;
14452 
14453 		tmp = idMath::RSqrt( t0 * t0 + t1 * t1 + t2 * t2 );
14454 		*(unsigned int *)&tmp ^= signBit[0];
14455 
14456 		t0 *= tmp;
14457 		t1 *= tmp;
14458 		t2 *= tmp;
14459 
14460 		// second tangent
14461 		t3 = d3 * d5 - d0 * d8;
14462 		t4 = d3 * d6 - d1 * d8;
14463 		t5 = d3 * d7 - d2 * d8;
14464 
14465 		tmp = idMath::RSqrt( t3 * t3 + t4 * t4 + t5 * t5 );
14466 		*(unsigned int *)&tmp ^= signBit[0];
14467 
14468 		t3 *= tmp;
14469 		t4 *= tmp;
14470 		t5 *= tmp;
14471 
14472 #endif
14473 
14474 		planes->Normal()[0] = n0;
14475 		planes->Normal()[1] = n1;
14476 		planes->Normal()[2] = n2;
14477 		planes->FitThroughPoint( a->xyz );
14478 		planes++;
14479 
14480 		if ( used[v0] ) {
14481 			a->normal[0] += n0;
14482 			a->normal[1] += n1;
14483 			a->normal[2] += n2;
14484 
14485 			a->tangents[0][0] += t0;
14486 			a->tangents[0][1] += t1;
14487 			a->tangents[0][2] += t2;
14488 
14489 			a->tangents[1][0] += t3;
14490 			a->tangents[1][1] += t4;
14491 			a->tangents[1][2] += t5;
14492 		} else {
14493 			a->normal[0] = n0;
14494 			a->normal[1] = n1;
14495 			a->normal[2] = n2;
14496 
14497 			a->tangents[0][0] = t0;
14498 			a->tangents[0][1] = t1;
14499 			a->tangents[0][2] = t2;
14500 
14501 			a->tangents[1][0] = t3;
14502 			a->tangents[1][1] = t4;
14503 			a->tangents[1][2] = t5;
14504 
14505 			used[v0] = true;
14506 		}
14507 
14508 		if ( used[v1] ) {
14509 			b->normal[0] += n0;
14510 			b->normal[1] += n1;
14511 			b->normal[2] += n2;
14512 
14513 			b->tangents[0][0] += t0;
14514 			b->tangents[0][1] += t1;
14515 			b->tangents[0][2] += t2;
14516 
14517 			b->tangents[1][0] += t3;
14518 			b->tangents[1][1] += t4;
14519 			b->tangents[1][2] += t5;
14520 		} else {
14521 			b->normal[0] = n0;
14522 			b->normal[1] = n1;
14523 			b->normal[2] = n2;
14524 
14525 			b->tangents[0][0] = t0;
14526 			b->tangents[0][1] = t1;
14527 			b->tangents[0][2] = t2;
14528 
14529 			b->tangents[1][0] = t3;
14530 			b->tangents[1][1] = t4;
14531 			b->tangents[1][2] = t5;
14532 
14533 			used[v1] = true;
14534 		}
14535 
14536 		if ( used[v2] ) {
14537 			c->normal[0] += n0;
14538 			c->normal[1] += n1;
14539 			c->normal[2] += n2;
14540 
14541 			c->tangents[0][0] += t0;
14542 			c->tangents[0][1] += t1;
14543 			c->tangents[0][2] += t2;
14544 
14545 			c->tangents[1][0] += t3;
14546 			c->tangents[1][1] += t4;
14547 			c->tangents[1][2] += t5;
14548 		} else {
14549 			c->normal[0] = n0;
14550 			c->normal[1] = n1;
14551 			c->normal[2] = n2;
14552 
14553 			c->tangents[0][0] = t0;
14554 			c->tangents[0][1] = t1;
14555 			c->tangents[0][2] = t2;
14556 
14557 			c->tangents[1][0] = t3;
14558 			c->tangents[1][1] = t4;
14559 			c->tangents[1][2] = t5;
14560 
14561 			used[v2] = true;
14562 		}
14563 	}
14564 }
14565 
14566 /*
14567 ============
14568 idSIMD_SSE::DeriveUnsmoothedTangents
14569 ============
14570 */
14571 #define DERIVE_UNSMOOTHED_BITANGENT
14572 
DeriveUnsmoothedTangents(idDrawVert * verts,const dominantTri_s * dominantTris,const int numVerts)14573 void VPCALL idSIMD_SSE::DeriveUnsmoothedTangents( idDrawVert *verts, const dominantTri_s *dominantTris, const int numVerts ) {
14574 	int i, j;
14575 
14576 	for ( i = 0; i <= numVerts - 4; i += 4 ) {
14577 		ALIGN16( float s0[4] );
14578 		ALIGN16( float s1[4] );
14579 		ALIGN16( float s2[4] );
14580 		ALIGN16( float d0[4] );
14581 		ALIGN16( float d1[4] );
14582 		ALIGN16( float d2[4] );
14583 		ALIGN16( float d3[4] );
14584 		ALIGN16( float d4[4] );
14585 		ALIGN16( float d5[4] );
14586 		ALIGN16( float d6[4] );
14587 		ALIGN16( float d7[4] );
14588 		ALIGN16( float d8[4] );
14589 		ALIGN16( float d9[4] );
14590 		ALIGN16( float n0[4] );
14591 		ALIGN16( float n1[4] );
14592 		ALIGN16( float n2[4] );
14593 		ALIGN16( float t0[4] );
14594 		ALIGN16( float t1[4] );
14595 		ALIGN16( float t2[4] );
14596 		ALIGN16( float t3[4] );
14597 		ALIGN16( float t4[4] );
14598 		ALIGN16( float t5[4] );
14599 
14600 		for ( j = 0; j < 4; j++ ) {
14601 			const idDrawVert *a, *b, *c;
14602 
14603 			const dominantTri_s &dt = dominantTris[i+j];
14604 
14605 			s0[j] = dt.normalizationScale[0];
14606 			s1[j] = dt.normalizationScale[1];
14607 			s2[j] = dt.normalizationScale[2];
14608 
14609 			a = verts + i + j;
14610 			b = verts + dt.v2;
14611 			c = verts + dt.v3;
14612 
14613 			d0[j] = b->xyz[0] - a->xyz[0];
14614 			d1[j] = b->xyz[1] - a->xyz[1];
14615 			d2[j] = b->xyz[2] - a->xyz[2];
14616 			d3[j] = b->st[0] - a->st[0];
14617 			d4[j] = b->st[1] - a->st[1];
14618 
14619 			d5[j] = c->xyz[0] - a->xyz[0];
14620 			d6[j] = c->xyz[1] - a->xyz[1];
14621 			d7[j] = c->xyz[2] - a->xyz[2];
14622 			d8[j] = c->st[0] - a->st[0];
14623 			d9[j] = c->st[1] - a->st[1];
14624 		}
14625 
14626 #if 1
14627 
14628 		__asm {
14629 
14630 			movaps		xmm0, d6
14631 			mulps		xmm0, d2
14632 			movaps		xmm1, d7
14633 			mulps		xmm1, d1
14634 
14635 			movaps		xmm2, d7
14636 			mulps		xmm2, d0
14637 			movaps		xmm3, d5
14638 			mulps		xmm3, d2
14639 
14640 			movaps		xmm4, d5
14641 			mulps		xmm4, d1
14642 			movaps		xmm5, d6
14643 			mulps		xmm5, d0
14644 
14645 			subps		xmm0, xmm1
14646 			subps		xmm2, xmm3
14647 			movaps		xmm7, s2
14648 			subps		xmm4, xmm5
14649 
14650 			mulps		xmm0, xmm7
14651 			movaps		n0, xmm0
14652 			mulps		xmm2, xmm7
14653 			movaps		n1, xmm2
14654 			mulps		xmm4, xmm7
14655 			movaps		n2, xmm4
14656 
14657 			movaps		xmm0, d0
14658 			mulps		xmm0, d9
14659 			movaps		xmm1, d4
14660 			mulps		xmm1, d5
14661 
14662 			movaps		xmm2, d1
14663 			mulps		xmm2, d9
14664 			movaps		xmm3, d4
14665 			mulps		xmm3, d6
14666 
14667 			movaps		xmm4, d2
14668 			mulps		xmm4, d9
14669 			movaps		xmm5, d4
14670 			mulps		xmm5, d7
14671 
14672 			subps		xmm0, xmm1
14673 			subps		xmm2, xmm3
14674 			movaps		xmm7, s0
14675 			subps		xmm4, xmm5
14676 
14677 			mulps		xmm0, xmm7
14678 			movaps		t0, xmm0
14679 			mulps		xmm2, xmm7
14680 			movaps		t1, xmm2
14681 			mulps		xmm4, xmm7
14682 			movaps		t2, xmm4
14683 
14684 #ifndef DERIVE_UNSMOOTHED_BITANGENT
14685 			movaps		xmm0, d3
14686 			mulps		xmm0, d5
14687 			movaps		xmm1, d0
14688 			mulps		xmm1, d8
14689 
14690 			movaps		xmm2, d3
14691 			mulps		xmm2, d6
14692 			movaps		xmm3, d1
14693 			mulps		xmm3, d8
14694 
14695 			movaps		xmm4, d3
14696 			mulps		xmm4, d7
14697 			movaps		xmm5, d2
14698 			mulps		xmm5, d8
14699 #else
14700 			movaps		xmm0, n2
14701 			mulps		xmm0, t1
14702 			movaps		xmm1, n1
14703 			mulps		xmm1, t2
14704 
14705 			movaps		xmm2, n0
14706 			mulps		xmm2, t2
14707 			movaps		xmm3, n2
14708 			mulps		xmm3, t0
14709 
14710 			movaps		xmm4, n1
14711 			mulps		xmm4, t0
14712 			movaps		xmm5, n0
14713 			mulps		xmm5, t1
14714 #endif
14715 			subps		xmm0, xmm1
14716 			subps		xmm2, xmm3
14717 			movaps		xmm7, s1
14718 			subps		xmm4, xmm5
14719 
14720 			mulps		xmm0, xmm7
14721 			movaps		t3, xmm0
14722 			mulps		xmm2, xmm7
14723 			movaps		t4, xmm2
14724 			mulps		xmm4, xmm7
14725 			movaps		t5, xmm4
14726 		}
14727 
14728 #else
14729 
14730 		n0[0] = d6[0] * d2[0];
14731 		n0[1] = d6[1] * d2[1];
14732 		n0[2] = d6[2] * d2[2];
14733 		n0[3] = d6[3] * d2[3];
14734 
14735 		n1[0] = d7[0] * d0[0];
14736 		n1[1] = d7[1] * d0[1];
14737 		n1[2] = d7[2] * d0[2];
14738 		n1[3] = d7[3] * d0[3];
14739 
14740 		n2[0] = d5[0] * d1[0];
14741 		n2[1] = d5[1] * d1[1];
14742 		n2[2] = d5[2] * d1[2];
14743 		n2[3] = d5[3] * d1[3];
14744 
14745 		n0[0] -= d7[0] * d1[0];
14746 		n0[1] -= d7[1] * d1[1];
14747 		n0[2] -= d7[2] * d1[2];
14748 		n0[3] -= d7[3] * d1[3];
14749 
14750 		n1[0] -= d5[0] * d2[0];
14751 		n1[1] -= d5[1] * d2[1];
14752 		n1[2] -= d5[2] * d2[2];
14753 		n1[3] -= d5[3] * d2[3];
14754 
14755 		n2[0] -= d6[0] * d0[0];
14756 		n2[1] -= d6[1] * d0[1];
14757 		n2[2] -= d6[2] * d0[2];
14758 		n2[3] -= d6[3] * d0[3];
14759 
14760 		n0[0] *= s2[0];
14761 		n0[1] *= s2[1];
14762 		n0[2] *= s2[2];
14763 		n0[3] *= s2[3];
14764 
14765 		n1[0] *= s2[0];
14766 		n1[1] *= s2[1];
14767 		n1[2] *= s2[2];
14768 		n1[3] *= s2[3];
14769 
14770 		n2[0] *= s2[0];
14771 		n2[1] *= s2[1];
14772 		n2[2] *= s2[2];
14773 		n2[3] *= s2[3];
14774 
14775 		t0[0] = d0[0] * d9[0];
14776 		t0[1] = d0[1] * d9[1];
14777 		t0[2] = d0[2] * d9[2];
14778 		t0[3] = d0[3] * d9[3];
14779 
14780 		t1[0] = d1[0] * d9[0];
14781 		t1[1] = d1[1] * d9[1];
14782 		t1[2] = d1[2] * d9[2];
14783 		t1[3] = d1[3] * d9[3];
14784 
14785 		t2[0] = d2[0] * d9[0];
14786 		t2[1] = d2[1] * d9[1];
14787 		t2[2] = d2[2] * d9[2];
14788 		t2[3] = d2[3] * d9[3];
14789 
14790 		t0[0] -= d4[0] * d5[0];
14791 		t0[1] -= d4[1] * d5[1];
14792 		t0[2] -= d4[2] * d5[2];
14793 		t0[3] -= d4[3] * d5[3];
14794 
14795 		t1[0] -= d4[0] * d6[0];
14796 		t1[1] -= d4[1] * d6[1];
14797 		t1[2] -= d4[2] * d6[2];
14798 		t1[3] -= d4[3] * d6[3];
14799 
14800 		t2[0] -= d4[0] * d7[0];
14801 		t2[1] -= d4[1] * d7[1];
14802 		t2[2] -= d4[2] * d7[2];
14803 		t2[3] -= d4[3] * d7[3];
14804 
14805 		t0[0] *= s0[0];
14806 		t0[1] *= s0[1];
14807 		t0[2] *= s0[2];
14808 		t0[3] *= s0[3];
14809 
14810 		t1[0] *= s0[0];
14811 		t1[1] *= s0[1];
14812 		t1[2] *= s0[2];
14813 		t1[3] *= s0[3];
14814 
14815 		t2[0] *= s0[0];
14816 		t2[1] *= s0[1];
14817 		t2[2] *= s0[2];
14818 		t2[3] *= s0[3];
14819 
14820 #ifndef DERIVE_UNSMOOTHED_BITANGENT
14821 		t3[0] = d3[0] * d5[0];
14822 		t3[1] = d3[1] * d5[1];
14823 		t3[2] = d3[2] * d5[2];
14824 		t3[3] = d3[3] * d5[3];
14825 
14826 		t4[0] = d3[0] * d6[0];
14827 		t4[1] = d3[1] * d6[1];
14828 		t4[2] = d3[2] * d6[2];
14829 		t4[3] = d3[3] * d6[3];
14830 
14831 		t5[0] = d3[0] * d7[0];
14832 		t5[1] = d3[1] * d7[1];
14833 		t5[2] = d3[2] * d7[2];
14834 		t5[3] = d3[3] * d7[3];
14835 
14836 		t3[0] -= d0[0] * d8[0];
14837 		t3[1] -= d0[1] * d8[1];
14838 		t3[2] -= d0[2] * d8[2];
14839 		t3[3] -= d0[3] * d8[3];
14840 
14841 		t4[0] -= d1[0] * d8[0];
14842 		t4[1] -= d1[1] * d8[1];
14843 		t4[2] -= d1[2] * d8[2];
14844 		t4[3] -= d1[3] * d8[3];
14845 
14846 		t5[0] -= d2[0] * d8[0];
14847 		t5[1] -= d2[1] * d8[1];
14848 		t5[2] -= d2[2] * d8[2];
14849 		t5[3] -= d2[3] * d8[3];
14850 #else
14851 		t3[0] = n2[0] * t1[0];
14852 		t3[1] = n2[1] * t1[1];
14853 		t3[2] = n2[2] * t1[2];
14854 		t3[3] = n2[3] * t1[3];
14855 
14856 		t4[0] = n0[0] * t2[0];
14857 		t4[1] = n0[1] * t2[1];
14858 		t4[2] = n0[2] * t2[2];
14859 		t4[3] = n0[3] * t2[3];
14860 
14861 		t5[0] = n1[0] * t0[0];
14862 		t5[1] = n1[1] * t0[1];
14863 		t5[2] = n1[2] * t0[2];
14864 		t5[3] = n1[3] * t0[3];
14865 
14866 		t3[0] -= n1[0] * t2[0];
14867 		t3[1] -= n1[1] * t2[1];
14868 		t3[2] -= n1[2] * t2[2];
14869 		t3[3] -= n1[3] * t2[3];
14870 
14871 		t4[0] -= n2[0] * t0[0];
14872 		t4[1] -= n2[1] * t0[1];
14873 		t4[2] -= n2[2] * t0[2];
14874 		t4[3] -= n2[3] * t0[3];
14875 
14876 		t5[0] -= n0[0] * t1[0];
14877 		t5[1] -= n0[1] * t1[1];
14878 		t5[2] -= n0[2] * t1[2];
14879 		t5[3] -= n0[3] * t1[3];
14880 #endif
14881 		t3[0] *= s1[0];
14882 		t3[1] *= s1[1];
14883 		t3[2] *= s1[2];
14884 		t3[3] *= s1[3];
14885 
14886 		t4[0] *= s1[0];
14887 		t4[1] *= s1[1];
14888 		t4[2] *= s1[2];
14889 		t4[3] *= s1[3];
14890 
14891 		t5[0] *= s1[0];
14892 		t5[1] *= s1[1];
14893 		t5[2] *= s1[2];
14894 		t5[3] *= s1[3];
14895 
14896 #endif
14897 
14898 		for ( j = 0; j < 4; j++ ) {
14899 			idDrawVert *a;
14900 
14901 			a = verts + i + j;
14902 
14903 			a->normal[0] = n0[j];
14904 			a->normal[1] = n1[j];
14905 			a->normal[2] = n2[j];
14906 
14907 			a->tangents[0][0] = t0[j];
14908 			a->tangents[0][1] = t1[j];
14909 			a->tangents[0][2] = t2[j];
14910 
14911 			a->tangents[1][0] = t3[j];
14912 			a->tangents[1][1] = t4[j];
14913 			a->tangents[1][2] = t5[j];
14914 		}
14915 	}
14916 
14917 	for ( ; i < numVerts; i++ ) {
14918 		idDrawVert *a, *b, *c;
14919 		float d0, d1, d2, d3, d4;
14920 		float d5, d6, d7, d8, d9;
14921 		float s0, s1, s2;
14922 		float n0, n1, n2;
14923 		float t0, t1, t2;
14924 		float t3, t4, t5;
14925 
14926 		const dominantTri_s &dt = dominantTris[i];
14927 
14928 		s0 = dt.normalizationScale[0];
14929 		s1 = dt.normalizationScale[1];
14930 		s2 = dt.normalizationScale[2];
14931 
14932 		a = verts + i;
14933 		b = verts + dt.v2;
14934 		c = verts + dt.v3;
14935 
14936 		d0 = b->xyz[0] - a->xyz[0];
14937 		d1 = b->xyz[1] - a->xyz[1];
14938 		d2 = b->xyz[2] - a->xyz[2];
14939 		d3 = b->st[0] - a->st[0];
14940 		d4 = b->st[1] - a->st[1];
14941 
14942 		d5 = c->xyz[0] - a->xyz[0];
14943 		d6 = c->xyz[1] - a->xyz[1];
14944 		d7 = c->xyz[2] - a->xyz[2];
14945 		d8 = c->st[0] - a->st[0];
14946 		d9 = c->st[1] - a->st[1];
14947 
14948 #if 1
14949 
14950 		__asm {
14951 
14952 			movss		xmm0, d6
14953 			mulss		xmm0, d2
14954 			movss		xmm1, d7
14955 			mulss		xmm1, d1
14956 
14957 			movss		xmm2, d7
14958 			mulss		xmm2, d0
14959 			movss		xmm3, d5
14960 			mulss		xmm3, d2
14961 
14962 			movss		xmm4, d5
14963 			mulss		xmm4, d1
14964 			movss		xmm5, d6
14965 			mulss		xmm5, d0
14966 
14967 			subss		xmm0, xmm1
14968 			subss		xmm2, xmm3
14969 			movss		xmm7, s2
14970 			subss		xmm4, xmm5
14971 
14972 			mulss		xmm0, xmm7
14973 			movss		n0, xmm0
14974 			mulss		xmm2, xmm7
14975 			movss		n1, xmm2
14976 			mulss		xmm4, xmm7
14977 			movss		n2, xmm4
14978 
14979 			movss		xmm0, d0
14980 			mulss		xmm0, d9
14981 			movss		xmm1, d4
14982 			mulss		xmm1, d5
14983 
14984 			movss		xmm2, d1
14985 			mulss		xmm2, d9
14986 			movss		xmm3, d4
14987 			mulss		xmm3, d6
14988 
14989 			movss		xmm4, d2
14990 			mulss		xmm4, d9
14991 			movss		xmm5, d4
14992 			mulss		xmm5, d7
14993 
14994 			subss		xmm0, xmm1
14995 			subss		xmm2, xmm3
14996 			movss		xmm7, s0
14997 			subss		xmm4, xmm5
14998 
14999 			mulss		xmm0, xmm7
15000 			movss		t0, xmm0
15001 			mulss		xmm2, xmm7
15002 			movss		t1, xmm2
15003 			mulss		xmm4, xmm7
15004 			movss		t2, xmm4
15005 
15006 #ifndef DERIVE_UNSMOOTHED_BITANGENT
15007 			movss		xmm0, d3
15008 			mulss		xmm0, d5
15009 			movss		xmm1, d0
15010 			mulss		xmm1, d8
15011 
15012 			movss		xmm2, d3
15013 			mulss		xmm2, d6
15014 			movss		xmm3, d1
15015 			mulss		xmm3, d8
15016 
15017 			movss		xmm4, d3
15018 			mulss		xmm4, d7
15019 			movss		xmm5, d2
15020 			mulss		xmm5, d8
15021 #else
15022 			movss		xmm0, n2
15023 			mulss		xmm0, t1
15024 			movss		xmm1, n1
15025 			mulss		xmm1, t2
15026 
15027 			movss		xmm2, n0
15028 			mulss		xmm2, t2
15029 			movss		xmm3, n2
15030 			mulss		xmm3, t0
15031 
15032 			movss		xmm4, n1
15033 			mulss		xmm4, t0
15034 			movss		xmm5, n0
15035 			mulss		xmm5, t1
15036 #endif
15037 			subss		xmm0, xmm1
15038 			subss		xmm2, xmm3
15039 			movss		xmm7, s1
15040 			subss		xmm4, xmm5
15041 
15042 			mulss		xmm0, xmm7
15043 			movss		t3, xmm0
15044 			mulss		xmm2, xmm7
15045 			movss		t4, xmm2
15046 			mulss		xmm4, xmm7
15047 			movss		t5, xmm4
15048 		}
15049 
15050 #else
15051 
15052 		n0 = s2 * ( d6 * d2 - d7 * d1 );
15053 		n1 = s2 * ( d7 * d0 - d5 * d2 );
15054 		n2 = s2 * ( d5 * d1 - d6 * d0 );
15055 
15056 		t0 = s0 * ( d0 * d9 - d4 * d5 );
15057 		t1 = s0 * ( d1 * d9 - d4 * d6 );
15058 		t2 = s0 * ( d2 * d9 - d4 * d7 );
15059 
15060 #ifndef DERIVE_UNSMOOTHED_BITANGENT
15061 		t3 = s1 * ( d3 * d5 - d0 * d8 );
15062 		t4 = s1 * ( d3 * d6 - d1 * d8 );
15063 		t5 = s1 * ( d3 * d7 - d2 * d8 );
15064 #else
15065 		t3 = s1 * ( n2 * t1 - n1 * t2 );
15066 		t4 = s1 * ( n0 * t2 - n2 * t0 );
15067 		t5 = s1 * ( n1 * t0 - n0 * t1 );
15068 #endif
15069 
15070 #endif
15071 
15072 		a->normal[0] = n0;
15073 		a->normal[1] = n1;
15074 		a->normal[2] = n2;
15075 
15076 		a->tangents[0][0] = t0;
15077 		a->tangents[0][1] = t1;
15078 		a->tangents[0][2] = t2;
15079 
15080 		a->tangents[1][0] = t3;
15081 		a->tangents[1][1] = t4;
15082 		a->tangents[1][2] = t5;
15083 	}
15084 }
15085 
15086 /*
15087 ============
15088 idSIMD_SSE::NormalizeTangents
15089 ============
15090 */
NormalizeTangents(idDrawVert * verts,const int numVerts)15091 void VPCALL idSIMD_SSE::NormalizeTangents( idDrawVert *verts, const int numVerts ) {
15092 	ALIGN16( float normal[12] );
15093 
15094 	assert( sizeof( idDrawVert ) == DRAWVERT_SIZE );
15095 	assert( (int)&((idDrawVert *)0)->normal == DRAWVERT_NORMAL_OFFSET );
15096 	assert( (int)&((idDrawVert *)0)->tangents[0] == DRAWVERT_TANGENT0_OFFSET );
15097 	assert( (int)&((idDrawVert *)0)->tangents[1] == DRAWVERT_TANGENT1_OFFSET );
15098 
15099 	assert( verts != NULL );
15100 	assert( numVerts >= 0 );
15101 
15102 	__asm {
15103 		mov			eax, numVerts
15104 		test		eax, eax
15105 		jz			done
15106 #ifdef REFINE_TANGENT_SQUAREROOT
15107 		movaps		xmm6, SIMD_SP_rsqrt_c0
15108 		movaps		xmm7, SIMD_SP_rsqrt_c1
15109 #endif
15110 		mov			esi, verts
15111 		imul		eax, DRAWVERT_SIZE
15112 		add			esi, eax
15113 		neg			eax
15114 		add			eax, DRAWVERT_SIZE*4
15115 		jle			loopVert4
15116 
15117 		sub			eax, DRAWVERT_SIZE*4
15118 		jl			loopVert1
15119 
15120 	loopVert4:
15121 
15122 		sub			eax, DRAWVERT_SIZE*4
15123 
15124 		// normalize 4 idDrawVert::normal
15125 
15126 		movss		xmm0, [esi+eax+DRAWVERT_SIZE*0+DRAWVERT_NORMAL_OFFSET+0]	//  0,  X,  X,  X
15127 		movhps		xmm0, [esi+eax+DRAWVERT_SIZE*1+DRAWVERT_NORMAL_OFFSET+0]	//  0,  X,  3,  4
15128 		movss		xmm2, [esi+eax+DRAWVERT_SIZE*1+DRAWVERT_NORMAL_OFFSET+8]	//  5,  X,  X,  X
15129 		movhps		xmm2, [esi+eax+DRAWVERT_SIZE*0+DRAWVERT_NORMAL_OFFSET+4]	//	5,  X,  1,  2
15130 		movss		xmm4, [esi+eax+DRAWVERT_SIZE*2+DRAWVERT_NORMAL_OFFSET+0]	//  6,  X,  X,  X
15131 		movhps		xmm4, [esi+eax+DRAWVERT_SIZE*3+DRAWVERT_NORMAL_OFFSET+0]	//  6,  X,  9, 10
15132 		movss		xmm3, [esi+eax+DRAWVERT_SIZE*3+DRAWVERT_NORMAL_OFFSET+8]	// 11,  X,  X,  X
15133 		movhps		xmm3, [esi+eax+DRAWVERT_SIZE*2+DRAWVERT_NORMAL_OFFSET+4]	// 11,  X,  7,  8
15134 
15135 		movaps		xmm1, xmm0
15136 		movaps		xmm5, xmm2
15137 		shufps		xmm0, xmm4, R_SHUFFLEPS( 0, 2, 0, 2 )		//  0,  3,  6,  9
15138 		shufps		xmm2, xmm3, R_SHUFFLEPS( 3, 0, 3, 0 )		//  2,  5,  8, 11
15139 		shufps		xmm1, xmm5, R_SHUFFLEPS( 3, 3, 2, 2 )		//  4,  4,  1,  1
15140 		shufps		xmm4, xmm3, R_SHUFFLEPS( 3, 3, 2, 2 )		// 10, 10,  7,  7
15141 		shufps		xmm1, xmm4, R_SHUFFLEPS( 2, 0, 2, 0 )		//  1,  4,  7, 10
15142 
15143 		movaps		xmm3, xmm0
15144 		movaps		xmm4, xmm1
15145 		movaps		xmm5, xmm2
15146 
15147 		mulps		xmm3, xmm3
15148 		mulps		xmm4, xmm4
15149 		mulps		xmm5, xmm5
15150 		addps		xmm3, xmm4
15151 		addps		xmm3, xmm5
15152 
15153 #ifdef REFINE_TANGENT_SQUAREROOT
15154 		rsqrtps		xmm4, xmm3
15155 		mulps		xmm3, xmm4
15156 		mulps		xmm3, xmm4
15157 		subps		xmm3, xmm6
15158 		mulps		xmm4, xmm7
15159 		mulps		xmm3, xmm4
15160 #else
15161 		rsqrtps		xmm3, xmm3
15162 #endif
15163 
15164 		mulps		xmm0, xmm3
15165 		mulps		xmm1, xmm3
15166 		mulps		xmm2, xmm3
15167 
15168 		// save the 4 idDrawVert::normal to project the tangents
15169 
15170 		movaps		[normal+ 0], xmm0
15171 		movaps		[normal+16], xmm1
15172 		movaps		[normal+32], xmm2
15173 
15174 		movss		[esi+eax+DRAWVERT_SIZE*0+DRAWVERT_NORMAL_OFFSET+0], xmm0
15175 		movss		[esi+eax+DRAWVERT_SIZE*0+DRAWVERT_NORMAL_OFFSET+4], xmm1
15176 		movss		[esi+eax+DRAWVERT_SIZE*0+DRAWVERT_NORMAL_OFFSET+8], xmm2
15177 
15178 		shufps		xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 )
15179 		shufps		xmm1, xmm1, R_SHUFFLEPS( 1, 2, 3, 0 )
15180 		shufps		xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 )
15181 
15182 		movss		[esi+eax+DRAWVERT_SIZE*1+DRAWVERT_NORMAL_OFFSET+0], xmm0
15183 		movss		[esi+eax+DRAWVERT_SIZE*1+DRAWVERT_NORMAL_OFFSET+4], xmm1
15184 		movss		[esi+eax+DRAWVERT_SIZE*1+DRAWVERT_NORMAL_OFFSET+8], xmm2
15185 
15186 		shufps		xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 )
15187 		shufps		xmm1, xmm1, R_SHUFFLEPS( 1, 2, 3, 0 )
15188 		shufps		xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 )
15189 
15190 		movss		[esi+eax+DRAWVERT_SIZE*2+DRAWVERT_NORMAL_OFFSET+0], xmm0
15191 		movss		[esi+eax+DRAWVERT_SIZE*2+DRAWVERT_NORMAL_OFFSET+4], xmm1
15192 		movss		[esi+eax+DRAWVERT_SIZE*2+DRAWVERT_NORMAL_OFFSET+8], xmm2
15193 
15194 		shufps		xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 )
15195 		shufps		xmm1, xmm1, R_SHUFFLEPS( 1, 2, 3, 0 )
15196 		shufps		xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 )
15197 
15198 		movss		[esi+eax+DRAWVERT_SIZE*3+DRAWVERT_NORMAL_OFFSET+0], xmm0
15199 		movss		[esi+eax+DRAWVERT_SIZE*3+DRAWVERT_NORMAL_OFFSET+4], xmm1
15200 		movss		[esi+eax+DRAWVERT_SIZE*3+DRAWVERT_NORMAL_OFFSET+8], xmm2
15201 
15202 		// project and normalize 4 idDrawVert::tangent[0]
15203 
15204 		movss		xmm0, [esi+eax+DRAWVERT_SIZE*0+DRAWVERT_TANGENT0_OFFSET+0]	//  0,  X,  X,  X
15205 		movhps		xmm0, [esi+eax+DRAWVERT_SIZE*1+DRAWVERT_TANGENT0_OFFSET+0]	//  0,  X,  3,  4
15206 		movss		xmm2, [esi+eax+DRAWVERT_SIZE*1+DRAWVERT_TANGENT0_OFFSET+8]	//  5,  X,  X,  X
15207 		movhps		xmm2, [esi+eax+DRAWVERT_SIZE*0+DRAWVERT_TANGENT0_OFFSET+4]	//	5,  X,  1,  2
15208 		movss		xmm4, [esi+eax+DRAWVERT_SIZE*2+DRAWVERT_TANGENT0_OFFSET+0]	//  6,  X,  X,  X
15209 		movhps		xmm4, [esi+eax+DRAWVERT_SIZE*3+DRAWVERT_TANGENT0_OFFSET+0]	//  6,  X,  9, 10
15210 		movss		xmm3, [esi+eax+DRAWVERT_SIZE*3+DRAWVERT_TANGENT0_OFFSET+8]	// 11,  X,  X,  X
15211 		movhps		xmm3, [esi+eax+DRAWVERT_SIZE*2+DRAWVERT_TANGENT0_OFFSET+4]	// 11,  X,  7,  8
15212 
15213 		movaps		xmm1, xmm0
15214 		movaps		xmm5, xmm2
15215 		shufps		xmm0, xmm4, R_SHUFFLEPS( 0, 2, 0, 2 )		//  0,  3,  6,  9
15216 		shufps		xmm2, xmm3, R_SHUFFLEPS( 3, 0, 3, 0 )		//  2,  5,  8, 11
15217 		shufps		xmm1, xmm5, R_SHUFFLEPS( 3, 3, 2, 2 )		//  4,  4,  1,  1
15218 		shufps		xmm4, xmm3, R_SHUFFLEPS( 3, 3, 2, 2 )		// 10, 10,  7,  7
15219 		shufps		xmm1, xmm4, R_SHUFFLEPS( 2, 0, 2, 0 )		//  1,  4,  7, 10
15220 
15221 		movaps		xmm3, xmm0
15222 		movaps		xmm4, xmm1
15223 		movaps		xmm5, xmm2
15224 
15225 		mulps		xmm3, [normal+ 0]
15226 		mulps		xmm4, [normal+16]
15227 		mulps		xmm5, [normal+32]
15228 		addps		xmm3, xmm4
15229 		addps		xmm3, xmm5
15230 
15231 		movaps		xmm4, xmm3
15232 		movaps		xmm5, xmm3
15233 		mulps		xmm3, [normal+ 0]
15234 		mulps		xmm4, [normal+16]
15235 		mulps		xmm5, [normal+32]
15236 		subps		xmm0, xmm3
15237 		subps		xmm1, xmm4
15238 		subps		xmm2, xmm5
15239 
15240 		movaps		xmm3, xmm0
15241 		movaps		xmm4, xmm1
15242 		movaps		xmm5, xmm2
15243 
15244 		mulps		xmm3, xmm3
15245 		mulps		xmm4, xmm4
15246 		mulps		xmm5, xmm5
15247 		addps		xmm3, xmm4
15248 		addps		xmm3, xmm5
15249 
15250 #ifdef REFINE_TANGENT_SQUAREROOT
15251 		rsqrtps		xmm4, xmm3
15252 		mulps		xmm3, xmm4
15253 		mulps		xmm3, xmm4
15254 		subps		xmm3, xmm6
15255 		mulps		xmm4, xmm7
15256 		mulps		xmm3, xmm4
15257 #else
15258 		rsqrtps		xmm3, xmm3
15259 #endif
15260 
15261 		mulps		xmm0, xmm3
15262 		mulps		xmm1, xmm3
15263 		mulps		xmm2, xmm3
15264 
15265 		movss		[esi+eax+DRAWVERT_SIZE*0+DRAWVERT_TANGENT0_OFFSET+0], xmm0
15266 		movss		[esi+eax+DRAWVERT_SIZE*0+DRAWVERT_TANGENT0_OFFSET+4], xmm1
15267 		movss		[esi+eax+DRAWVERT_SIZE*0+DRAWVERT_TANGENT0_OFFSET+8], xmm2
15268 
15269 		shufps		xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 )
15270 		shufps		xmm1, xmm1, R_SHUFFLEPS( 1, 2, 3, 0 )
15271 		shufps		xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 )
15272 
15273 		movss		[esi+eax+DRAWVERT_SIZE*1+DRAWVERT_TANGENT0_OFFSET+0], xmm0
15274 		movss		[esi+eax+DRAWVERT_SIZE*1+DRAWVERT_TANGENT0_OFFSET+4], xmm1
15275 		movss		[esi+eax+DRAWVERT_SIZE*1+DRAWVERT_TANGENT0_OFFSET+8], xmm2
15276 
15277 		shufps		xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 )
15278 		shufps		xmm1, xmm1, R_SHUFFLEPS( 1, 2, 3, 0 )
15279 		shufps		xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 )
15280 
15281 		movss		[esi+eax+DRAWVERT_SIZE*2+DRAWVERT_TANGENT0_OFFSET+0], xmm0
15282 		movss		[esi+eax+DRAWVERT_SIZE*2+DRAWVERT_TANGENT0_OFFSET+4], xmm1
15283 		movss		[esi+eax+DRAWVERT_SIZE*2+DRAWVERT_TANGENT0_OFFSET+8], xmm2
15284 
15285 		shufps		xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 )
15286 		shufps		xmm1, xmm1, R_SHUFFLEPS( 1, 2, 3, 0 )
15287 		shufps		xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 )
15288 
15289 		movss		[esi+eax+DRAWVERT_SIZE*3+DRAWVERT_TANGENT0_OFFSET+0], xmm0
15290 		movss		[esi+eax+DRAWVERT_SIZE*3+DRAWVERT_TANGENT0_OFFSET+4], xmm1
15291 		movss		[esi+eax+DRAWVERT_SIZE*3+DRAWVERT_TANGENT0_OFFSET+8], xmm2
15292 
15293 		// project and normalize 4 idDrawVert::tangent[1]
15294 
15295 		movss		xmm0, [esi+eax+DRAWVERT_SIZE*0+DRAWVERT_TANGENT1_OFFSET+0]	//  0,  X,  X,  X
15296 		movhps		xmm0, [esi+eax+DRAWVERT_SIZE*1+DRAWVERT_TANGENT1_OFFSET+0]	//  0,  X,  3,  4
15297 		movss		xmm2, [esi+eax+DRAWVERT_SIZE*1+DRAWVERT_TANGENT1_OFFSET+8]	//  5,  X,  X,  X
15298 		movhps		xmm2, [esi+eax+DRAWVERT_SIZE*0+DRAWVERT_TANGENT1_OFFSET+4]	//	5,  X,  1,  2
15299 		movss		xmm4, [esi+eax+DRAWVERT_SIZE*2+DRAWVERT_TANGENT1_OFFSET+0]	//  6,  X,  X,  X
15300 		movhps		xmm4, [esi+eax+DRAWVERT_SIZE*3+DRAWVERT_TANGENT1_OFFSET+0]	//  6,  X,  9, 10
15301 		movss		xmm3, [esi+eax+DRAWVERT_SIZE*3+DRAWVERT_TANGENT1_OFFSET+8]	// 11,  X,  X,  X
15302 		movhps		xmm3, [esi+eax+DRAWVERT_SIZE*2+DRAWVERT_TANGENT1_OFFSET+4]	// 11,  X,  7,  8
15303 
15304 		movaps		xmm1, xmm0
15305 		movaps		xmm5, xmm2
15306 		shufps		xmm0, xmm4, R_SHUFFLEPS( 0, 2, 0, 2 )		//  0,  3,  6,  9
15307 		shufps		xmm2, xmm3, R_SHUFFLEPS( 3, 0, 3, 0 )		//  2,  5,  8, 11
15308 		shufps		xmm1, xmm5, R_SHUFFLEPS( 3, 3, 2, 2 )		//  4,  4,  1,  1
15309 		shufps		xmm4, xmm3, R_SHUFFLEPS( 3, 3, 2, 2 )		// 10, 10,  7,  7
15310 		shufps		xmm1, xmm4, R_SHUFFLEPS( 2, 0, 2, 0 )		//  1,  4,  7, 10
15311 
15312 		movaps		xmm3, xmm0
15313 		movaps		xmm4, xmm1
15314 		movaps		xmm5, xmm2
15315 
15316 		mulps		xmm3, [normal+ 0]
15317 		mulps		xmm4, [normal+16]
15318 		mulps		xmm5, [normal+32]
15319 		addps		xmm3, xmm4
15320 		addps		xmm3, xmm5
15321 
15322 		movaps		xmm4, xmm3
15323 		movaps		xmm5, xmm3
15324 		mulps		xmm3, [normal+ 0]
15325 		mulps		xmm4, [normal+16]
15326 		mulps		xmm5, [normal+32]
15327 		subps		xmm0, xmm3
15328 		subps		xmm1, xmm4
15329 		subps		xmm2, xmm5
15330 
15331 		movaps		xmm3, xmm0
15332 		movaps		xmm4, xmm1
15333 		movaps		xmm5, xmm2
15334 
15335 		mulps		xmm3, xmm3
15336 		mulps		xmm4, xmm4
15337 		mulps		xmm5, xmm5
15338 		addps		xmm3, xmm4
15339 		addps		xmm3, xmm5
15340 
15341 #ifdef REFINE_TANGENT_SQUAREROOT
15342 		rsqrtps		xmm4, xmm3
15343 		mulps		xmm3, xmm4
15344 		mulps		xmm3, xmm4
15345 		subps		xmm3, xmm6
15346 		mulps		xmm4, xmm7
15347 		mulps		xmm3, xmm4
15348 #else
15349 		rsqrtps		xmm3, xmm3
15350 #endif
15351 
15352 		mulps		xmm0, xmm3
15353 		mulps		xmm1, xmm3
15354 		mulps		xmm2, xmm3
15355 
15356 		movss		[esi+eax+DRAWVERT_SIZE*0+DRAWVERT_TANGENT1_OFFSET+0], xmm0
15357 		movss		[esi+eax+DRAWVERT_SIZE*0+DRAWVERT_TANGENT1_OFFSET+4], xmm1
15358 		movss		[esi+eax+DRAWVERT_SIZE*0+DRAWVERT_TANGENT1_OFFSET+8], xmm2
15359 
15360 		shufps		xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 )
15361 		shufps		xmm1, xmm1, R_SHUFFLEPS( 1, 2, 3, 0 )
15362 		shufps		xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 )
15363 
15364 		movss		[esi+eax+DRAWVERT_SIZE*1+DRAWVERT_TANGENT1_OFFSET+0], xmm0
15365 		movss		[esi+eax+DRAWVERT_SIZE*1+DRAWVERT_TANGENT1_OFFSET+4], xmm1
15366 		movss		[esi+eax+DRAWVERT_SIZE*1+DRAWVERT_TANGENT1_OFFSET+8], xmm2
15367 
15368 		shufps		xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 )
15369 		shufps		xmm1, xmm1, R_SHUFFLEPS( 1, 2, 3, 0 )
15370 		shufps		xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 )
15371 
15372 		movss		[esi+eax+DRAWVERT_SIZE*2+DRAWVERT_TANGENT1_OFFSET+0], xmm0
15373 		movss		[esi+eax+DRAWVERT_SIZE*2+DRAWVERT_TANGENT1_OFFSET+4], xmm1
15374 		movss		[esi+eax+DRAWVERT_SIZE*2+DRAWVERT_TANGENT1_OFFSET+8], xmm2
15375 
15376 		shufps		xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 )
15377 		shufps		xmm1, xmm1, R_SHUFFLEPS( 1, 2, 3, 0 )
15378 		shufps		xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 )
15379 
15380 		movss		[esi+eax+DRAWVERT_SIZE*3+DRAWVERT_TANGENT1_OFFSET+0], xmm0
15381 		movss		[esi+eax+DRAWVERT_SIZE*3+DRAWVERT_TANGENT1_OFFSET+4], xmm1
15382 		movss		[esi+eax+DRAWVERT_SIZE*3+DRAWVERT_TANGENT1_OFFSET+8], xmm2
15383 
15384 		add			eax, DRAWVERT_SIZE*8
15385 
15386 		jle			loopVert4
15387 
15388 		sub			eax, DRAWVERT_SIZE*4
15389 		jge			done
15390 
15391 	loopVert1:
15392 
15393 		// normalize one idDrawVert::normal
15394 
15395 		movss		xmm0, [esi+eax+DRAWVERT_NORMAL_OFFSET+0]
15396 		movss		xmm1, [esi+eax+DRAWVERT_NORMAL_OFFSET+4]
15397 		movss		xmm2, [esi+eax+DRAWVERT_NORMAL_OFFSET+8]
15398 		movss		xmm3, xmm0
15399 		movss		xmm4, xmm1
15400 		movss		xmm5, xmm2
15401 
15402 		mulss		xmm3, xmm3
15403 		mulss		xmm4, xmm4
15404 		mulss		xmm5, xmm5
15405 		addss		xmm3, xmm4
15406 		addss		xmm3, xmm5
15407 
15408 #ifdef REFINE_TANGENT_SQUAREROOT
15409 		rsqrtss		xmm4, xmm3
15410 		mulss		xmm3, xmm4
15411 		mulss		xmm3, xmm4
15412 		subss		xmm3, xmm6
15413 		mulss		xmm4, xmm7
15414 		mulss		xmm3, xmm4
15415 #else
15416 		rsqrtss		xmm3, xmm3
15417 #endif
15418 
15419 		mulss		xmm0, xmm3
15420 		mulss		xmm1, xmm3
15421 		mulss		xmm2, xmm3
15422 
15423 		movss		[esi+eax+DRAWVERT_NORMAL_OFFSET+0], xmm0
15424 		movss		[esi+eax+DRAWVERT_NORMAL_OFFSET+4], xmm1
15425 		movss		[esi+eax+DRAWVERT_NORMAL_OFFSET+8], xmm2
15426 
15427 		// project and normalize one idDrawVert::tangent[0]
15428 
15429 		movss		xmm0, [esi+eax+DRAWVERT_TANGENT0_OFFSET+0]
15430 		movss		xmm1, [esi+eax+DRAWVERT_TANGENT0_OFFSET+4]
15431 		movss		xmm2, [esi+eax+DRAWVERT_TANGENT0_OFFSET+8]
15432 		movss		xmm3, xmm0
15433 		movss		xmm4, xmm1
15434 		movss		xmm5, xmm2
15435 
15436 		mulss		xmm3, [esi+eax+DRAWVERT_NORMAL_OFFSET+0]
15437 		mulss		xmm4, [esi+eax+DRAWVERT_NORMAL_OFFSET+4]
15438 		mulss		xmm5, [esi+eax+DRAWVERT_NORMAL_OFFSET+8]
15439 		addss		xmm3, xmm4
15440 		addss		xmm3, xmm5
15441 
15442 		movss		xmm4, xmm3
15443 		movss		xmm5, xmm3
15444 		mulss		xmm3, [esi+eax+DRAWVERT_NORMAL_OFFSET+0]
15445 		mulss		xmm4, [esi+eax+DRAWVERT_NORMAL_OFFSET+4]
15446 		mulss		xmm5, [esi+eax+DRAWVERT_NORMAL_OFFSET+8]
15447 		subss		xmm0, xmm3
15448 		subss		xmm1, xmm4
15449 		subss		xmm2, xmm5
15450 
15451 		movss		xmm3, xmm0
15452 		movss		xmm4, xmm1
15453 		movss		xmm5, xmm2
15454 
15455 		mulss		xmm3, xmm3
15456 		mulss		xmm4, xmm4
15457 		mulss		xmm5, xmm5
15458 		addss		xmm3, xmm4
15459 		addss		xmm3, xmm5
15460 
15461 #ifdef REFINE_TANGENT_SQUAREROOT
15462 		rsqrtss		xmm4, xmm3
15463 		mulss		xmm3, xmm4
15464 		mulss		xmm3, xmm4
15465 		subss		xmm3, xmm6
15466 		mulss		xmm4, xmm7
15467 		mulss		xmm3, xmm4
15468 #else
15469 		rsqrtss		xmm3, xmm3
15470 #endif
15471 
15472 		mulss		xmm0, xmm3
15473 		mulss		xmm1, xmm3
15474 		mulss		xmm2, xmm3
15475 
15476 		movss		[esi+eax+DRAWVERT_TANGENT0_OFFSET+0], xmm0
15477 		movss		[esi+eax+DRAWVERT_TANGENT0_OFFSET+4], xmm1
15478 		movss		[esi+eax+DRAWVERT_TANGENT0_OFFSET+8], xmm2
15479 
15480 		// project and normalize one idDrawVert::tangent[1]
15481 
15482 		movss		xmm0, [esi+eax+DRAWVERT_TANGENT1_OFFSET+0]
15483 		movss		xmm1, [esi+eax+DRAWVERT_TANGENT1_OFFSET+4]
15484 		movss		xmm2, [esi+eax+DRAWVERT_TANGENT1_OFFSET+8]
15485 		movss		xmm3, xmm0
15486 		movss		xmm4, xmm1
15487 		movss		xmm5, xmm2
15488 
15489 		mulss		xmm3, [esi+eax+DRAWVERT_NORMAL_OFFSET+0]
15490 		mulss		xmm4, [esi+eax+DRAWVERT_NORMAL_OFFSET+4]
15491 		mulss		xmm5, [esi+eax+DRAWVERT_NORMAL_OFFSET+8]
15492 		addss		xmm3, xmm4
15493 		addss		xmm3, xmm5
15494 
15495 		movss		xmm4, xmm3
15496 		movss		xmm5, xmm3
15497 		mulss		xmm3, [esi+eax+DRAWVERT_NORMAL_OFFSET+0]
15498 		mulss		xmm4, [esi+eax+DRAWVERT_NORMAL_OFFSET+4]
15499 		mulss		xmm5, [esi+eax+DRAWVERT_NORMAL_OFFSET+8]
15500 		subss		xmm0, xmm3
15501 		subss		xmm1, xmm4
15502 		subss		xmm2, xmm5
15503 
15504 		movss		xmm3, xmm0
15505 		movss		xmm4, xmm1
15506 		movss		xmm5, xmm2
15507 
15508 		mulss		xmm3, xmm3
15509 		mulss		xmm4, xmm4
15510 		mulss		xmm5, xmm5
15511 		addss		xmm3, xmm4
15512 		addss		xmm3, xmm5
15513 
15514 #ifdef REFINE_TANGENT_SQUAREROOT
15515 		rsqrtss		xmm4, xmm3
15516 		mulss		xmm3, xmm4
15517 		mulss		xmm3, xmm4
15518 		subss		xmm3, xmm6
15519 		mulss		xmm4, xmm7
15520 		mulss		xmm3, xmm4
15521 #else
15522 		rsqrtss		xmm3, xmm3
15523 #endif
15524 
15525 		mulss		xmm0, xmm3
15526 		mulss		xmm1, xmm3
15527 		mulss		xmm2, xmm3
15528 
15529 		movss		[esi+eax+DRAWVERT_TANGENT1_OFFSET+0], xmm0
15530 		movss		[esi+eax+DRAWVERT_TANGENT1_OFFSET+4], xmm1
15531 		movss		[esi+eax+DRAWVERT_TANGENT1_OFFSET+8], xmm2
15532 
15533 		add			eax, DRAWVERT_SIZE
15534 
15535 		jl			loopVert1
15536 	done:
15537 	}
15538 }
15539 
15540 /*
15541 ============
15542 idSIMD_SSE::CreateTextureSpaceLightVectors
15543 ============
15544 */
CreateTextureSpaceLightVectors(idVec3 * lightVectors,const idVec3 & lightOrigin,const idDrawVert * verts,const int numVerts,const int * indexes,const int numIndexes)15545 void VPCALL idSIMD_SSE::CreateTextureSpaceLightVectors( idVec3 *lightVectors, const idVec3 &lightOrigin, const idDrawVert *verts, const int numVerts, const int *indexes, const int numIndexes ) {
15546 
15547 	assert( sizeof( idDrawVert ) == DRAWVERT_SIZE );
15548 	assert( (int)&((idDrawVert *)0)->xyz == DRAWVERT_XYZ_OFFSET );
15549 	assert( (int)&((idDrawVert *)0)->normal == DRAWVERT_NORMAL_OFFSET );
15550 	assert( (int)&((idDrawVert *)0)->tangents[0] == DRAWVERT_TANGENT0_OFFSET );
15551 	assert( (int)&((idDrawVert *)0)->tangents[1] == DRAWVERT_TANGENT1_OFFSET );
15552 
15553 	bool *used = (bool *)_alloca16( numVerts * sizeof( used[0] ) );
15554 	memset( used, 0, numVerts * sizeof( used[0] ) );
15555 
15556 	for ( int i = numIndexes - 1; i >= 0; i-- ) {
15557 		used[indexes[i]] = true;
15558 	}
15559 
15560 #if 0
15561 
15562 	__asm {
15563 
15564 		mov			eax, numVerts
15565 
15566 		mov			esi, used
15567 		add			esi, eax
15568 
15569 		mov			edi, verts
15570 		sub			edi, DRAWVERT_SIZE
15571 
15572 		neg			eax
15573 		dec			eax
15574 
15575 		mov			ecx, lightOrigin
15576 		movss		xmm7, [ecx+0]
15577 		movhps		xmm7, [ecx+4]
15578 
15579 		mov			ecx, lightVectors
15580 		sub			ecx, 3*4
15581 
15582 	loopVert:
15583 		inc			eax
15584 		jge			done
15585 
15586 		add			edi, DRAWVERT_SIZE
15587 		add			ecx, 3*4
15588 
15589 		cmp			byte ptr [esi+eax], 0
15590 		je			loopVert
15591 
15592 		movaps		xmm0, xmm7
15593 		movss		xmm1, [edi+DRAWVERT_XYZ_OFFSET+0]
15594 		movhps		xmm1, [edi+DRAWVERT_XYZ_OFFSET+4]
15595 		subps		xmm0, xmm1
15596 
15597 		// 0,  X,  1,  2
15598 		// 3,  X,  4,  5
15599 		// 6,  X,  7,  8
15600 
15601 		movss		xmm2, [edi+DRAWVERT_TANGENT0_OFFSET+0]
15602 		movhps		xmm2, [edi+DRAWVERT_TANGENT0_OFFSET+4]
15603 		mulps		xmm2, xmm0
15604 
15605 		movss		xmm3, [edi+DRAWVERT_TANGENT1_OFFSET+0]
15606 		movhps		xmm3, [edi+DRAWVERT_TANGENT1_OFFSET+4]
15607 		mulps		xmm3, xmm0
15608 
15609 		movaps		xmm5, xmm2								// xmm5 = 0,  X,  1,  2
15610 		unpcklps	xmm5, xmm3								// xmm5 = 0,  3,  X,  X
15611 		unpckhps	xmm2, xmm3								// xmm2 = 1,  4,  2,  5
15612 
15613 		movss		xmm4, [edi+DRAWVERT_NORMAL_OFFSET+0]
15614 		movhps		xmm4, [edi+DRAWVERT_NORMAL_OFFSET+4]
15615 		mulps		xmm4, xmm0
15616 
15617 		movlhps		xmm5, xmm4								// xmm5 = 0,  3,  6,  X
15618 		movhlps		xmm4, xmm2								// xmm4 = 2,  5,  7,  8
15619 		shufps		xmm2, xmm4, R_SHUFFLEPS( 0, 1, 3, 2 )	// xmm2 = 2,  5,  8,  7
15620 
15621 		addps		xmm5, xmm4
15622 		addps		xmm5, xmm2
15623 		movlps		[ecx+0], xmm5
15624 		shufps		xmm5, xmm5, R_SHUFFLEPS( 2, 3, 0, 1 )
15625 		movss		[ecx+8], xmm5
15626 
15627 		jmp			loopVert
15628 
15629 	done:
15630 	}
15631 
15632 #elif 1
15633 
15634 	for ( int i = 0; i < numVerts; i++ ) {
15635 		if ( !used[i] ) {
15636 			continue;
15637 		}
15638 
15639 		const idDrawVert *v = &verts[i];
15640 		idVec3 lightDir;
15641 
15642 		lightDir[0] = lightOrigin[0] - v->xyz[0];
15643 		lightDir[1] = lightOrigin[1] - v->xyz[1];
15644 		lightDir[2] = lightOrigin[2] - v->xyz[2];
15645 
15646 		lightVectors[i][0] = lightDir[0] * v->tangents[0][0] + lightDir[1] * v->tangents[0][1] + lightDir[2] * v->tangents[0][2];
15647 		lightVectors[i][1] = lightDir[0] * v->tangents[1][0] + lightDir[1] * v->tangents[1][1] + lightDir[2] * v->tangents[1][2];
15648 		lightVectors[i][2] = lightDir[0] * v->normal[0] + lightDir[1] * v->normal[1] + lightDir[2] * v->normal[2];
15649 	}
15650 
15651 #elif 1
15652 
15653 	ALIGN16( int usedVertNums[4] );
15654 	ALIGN16( float lightDir0[4] );
15655 	ALIGN16( float lightDir1[4] );
15656 	ALIGN16( float lightDir2[4] );
15657 	ALIGN16( float normal0[4] );
15658 	ALIGN16( float normal1[4] );
15659 	ALIGN16( float normal2[4] );
15660 	ALIGN16( float tangent0[4] );
15661 	ALIGN16( float tangent1[4] );
15662 	ALIGN16( float tangent2[4] );
15663 	ALIGN16( float tangent3[4] );
15664 	ALIGN16( float tangent4[4] );
15665 	ALIGN16( float tangent5[4] );
15666 	idVec3 localLightOrigin = lightOrigin;
15667 
15668 	__asm {
15669 
15670 		xor			ecx, ecx
15671 		mov			eax, numVerts
15672 
15673 		mov			esi, used
15674 		add			esi, eax
15675 
15676 		mov			edi, verts
15677 		sub			edi, DRAWVERT_SIZE
15678 
15679 		neg			eax
15680 		dec			eax
15681 
15682 	loopVert4:
15683 		inc			eax
15684 		jge			done4
15685 
15686 		add			edi, DRAWVERT_SIZE
15687 
15688 		cmp			byte ptr [esi+eax], 0
15689 		je			loopVert4
15690 
15691 		mov			usedVertNums[ecx*4], eax
15692 
15693 		inc			ecx
15694 		cmp			ecx, 4
15695 
15696 		movss		xmm0, localLightOrigin[0]
15697 		movss		xmm1, localLightOrigin[4]
15698 		movss		xmm2, localLightOrigin[8]
15699 
15700 		subss		xmm0, [edi+DRAWVERT_XYZ_OFFSET+0]
15701 		subss		xmm1, [edi+DRAWVERT_XYZ_OFFSET+4]
15702 		subss		xmm2, [edi+DRAWVERT_XYZ_OFFSET+8]
15703 
15704 		movss		lightDir0[ecx*4-4], xmm0
15705 		movss		lightDir1[ecx*4-4], xmm1
15706 		movss		lightDir2[ecx*4-4], xmm2
15707 
15708 		movss		xmm3, [edi+DRAWVERT_NORMAL_OFFSET+0]
15709 		movss		xmm4, [edi+DRAWVERT_NORMAL_OFFSET+4]
15710 		movss		xmm5, [edi+DRAWVERT_NORMAL_OFFSET+8]
15711 
15712 		movss		normal0[ecx*4-4], xmm3
15713 		movss		normal1[ecx*4-4], xmm4
15714 		movss		normal2[ecx*4-4], xmm5
15715 
15716 		movss		xmm0, [edi+DRAWVERT_TANGENT0_OFFSET+0]
15717 		movss		xmm1, [edi+DRAWVERT_TANGENT0_OFFSET+4]
15718 		movss		xmm2, [edi+DRAWVERT_TANGENT0_OFFSET+8]
15719 
15720 		movss		tangent0[ecx*4-4], xmm0
15721 		movss		tangent1[ecx*4-4], xmm1
15722 		movss		tangent2[ecx*4-4], xmm2
15723 
15724 		movss		xmm3, [edi+DRAWVERT_TANGENT1_OFFSET+0]
15725 		movss		xmm4, [edi+DRAWVERT_TANGENT1_OFFSET+4]
15726 		movss		xmm5, [edi+DRAWVERT_TANGENT1_OFFSET+8]
15727 
15728 		movss		tangent3[ecx*4-4], xmm3
15729 		movss		tangent4[ecx*4-4], xmm4
15730 		movss		tangent5[ecx*4-4], xmm5
15731 
15732 		jl			loopVert4
15733 
15734 		movaps		xmm0, lightDir0
15735 		movaps		xmm1, lightDir1
15736 		movaps		xmm2, lightDir2
15737 
15738 		movaps		xmm3, tangent0
15739 		mulps		xmm3, xmm0
15740 		movaps		xmm4, tangent1
15741 		mulps		xmm4, xmm1
15742 		movaps		xmm5, tangent2
15743 		mulps		xmm5, xmm2
15744 
15745 		addps		xmm3, xmm4
15746 		addps		xmm5, xmm3
15747 
15748 		movaps		xmm3, tangent3
15749 		mulps		xmm3, xmm0
15750 		movaps		xmm4, tangent4
15751 		mulps		xmm4, xmm1
15752 		movaps		xmm6, tangent5
15753 		mulps		xmm6, xmm2
15754 
15755 		addps		xmm3, xmm4
15756 		addps		xmm6, xmm3
15757 
15758 		mulps		xmm0, normal0
15759 		mulps		xmm1, normal1
15760 		mulps		xmm2, normal2
15761 
15762 		addps		xmm0, xmm1
15763 		addps		xmm0, xmm2
15764 
15765 		mov			ecx, numVerts
15766 		imul		ecx, 12
15767 		mov			edx, usedVertNums[0]
15768 		add			ecx, lightVectors
15769 		imul		edx, 12
15770 
15771 		movss		[ecx+edx+0], xmm5
15772 		movss		[ecx+edx+4], xmm6
15773 		movss		[ecx+edx+8], xmm0
15774 
15775 		shufps		xmm5, xmm5, R_SHUFFLEPS( 1, 2, 3, 0 )
15776 		mov			edx, usedVertNums[4]
15777 		shufps		xmm6, xmm6, R_SHUFFLEPS( 1, 2, 3, 0 )
15778 		imul		edx, 12
15779 		shufps		xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 )
15780 
15781 		movss		[ecx+edx+0], xmm5
15782 		movss		[ecx+edx+4], xmm6
15783 		movss		[ecx+edx+8], xmm0
15784 
15785 		shufps		xmm5, xmm5, R_SHUFFLEPS( 1, 2, 3, 0 )
15786 		mov			edx, usedVertNums[8]
15787 		shufps		xmm6, xmm6, R_SHUFFLEPS( 1, 2, 3, 0 )
15788 		imul		edx, 12
15789 		shufps		xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 )
15790 
15791 		movss		[ecx+edx+0], xmm5
15792 		movss		[ecx+edx+4], xmm6
15793 		movss		[ecx+edx+8], xmm0
15794 
15795 		shufps		xmm5, xmm5, R_SHUFFLEPS( 1, 2, 3, 0 )
15796 		mov			edx, usedVertNums[12]
15797 		shufps		xmm6, xmm6, R_SHUFFLEPS( 1, 2, 3, 0 )
15798 		imul		edx, 12
15799 		shufps		xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 )
15800 
15801 		movss		[ecx+edx+0], xmm5
15802 		movss		[ecx+edx+4], xmm6
15803 		movss		[ecx+edx+8], xmm0
15804 
15805 		xor			ecx, ecx
15806 		jmp			loopVert4
15807 
15808 	done4:
15809 		test		ecx, ecx
15810 		jz			done
15811 		xor			eax, eax
15812 		mov			edi, numVerts
15813 		imul		edi, 12
15814 		add			edi, lightVectors
15815 
15816 	loopVert1:
15817 		movss		xmm0, lightDir0[eax*4]
15818 		movss		xmm1, lightDir1[eax*4]
15819 		movss		xmm2, lightDir2[eax*4]
15820 
15821 		mov			edx, usedVertNums[eax*4]
15822 		imul		edx, 12
15823 
15824 		movss		xmm3, tangent0[eax*4]
15825 		mulss		xmm3, xmm0
15826 		movss		xmm4, tangent1[eax*4]
15827 		mulss		xmm4, xmm1
15828 		movss		xmm5, tangent2[eax*4]
15829 		mulss		xmm5, xmm2
15830 
15831 		addss		xmm3, xmm4
15832 		addss		xmm5, xmm3
15833 		movss		[edi+edx+0], xmm5
15834 
15835 		movss		xmm3, tangent3[eax*4]
15836 		mulss		xmm3, xmm0
15837 		movss		xmm4, tangent4[eax*4]
15838 		mulss		xmm4, xmm1
15839 		movss		xmm6, tangent5[eax*4]
15840 		mulss		xmm6, xmm2
15841 
15842 		addss		xmm3, xmm4
15843 		addss		xmm6, xmm3
15844 		movss		[edi+edx+4], xmm6
15845 
15846 		mulss		xmm0, normal0[eax*4]
15847 		mulss		xmm1, normal1[eax*4]
15848 		mulss		xmm2, normal2[eax*4]
15849 
15850 		addss		xmm0, xmm1
15851 		addss		xmm0, xmm2
15852 		movss		[edi+edx+8], xmm0
15853 
15854 		inc			eax
15855 		dec			ecx
15856 		jg			loopVert1
15857 
15858 	done:
15859 	}
15860 
15861 #else
15862 
15863 	ALIGN16( float lightVectors0[4] );
15864 	ALIGN16( float lightVectors1[4] );
15865 	ALIGN16( float lightVectors2[4] );
15866 	int numUsedVerts = 0;
15867 
15868 	for ( int i = 0; i < numVerts; i++ ) {
15869 		if ( !used[i] ) {
15870 			continue;
15871 		}
15872 
15873 		const idDrawVert *v = &verts[i];
15874 
15875 		lightDir0[numUsedVerts] = lightOrigin[0] - v->xyz[0];
15876 		lightDir1[numUsedVerts] = lightOrigin[1] - v->xyz[1];
15877 		lightDir2[numUsedVerts] = lightOrigin[2] - v->xyz[2];
15878 
15879 		normal0[numUsedVerts] = v->normal[0];
15880 		normal1[numUsedVerts] = v->normal[1];
15881 		normal2[numUsedVerts] = v->normal[2];
15882 
15883 		tangent0[numUsedVerts] = v->tangents[0][0];
15884 		tangent1[numUsedVerts] = v->tangents[0][1];
15885 		tangent2[numUsedVerts] = v->tangents[0][2];
15886 
15887 		tangent3[numUsedVerts] = v->tangents[1][0];
15888 		tangent4[numUsedVerts] = v->tangents[1][1];
15889 		tangent5[numUsedVerts] = v->tangents[1][2];
15890 
15891 		usedVertNums[numUsedVerts++] = i;
15892 		if ( numUsedVerts < 4 ) {
15893 			continue;
15894 		}
15895 
15896 		lightVectors0[0] = lightDir0[0] * tangent0[0];
15897 		lightVectors0[1] = lightDir0[1] * tangent0[1];
15898 		lightVectors0[2] = lightDir0[2] * tangent0[2];
15899 		lightVectors0[3] = lightDir0[3] * tangent0[3];
15900 
15901 		lightVectors0[0] += lightDir1[0] * tangent1[0];
15902 		lightVectors0[1] += lightDir1[1] * tangent1[1];
15903 		lightVectors0[2] += lightDir1[2] * tangent1[2];
15904 		lightVectors0[3] += lightDir1[3] * tangent1[3];
15905 
15906 		lightVectors0[0] += lightDir2[0] * tangent2[0];
15907 		lightVectors0[1] += lightDir2[1] * tangent2[1];
15908 		lightVectors0[2] += lightDir2[2] * tangent2[2];
15909 		lightVectors0[3] += lightDir2[3] * tangent2[3];
15910 
15911 		lightVectors1[0] = lightDir0[0] * tangent3[0];
15912 		lightVectors1[1] = lightDir0[1] * tangent3[1];
15913 		lightVectors1[2] = lightDir0[2] * tangent3[2];
15914 		lightVectors1[3] = lightDir0[3] * tangent3[3];
15915 
15916 		lightVectors1[0] += lightDir1[0] * tangent4[0];
15917 		lightVectors1[1] += lightDir1[1] * tangent4[1];
15918 		lightVectors1[2] += lightDir1[2] * tangent4[2];
15919 		lightVectors1[3] += lightDir1[3] * tangent4[3];
15920 
15921 		lightVectors1[0] += lightDir2[0] * tangent5[0];
15922 		lightVectors1[1] += lightDir2[1] * tangent5[1];
15923 		lightVectors1[2] += lightDir2[2] * tangent5[2];
15924 		lightVectors1[3] += lightDir2[3] * tangent5[3];
15925 
15926 		lightVectors2[0] = lightDir0[0] * normal0[0];
15927 		lightVectors2[1] = lightDir0[1] * normal0[1];
15928 		lightVectors2[2] = lightDir0[2] * normal0[2];
15929 		lightVectors2[3] = lightDir0[3] * normal0[3];
15930 
15931 		lightVectors2[0] += lightDir1[0] * normal1[0];
15932 		lightVectors2[1] += lightDir1[1] * normal1[1];
15933 		lightVectors2[2] += lightDir1[2] * normal1[2];
15934 		lightVectors2[3] += lightDir1[3] * normal1[3];
15935 
15936 		lightVectors2[0] += lightDir2[0] * normal2[0];
15937 		lightVectors2[1] += lightDir2[1] * normal2[1];
15938 		lightVectors2[2] += lightDir2[2] * normal2[2];
15939 		lightVectors2[3] += lightDir2[3] * normal2[3];
15940 
15941 
15942 		for ( int j = 0; j < 4; j++ ) {
15943 			int n = usedVertNums[j];
15944 
15945 			lightVectors[n][0] = lightVectors0[j];
15946 			lightVectors[n][1] = lightVectors1[j];
15947 			lightVectors[n][2] = lightVectors2[j];
15948 		}
15949 
15950 		numUsedVerts = 0;
15951 	}
15952 
15953 	for ( int i = 0; i < numUsedVerts; i++ ) {
15954 
15955 		lightVectors0[i] = lightDir0[i] * tangent0[i] + lightDir1[i] * tangent1[i] + lightDir2[i] * tangent2[i];
15956 		lightVectors1[i] = lightDir0[i] * tangent3[i] + lightDir1[i] * tangent4[i] + lightDir2[i] * tangent5[i];
15957 		lightVectors2[i] = lightDir0[i] * normal0[i] + lightDir1[i] * normal1[i] + lightDir2[i] * normal2[i];
15958 
15959 		int n = usedVertNums[i];
15960 		lightVectors[n][0] = lightVectors0[i];
15961 		lightVectors[n][1] = lightVectors1[i];
15962 		lightVectors[n][2] = lightVectors2[i];
15963 	}
15964 
15965 #endif
15966 }
15967 
15968 /*
15969 ============
15970 idSIMD_SSE::CreateSpecularTextureCoords
15971 ============
15972 */
CreateSpecularTextureCoords(idVec4 * texCoords,const idVec3 & lightOrigin,const idVec3 & viewOrigin,const idDrawVert * verts,const int numVerts,const int * indexes,const int numIndexes)15973 void VPCALL idSIMD_SSE::CreateSpecularTextureCoords( idVec4 *texCoords, const idVec3 &lightOrigin, const idVec3 &viewOrigin, const idDrawVert *verts, const int numVerts, const int *indexes, const int numIndexes ) {
15974 
15975 	assert( sizeof( idDrawVert ) == DRAWVERT_SIZE );
15976 	assert( (int)&((idDrawVert *)0)->xyz == DRAWVERT_XYZ_OFFSET );
15977 	assert( (int)&((idDrawVert *)0)->normal == DRAWVERT_NORMAL_OFFSET );
15978 	assert( (int)&((idDrawVert *)0)->tangents[0] == DRAWVERT_TANGENT0_OFFSET );
15979 	assert( (int)&((idDrawVert *)0)->tangents[1] == DRAWVERT_TANGENT1_OFFSET );
15980 
15981 	bool *used = (bool *)_alloca16( numVerts * sizeof( used[0] ) );
15982 	memset( used, 0, numVerts * sizeof( used[0] ) );
15983 
15984 	for ( int i = numIndexes - 1; i >= 0; i-- ) {
15985 		used[indexes[i]] = true;
15986 	}
15987 
15988 #if 0
15989 
15990 	__asm {
15991 
15992 		mov			eax, numVerts
15993 
15994 		mov			esi, used
15995 		add			esi, eax
15996 
15997 		mov			edi, verts
15998 		sub			edi, DRAWVERT_SIZE
15999 
16000 		neg			eax
16001 		dec			eax
16002 
16003 		mov			ecx, viewOrigin
16004 		movss		xmm6, [ecx+0]
16005 		movhps		xmm6, [ecx+4]
16006 
16007 		mov			ecx, lightOrigin
16008 		movss		xmm7, [ecx+0]
16009 		movhps		xmm7, [ecx+4]
16010 
16011 		mov			ecx, texCoords
16012 		sub			ecx, 4*4
16013 
16014 	loopVert:
16015 		inc			eax
16016 		jge			done
16017 
16018 		add			edi, DRAWVERT_SIZE
16019 		add			ecx, 4*4
16020 
16021 		cmp			byte ptr [esi+eax], 0
16022 		je			loopVert
16023 
16024 		movaps		xmm0, xmm7
16025 		movaps		xmm1, xmm6
16026 		movss		xmm2, [edi+DRAWVERT_XYZ_OFFSET+0]
16027 		movhps		xmm2, [edi+DRAWVERT_XYZ_OFFSET+4]
16028 		subps		xmm0, xmm2
16029 		subps		xmm1, xmm2
16030 
16031 		movaps		xmm3, xmm0
16032 		movaps		xmm4, xmm1
16033 		mulps		xmm3, xmm3
16034 		mulps		xmm4, xmm4
16035 
16036 		// 0,  X,  1,  2
16037 		// 3,  X,  4,  5
16038 
16039 		movaps		xmm5, xmm3								// xmm5 = 0,  X,  1,  2
16040 		unpcklps	xmm5, xmm4								// xmm5 = 0,  3,  X,  X
16041 		unpckhps	xmm3, xmm4								// xmm3 = 1,  4,  2,  5
16042 		movhlps		xmm4, xmm3								// xmm4 = 2,  5,  4,  5
16043 
16044 		addps		xmm5, xmm3
16045 		addps		xmm5, xmm4
16046 		shufps		xmm5, xmm5, R_SHUFFLEPS( 0, 1, 0, 1 )
16047 		rsqrtps		xmm5, xmm5
16048 
16049 		movaps		xmm4, xmm5
16050 		shufps		xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
16051 		shufps		xmm5, xmm5, R_SHUFFLEPS( 1, 1, 1, 1 )
16052 
16053 		mulps		xmm0, xmm4
16054 		mulps		xmm1, xmm5
16055 		addps		xmm0, xmm1
16056 
16057 		movss		xmm2, [edi+DRAWVERT_TANGENT0_OFFSET+0]
16058 		movhps		xmm2, [edi+DRAWVERT_TANGENT0_OFFSET+4]
16059 		mulps		xmm2, xmm0
16060 
16061 		movss		xmm3, [edi+DRAWVERT_TANGENT1_OFFSET+0]
16062 		movhps		xmm3, [edi+DRAWVERT_TANGENT1_OFFSET+4]
16063 		mulps		xmm3, xmm0
16064 
16065 		movss		xmm4, [edi+DRAWVERT_NORMAL_OFFSET+0]
16066 		movhps		xmm4, [edi+DRAWVERT_NORMAL_OFFSET+4]
16067 		mulps		xmm4, xmm0
16068 
16069 		movaps		xmm5, xmm2								// xmm5 = 0,  X,  1,  2
16070 		unpcklps	xmm5, xmm3								// xmm5 = 0,  3,  X,  X
16071 		unpckhps	xmm2, xmm3								// xmm2 = 1,  4,  2,  5
16072 
16073 		movlhps		xmm5, xmm4								// xmm5 = 0,  3,  6,  X
16074 		movhlps		xmm4, xmm2								// xmm4 = 2,  5,  7,  8
16075 		shufps		xmm2, xmm4, R_SHUFFLEPS( 0, 1, 3, 2 )	// xmm2 = 2,  5,  8,  7
16076 
16077 		movaps		xmm3, SIMD_SP_one
16078 
16079 		addps		xmm5, xmm4
16080 		addps		xmm5, xmm2
16081 		movaps		[ecx+0], xmm5
16082 		movss		[ecx+12], xmm3
16083 
16084 		jmp			loopVert
16085 
16086 	done:
16087 	}
16088 
16089 #elif 0
16090 
16091 	for ( int i = 0; i < numVerts; i++ ) {
16092 		if ( !used[i] ) {
16093 			continue;
16094 		}
16095 
16096 		const idDrawVert *v = &verts[i];
16097 
16098 		idVec3 lightDir = lightOrigin - v->xyz;
16099 		idVec3 viewDir = viewOrigin - v->xyz;
16100 
16101 		float ilength;
16102 
16103 		ilength = idMath::RSqrt( lightDir[0] * lightDir[0] + lightDir[1] * lightDir[1] + lightDir[2] * lightDir[2] );
16104 		lightDir[0] *= ilength;
16105 		lightDir[1] *= ilength;
16106 		lightDir[2] *= ilength;
16107 
16108 		ilength = idMath::RSqrt( viewDir[0] * viewDir[0] + viewDir[1] * viewDir[1] + viewDir[2] * viewDir[2] );
16109 		viewDir[0] *= ilength;
16110 		viewDir[1] *= ilength;
16111 		viewDir[2] *= ilength;
16112 
16113 		lightDir += viewDir;
16114 
16115 		texCoords[i][0] = lightDir[0] * v->tangents[0][0] + lightDir[1] * v->tangents[0][1] + lightDir[2] * v->tangents[0][2];
16116 		texCoords[i][1] = lightDir[0] * v->tangents[1][0] + lightDir[1] * v->tangents[1][1] + lightDir[2] * v->tangents[1][2];
16117 		texCoords[i][2] = lightDir[0] * v->normal[0] + lightDir[1] * v->normal[1] + lightDir[2] * v->normal[2];
16118 		texCoords[i][3] = 1.0f;
16119 	}
16120 
16121 
16122 #elif 1
16123 
16124 	ALIGN16( int usedVertNums[4] );
16125 	ALIGN16( float lightDir0[4] );
16126 	ALIGN16( float lightDir1[4] );
16127 	ALIGN16( float lightDir2[4] );
16128 	ALIGN16( float viewDir0[4] );
16129 	ALIGN16( float viewDir1[4] );
16130 	ALIGN16( float viewDir2[4] );
16131 	ALIGN16( float normal0[4] );
16132 	ALIGN16( float normal1[4] );
16133 	ALIGN16( float normal2[4] );
16134 	ALIGN16( float tangent0[4] );
16135 	ALIGN16( float tangent1[4] );
16136 	ALIGN16( float tangent2[4] );
16137 	ALIGN16( float tangent3[4] );
16138 	ALIGN16( float tangent4[4] );
16139 	ALIGN16( float tangent5[4] );
16140 	idVec3 localLightOrigin = lightOrigin;
16141 	idVec3 localViewOrigin = viewOrigin;
16142 
16143 	__asm {
16144 
16145 		xor			ecx, ecx
16146 		mov			eax, numVerts
16147 
16148 		mov			esi, used
16149 		add			esi, eax
16150 
16151 		mov			edi, verts
16152 		sub			edi, DRAWVERT_SIZE
16153 
16154 		neg			eax
16155 		dec			eax
16156 
16157 	loopVert4:
16158 		inc			eax
16159 		jge			done4
16160 
16161 		add			edi, DRAWVERT_SIZE
16162 
16163 		cmp			byte ptr [esi+eax], 0
16164 		je			loopVert4
16165 
16166 		mov			usedVertNums[ecx*4], eax
16167 
16168 		inc			ecx
16169 		cmp			ecx, 4
16170 
16171 		movss		xmm3, localLightOrigin[0]
16172 		movss		xmm4, localLightOrigin[4]
16173 		movss		xmm5, localLightOrigin[8]
16174 
16175 		subss		xmm3, [edi+DRAWVERT_XYZ_OFFSET+0]
16176 		subss		xmm4, [edi+DRAWVERT_XYZ_OFFSET+4]
16177 		subss		xmm5, [edi+DRAWVERT_XYZ_OFFSET+8]
16178 
16179 		movss		lightDir0[ecx*4-4], xmm3
16180 		movss		lightDir1[ecx*4-4], xmm4
16181 		movss		lightDir2[ecx*4-4], xmm5
16182 
16183 		movss		xmm0, localViewOrigin[0]
16184 		movss		xmm1, localViewOrigin[4]
16185 		movss		xmm2, localViewOrigin[8]
16186 
16187 		subss		xmm0, [edi+DRAWVERT_XYZ_OFFSET+0]
16188 		subss		xmm1, [edi+DRAWVERT_XYZ_OFFSET+4]
16189 		subss		xmm2, [edi+DRAWVERT_XYZ_OFFSET+8]
16190 
16191 		movss		viewDir0[ecx*4-4], xmm0
16192 		movss		viewDir1[ecx*4-4], xmm1
16193 		movss		viewDir2[ecx*4-4], xmm2
16194 
16195 		movss		xmm3, [edi+DRAWVERT_NORMAL_OFFSET+0]
16196 		movss		xmm4, [edi+DRAWVERT_NORMAL_OFFSET+4]
16197 		movss		xmm5, [edi+DRAWVERT_NORMAL_OFFSET+8]
16198 
16199 		movss		normal0[ecx*4-4], xmm3
16200 		movss		normal1[ecx*4-4], xmm4
16201 		movss		normal2[ecx*4-4], xmm5
16202 
16203 		movss		xmm0, [edi+DRAWVERT_TANGENT0_OFFSET+0]
16204 		movss		xmm1, [edi+DRAWVERT_TANGENT0_OFFSET+4]
16205 		movss		xmm2, [edi+DRAWVERT_TANGENT0_OFFSET+8]
16206 
16207 		movss		tangent0[ecx*4-4], xmm0
16208 		movss		tangent1[ecx*4-4], xmm1
16209 		movss		tangent2[ecx*4-4], xmm2
16210 
16211 		movss		xmm3, [edi+DRAWVERT_TANGENT1_OFFSET+0]
16212 		movss		xmm4, [edi+DRAWVERT_TANGENT1_OFFSET+4]
16213 		movss		xmm5, [edi+DRAWVERT_TANGENT1_OFFSET+8]
16214 
16215 		movss		tangent3[ecx*4-4], xmm3
16216 		movss		tangent4[ecx*4-4], xmm4
16217 		movss		tangent5[ecx*4-4], xmm5
16218 
16219 		jl			loopVert4
16220 
16221 		movaps		xmm6, lightDir0
16222 		movaps		xmm0, xmm6
16223 		mulps		xmm6, xmm6
16224 		movaps		xmm7, lightDir1
16225 		movaps		xmm1, xmm7
16226 		mulps		xmm7, xmm7
16227 		addps		xmm6, xmm7
16228 		movaps		xmm5, lightDir2
16229 		movaps		xmm2, xmm5
16230 		mulps		xmm5, xmm5
16231 		addps		xmm6, xmm5
16232 		rsqrtps		xmm6, xmm6
16233 
16234 		mulps		xmm0, xmm6
16235 		mulps		xmm1, xmm6
16236 		mulps		xmm2, xmm6
16237 
16238 		movaps		xmm3, viewDir0
16239 		movaps		xmm7, xmm3
16240 		mulps		xmm7, xmm7
16241 		movaps		xmm4, viewDir1
16242 		movaps		xmm6, xmm4
16243 		mulps		xmm6, xmm6
16244 		addps		xmm7, xmm6
16245 		movaps		xmm5, viewDir2
16246 		movaps		xmm6, xmm5
16247 		mulps		xmm6, xmm6
16248 		addps		xmm7, xmm6
16249 		rsqrtps		xmm7, xmm7
16250 
16251 		mulps		xmm3, xmm7
16252 		addps		xmm0, xmm3
16253 		mulps		xmm4, xmm7
16254 		addps		xmm1, xmm4
16255 		mulps		xmm5, xmm7
16256 		addps		xmm2, xmm5
16257 
16258 		movaps		xmm3, tangent0
16259 		mulps		xmm3, xmm0
16260 		movaps		xmm4, tangent1
16261 		mulps		xmm4, xmm1
16262 		addps		xmm3, xmm4
16263 		movaps		xmm5, tangent2
16264 		mulps		xmm5, xmm2
16265 		addps		xmm5, xmm3
16266 
16267 		movaps		xmm3, tangent3
16268 		mulps		xmm3, xmm0
16269 		movaps		xmm4, tangent4
16270 		mulps		xmm4, xmm1
16271 		addps		xmm3, xmm4
16272 		movaps		xmm6, tangent5
16273 		mulps		xmm6, xmm2
16274 		addps		xmm6, xmm3
16275 
16276 		mulps		xmm0, normal0
16277 		mulps		xmm1, normal1
16278 		addps		xmm0, xmm1
16279 		mulps		xmm2, normal2
16280 		addps		xmm0, xmm2
16281 
16282 		mov			ecx, numVerts
16283 		shl			ecx, 4
16284 		mov			edx, usedVertNums[0]
16285 		add			ecx, texCoords
16286 		shl			edx, 4
16287 		movss		xmm3, SIMD_SP_one
16288 
16289 		movss		[ecx+edx+0], xmm5
16290 		movss		[ecx+edx+4], xmm6
16291 		movss		[ecx+edx+8], xmm0
16292 		movss		[ecx+edx+12], xmm3
16293 
16294 		shufps		xmm5, xmm5, R_SHUFFLEPS( 1, 2, 3, 0 )
16295 		mov			edx, usedVertNums[4]
16296 		shufps		xmm6, xmm6, R_SHUFFLEPS( 1, 2, 3, 0 )
16297 		shl			edx, 4
16298 		shufps		xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 )
16299 
16300 		movss		[ecx+edx+0], xmm5
16301 		movss		[ecx+edx+4], xmm6
16302 		movss		[ecx+edx+8], xmm0
16303 		movss		[ecx+edx+12], xmm3
16304 
16305 		shufps		xmm5, xmm5, R_SHUFFLEPS( 1, 2, 3, 0 )
16306 		mov			edx, usedVertNums[8]
16307 		shufps		xmm6, xmm6, R_SHUFFLEPS( 1, 2, 3, 0 )
16308 		shl			edx, 4
16309 		shufps		xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 )
16310 
16311 		movss		[ecx+edx+0], xmm5
16312 		movss		[ecx+edx+4], xmm6
16313 		movss		[ecx+edx+8], xmm0
16314 		movss		[ecx+edx+12], xmm3
16315 
16316 		shufps		xmm5, xmm5, R_SHUFFLEPS( 1, 2, 3, 0 )
16317 		mov			edx, usedVertNums[12]
16318 		shufps		xmm6, xmm6, R_SHUFFLEPS( 1, 2, 3, 0 )
16319 		shl			edx, 4
16320 		shufps		xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 )
16321 
16322 		movss		[ecx+edx+0], xmm5
16323 		movss		[ecx+edx+4], xmm6
16324 		movss		[ecx+edx+8], xmm0
16325 		movss		[ecx+edx+12], xmm3
16326 
16327 		xor			ecx, ecx
16328 		jmp			loopVert4
16329 
16330 	done4:
16331 		test		ecx, ecx
16332 		jz			done
16333 		xor			eax, eax
16334 		mov			edi, numVerts
16335 		shl			edi, 4
16336 		add			edi, texCoords
16337 
16338 	loopVert1:
16339 		movss		xmm6, lightDir0[eax*4]
16340 		movss		xmm0, xmm6
16341 		mulss		xmm6, xmm6
16342 		movss		xmm7, lightDir1[eax*4]
16343 		movss		xmm1, xmm7
16344 		mulss		xmm7, xmm7
16345 		addss		xmm6, xmm7
16346 		movss		xmm5, lightDir2[eax*4]
16347 		movss		xmm2, xmm5
16348 		mulss		xmm5, xmm5
16349 		addss		xmm6, xmm5
16350 		rsqrtss		xmm6, xmm6
16351 
16352 		mulss		xmm0, xmm6
16353 		mulss		xmm1, xmm6
16354 		mulss		xmm2, xmm6
16355 
16356 		movss		xmm3, viewDir0[eax*4]
16357 		movss		xmm7, xmm3
16358 		mulss		xmm7, xmm7
16359 		movss		xmm4, viewDir1[eax*4]
16360 		movss		xmm6, xmm4
16361 		mulss		xmm6, xmm6
16362 		addss		xmm7, xmm6
16363 		movss		xmm5, viewDir2[eax*4]
16364 		movss		xmm6, xmm5
16365 		mulss		xmm6, xmm6
16366 		addss		xmm7, xmm6
16367 		rsqrtss		xmm7, xmm7
16368 
16369 		mulss		xmm3, xmm7
16370 		addss		xmm0, xmm3
16371 		mulss		xmm4, xmm7
16372 		addss		xmm1, xmm4
16373 		mulss		xmm5, xmm7
16374 		addss		xmm2, xmm5
16375 
16376 		mov			edx, usedVertNums[eax*4]
16377 		shl			edx, 4
16378 
16379 		movss		xmm3, tangent0[eax*4]
16380 		mulss		xmm3, xmm0
16381 		movss		xmm4, tangent1[eax*4]
16382 		mulss		xmm4, xmm1
16383 		addss		xmm3, xmm4
16384 		movss		xmm5, tangent2[eax*4]
16385 		mulss		xmm5, xmm2
16386 		addss		xmm5, xmm3
16387 		movss		[edi+edx+0], xmm5
16388 
16389 		movss		xmm3, tangent3[eax*4]
16390 		mulss		xmm3, xmm0
16391 		movss		xmm4, tangent4[eax*4]
16392 		mulss		xmm4, xmm1
16393 		addss		xmm3, xmm4
16394 		movss		xmm6, tangent5[eax*4]
16395 		mulss		xmm6, xmm2
16396 		addss		xmm6, xmm3
16397 		movss		[edi+edx+4], xmm6
16398 
16399 		mulss		xmm0, normal0[eax*4]
16400 		mulss		xmm1, normal1[eax*4]
16401 		addss		xmm0, xmm1
16402 		mulss		xmm2, normal2[eax*4]
16403 		addss		xmm0, xmm2
16404 		movss		[edi+edx+8], xmm0
16405 
16406 		movss		xmm3, SIMD_SP_one
16407 		movss		[edi+edx+12], xmm3
16408 
16409 		inc			eax
16410 		dec			ecx
16411 		jg			loopVert1
16412 
16413 	done:
16414 	}
16415 
16416 #else
16417 
16418 	ALIGN16( int usedVertNums[4] );
16419 	ALIGN16( float lightDir0[4] );
16420 	ALIGN16( float lightDir1[4] );
16421 	ALIGN16( float lightDir2[4] );
16422 	ALIGN16( float viewDir0[4] );
16423 	ALIGN16( float viewDir1[4] );
16424 	ALIGN16( float viewDir2[4] );
16425 	ALIGN16( float normal0[4] );
16426 	ALIGN16( float normal1[4] );
16427 	ALIGN16( float normal2[4] );
16428 	ALIGN16( float tangent0[4] );
16429 	ALIGN16( float tangent1[4] );
16430 	ALIGN16( float tangent2[4] );
16431 	ALIGN16( float tangent3[4] );
16432 	ALIGN16( float tangent4[4] );
16433 	ALIGN16( float tangent5[4] );
16434 	ALIGN16( float texCoords0[4] );
16435 	ALIGN16( float texCoords1[4] );
16436 	ALIGN16( float texCoords2[4] );
16437 	idVec3 localLightOrigin = lightOrigin;
16438 	idVec3 localViewOrigin = viewOrigin;
16439 	int numUsedVerts = 0;
16440 
16441 	for ( int i = 0; i < numVerts; i++ ) {
16442 		if ( !used[i] ) {
16443 			continue;
16444 		}
16445 
16446 		const idDrawVert *v = &verts[i];
16447 
16448 		lightDir0[numUsedVerts] = localLightOrigin[0] - v->xyz[0];
16449 		lightDir1[numUsedVerts] = localLightOrigin[1] - v->xyz[1];
16450 		lightDir2[numUsedVerts] = localLightOrigin[2] - v->xyz[2];
16451 
16452 		viewDir0[numUsedVerts] = localViewOrigin[0] - v->xyz[0];
16453 		viewDir1[numUsedVerts] = localViewOrigin[1] - v->xyz[1];
16454 		viewDir2[numUsedVerts] = localViewOrigin[2] - v->xyz[2];
16455 
16456 		normal0[numUsedVerts] = v->normal[0];
16457 		normal1[numUsedVerts] = v->normal[1];
16458 		normal2[numUsedVerts] = v->normal[2];
16459 
16460 		tangent0[numUsedVerts] = v->tangents[0][0];
16461 		tangent1[numUsedVerts] = v->tangents[0][1];
16462 		tangent2[numUsedVerts] = v->tangents[0][2];
16463 
16464 		tangent3[numUsedVerts] = v->tangents[1][0];
16465 		tangent4[numUsedVerts] = v->tangents[1][1];
16466 		tangent5[numUsedVerts] = v->tangents[1][2];
16467 
16468 		usedVertNums[numUsedVerts++] = i;
16469 		if ( numUsedVerts < 4 ) {
16470 			continue;
16471 		}
16472 
16473 		ALIGN16( float temp[4] );
16474 
16475 		temp[0] = lightDir0[0] * lightDir0[0];
16476 		temp[1] = lightDir0[1] * lightDir0[1];
16477 		temp[2] = lightDir0[2] * lightDir0[2];
16478 		temp[3] = lightDir0[3] * lightDir0[3];
16479 
16480 		temp[0] += lightDir1[0] * lightDir1[0];
16481 		temp[1] += lightDir1[1] * lightDir1[1];
16482 		temp[2] += lightDir1[2] * lightDir1[2];
16483 		temp[3] += lightDir1[3] * lightDir1[3];
16484 
16485 		temp[0] += lightDir2[0] * lightDir2[0];
16486 		temp[1] += lightDir2[1] * lightDir2[1];
16487 		temp[2] += lightDir2[2] * lightDir2[2];
16488 		temp[3] += lightDir2[3] * lightDir2[3];
16489 
16490 		temp[0] = idMath::RSqrt( temp[0] );
16491 		temp[1] = idMath::RSqrt( temp[1] );
16492 		temp[2] = idMath::RSqrt( temp[2] );
16493 		temp[3] = idMath::RSqrt( temp[3] );
16494 
16495 		lightDir0[0] *= temp[0];
16496 		lightDir0[1] *= temp[1];
16497 		lightDir0[2] *= temp[2];
16498 		lightDir0[3] *= temp[3];
16499 
16500 		lightDir1[0] *= temp[0];
16501 		lightDir1[1] *= temp[1];
16502 		lightDir1[2] *= temp[2];
16503 		lightDir1[3] *= temp[3];
16504 
16505 		lightDir2[0] *= temp[0];
16506 		lightDir2[1] *= temp[1];
16507 		lightDir2[2] *= temp[2];
16508 		lightDir2[3] *= temp[3];
16509 
16510 		temp[0] = viewDir0[0] * viewDir0[0];
16511 		temp[1] = viewDir0[1] * viewDir0[1];
16512 		temp[2] = viewDir0[2] * viewDir0[2];
16513 		temp[3] = viewDir0[3] * viewDir0[3];
16514 
16515 		temp[0] += viewDir1[0] * viewDir1[0];
16516 		temp[1] += viewDir1[1] * viewDir1[1];
16517 		temp[2] += viewDir1[2] * viewDir1[2];
16518 		temp[3] += viewDir1[3] * viewDir1[3];
16519 
16520 		temp[0] += viewDir2[0] * viewDir2[0];
16521 		temp[1] += viewDir2[1] * viewDir2[1];
16522 		temp[2] += viewDir2[2] * viewDir2[2];
16523 		temp[3] += viewDir2[3] * viewDir2[3];
16524 
16525 		temp[0] = idMath::RSqrt( temp[0] );
16526 		temp[1] = idMath::RSqrt( temp[1] );
16527 		temp[2] = idMath::RSqrt( temp[2] );
16528 		temp[3] = idMath::RSqrt( temp[3] );
16529 
16530 		viewDir0[0] *= temp[0];
16531 		viewDir0[1] *= temp[1];
16532 		viewDir0[2] *= temp[2];
16533 		viewDir0[3] *= temp[3];
16534 
16535 		viewDir1[0] *= temp[0];
16536 		viewDir1[1] *= temp[1];
16537 		viewDir1[2] *= temp[2];
16538 		viewDir1[3] *= temp[3];
16539 
16540 		viewDir2[0] *= temp[0];
16541 		viewDir2[1] *= temp[1];
16542 		viewDir2[2] *= temp[2];
16543 		viewDir2[3] *= temp[3];
16544 
16545 		lightDir0[0] += viewDir0[0];
16546 		lightDir0[1] += viewDir0[1];
16547 		lightDir0[2] += viewDir0[2];
16548 		lightDir0[3] += viewDir0[3];
16549 
16550 		lightDir1[0] += viewDir1[0];
16551 		lightDir1[1] += viewDir1[1];
16552 		lightDir1[2] += viewDir1[2];
16553 		lightDir1[3] += viewDir1[3];
16554 
16555 		lightDir2[0] += viewDir2[0];
16556 		lightDir2[1] += viewDir2[1];
16557 		lightDir2[2] += viewDir2[2];
16558 		lightDir2[3] += viewDir2[3];
16559 
16560 		texCoords0[0] = lightDir0[0] * tangent0[0];
16561 		texCoords0[1] = lightDir0[1] * tangent0[1];
16562 		texCoords0[2] = lightDir0[2] * tangent0[2];
16563 		texCoords0[3] = lightDir0[3] * tangent0[3];
16564 
16565 		texCoords0[0] += lightDir1[0] * tangent1[0];
16566 		texCoords0[1] += lightDir1[1] * tangent1[1];
16567 		texCoords0[2] += lightDir1[2] * tangent1[2];
16568 		texCoords0[3] += lightDir1[3] * tangent1[3];
16569 
16570 		texCoords0[0] += lightDir2[0] * tangent2[0];
16571 		texCoords0[1] += lightDir2[1] * tangent2[1];
16572 		texCoords0[2] += lightDir2[2] * tangent2[2];
16573 		texCoords0[3] += lightDir2[3] * tangent2[3];
16574 
16575 		texCoords1[0] = lightDir0[0] * tangent3[0];
16576 		texCoords1[1] = lightDir0[1] * tangent3[1];
16577 		texCoords1[2] = lightDir0[2] * tangent3[2];
16578 		texCoords1[3] = lightDir0[3] * tangent3[3];
16579 
16580 		texCoords1[0] += lightDir1[0] * tangent4[0];
16581 		texCoords1[1] += lightDir1[1] * tangent4[1];
16582 		texCoords1[2] += lightDir1[2] * tangent4[2];
16583 		texCoords1[3] += lightDir1[3] * tangent4[3];
16584 
16585 		texCoords1[0] += lightDir2[0] * tangent5[0];
16586 		texCoords1[1] += lightDir2[1] * tangent5[1];
16587 		texCoords1[2] += lightDir2[2] * tangent5[2];
16588 		texCoords1[3] += lightDir2[3] * tangent5[3];
16589 
16590 		texCoords2[0] = lightDir0[0] * normal0[0];
16591 		texCoords2[1] = lightDir0[1] * normal0[1];
16592 		texCoords2[2] = lightDir0[2] * normal0[2];
16593 		texCoords2[3] = lightDir0[3] * normal0[3];
16594 
16595 		texCoords2[0] += lightDir1[0] * normal1[0];
16596 		texCoords2[1] += lightDir1[1] * normal1[1];
16597 		texCoords2[2] += lightDir1[2] * normal1[2];
16598 		texCoords2[3] += lightDir1[3] * normal1[3];
16599 
16600 		texCoords2[0] += lightDir2[0] * normal2[0];
16601 		texCoords2[1] += lightDir2[1] * normal2[1];
16602 		texCoords2[2] += lightDir2[2] * normal2[2];
16603 		texCoords2[3] += lightDir2[3] * normal2[3];
16604 
16605 		for ( int j = 0; j < 4; j++ ) {
16606 			int n = usedVertNums[j];
16607 
16608 			texCoords[n][0] = texCoords0[j];
16609 			texCoords[n][1] = texCoords1[j];
16610 			texCoords[n][2] = texCoords2[j];
16611 			texCoords[n][3] = 1.0f;
16612 		}
16613 
16614 		numUsedVerts = 0;
16615 	}
16616 
16617 	for ( int i = 0; i < numUsedVerts; i++ ) {
16618 		float temp;
16619 
16620 		temp = lightDir0[i] * lightDir0[i] + lightDir1[i] * lightDir1[i] + lightDir2[i] * lightDir2[i];
16621 		temp = idMath::RSqrt( temp );
16622 
16623 		lightDir0[i] *= temp;
16624 		lightDir1[i] *= temp;
16625 		lightDir2[i] *= temp;
16626 
16627 		temp = viewDir0[i] * viewDir0[i] + viewDir1[i] * viewDir1[i] + viewDir2[i] * viewDir2[i];
16628 		temp = idMath::RSqrt( temp );
16629 
16630 		viewDir0[i] *= temp;
16631 		viewDir1[i] *= temp;
16632 		viewDir2[i] *= temp;
16633 
16634 		lightDir0[i] += viewDir0[i];
16635 		lightDir1[i] += viewDir1[i];
16636 		lightDir2[i] += viewDir2[i];
16637 
16638 		texCoords0[i] = lightDir0[i] * tangent0[i] + lightDir1[i] * tangent1[i] + lightDir2[i] * tangent2[i];
16639 		texCoords1[i] = lightDir0[i] * tangent3[i] + lightDir1[i] * tangent4[i] + lightDir2[i] * tangent5[i];
16640 		texCoords2[i] = lightDir0[i] * normal0[i] + lightDir1[i] * normal1[i] + lightDir2[i] * normal2[i];
16641 
16642 		int n = usedVertNums[i];
16643 		texCoords[n][0] = texCoords0;
16644 		texCoords[n][1] = texCoords1;
16645 		texCoords[n][2] = texCoords2;
16646 		texCoords[n][3] = 1.0f;
16647 	}
16648 
16649 #endif
16650 }
16651 
16652 /*
16653 ============
16654 idSIMD_SSE::CreateShadowCache
16655 ============
16656 */
CreateShadowCache(idVec4 * vertexCache,int * vertRemap,const idVec3 & lightOrigin,const idDrawVert * verts,const int numVerts)16657 int VPCALL idSIMD_SSE::CreateShadowCache( idVec4 *vertexCache, int *vertRemap, const idVec3 &lightOrigin, const idDrawVert *verts, const int numVerts ) {
16658 #if 1
16659 	int outVerts;
16660 
16661 	__asm {
16662 		push		ebx
16663 
16664 		mov			esi, lightOrigin
16665 		movaps		xmm5, SIMD_SP_lastOne
16666 		movss		xmm6, [esi+0]
16667 		movhps		xmm6, [esi+4]
16668 		shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 2, 3, 1 )
16669 		orps		xmm6, SIMD_SP_lastOne
16670 		movaps		xmm7, xmm6
16671 
16672 		xor			ebx, ebx
16673 		xor			ecx, ecx
16674 
16675 		mov			edx, vertRemap
16676 		mov			esi, verts
16677 		mov			edi, vertexCache
16678 		mov			eax, numVerts
16679 		and			eax, ~3
16680 		jz			done4
16681 		shl			eax, 2
16682 		add			edx, eax
16683 		neg			eax
16684 
16685 	loop4:
16686 		prefetchnta	[edx+128]
16687 		prefetchnta	[esi+4*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET]
16688 
16689 		cmp         dword ptr [edx+eax+0], ebx
16690 		jne         skip1
16691 
16692 		mov			dword ptr [edx+eax+0], ecx
16693 		movss		xmm0, [esi+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8]
16694 		movhps		xmm0, [esi+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]
16695 		add			ecx, 2
16696 		shufps		xmm0, xmm0, R_SHUFFLEPS( 2, 3, 0, 1 );
16697 		orps		xmm0, xmm5
16698 		movaps		[edi+0*16], xmm0
16699 		subps		xmm0, xmm6
16700 		movaps		[edi+1*16], xmm0
16701 		add			edi, 2*16
16702 
16703 	skip1:
16704 		cmp         dword ptr [edx+eax+4], ebx
16705 		jne         skip2
16706 
16707 		mov			dword ptr [edx+eax+4], ecx
16708 		movss		xmm1, [esi+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]
16709 		movhps		xmm1, [esi+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4]
16710 		add			ecx, 2
16711 		shufps		xmm1, xmm1, R_SHUFFLEPS( 0, 2, 3, 1 )
16712 		orps		xmm1, xmm5
16713 		movaps		[edi+0*16], xmm1
16714 		subps		xmm1, xmm7
16715 		movaps		[edi+1*16], xmm1
16716 		add			edi, 2*16
16717 
16718 	skip2:
16719 		cmp         dword ptr [edx+eax+8], ebx
16720 		jne         skip3
16721 
16722 		mov			dword ptr [edx+eax+8], ecx
16723 		movss		xmm2, [esi+2*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8]
16724 		movhps		xmm2, [esi+2*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]
16725 		add			ecx, 2
16726 		shufps		xmm2, xmm2, R_SHUFFLEPS( 2, 3, 0, 1 );
16727 		orps		xmm2, xmm5
16728 		movaps		[edi+0*16], xmm2
16729 		subps		xmm2, xmm6
16730 		movaps		[edi+1*16], xmm2
16731 		add			edi, 2*16
16732 
16733 	skip3:
16734 		cmp         dword ptr [edx+eax+12], ebx
16735 		jne         skip4
16736 
16737 		mov			dword ptr [edx+eax+12], ecx
16738 		movss		xmm3, [esi+3*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]
16739 		movhps		xmm3, [esi+3*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4]
16740 		add			ecx, 2
16741 		shufps		xmm3, xmm3, R_SHUFFLEPS( 0, 2, 3, 1 )
16742 		orps		xmm3, xmm5
16743 		movaps		[edi+0*16], xmm3
16744 		subps		xmm3, xmm7
16745 		movaps		[edi+1*16], xmm3
16746 		add			edi, 2*16
16747 
16748 	skip4:
16749 		add			esi, 4*DRAWVERT_SIZE
16750 		add			eax, 4*4
16751 		jl			loop4
16752 
16753 	done4:
16754 		mov			eax, numVerts
16755 		and			eax, 3
16756 		jz			done1
16757 		shl			eax, 2
16758 		add			edx, eax
16759 		neg			eax
16760 
16761 	loop1:
16762 		cmp         dword ptr [edx+eax+0], ebx
16763 		jne         skip0
16764 
16765 		mov			dword ptr [edx+eax+0], ecx
16766 		movss		xmm0, [esi+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8]
16767 		movhps		xmm0, [esi+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]
16768 		add			ecx, 2
16769 		shufps		xmm0, xmm0, R_SHUFFLEPS( 2, 3, 0, 1 )
16770 		orps		xmm0, xmm5
16771 		movaps		[edi+0*16], xmm0
16772 		subps		xmm0, xmm6
16773 		movaps		[edi+1*16], xmm0
16774 		add			edi, 2*16
16775 
16776 	skip0:
16777 
16778 		add			esi, DRAWVERT_SIZE
16779 		add			eax, 4
16780 		jl			loop1
16781 
16782 	done1:
16783 		pop			ebx
16784 		mov			outVerts, ecx
16785 	}
16786 	return outVerts;
16787 
16788 #else
16789 
16790 	int outVerts = 0;
16791 	for ( int i = 0; i < numVerts; i++ ) {
16792 		if ( vertRemap[i] ) {
16793 			continue;
16794 		}
16795 		const float *v = verts[i].xyz.ToFloatPtr();
16796 		vertexCache[outVerts+0][0] = v[0];
16797 		vertexCache[outVerts+0][1] = v[1];
16798 		vertexCache[outVerts+0][2] = v[2];
16799 		vertexCache[outVerts+0][3] = 1.0f;
16800 
16801 		// R_SetupProjection() builds the projection matrix with a slight crunch
16802 		// for depth, which keeps this w=0 division from rasterizing right at the
16803 		// wrap around point and causing depth fighting with the rear caps
16804 		vertexCache[outVerts+1][0] = v[0] - lightOrigin[0];
16805 		vertexCache[outVerts+1][1] = v[1] - lightOrigin[1];
16806 		vertexCache[outVerts+1][2] = v[2] - lightOrigin[2];
16807 		vertexCache[outVerts+1][3] = 0.0f;
16808 		vertRemap[i] = outVerts;
16809 		outVerts += 2;
16810 	}
16811 	return outVerts;
16812 
16813 #endif
16814 }
16815 
16816 /*
16817 ============
16818 idSIMD_SSE::CreateVertexProgramShadowCache
16819 ============
16820 */
CreateVertexProgramShadowCache(idVec4 * vertexCache,const idDrawVert * verts,const int numVerts)16821 int VPCALL idSIMD_SSE::CreateVertexProgramShadowCache( idVec4 *vertexCache, const idDrawVert *verts, const int numVerts ) {
16822 #if 1
16823 
16824 	__asm {
16825 		movaps		xmm4, SIMD_SP_lastOne
16826 		movaps		xmm5, xmm4
16827 		movaps		xmm6, xmm4
16828 		movaps		xmm7, xmm4
16829 
16830 		mov			esi, verts
16831 		mov			edi, vertexCache
16832 		mov			eax, numVerts
16833 		and			eax, ~3
16834 		jz			done4
16835 		shl			eax, 5
16836 		add			edi, eax
16837 		neg			eax
16838 
16839 	loop4:
16840 		prefetchnta	[esi+4*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET]
16841 
16842 		movss		xmm0, [esi+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8]
16843 		movhps		xmm0, [esi+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]
16844 		shufps		xmm0, xmm0, R_SHUFFLEPS( 2, 3, 0, 1 );
16845 		movaps		[edi+eax+1*16], xmm0
16846 		orps		xmm0, xmm4
16847 		movaps		[edi+eax+0*16], xmm0
16848 
16849 		movss		xmm1, [esi+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]
16850 		movhps		xmm1, [esi+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4]
16851 		shufps		xmm1, xmm1, R_SHUFFLEPS( 0, 2, 3, 1 )
16852 		movaps		[edi+eax+3*16], xmm1
16853 		orps		xmm1, xmm5
16854 		movaps		[edi+eax+2*16], xmm1
16855 
16856 		movss		xmm2, [esi+2*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8]
16857 		movhps		xmm2, [esi+2*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]
16858 		shufps		xmm2, xmm2, R_SHUFFLEPS( 2, 3, 0, 1 );
16859 		movaps		[edi+eax+5*16], xmm2
16860 		orps		xmm2, xmm6
16861 		movaps		[edi+eax+4*16], xmm2
16862 
16863 		movss		xmm3, [esi+3*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]
16864 		movhps		xmm3, [esi+3*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4]
16865 		shufps		xmm3, xmm3, R_SHUFFLEPS( 0, 2, 3, 1 )
16866 		movaps		[edi+eax+7*16], xmm3
16867 		orps		xmm3, xmm7
16868 		movaps		[edi+eax+6*16], xmm3
16869 
16870 		add			esi, 4*DRAWVERT_SIZE
16871 		add			eax, 4*8*4
16872 		jl			loop4
16873 
16874 	done4:
16875 		mov			eax, numVerts
16876 		and			eax, 3
16877 		jz			done1
16878 		shl			eax, 5
16879 		add			edi, eax
16880 		neg			eax
16881 
16882 	loop1:
16883 		movss		xmm0, [esi+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8]
16884 		movhps		xmm0, [esi+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]
16885 		shufps		xmm0, xmm0, R_SHUFFLEPS( 2, 3, 0, 1 );
16886 		movaps		[edi+eax+1*16], xmm0
16887 		orps		xmm0, xmm4
16888 		movaps		[edi+eax+0*16], xmm0
16889 
16890 		add			esi, DRAWVERT_SIZE
16891 		add			eax, 8*4
16892 		jl			loop1
16893 
16894 	done1:
16895 	}
16896 	return numVerts * 2;
16897 
16898 #else
16899 
16900 	for ( int i = 0; i < numVerts; i++ ) {
16901 		const float *v = verts[i].xyz.ToFloatPtr();
16902 		vertexCache[i*2+0][0] = v[0];
16903 		vertexCache[i*2+0][1] = v[1];
16904 		vertexCache[i*2+0][2] = v[2];
16905 		vertexCache[i*2+0][3] = 1.0f;
16906 
16907 		vertexCache[i*2+1][0] = v[0];
16908 		vertexCache[i*2+1][1] = v[1];
16909 		vertexCache[i*2+1][2] = v[2];
16910 		vertexCache[i*2+1][3] = 0.0f;
16911 	}
16912 	return numVerts * 2;
16913 
16914 #endif
16915 }
16916 
16917 /*
16918 ============
16919 SSE_UpSample11kHzMonoPCMTo44kHz
16920 ============
16921 */
SSE_UpSample11kHzMonoPCMTo44kHz(float * dest,const short * src,const int numSamples)16922 static void SSE_UpSample11kHzMonoPCMTo44kHz( float *dest, const short *src, const int numSamples ) {
16923 	__asm {
16924 		mov			esi, src
16925 		mov			edi, dest
16926 
16927 		mov			eax, numSamples
16928 		and			eax, ~1
16929 		jz			done2
16930 		shl			eax, 1
16931 		add			esi, eax
16932 		neg			eax
16933 
16934 		align		16
16935 	loop2:
16936 		add			edi, 2*4*4
16937 
16938 		movsx		ecx, word ptr [esi+eax+0]
16939 		cvtsi2ss	xmm0, ecx
16940 		shufps		xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
16941 		movlps		[edi-2*4*4+0], xmm0
16942 		movhps		[edi-2*4*4+8], xmm0
16943 
16944 		movsx		edx, word ptr [esi+eax+2]
16945 		cvtsi2ss	xmm1, edx
16946 		shufps		xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
16947 		movlps		[edi-1*4*4+0], xmm1
16948 		movhps		[edi-1*4*4+8], xmm1
16949 
16950 		add			eax, 2*2
16951 		jl			loop2
16952 
16953 	done2:
16954 		mov			eax, numSamples
16955 		and			eax, 1
16956 		jz			done
16957 
16958 		movsx		ecx, word ptr [esi]
16959 		cvtsi2ss	xmm0, ecx
16960 		shufps		xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
16961 		movlps		[edi+0], xmm0
16962 		movhps		[edi+8], xmm0
16963 
16964 	done:
16965 	}
16966 }
16967 
16968 /*
16969 ============
16970 SSE_UpSample11kHzStereoPCMTo44kHz
16971 ============
16972 */
SSE_UpSample11kHzStereoPCMTo44kHz(float * dest,const short * src,const int numSamples)16973 static void SSE_UpSample11kHzStereoPCMTo44kHz( float *dest, const short *src, const int numSamples ) {
16974 	__asm {
16975 		mov			esi, src
16976 		mov			edi, dest
16977 
16978 		mov			eax, numSamples
16979 		test		eax, ~1
16980 		jz			done2
16981 		shl			eax, 1
16982 		add			esi, eax
16983 		neg			eax
16984 
16985 		align		16
16986 	loop2:
16987 		add			edi, 8*4
16988 
16989 		movsx		ecx, word ptr [esi+eax+0]
16990 		cvtsi2ss	xmm0, ecx
16991 
16992 		movsx		edx, word ptr [esi+eax+2]
16993 		cvtsi2ss	xmm1, edx
16994 
16995 		unpcklps	xmm0, xmm1
16996 
16997 		movlps		[edi-8*4+0], xmm0
16998 		movlps		[edi-8*4+8], xmm0
16999 		movlps		[edi-4*4+0], xmm0
17000 		movlps		[edi-4*4+8], xmm0
17001 
17002 		add			eax, 2*2
17003 		jl			loop2
17004 
17005 	done2:
17006 	}
17007 }
17008 
17009 /*
17010 ============
17011 SSE_UpSample22kHzMonoPCMTo44kHz
17012 ============
17013 */
SSE_UpSample22kHzMonoPCMTo44kHz(float * dest,const short * src,const int numSamples)17014 static void SSE_UpSample22kHzMonoPCMTo44kHz( float *dest, const short *src, const int numSamples ) {
17015 	__asm {
17016 		mov			esi, src
17017 		mov			edi, dest
17018 
17019 		mov			eax, numSamples
17020 		and			eax, ~1
17021 		jz			done2
17022 		shl			eax, 1
17023 		add			esi, eax
17024 		neg			eax
17025 
17026 		align		16
17027 	loop2:
17028 		add			edi, 4*4
17029 
17030 		movsx		ecx, word ptr [esi+eax+0]
17031 		cvtsi2ss	xmm0, ecx
17032 
17033 		movsx		edx, word ptr [esi+eax+2]
17034 		cvtsi2ss	xmm1, edx
17035 
17036 		shufps		xmm0, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
17037 		movlps		[edi-4*4+0], xmm0
17038 		movhps		[edi-4*4+8], xmm0
17039 
17040 		add			eax, 2*2
17041 		jl			loop2
17042 
17043 	done2:
17044 		mov			eax, numSamples
17045 		and			eax, 1
17046 		jz			done
17047 
17048 		movsx		ecx, word ptr [esi]
17049 		cvtsi2ss	xmm0, ecx
17050 		shufps		xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
17051 		movlps		[edi], xmm0
17052 
17053 	done:
17054 	}
17055 }
17056 
17057 /*
17058 ============
17059 SSE_UpSample22kHzStereoPCMTo44kHz
17060 ============
17061 */
SSE_UpSample22kHzStereoPCMTo44kHz(float * dest,const short * src,const int numSamples)17062 static void SSE_UpSample22kHzStereoPCMTo44kHz( float *dest, const short *src, const int numSamples ) {
17063 	__asm {
17064 		mov			esi, src
17065 		mov			edi, dest
17066 
17067 		mov			eax, numSamples
17068 		test		eax, ~1
17069 		jz			done2
17070 		shl			eax, 1
17071 		add			esi, eax
17072 		neg			eax
17073 
17074 		align		16
17075 	loop2:
17076 		add			edi, 4*4
17077 
17078 		movsx		ecx, word ptr [esi+eax+0]
17079 		cvtsi2ss	xmm0, ecx
17080 		movss		[edi-4*4], xmm0
17081 		movss		[edi-2*4], xmm0
17082 
17083 		movsx		edx, word ptr [esi+eax+2]
17084 		cvtsi2ss	xmm1, edx
17085 		movss		[edi-3*4], xmm1
17086 		movss		[edi-1*4], xmm1
17087 
17088 		add			eax, 2*2
17089 		jl			loop2
17090 
17091 	done2:
17092 	}
17093 }
17094 
17095 /*
17096 ============
17097 SSE_UpSample44kHzMonoPCMTo44kHz
17098 ============
17099 */
SSE_UpSample44kHzMonoPCMTo44kHz(float * dest,const short * src,const int numSamples)17100 static void SSE_UpSample44kHzMonoPCMTo44kHz( float *dest, const short *src, const int numSamples ) {
17101 	__asm {
17102 		mov			esi, src
17103 		mov			edi, dest
17104 
17105 		mov			eax, numSamples
17106 		and			eax, ~1
17107 		jz			done2
17108 		shl			eax, 1
17109 		add			esi, eax
17110 		neg			eax
17111 
17112 		align		16
17113 	loop2:
17114 		add			edi, 2*4
17115 
17116 		movsx		ecx, word ptr [esi+eax+0]
17117 		cvtsi2ss	xmm0, ecx
17118 		movss		[edi-2*4], xmm0
17119 
17120 		movsx		edx, word ptr [esi+eax+2]
17121 		cvtsi2ss	xmm1, edx
17122 		movss		[edi-1*4], xmm1
17123 
17124 		add			eax, 2*2
17125 		jl			loop2
17126 
17127 	done2:
17128 		mov			eax, numSamples
17129 		and			eax, 1
17130 		jz			done
17131 
17132 		movsx		ecx, word ptr [esi]
17133 		cvtsi2ss	xmm0, ecx
17134 		movss		[edi], xmm0
17135 
17136 	done:
17137 	}
17138 }
17139 
17140 /*
17141 ============
17142 idSIMD_SSE::UpSamplePCMTo44kHz
17143 
17144   Duplicate samples for 44kHz output.
17145 ============
17146 */
UpSamplePCMTo44kHz(float * dest,const short * src,const int numSamples,const int kHz,const int numChannels)17147 void idSIMD_SSE::UpSamplePCMTo44kHz( float *dest, const short *src, const int numSamples, const int kHz, const int numChannels ) {
17148 	if ( kHz == 11025 ) {
17149 		if ( numChannels == 1 ) {
17150 			SSE_UpSample11kHzMonoPCMTo44kHz( dest, src, numSamples );
17151 		} else {
17152 			SSE_UpSample11kHzStereoPCMTo44kHz( dest, src, numSamples );
17153 		}
17154 	} else if ( kHz == 22050 ) {
17155 		if ( numChannels == 1 ) {
17156 			SSE_UpSample22kHzMonoPCMTo44kHz( dest, src, numSamples );
17157 		} else {
17158 			SSE_UpSample22kHzStereoPCMTo44kHz( dest, src, numSamples );
17159 		}
17160 	} else if ( kHz == 44100 ) {
17161 		SSE_UpSample44kHzMonoPCMTo44kHz( dest, src, numSamples );
17162 	} else {
17163 		assert( 0 );
17164 	}
17165 }
17166 
17167 
17168 // DG: at least in the 22KHz Stereo OGG case with numSamples % 4 != 0 this is broken (writes 4 floats too much which can destroy the stack, see #303),
17169 //     so let's just not use it anymore its MSVC+32bit only anyway and I doubt it gets noticable speedups, so I don't feel like trying to understand and fix it..
17170 #if 0
17171 /*
17172 ============
17173 SSE_UpSample11kHzMonoOGGTo44kHz
17174 ============
17175 */
17176 static void SSE_UpSample11kHzMonoOGGTo44kHz( float *dest, const float *src, const int numSamples ) {
17177 	float constant = 32768.0f;
17178 	__asm {
17179 		mov			esi, src
17180 		mov			edi, dest
17181 		movss		xmm7, constant
17182 		shufps		xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
17183 
17184 		mov			eax, numSamples
17185 		and			eax, ~1
17186 		jz			done2
17187 		shl			eax, 2
17188 		add			esi, eax
17189 		neg			eax
17190 
17191 		align		16
17192 	loop2:
17193 		add			edi, 2*16
17194 
17195 		movss		xmm0, [esi+eax+0]
17196 		mulss		xmm0, xmm7
17197 		shufps		xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
17198 		movlps		[edi-32], xmm0
17199 		movlps		[edi-24], xmm0
17200 
17201 		movss		xmm1, [esi+eax+4]
17202 		mulss		xmm1, xmm7
17203 		shufps		xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
17204 		movlps		[edi-16], xmm1
17205 		movlps		[edi- 8], xmm1
17206 
17207 		add			eax, 2*4
17208 		jl			loop2
17209 
17210 	done2:
17211 		mov			eax, numSamples
17212 		and			eax, 1
17213 		jz			done
17214 
17215 		movss		xmm0, [esi]
17216 		mulss		xmm0, xmm7
17217 		shufps		xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
17218 		movlps		[edi+0], xmm0
17219 		movlps		[edi+8], xmm0
17220 
17221 	done:
17222 	}
17223 }
17224 
17225 /*
17226 ============
17227 SSE_UpSample11kHzStereoOGGTo44kHz
17228 ============
17229 */
17230 static void SSE_UpSample11kHzStereoOGGTo44kHz( float *dest, const float * const *src, const int numSamples ) {
17231 	float constant = 32768.0f;
17232 	__asm {
17233 		mov			esi, src
17234 		mov			ecx, [esi+0]
17235 		mov			edx, [esi+4]
17236 		mov			edi, dest
17237 		movss		xmm7, constant
17238 		shufps		xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
17239 
17240 		mov			eax, numSamples
17241 		and			eax, ~1
17242 		jz			done2
17243 		shl			eax, 1
17244 		add			ecx, eax
17245 		add			edx, eax
17246 		neg			eax
17247 
17248 		align		16
17249 	loop2:
17250 		add			edi, 4*16
17251 
17252 		movlps		xmm0, [ecx+eax]
17253 		movlps		xmm1, [edx+eax]
17254 		unpcklps	xmm0, xmm1
17255 		mulps		xmm0, xmm7
17256 		movlps		[edi-8*8], xmm0
17257 		movlps		[edi-7*8], xmm0
17258 		movlps		[edi-6*8], xmm0
17259 		movlps		[edi-5*8], xmm0
17260 		movhps		[edi-4*8], xmm0
17261 		movhps		[edi-3*8], xmm0
17262 		movhps		[edi-2*8], xmm0
17263 		movhps		[edi-1*8], xmm0
17264 
17265 		add			eax, 2*4
17266 		jl			loop2
17267 
17268 	done2:
17269 		mov			eax, numSamples
17270 		and			eax, 1
17271 		jz			done
17272 
17273 		movss		xmm0, [ecx]
17274 		movss		xmm1, [edx]
17275 		unpcklps	xmm0, xmm1
17276 		mulps		xmm0, xmm7
17277 		movlps		[edi+0*8], xmm0
17278 		movlps		[edi+1*8], xmm0
17279 		movlps		[edi+2*8], xmm0
17280 		movlps		[edi+3*8], xmm0
17281 
17282 	done:
17283 	}
17284 }
17285 
17286 /*
17287 ============
17288 SSE_UpSample22kHzMonoOGGTo44kHz
17289 ============
17290 */
17291 static void SSE_UpSample22kHzMonoOGGTo44kHz( float *dest, const float *src, const int numSamples ) {
17292 	float constant = 32768.0f;
17293 	__asm {
17294 		mov			esi, src
17295 		mov			edi, dest
17296 		movss		xmm7, constant
17297 		shufps		xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
17298 
17299 		mov			eax, numSamples
17300 		and			eax, ~1
17301 		jz			done2
17302 		shl			eax, 2
17303 		add			esi, eax
17304 		neg			eax
17305 
17306 		align		16
17307 	loop2:
17308 		add			edi, 2*8
17309 
17310 		movss		xmm0, [esi+eax+0]
17311 		movss		xmm1, [esi+eax+4]
17312 		shufps		xmm0, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
17313 		mulps		xmm0, xmm7
17314 		movlps		[edi-16], xmm0
17315 		movhps		[edi- 8], xmm0
17316 
17317 		add			eax, 2*4
17318 		jl			loop2
17319 
17320 	done2:
17321 		mov			eax, numSamples
17322 		and			eax, 1
17323 		jz			done
17324 
17325 		movss		xmm0, [esi]
17326 		mulss		xmm0, xmm7
17327 		shufps		xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
17328 		movlps		[edi+0], xmm0
17329 
17330 	done:
17331 	}
17332 }
17333 
17334 /*
17335 ============
17336 SSE_UpSample22kHzStereoOGGTo44kHz
17337 ============
17338 */
17339 static void SSE_UpSample22kHzStereoOGGTo44kHz( float *dest, const float * const *src, const int numSamples ) {
17340 	float constant = 32768.0f;
17341 	__asm {
17342 		mov			esi, src
17343 		mov			ecx, [esi+0]
17344 		mov			edx, [esi+4]
17345 		mov			edi, dest
17346 		movss		xmm7, constant
17347 		shufps		xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
17348 
17349 		mov			eax, numSamples
17350 		and			eax, ~1
17351 		jz			done2
17352 		shl			eax, 1
17353 		add			ecx, eax
17354 		add			edx, eax
17355 		neg			eax
17356 
17357 		align		16
17358 	loop2:
17359 		add			edi, 2*16
17360 
17361 		movlps		xmm0, [ecx+eax]
17362 		movlps		xmm1, [edx+eax]
17363 		unpcklps	xmm0, xmm1
17364 		mulps		xmm0, xmm7
17365 		movlps		[edi-4*8], xmm0
17366 		movlps		[edi-3*8], xmm0
17367 		movhps		[edi-2*8], xmm0
17368 		movhps		[edi-1*8], xmm0
17369 
17370 		add			eax, 2*4
17371 		jl			loop2
17372 
17373 	done2:
17374 		mov			eax, numSamples
17375 		and			eax, 1
17376 		jz			done
17377 
17378 		movss		xmm0, [ecx]
17379 		movss		xmm1, [edx]
17380 		unpcklps	xmm0, xmm1
17381 		mulps		xmm0, xmm7
17382 		movlps		[edi+0*8], xmm0
17383 		movlps		[edi+1*8], xmm0
17384 
17385 	done:
17386 	}
17387 }
17388 
17389 /*
17390 ============
17391 SSE_UpSample44kHzMonoOGGTo44kHz
17392 ============
17393 */
17394 static void SSE_UpSample44kHzMonoOGGTo44kHz( float *dest, const float *src, const int numSamples ) {
17395 	float constant = 32768.0f;
17396 	KFLOAT_CA( mul, dest, src, constant, numSamples )
17397 }
17398 
17399 /*
17400 ============
17401 SSE_UpSample44kHzStereoOGGTo44kHz
17402 ============
17403 */
17404 static void SSE_UpSample44kHzStereoOGGTo44kHz( float *dest, const float * const *src, const int numSamples ) {
17405 	float constant = 32768.0f;
17406 	__asm {
17407 		mov			esi, src
17408 		mov			ecx, [esi+0]
17409 		mov			edx, [esi+4]
17410 		mov			edi, dest
17411 		movss		xmm7, constant
17412 		shufps		xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
17413 
17414 		mov			eax, numSamples
17415 		and			eax, ~1
17416 		jz			done2
17417 		shl			eax, 1
17418 		add			ecx, eax
17419 		add			edx, eax
17420 		neg			eax
17421 
17422 		align		16
17423 	loop2:
17424 		add			edi, 16
17425 
17426 		movlps		xmm0, [ecx+eax]
17427 		movlps		xmm1, [edx+eax]
17428 		unpcklps	xmm0, xmm1
17429 		mulps		xmm0, xmm7
17430 		movlps		[edi-2*8], xmm0
17431 		movhps		[edi-1*8], xmm0
17432 
17433 		add			eax, 2*4
17434 		jl			loop2
17435 
17436 	done2:
17437 		mov			eax, numSamples
17438 		and			eax, 1
17439 		jz			done
17440 
17441 		movss		xmm0, [ecx]
17442 		movss		xmm1, [edx]
17443 		unpcklps	xmm0, xmm1
17444 		mulps		xmm0, xmm7
17445 		movlps		[edi+0*8], xmm0
17446 
17447 	done:
17448 	}
17449 }
17450 
17451 /*
17452 ============
17453 idSIMD_SSE::UpSampleOGGTo44kHz
17454 
17455   Duplicate samples for 44kHz output.
17456 ============
17457 */
17458 void idSIMD_SSE::UpSampleOGGTo44kHz( float *dest, const float * const *ogg, const int numSamples, const int kHz, const int numChannels ) {
17459 	if ( kHz == 11025 ) {
17460 		if ( numChannels == 1 ) {
17461 			SSE_UpSample11kHzMonoOGGTo44kHz( dest, ogg[0], numSamples );
17462 		} else {
17463 			SSE_UpSample11kHzStereoOGGTo44kHz( dest, ogg, numSamples );
17464 		}
17465 	} else if ( kHz == 22050 ) {
17466 		if ( numChannels == 1 ) {
17467 			SSE_UpSample22kHzMonoOGGTo44kHz( dest, ogg[0], numSamples );
17468 		} else {
17469 			SSE_UpSample22kHzStereoOGGTo44kHz( dest, ogg, numSamples );
17470 		}
17471 	} else if ( kHz == 44100 ) {
17472 		if ( numChannels == 1 ) {
17473 			SSE_UpSample44kHzMonoOGGTo44kHz( dest, ogg[0], numSamples );
17474 		} else {
17475 			SSE_UpSample44kHzStereoOGGTo44kHz( dest, ogg, numSamples );
17476 		}
17477 	} else {
17478 		assert( 0 );
17479 	}
17480 }
17481 #endif
17482 
17483 /*
17484 ============
17485 idSIMD_SSE::MixSoundTwoSpeakerMono
17486 ============
17487 */
MixSoundTwoSpeakerMono(float * mixBuffer,const float * samples,const int numSamples,const float lastV[2],const float currentV[2])17488 void VPCALL idSIMD_SSE::MixSoundTwoSpeakerMono( float *mixBuffer, const float *samples, const int numSamples, const float lastV[2], const float currentV[2] ) {
17489 #if 1
17490 
17491 	ALIGN16( float incs[2] );
17492 
17493 	assert( numSamples == MIXBUFFER_SAMPLES );
17494 
17495 	incs[0] = ( currentV[0] - lastV[0] ) / MIXBUFFER_SAMPLES;
17496 	incs[1] = ( currentV[1] - lastV[1] ) / MIXBUFFER_SAMPLES;
17497 
17498 	__asm {
17499 		mov			eax, MIXBUFFER_SAMPLES
17500 		mov			edi, mixBuffer
17501 		mov			esi, samples
17502 		shl			eax, 2
17503 		add			esi, eax
17504 		neg			eax
17505 
17506 		mov			ecx, lastV
17507 		movlps		xmm6, [ecx]
17508 		xorps		xmm7, xmm7
17509 		movhps		xmm7, incs
17510 		shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 1, 0, 1 )
17511 		addps		xmm6, xmm7
17512 		shufps		xmm7, xmm7, R_SHUFFLEPS( 2, 3, 2, 3 )
17513 		addps		xmm7, xmm7
17514 
17515 	loop16:
17516 		add			edi, 4*4*4
17517 
17518 		movaps		xmm0, [esi+eax+0*4*4]
17519 		movaps		xmm1, xmm0
17520 		shufps		xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 )
17521 		mulps		xmm0, xmm6
17522 		addps		xmm0, [edi-4*4*4]
17523 		addps		xmm6, xmm7
17524 		movaps		[edi-4*4*4], xmm0
17525 
17526 		shufps		xmm1, xmm1, R_SHUFFLEPS( 2, 2, 3, 3 )
17527 		mulps		xmm1, xmm6
17528 		addps		xmm1, [edi-3*4*4]
17529 		addps		xmm6, xmm7
17530 		movaps		[edi-3*4*4], xmm1
17531 
17532 		movaps		xmm2, [esi+eax+1*4*4]
17533 		movaps		xmm3, xmm2
17534 		shufps		xmm2, xmm2, R_SHUFFLEPS( 0, 0, 1, 1 )
17535 		mulps		xmm2, xmm6
17536 		addps		xmm2, [edi-2*4*4]
17537 		addps		xmm6, xmm7
17538 		movaps		[edi-2*4*4], xmm2
17539 
17540 		shufps		xmm3, xmm3, R_SHUFFLEPS( 2, 2, 3, 3 )
17541 		mulps		xmm3, xmm6
17542 		addps		xmm3, [edi-1*4*4]
17543 		addps		xmm6, xmm7
17544 		movaps		[edi-1*4*4], xmm3
17545 
17546 		add			eax, 2*4*4
17547 
17548 		jl			loop16
17549 	}
17550 
17551 #else
17552 
17553 	int i;
17554 	float incL;
17555 	float incR;
17556 	float sL0, sL1;
17557 	float sR0, sR1;
17558 
17559 	assert( numSamples == MIXBUFFER_SAMPLES );
17560 
17561 	incL = ( currentV[0] - lastV[0] ) / MIXBUFFER_SAMPLES;
17562 	incR = ( currentV[1] - lastV[1] ) / MIXBUFFER_SAMPLES;
17563 
17564 	sL0 = lastV[0];
17565 	sR0 = lastV[1];
17566 	sL1 = lastV[0] + incL;
17567 	sR1 = lastV[1] + incR;
17568 
17569 	incL *= 2;
17570 	incR *= 2;
17571 
17572 	for( i = 0; i < MIXBUFFER_SAMPLES; i += 2 ) {
17573 		mixBuffer[i*2+0] += samples[i+0] * sL0;
17574 		mixBuffer[i*2+1] += samples[i+0] * sR0;
17575 		mixBuffer[i*2+2] += samples[i+1] * sL1;
17576 		mixBuffer[i*2+3] += samples[i+1] * sR1;
17577 		sL0 += incL;
17578 		sR0 += incR;
17579 		sL1 += incL;
17580 		sR1 += incR;
17581 	}
17582 
17583 #endif
17584 }
17585 
17586 /*
17587 ============
17588 idSIMD_SSE::MixSoundTwoSpeakerStereo
17589 ============
17590 */
MixSoundTwoSpeakerStereo(float * mixBuffer,const float * samples,const int numSamples,const float lastV[2],const float currentV[2])17591 void VPCALL idSIMD_SSE::MixSoundTwoSpeakerStereo( float *mixBuffer, const float *samples, const int numSamples, const float lastV[2], const float currentV[2] ) {
17592 #if 1
17593 
17594 	ALIGN16( float incs[2] );
17595 
17596 	assert( numSamples == MIXBUFFER_SAMPLES );
17597 
17598 	incs[0] = ( currentV[0] - lastV[0] ) / MIXBUFFER_SAMPLES;
17599 	incs[1] = ( currentV[1] - lastV[1] ) / MIXBUFFER_SAMPLES;
17600 
17601 	__asm {
17602 		mov			eax, MIXBUFFER_SAMPLES
17603 		mov			edi, mixBuffer
17604 		mov			esi, samples
17605 		shl			eax, 3
17606 		add			esi, eax
17607 		neg			eax
17608 
17609 		mov			ecx, lastV
17610 		movlps		xmm6, [ecx]
17611 		xorps		xmm7, xmm7
17612 		movhps		xmm7, incs
17613 		shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 1, 0, 1 )
17614 		addps		xmm6, xmm7
17615 		shufps		xmm7, xmm7, R_SHUFFLEPS( 2, 3, 2, 3 )
17616 		addps		xmm7, xmm7
17617 
17618 	loop16:
17619 		add			edi, 4*4*4
17620 
17621 		movaps		xmm0, [esi+eax+0*4*4]
17622 		mulps		xmm0, xmm6
17623 		addps		xmm0, [edi-4*4*4]
17624 		addps		xmm6, xmm7
17625 		movaps		[edi-4*4*4], xmm0
17626 
17627 		movaps		xmm2, [esi+eax+1*4*4]
17628 		mulps		xmm2, xmm6
17629 		addps		xmm2, [edi-3*4*4]
17630 		addps		xmm6, xmm7
17631 		movaps		[edi-3*4*4], xmm2
17632 
17633 		movaps		xmm3, [esi+eax+2*4*4]
17634 		mulps		xmm3, xmm6
17635 		addps		xmm3, [edi-2*4*4]
17636 		addps		xmm6, xmm7
17637 		movaps		[edi-2*4*4], xmm3
17638 
17639 		movaps		xmm4, [esi+eax+3*4*4]
17640 		mulps		xmm4, xmm6
17641 		addps		xmm4, [edi-1*4*4]
17642 		addps		xmm6, xmm7
17643 		movaps		[edi-1*4*4], xmm4
17644 
17645 		add			eax, 4*4*4
17646 
17647 		jl			loop16
17648 	}
17649 
17650 #else
17651 
17652 	int i;
17653 	float incL;
17654 	float incR;
17655 	float sL0, sL1;
17656 	float sR0, sR1;
17657 
17658 	assert( numSamples == MIXBUFFER_SAMPLES );
17659 
17660 	incL = ( currentV[0] - lastV[0] ) / MIXBUFFER_SAMPLES;
17661 	incR = ( currentV[1] - lastV[1] ) / MIXBUFFER_SAMPLES;
17662 
17663 	sL0 = lastV[0];
17664 	sR0 = lastV[1];
17665 	sL1 = lastV[0] + incL;
17666 	sR1 = lastV[1] + incR;
17667 
17668 	incL *= 2;
17669 	incR *= 2;
17670 
17671 	for( i = 0; i < MIXBUFFER_SAMPLES; i += 2 ) {
17672 		mixBuffer[i*2+0] += samples[i*2+0] * sL0;
17673 		mixBuffer[i*2+1] += samples[i*2+1] * sR0;
17674 		mixBuffer[i*2+2] += samples[i*2+2] * sL1;
17675 		mixBuffer[i*2+3] += samples[i*2+3] * sR1;
17676 		sL0 += incL;
17677 		sR0 += incR;
17678 		sL1 += incL;
17679 		sR1 += incR;
17680 	}
17681 
17682 #endif
17683 }
17684 
17685 /*
17686 ============
17687 idSIMD_SSE::MixSoundSixSpeakerMono
17688 ============
17689 */
MixSoundSixSpeakerMono(float * mixBuffer,const float * samples,const int numSamples,const float lastV[6],const float currentV[6])17690 void VPCALL idSIMD_SSE::MixSoundSixSpeakerMono( float *mixBuffer, const float *samples, const int numSamples, const float lastV[6], const float currentV[6] ) {
17691 #if 1
17692 
17693 	ALIGN16( float incs[6] );
17694 
17695 	assert( numSamples == MIXBUFFER_SAMPLES );
17696 
17697 	incs[0] = ( currentV[0] - lastV[0] ) / MIXBUFFER_SAMPLES;
17698 	incs[1] = ( currentV[1] - lastV[1] ) / MIXBUFFER_SAMPLES;
17699 	incs[2] = ( currentV[2] - lastV[2] ) / MIXBUFFER_SAMPLES;
17700 	incs[3] = ( currentV[3] - lastV[3] ) / MIXBUFFER_SAMPLES;
17701 	incs[4] = ( currentV[4] - lastV[4] ) / MIXBUFFER_SAMPLES;
17702 	incs[5] = ( currentV[5] - lastV[5] ) / MIXBUFFER_SAMPLES;
17703 
17704 	__asm {
17705 		mov			eax, MIXBUFFER_SAMPLES
17706 		mov			edi, mixBuffer
17707 		mov			esi, samples
17708 		shl			eax, 2
17709 		add			esi, eax
17710 		neg			eax
17711 
17712 		mov			ecx, lastV
17713 		movlps		xmm2, [ecx+ 0]
17714 		movhps		xmm2, [ecx+ 8]
17715 		movlps		xmm3, [ecx+16]
17716 		movaps		xmm4, xmm2
17717 		shufps		xmm3, xmm2, R_SHUFFLEPS( 0, 1, 0, 1 )
17718 		shufps		xmm4, xmm3, R_SHUFFLEPS( 2, 3, 0, 1 )
17719 
17720 		xorps		xmm5, xmm5
17721 		movhps		xmm5, incs
17722 		movlps		xmm7, incs+8
17723 		movhps		xmm7, incs+16
17724 		addps		xmm3, xmm5
17725 		addps		xmm4, xmm7
17726 		shufps		xmm5, xmm7, R_SHUFFLEPS( 2, 3, 0, 1 )
17727 		movaps		xmm6, xmm7
17728 		shufps		xmm6, xmm5, R_SHUFFLEPS( 2, 3, 0, 1 )
17729 		addps		xmm5, xmm5
17730 		addps		xmm6, xmm6
17731 		addps		xmm7, xmm7
17732 
17733 	loop24:
17734 		add			edi, 6*16
17735 
17736 		movaps		xmm0, [esi+eax]
17737 
17738 		movaps		xmm1, xmm0
17739 		shufps		xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
17740 		mulps		xmm1, xmm2
17741 		addps		xmm1, [edi-6*16]
17742 		addps		xmm2, xmm5
17743 		movaps		[edi-6*16], xmm1
17744 
17745 		movaps		xmm1, xmm0
17746 		shufps		xmm1, xmm1, R_SHUFFLEPS( 0, 0, 1, 1 )
17747 		mulps		xmm1, xmm3
17748 		addps		xmm1, [edi-5*16]
17749 		addps		xmm3, xmm6
17750 		movaps		[edi-5*16], xmm1
17751 
17752 		movaps		xmm1, xmm0
17753 		shufps		xmm1, xmm1, R_SHUFFLEPS( 1, 1, 1, 1 )
17754 		mulps		xmm1, xmm4
17755 		addps		xmm1, [edi-4*16]
17756 		addps		xmm4, xmm7
17757 		movaps		[edi-4*16], xmm1
17758 
17759 		movaps		xmm1, xmm0
17760 		shufps		xmm1, xmm1, R_SHUFFLEPS( 2, 2, 2, 2 )
17761 		mulps		xmm1, xmm2
17762 		addps		xmm1, [edi-3*16]
17763 		addps		xmm2, xmm5
17764 		movaps		[edi-3*16], xmm1
17765 
17766 		movaps		xmm1, xmm0
17767 		shufps		xmm1, xmm1, R_SHUFFLEPS( 2, 2, 3, 3 )
17768 		mulps		xmm1, xmm3
17769 		addps		xmm1, [edi-2*16]
17770 		addps		xmm3, xmm6
17771 		movaps		[edi-2*16], xmm1
17772 
17773 		shufps		xmm0, xmm0, R_SHUFFLEPS( 3, 3, 3, 3 )
17774 		mulps		xmm0, xmm4
17775 		addps		xmm0, [edi-1*16]
17776 		addps		xmm4, xmm7
17777 		movaps		[edi-1*16], xmm0
17778 
17779 		add			eax, 4*4
17780 
17781 		jl			loop24
17782 	}
17783 
17784 #else
17785 
17786 	int i;
17787 	float sL0, sL1, sL2, sL3, sL4, sL5, sL6, sL7, sL8, sL9, sL10, sL11;
17788 	float incL0, incL1, incL2, incL3, incL4, incL5;
17789 
17790 	assert( numSamples == MIXBUFFER_SAMPLES );
17791 
17792 	incL0 = ( currentV[0] - lastV[0] ) / MIXBUFFER_SAMPLES;
17793 	incL1 = ( currentV[1] - lastV[1] ) / MIXBUFFER_SAMPLES;
17794 	incL2 = ( currentV[2] - lastV[2] ) / MIXBUFFER_SAMPLES;
17795 	incL3 = ( currentV[3] - lastV[3] ) / MIXBUFFER_SAMPLES;
17796 	incL4 = ( currentV[4] - lastV[4] ) / MIXBUFFER_SAMPLES;
17797 	incL5 = ( currentV[5] - lastV[5] ) / MIXBUFFER_SAMPLES;
17798 
17799 	sL0  = lastV[0];
17800 	sL1  = lastV[1];
17801 	sL2  = lastV[2];
17802 	sL3  = lastV[3];
17803 	sL4  = lastV[4];
17804 	sL5  = lastV[5];
17805 
17806 	sL6  = lastV[0] + incL0;
17807 	sL7  = lastV[1] + incL1;
17808 	sL8  = lastV[2] + incL2;
17809 	sL9  = lastV[3] + incL3;
17810 	sL10 = lastV[4] + incL4;
17811 	sL11 = lastV[5] + incL5;
17812 
17813 	incL0 *= 2;
17814 	incL1 *= 2;
17815 	incL2 *= 2;
17816 	incL3 *= 2;
17817 	incL4 *= 2;
17818 	incL5 *= 2;
17819 
17820 	for( i = 0; i <= MIXBUFFER_SAMPLES - 2; i += 2 ) {
17821 		mixBuffer[i*6+ 0] += samples[i+0] * sL0;
17822 		mixBuffer[i*6+ 1] += samples[i+0] * sL1;
17823 		mixBuffer[i*6+ 2] += samples[i+0] * sL2;
17824 		mixBuffer[i*6+ 3] += samples[i+0] * sL3;
17825 
17826 		mixBuffer[i*6+ 4] += samples[i+0] * sL4;
17827 		mixBuffer[i*6+ 5] += samples[i+0] * sL5;
17828 		mixBuffer[i*6+ 6] += samples[i+1] * sL6;
17829 		mixBuffer[i*6+ 7] += samples[i+1] * sL7;
17830 
17831 		mixBuffer[i*6+ 8] += samples[i+1] * sL8;
17832 		mixBuffer[i*6+ 9] += samples[i+1] * sL9;
17833 		mixBuffer[i*6+10] += samples[i+1] * sL10;
17834 		mixBuffer[i*6+11] += samples[i+1] * sL11;
17835 
17836 		sL0  += incL0;
17837 		sL1  += incL1;
17838 		sL2  += incL2;
17839 		sL3  += incL3;
17840 
17841 		sL4  += incL4;
17842 		sL5  += incL5;
17843 		sL6  += incL0;
17844 		sL7  += incL1;
17845 
17846 		sL8  += incL2;
17847 		sL9  += incL3;
17848 		sL10 += incL4;
17849 		sL11 += incL5;
17850 	}
17851 
17852 #endif
17853 }
17854 
17855 /*
17856 ============
17857 idSIMD_SSE::MixSoundSixSpeakerStereo
17858 ============
17859 */
MixSoundSixSpeakerStereo(float * mixBuffer,const float * samples,const int numSamples,const float lastV[6],const float currentV[6])17860 void VPCALL idSIMD_SSE::MixSoundSixSpeakerStereo( float *mixBuffer, const float *samples, const int numSamples, const float lastV[6], const float currentV[6] ) {
17861 #if 1
17862 
17863 	ALIGN16( float incs[6] );
17864 
17865 	assert( numSamples == MIXBUFFER_SAMPLES );
17866 	assert( SPEAKER_RIGHT == 1 );
17867 	assert( SPEAKER_BACKRIGHT == 5 );
17868 
17869 	incs[0] = ( currentV[0] - lastV[0] ) / MIXBUFFER_SAMPLES;
17870 	incs[1] = ( currentV[1] - lastV[1] ) / MIXBUFFER_SAMPLES;
17871 	incs[2] = ( currentV[2] - lastV[2] ) / MIXBUFFER_SAMPLES;
17872 	incs[3] = ( currentV[3] - lastV[3] ) / MIXBUFFER_SAMPLES;
17873 	incs[4] = ( currentV[4] - lastV[4] ) / MIXBUFFER_SAMPLES;
17874 	incs[5] = ( currentV[5] - lastV[5] ) / MIXBUFFER_SAMPLES;
17875 
17876 	__asm {
17877 		mov			eax, MIXBUFFER_SAMPLES
17878 		mov			edi, mixBuffer
17879 		mov			esi, samples
17880 		shl			eax, 3
17881 		add			esi, eax
17882 		neg			eax
17883 
17884 		mov			ecx, lastV
17885 		movlps		xmm2, [ecx+ 0]
17886 		movhps		xmm2, [ecx+ 8]
17887 		movlps		xmm3, [ecx+16]
17888 		movaps		xmm4, xmm2
17889 		shufps		xmm3, xmm2, R_SHUFFLEPS( 0, 1, 0, 1 )
17890 		shufps		xmm4, xmm3, R_SHUFFLEPS( 2, 3, 0, 1 )
17891 
17892 		xorps		xmm5, xmm5
17893 		movhps		xmm5, incs
17894 		movlps		xmm7, incs+ 8
17895 		movhps		xmm7, incs+16
17896 		addps		xmm3, xmm5
17897 		addps		xmm4, xmm7
17898 		shufps		xmm5, xmm7, R_SHUFFLEPS( 2, 3, 0, 1 )
17899 		movaps		xmm6, xmm7
17900 		shufps		xmm6, xmm5, R_SHUFFLEPS( 2, 3, 0, 1 )
17901 		addps		xmm5, xmm5
17902 		addps		xmm6, xmm6
17903 		addps		xmm7, xmm7
17904 
17905 	loop12:
17906 		add			edi, 3*16
17907 
17908 		movaps		xmm0, [esi+eax+0]
17909 
17910 		movaps		xmm1, xmm0
17911 		shufps		xmm1, xmm1, R_SHUFFLEPS( 0, 1, 0, 0 )
17912 		mulps		xmm1, xmm2
17913 		addps		xmm1, [edi-3*16]
17914 		addps		xmm2, xmm5
17915 		movaps		[edi-3*16], xmm1
17916 
17917 		movaps		xmm1, xmm0
17918 		shufps		xmm1, xmm1, R_SHUFFLEPS( 0, 1, 2, 3 )
17919 		mulps		xmm1, xmm3
17920 		addps		xmm1, [edi-2*16]
17921 		addps		xmm3, xmm6
17922 		movaps		[edi-2*16], xmm1
17923 
17924 		add			eax, 4*4
17925 
17926 		shufps		xmm0, xmm0, R_SHUFFLEPS( 2, 2, 2, 3 )
17927 		mulps		xmm0, xmm4
17928 		addps		xmm0, [edi-1*16]
17929 		addps		xmm4, xmm7
17930 		movaps		[edi-1*16], xmm0
17931 
17932 		jl			loop12
17933 
17934 		emms
17935 	}
17936 
17937 #else
17938 
17939 	int i;
17940 	float sL0, sL1, sL2, sL3, sL4, sL5, sL6, sL7, sL8, sL9, sL10, sL11;
17941 	float incL0, incL1, incL2, incL3, incL4, incL5;
17942 
17943 	assert( numSamples == MIXBUFFER_SAMPLES );
17944 	assert( SPEAKER_RIGHT == 1 );
17945 	assert( SPEAKER_BACKRIGHT == 5 );
17946 
17947 	incL0 = ( currentV[0] - lastV[0] ) / MIXBUFFER_SAMPLES;
17948 	incL1 = ( currentV[1] - lastV[1] ) / MIXBUFFER_SAMPLES;
17949 	incL2 = ( currentV[2] - lastV[2] ) / MIXBUFFER_SAMPLES;
17950 	incL3 = ( currentV[3] - lastV[3] ) / MIXBUFFER_SAMPLES;
17951 	incL4 = ( currentV[4] - lastV[4] ) / MIXBUFFER_SAMPLES;
17952 	incL5 = ( currentV[5] - lastV[5] ) / MIXBUFFER_SAMPLES;
17953 
17954 	sL0  = lastV[0];
17955 	sL1  = lastV[1];
17956 	sL2  = lastV[2];
17957 	sL3  = lastV[3];
17958 	sL4  = lastV[4];
17959 	sL5  = lastV[5];
17960 
17961 	sL6  = lastV[0] + incL0;
17962 	sL7  = lastV[1] + incL1;
17963 	sL8  = lastV[2] + incL2;
17964 	sL9  = lastV[3] + incL3;
17965 	sL10 = lastV[4] + incL4;
17966 	sL11 = lastV[5] + incL5;
17967 
17968 	incL0 *= 2;
17969 	incL1 *= 2;
17970 	incL2 *= 2;
17971 	incL3 *= 2;
17972 	incL4 *= 2;
17973 	incL5 *= 2;
17974 
17975 	for( i = 0; i <= MIXBUFFER_SAMPLES - 2; i += 2 ) {
17976 		mixBuffer[i*6+ 0] += samples[i*2+0+0] * sL0;
17977 		mixBuffer[i*6+ 1] += samples[i*2+0+1] * sL1;
17978 		mixBuffer[i*6+ 2] += samples[i*2+0+0] * sL2;
17979 		mixBuffer[i*6+ 3] += samples[i*2+0+0] * sL3;
17980 
17981 		mixBuffer[i*6+ 4] += samples[i*2+0+0] * sL4;
17982 		mixBuffer[i*6+ 5] += samples[i*2+0+1] * sL5;
17983 		mixBuffer[i*6+ 6] += samples[i*2+2+0] * sL6;
17984 		mixBuffer[i*6+ 7] += samples[i*2+2+1] * sL7;
17985 
17986 		mixBuffer[i*6+ 8] += samples[i*2+2+0] * sL8;
17987 		mixBuffer[i*6+ 9] += samples[i*2+2+0] * sL9;
17988 		mixBuffer[i*6+10] += samples[i*2+2+0] * sL10;
17989 		mixBuffer[i*6+11] += samples[i*2+2+1] * sL11;
17990 
17991 		sL0  += incL0;
17992 		sL1  += incL1;
17993 		sL2  += incL2;
17994 		sL3  += incL3;
17995 
17996 		sL4  += incL4;
17997 		sL5  += incL5;
17998 		sL6  += incL0;
17999 		sL7  += incL1;
18000 
18001 		sL8  += incL2;
18002 		sL9  += incL3;
18003 		sL10 += incL4;
18004 		sL11 += incL5;
18005 	}
18006 
18007 #endif
18008 }
18009 
18010 /*
18011 ============
18012 idSIMD_SSE::MixedSoundToSamples
18013 ============
18014 */
MixedSoundToSamples(short * samples,const float * mixBuffer,const int numSamples)18015 void VPCALL idSIMD_SSE::MixedSoundToSamples( short *samples, const float *mixBuffer, const int numSamples ) {
18016 #if 1
18017 
18018 	assert( ( numSamples % MIXBUFFER_SAMPLES ) == 0 );
18019 
18020 	__asm {
18021 
18022 		mov			eax, numSamples
18023 		mov			edi, mixBuffer
18024 		mov			esi, samples
18025 		shl			eax, 2
18026 		add			edi, eax
18027 		neg			eax
18028 
18029 	loop16:
18030 
18031 		movaps		xmm0, [edi+eax+0*16]
18032 		movaps		xmm2, [edi+eax+1*16]
18033 		movaps		xmm4, [edi+eax+2*16]
18034 		movaps		xmm6, [edi+eax+3*16]
18035 
18036 		add			esi, 4*4*2
18037 
18038 		movhlps		xmm1, xmm0
18039 		movhlps		xmm3, xmm2
18040 		movhlps		xmm5, xmm4
18041 		movhlps		xmm7, xmm6
18042 
18043 		prefetchnta	[edi+eax+64]
18044 
18045 		cvtps2pi	mm0, xmm0
18046 		cvtps2pi	mm2, xmm2
18047 		cvtps2pi	mm4, xmm4
18048 		cvtps2pi	mm6, xmm6
18049 
18050 		prefetchnta	[edi+eax+128]
18051 
18052 		cvtps2pi	mm1, xmm1
18053 		cvtps2pi	mm3, xmm3
18054 		cvtps2pi	mm5, xmm5
18055 		cvtps2pi	mm7, xmm7
18056 
18057 		add			eax, 4*16
18058 
18059 		packssdw	mm0, mm1
18060 		packssdw	mm2, mm3
18061 		packssdw	mm4, mm5
18062 		packssdw	mm6, mm7
18063 
18064 		movq		[esi-4*4*2], mm0
18065 		movq		[esi-3*4*2], mm2
18066 		movq		[esi-2*4*2], mm4
18067 		movq		[esi-1*4*2], mm6
18068 
18069 		jl			loop16
18070 
18071 		emms
18072 	}
18073 
18074 #else
18075 
18076 	for ( int i = 0; i < numSamples; i++ ) {
18077 		if ( mixBuffer[i] <= -32768.0f ) {
18078 			samples[i] = -32768;
18079 		} else if ( mixBuffer[i] >= 32767.0f ) {
18080 			samples[i] = 32767;
18081 		} else {
18082 			samples[i] = (short) mixBuffer[i];
18083 		}
18084 	}
18085 
18086 #endif
18087 }
18088 
18089 #endif /* _MSC_VER */
18090