1 /*
2 ===========================================================================
3 
4 Doom 3 GPL Source Code
5 Copyright (C) 1999-2011 id Software LLC, a ZeniMax Media company.
6 
7 This file is part of the Doom 3 GPL Source Code ("Doom 3 Source Code").
8 
9 Doom 3 Source Code is free software: you can redistribute it and/or modify
10 it under the terms of the GNU General Public License as published by
11 the Free Software Foundation, either version 3 of the License, or
12 (at your option) any later version.
13 
14 Doom 3 Source Code is distributed in the hope that it will be useful,
15 but WITHOUT ANY WARRANTY; without even the implied warranty of
16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17 GNU General Public License for more details.
18 
19 You should have received a copy of the GNU General Public License
20 along with Doom 3 Source Code.  If not, see <http://www.gnu.org/licenses/>.
21 
22 In addition, the Doom 3 Source Code is also subject to certain additional terms. You should have received a copy of these additional terms immediately following the terms and conditions of the GNU General Public License which accompanied the Doom 3 Source Code.  If not, please request a copy in writing from id Software at the address below.
23 
24 If you have questions concerning this license or the applicable additional terms, you may contact in writing id Software LLC, c/o ZeniMax Media Inc., Suite 120, Rockville, Maryland 20850 USA.
25 
26 ===========================================================================
27 */
28 
29 #include <math.h>
30 #include <float.h>
31 
32 #include "sys/platform.h"
33 
34 #include "idlib/math/Simd_AltiVec.h"
35 
36 // Doom3 SIMD Library version 0.5
37 // Patrick Flanagan (pflanagan@apple.com)
38 // Sanjay Patel (spatel@apple.com)
39 // Architecture & Performance Group, Apple Computer
40 
41 
42 //===============================================================
43 //
44 //	AltiVec implementation of idSIMDProcessor
45 //
46 //===============================================================
47 
48 #if defined(MACOS_X) && defined(__GNUC__) && defined(__ALTIVEC__)
49 
50 #ifdef PPC_INTRINSICS
51 	// for square root estimate instruction
52 	#include <ppc_intrinsics.h>
53 #endif
54 
55 // Data struct sizes
56 
57 #ifndef DRAWVERT_PADDED
58 	// 60 bytes, 15 floats at 4 bytes each
59 	#define DRAWVERT_OFFSET 15
60 #else
61 	// 64 bytes, 16 floats
62 	#define DRAWVERT_OFFSET 16
63 #endif
64 // 16 bytes each, 4 floats
65 #define PLANE_OFFSET 4
66 // 16 bytes each, 4 floats
67 #define IDVEC4_OFFSET 4
68 
69 // Alignment tests
70 #define IS_16BYTE_ALIGNED( x ) ( ( (unsigned int)&x & 0x0F ) == 0 )
71 #define NOT_16BYTE_ALIGNED( x ) ( ( (unsigned int)&x & 0x0F) != 0 )
72 
73 // Aligned storing floats
74 #define ALIGNED_STORE2( ADDR, V0, V1 )			\
75 	vec_st( V0, 0, ADDR );						\
76 	vec_st( V1, 16, ADDR )
77 
78 #define ALIGNED_STORE3( ADDR, V0, V1, V2 )		\
79 	vec_st( V0, 0, ADDR );						\
80 	vec_st( V1, 16, ADDR );						\
81 	vec_st( V2, 32, ADDR )
82 
83 #define ALIGNED_STORE4( ADDR, V0, V1, V2, V3 )  \
84 	vec_st( V0, 0, ADDR );						\
85 	vec_st( V1, 16, ADDR );						\
86 	vec_st( V2, 32, ADDR );						\
87 	vec_st( V3, 48, ADDR )
88 
89 #define ALIGNED_STORE6( ADDR, V0, V1, V2, V3, V4, V5 )  \
90 	vec_st( V0, 0, ADDR );						\
91 	vec_st( V1, 16, ADDR );						\
92 	vec_st( V2, 32, ADDR );						\
93 	vec_st( V3, 48, ADDR );						\
94 	vec_st( V4, 64, ADDR );						\
95 	vec_st( V5, 80, ADDR )
96 
97 #define ALIGNED_STORE8( ADDR, V0, V1, V2, V3, V4, V5, V6, V7 )  \
98 	vec_st( V0, 0, ADDR );						\
99 	vec_st( V1, 16, ADDR );						\
100 	vec_st( V2, 32, ADDR );						\
101 	vec_st( V3, 48, ADDR );						\
102 	vec_st( V4, 64, ADDR );						\
103 	vec_st( V5, 80, ADDR );						\
104 	vec_st( V6, 96, ADDR );						\
105 	vec_st( V7, 112, ADDR )
106 
107 // Unaligned storing floats. These assume that we can trash the input
108 #define UNALIGNED_STORE1( ADDR, V0 ) { \
109 	/* use store element */				\
110 	vector unsigned char ULStoreMacroPerm = vec_lvsr( 0, ADDR );	\
111 	V0 = vec_perm( V0, V0, ULStoreMacroPerm );						\
112 	vec_ste( V0, 0, ADDR );		\
113 	vec_ste( V0, 4, ADDR );		\
114 	vec_ste( V0, 8, ADDR );		\
115 	vec_ste( V0, 12, ADDR );	\
116 	}
117 
118 #define UNALIGNED_STORE2( ADDR, V0, V1 )	{		\
119 	/* load up the values that are there now */			\
120 	vector float ULStoreMacro1 = vec_ld( 0, ADDR );		\
121 	vector float ULStoreMacro2 = vec_ld( 31, ADDR );	\
122 	/* generate permute vector and mask	*/				\
123 	vector unsigned char ULStoreMacroPerm = vec_sub( vec_lvsr( 15, ADDR ), (vector unsigned char)(1) ); \
124 	vector unsigned int ULStoreMacroMask = vec_perm( (vector unsigned int)(0), (vector unsigned int)(-1), ULStoreMacroPerm ); \
125 	/* right rotate input data	*/   \
126 	V0 = vec_perm( V0, V0, ULStoreMacroPerm );	\
127 	V1 = vec_perm( V1, V1, ULStoreMacroPerm );	\
128 	/* setup the output vectors		*/			\
129 	vector float ULStoreVal1, ULStoreVal2, ULStoreVal3;	\
130 	ULStoreVal1 = vec_sel( ULStoreMacro1, V0, ULStoreMacroMask );	\
131 	ULStoreVal2 = vec_sel( V0, V1, ULStoreMacroMask );	\
132 	ULStoreVal3 = vec_sel( V1, ULStoreMacro2, ULStoreMacroMask );	\
133 	/* store results	*/					\
134 	vec_st( ULStoreVal1, 0, ADDR );			\
135 	vec_st( ULStoreVal2, 15, ADDR );		\
136 	vec_st( ULStoreVal3, 31, ADDR ); }
137 
138 #define UNALIGNED_STORE3( ADDR, V0, V1, V2 )	{		\
139 	/* load up the values that are there now */			\
140 	vector float ULStoreMacro1 = vec_ld( 0, ADDR );		\
141 	vector float ULStoreMacro2 = vec_ld( 47, ADDR );	\
142 	/* generate permute vector and mask	*/				\
143 	vector unsigned char ULStoreMacroPerm = vec_sub( vec_lvsr( 15, ADDR ), (vector unsigned char)(1) ); \
144 	vector unsigned int ULStoreMacroMask = vec_perm( (vector unsigned int)(0), (vector unsigned int)(-1), ULStoreMacroPerm ); \
145 	/* right rotate input data	*/   \
146 	V0 = vec_perm( V0, V0, ULStoreMacroPerm );	\
147 	V1 = vec_perm( V1, V1, ULStoreMacroPerm );	\
148 	V2 = vec_perm( V2, V2, ULStoreMacroPerm );	\
149 	/* setup the output vectors		*/			\
150 	vector float ULStoreVal1, ULStoreVal2, ULStoreVal3, ULStoreVal4;	\
151 	ULStoreVal1 = vec_sel( ULStoreMacro1, V0, ULStoreMacroMask );	\
152 	ULStoreVal2 = vec_sel( V0, V1, ULStoreMacroMask );	\
153 	ULStoreVal3 = vec_sel( V1, V2, ULStoreMacroMask );	\
154 	ULStoreVal4 = vec_sel( V2, ULStoreMacro2, ULStoreMacroMask );	\
155 	/* store results	*/					\
156 	vec_st( ULStoreVal1, 0, ADDR );			\
157 	vec_st( ULStoreVal2, 15, ADDR );		\
158 	vec_st( ULStoreVal3, 31, ADDR );		\
159 	vec_st( ULStoreVal4, 47, ADDR ); }
160 
161 #define UNALIGNED_STORE4( ADDR, V0, V1, V2, V3 )	{		\
162 	/* load up the values that are there now */			\
163 	vector float ULStoreMacro1 = vec_ld( 0, ADDR );		\
164 	vector float ULStoreMacro2 = vec_ld( 63, ADDR );	\
165 	/* generate permute vector and mask	*/				\
166 	vector unsigned char ULStoreMacroPerm = vec_sub( vec_lvsr( 15, ADDR ), (vector unsigned char)(1) ); \
167 	vector unsigned int ULStoreMacroMask = vec_perm( (vector unsigned int)(0), (vector unsigned int)(-1), ULStoreMacroPerm ); \
168 	/* right rotate input data	*/   \
169 	V0 = vec_perm( V0, V0, ULStoreMacroPerm );	\
170 	V1 = vec_perm( V1, V1, ULStoreMacroPerm );	\
171 	V2 = vec_perm( V2, V2, ULStoreMacroPerm );	\
172 	V3 = vec_perm( V3, V3, ULStoreMacroPerm );	\
173 	/* setup the output vectors		*/			\
174 	vector float ULStoreVal1, ULStoreVal2, ULStoreVal3, ULStoreVal4, ULStoreVal5;	\
175 	ULStoreVal1 = vec_sel( ULStoreMacro1, V0, ULStoreMacroMask );	\
176 	ULStoreVal2 = vec_sel( V0, V1, ULStoreMacroMask );	\
177 	ULStoreVal3 = vec_sel( V1, V2, ULStoreMacroMask );	\
178 	ULStoreVal4 = vec_sel( V2, V3, ULStoreMacroMask );	\
179 	ULStoreVal5 = vec_sel( V3, ULStoreMacro2, ULStoreMacroMask );	\
180 	/* store results	*/					\
181 	vec_st( ULStoreVal1, 0, ADDR );			\
182 	vec_st( ULStoreVal2, 15, ADDR );		\
183 	vec_st( ULStoreVal3, 31, ADDR );		\
184 	vec_st( ULStoreVal4, 47, ADDR );		\
185 	vec_st( ULStoreVal5, 63, ADDR );	}
186 
187 #define UNALIGNED_STORE6( ADDR, V0, V1, V2, V3, V4, V5 )	{		\
188 	/* load up the values that are there now */			\
189 	vector float ULStoreMacro1 = vec_ld( 0, ADDR );		\
190 	vector float ULStoreMacro2 = vec_ld( 95, ADDR );	\
191 	/* generate permute vector and mask	*/				\
192 	vector unsigned char ULStoreMacroPerm = vec_sub( vec_lvsr( 15, ADDR ), (vector unsigned char)(1) ); \
193 	vector unsigned int ULStoreMacroMask = vec_perm( (vector unsigned int)(0), (vector unsigned int)(-1), ULStoreMacroPerm ); \
194 	/* right rotate input data	*/   \
195 	V0 = vec_perm( V0, V0, ULStoreMacroPerm );	\
196 	V1 = vec_perm( V1, V1, ULStoreMacroPerm );	\
197 	V2 = vec_perm( V2, V2, ULStoreMacroPerm );	\
198 	V3 = vec_perm( V3, V3, ULStoreMacroPerm );	\
199 	V4 = vec_perm( V4, V4, ULStoreMacroPerm );	\
200 	V5 = vec_perm( V5, V5, ULStoreMacroPerm );	\
201 	/* setup the output vectors		*/			\
202 	vector float ULStoreVal1, ULStoreVal2, ULStoreVal3, ULStoreVal4, ULStoreVal5, ULStoreVal6, ULStoreVal7;	\
203 	ULStoreVal1 = vec_sel( ULStoreMacro1, V0, ULStoreMacroMask );	\
204 	ULStoreVal2 = vec_sel( V0, V1, ULStoreMacroMask );	\
205 	ULStoreVal3 = vec_sel( V1, V2, ULStoreMacroMask );	\
206 	ULStoreVal4 = vec_sel( V2, V3, ULStoreMacroMask );	\
207 	ULStoreVal5 = vec_sel( V3, V4, ULStoreMacroMask );	\
208 	ULStoreVal6 = vec_sel( V4, V5, ULStoreMacroMask );	\
209 	ULStoreVal7 = vec_sel( V5, ULStoreMacro2, ULStoreMacroMask );	\
210 	/* store results	*/					\
211 	vec_st( ULStoreVal1, 0, ADDR );			\
212 	vec_st( ULStoreVal2, 15, ADDR );		\
213 	vec_st( ULStoreVal3, 31, ADDR );		\
214 	vec_st( ULStoreVal4, 47, ADDR );		\
215 	vec_st( ULStoreVal5, 63, ADDR );		\
216 	vec_st( ULStoreVal6, 79, ADDR );		\
217 	vec_st( ULStoreVal7, 95, ADDR );	}
218 
219 #define UNALIGNED_STORE9( ADDR, V0, V1, V2, V3, V4, V5, V6, V7, V8 )	{		\
220 	/* load up the values that are there now */			\
221 	vector float ULStoreMacro1 = vec_ld( 0, ADDR );		\
222 	vector float ULStoreMacro2 = vec_ld( 143, ADDR );	\
223 	/* generate permute vector and mask	*/				\
224 	vector unsigned char ULStoreMacroPerm = vec_sub( vec_lvsr( 15, ADDR ), (vector unsigned char)(1) ); \
225 	vector unsigned int ULStoreMacroMask = vec_perm( (vector unsigned int)(0), (vector unsigned int)(-1), ULStoreMacroPerm ); \
226 	/* right rotate input data	*/   \
227 	V0 = vec_perm( V0, V0, ULStoreMacroPerm );	\
228 	V1 = vec_perm( V1, V1, ULStoreMacroPerm );	\
229 	V2 = vec_perm( V2, V2, ULStoreMacroPerm );	\
230 	V3 = vec_perm( V3, V3, ULStoreMacroPerm );	\
231 	V4 = vec_perm( V4, V4, ULStoreMacroPerm );	\
232 	V5 = vec_perm( V5, V5, ULStoreMacroPerm );	\
233 	V6 = vec_perm( V6, V6, ULStoreMacroPerm );	\
234 	V7 = vec_perm( V7, V7, ULStoreMacroPerm );	\
235 	V8 = vec_perm( V8, V8, ULStoreMacroPerm );	\
236 	/* setup the output vectors		*/			\
237 	vector float ULStoreVal1, ULStoreVal2, ULStoreVal3, ULStoreVal4, ULStoreVal5, ULStoreVal6, ULStoreVal7;	\
238 	vector float ULStoreVal8, ULStoreVal9, ULStoreVal10;	\
239 	ULStoreVal1 = vec_sel( ULStoreMacro1, V0, ULStoreMacroMask );	\
240 	ULStoreVal2 = vec_sel( V0, V1, ULStoreMacroMask );	\
241 	ULStoreVal3 = vec_sel( V1, V2, ULStoreMacroMask );	\
242 	ULStoreVal4 = vec_sel( V2, V3, ULStoreMacroMask );	\
243 	ULStoreVal5 = vec_sel( V3, V4, ULStoreMacroMask );	\
244 	ULStoreVal6 = vec_sel( V4, V5, ULStoreMacroMask );	\
245 	ULStoreVal7 = vec_sel( V5, V6, ULStoreMacroMask );	\
246 	ULStoreVal8 = vec_sel( V6, V7, ULStoreMacroMask );	\
247 	ULStoreVal9 = vec_sel( V7, V8, ULStoreMacroMask );	\
248 	ULStoreVal10 = vec_sel( V8, ULStoreMacro2, ULStoreMacroMask );	\
249 	/* store results	*/					\
250 	vec_st( ULStoreVal1, 0, ADDR );			\
251 	vec_st( ULStoreVal2, 15, ADDR );		\
252 	vec_st( ULStoreVal3, 31, ADDR );		\
253 	vec_st( ULStoreVal4, 47, ADDR );		\
254 	vec_st( ULStoreVal5, 63, ADDR );		\
255 	vec_st( ULStoreVal6, 79, ADDR );		\
256 	vec_st( ULStoreVal7, 95, ADDR );		\
257 	vec_st( ULStoreVal8, 111, ADDR );		\
258 	vec_st( ULStoreVal9, 127, ADDR );		\
259 	vec_st( ULStoreVal10, 143, ADDR );	}
260 
261 /*
262 ============
263 idSIMD_AltiVec::GetName
264 ============
265 */
GetName(void) const266 const char *idSIMD_AltiVec::GetName( void ) const {
267 	return "AltiVec";
268 }
269 
270 /*
271 	Helper Functions
272 */
273 #if 0
274 // Prints the values of a vector, useful for debugging but
275 // should never be called in real code
276 inline void debugPrintVector( vector float v, char *msg ) {
277 	printf("%s -- %vf\n", msg, v );
278 }
279 
280 inline void debugPrintVector( vector unsigned int v, char *msg ) {
281 	printf("%s -- %vd\n", msg, v );
282 }
283 
284 inline void debugPrintVector( vector bool int v, char *msg ) {
285 	printf("%s -- %vi\n", msg, v );
286 }
287 
288 inline void debugPrintVector( vector unsigned char v, char *msg ) {
289 	printf("%s -- %vuc\n", msg, v );
290 }
291 
292 inline void debugPrintVector( vector unsigned short v, char *msg ) {
293 	printf("%s -- %vs\n", msg, v );
294 }
295 #endif
296 /*
297 ===============
298   Reciprocal
299 
300   For each element in vector:
301 	n = 1 / n
302 ===============
303 */
304 
305 // Use Newton-Raphson to calculate reciprocal of a vector
Reciprocal(vector float v)306 inline vector float Reciprocal( vector float v ) {
307   //Get the reciprocal estimate
308   vector float estimate = vec_re( v );
309   //One round of Newton-Raphson refinement
310   return vec_madd( vec_nmsub( estimate, v, (vector float) (1.0) ), estimate, estimate );
311 }
312 
313 /*
314 ===============
315   ReciprocalSquareRoot
316 
317   For each element in vector:
318 	n = 1 / sqrt(n)
319 ===============
320 */
321 // Reciprocal square root estimate of a vector
ReciprocalSquareRoot(vector float v)322 inline vector float ReciprocalSquareRoot( vector float v ) {
323 	//Get the square root reciprocal estimate
324 	vector float zero = (vector float)(0);
325 	vector float oneHalf = (vector float)(0.5);
326 	vector float one = (vector float)(1.0);
327 	vector float estimate = vec_rsqrte( vec_max( v, (vector float)(FLT_MIN) ) );
328 
329 	//One round of Newton-Raphson refinement
330 	vector float estimateSquared = vec_madd( estimate, estimate, zero );
331 	vector float halfEstimate = vec_madd( estimate, oneHalf, zero );
332 	return vec_madd( vec_nmsub( v, estimateSquared, one ), halfEstimate, estimate );
333 }
334 
335 
336 /*
337 ===============
338   Divide
339 
340   For each element in vectors:
341 	n = a / b
342 ===============
343 */
344 // Use reciprocal estimate and multiply to divide a vector
Divide(vector float a,vector float b)345 inline vector float Divide( vector float a, vector float b ) {
346    return vec_madd( a, Reciprocal( b ), (vector float)(0) );
347 }
348 
349 /*
350 ===============
351   loadSplatUnalignedScalar
352 
353   For each element in vector:
354 	n = s
355 ===============
356 */
loadSplatUnalignedScalar(const float * s)357 inline vector float loadSplatUnalignedScalar( const float *s ) {
358 	vector unsigned char splatMap = vec_lvsl( 0, s );
359 	vector float v = vec_ld( 0, s );
360 	splatMap = (vector unsigned char) vec_splat( (vector float) splatMap, 0 );
361 	return vec_perm( v, v, splatMap );
362 }
363 
364 /*
365 ===============
366   VectorATan16
367 
368   For each element in vector:
369 	n = idMath::ATan16( x, y )
370 ===============
371 */
372 // calculates arc tangent of a vector with 16 bits of precision, based on atan16 in idMath
VectorATan16(vector float x,vector float y)373 inline vector float VectorATan16( vector float x, vector float y ) {
374 
375 	vector float xDivY = Divide( x, y );
376 	vector float yDivX = Divide( y, x );
377 	vector float zeroVector = (vector float)(0);
378 
379 	vector bool int vecCmp = vec_cmpgt( vec_abs( y ), vec_abs( x ) );
380 	vector float vecA = vec_sel( yDivX, xDivY, vecCmp );
381 	vector bool int vecCmp2 = vec_cmplt( vecA, zeroVector );
382 	vector float vecS = vec_madd( vecA, vecA, (vector float)(0) );
383 
384 	// do calculation for S
385 	vector float vecWork1 = vec_madd( (vector float)(0.0028662257f), vecS, (vector float)(-0.0161657367f) );
386 	vecWork1 = vec_madd( vecWork1, vecS, (vector float)(0.0429096138f) );
387 	vecWork1 = vec_madd( vecWork1, vecS, (vector float)(-0.0752896400f) );
388 	vecWork1 = vec_madd( vecWork1, vecS, (vector float)(0.1065626393f) );
389 	vecWork1 = vec_madd( vecWork1, vecS, (vector float)(-0.1420889944f) );
390 	vecWork1 = vec_madd( vecWork1, vecS, (vector float)(0.1999355085f) );
391 	vecWork1 = vec_madd( vecWork1, vecS, (vector float)(-0.3333314528f) );
392 	vecWork1 = vec_madd( vecWork1, vecS, (vector float)(1) );
393 
394 	// get the regular S value
395 	vecS = vec_madd( vecWork1, vecA, (vector float)(0) );
396 
397 	// calculate what to return if y > x
398 	vector float negSPlusHalfPI = vec_madd( vecS, (vector float)(-1), (vector float)(0.5f * 3.14159265358979323846f) );
399 	vector float negSMinusHalfPI = vec_madd( vecS, (vector float)(-1), (vector float)(-0.5f * 3.14159265358979323846f) );
400 	vector float modRet = vec_sel( negSPlusHalfPI, negSMinusHalfPI, vecCmp2 );
401 
402 	return vec_sel( modRet, vecS, vecCmp );
403 }
404 
405 /*
406 ===============
407   VectorSin16
408 
409   For each element in vector:
410 	n = idMath::Sin16( v )
411 ===============
412 */
VectorSin16(vector float v)413 inline vector float VectorSin16( vector float v ) {
414 	vector float zero = (vector float)(0);
415 
416 #if 0
417 	// load up half PI and use it to calculate the rest of the values. This is
418 	// sometimes cheaper than loading them from memory
419 
420 	vector float halfPI = (vector float) ( 0.5f * 3.14159265358979323846f );
421 	vector float PI = vec_add( halfPI, halfPI );
422 	vector float oneandhalfPI = vec_add( PI, halfPI );
423 	vector float twoPI = vec_add( oneandhalfPI, halfPI );
424 #else
425 	vector float halfPI = (vector float) ( 0.5f * 3.14159265358979323846f );
426 	vector float PI = (vector float)(3.14159265358979323846f);
427 	vector float oneandhalfPI = (vector float)(3.14159265358979323846f + (  0.5f * 3.14159265358979323846f ) );
428 	vector float twoPI = (vector float)( 2.0f * 3.14159265358979323846f);
429 #endif
430 
431 	vector bool int vecCmp1, vecCmp2, vecCmp3, vecCmp4;
432 
433 	vector float vecMod;
434 	vector float vecResult;
435 
436 	// fix the range if needbe
437 	vecMod = vec_floor( Divide( v, twoPI ) );
438 	vecResult = vec_nmsub( vecMod, twoPI, v );
439 
440 	vector float vecPIminusA = vec_sub( PI, vecResult );
441 	vector float vecAminus2PI = vec_sub( vecResult, twoPI );
442 
443 	vecCmp1 = vec_cmplt( vecResult, PI );
444 	vecCmp2 = vec_cmpgt( vecResult, halfPI );
445 
446 	// these are the ones where a > PI + HALF_PI so set a = a - TWO_PI
447 	vecCmp3 = vec_cmpgt( vecResult, oneandhalfPI );
448 
449 	// we also want to set a = PI - a everywhere that !(a < PI) and !(a > PI + HALF_PI)
450 	vecCmp4 = vec_and( vec_xor( vecCmp3, (vector bool int)(1) ), vec_xor(  vecCmp1, (vector bool int)(1) ) ); // everywhere that both of those are false
451 
452 	// these are ones where a < PI and a > HALF_PI so we set a = PI - a
453 	vecCmp1 = vec_and( vecCmp1, vecCmp2 );
454 	vecCmp1 = vec_or( vecCmp1, vecCmp4 );
455 
456 	// put the correct values into place
457 	vecResult = vec_sel( vecResult, vecPIminusA, vecCmp1 );
458 	vecResult = vec_sel( vecResult, vecAminus2PI, vecCmp3 );
459 
460 	// calculate answer
461 	vector float vecASquared = vec_madd( vecResult, vecResult, zero );
462 	vector float vecEst = vec_madd( (vector float)(-2.39e-08f), vecASquared, (vector float)(2.7526e-06f) );
463 	vecEst = vec_madd( vecEst, vecASquared, (vector float)(-1.98409e-04f) );
464 	vecEst = vec_madd( vecEst, vecASquared, (vector float)(8.3333315e-03f) );
465 	vecEst = vec_madd( vecEst, vecASquared, (vector float)(-1.666666664e-01f) );
466 	vecEst = vec_madd( vecEst, vecASquared, (vector float)(1.0f) );
467 	return vec_madd( vecResult, vecEst, zero );
468 }
469 
470 /*
471 ===============
472   vecSplatWithRunTime
473 
474   For each element in vector:
475 	n = v(i)
476 ===============
477 */
478 // splats an element across a vector using a runtime variable
vecSplatWithRunTime(vector float v,int i)479 inline vector float vecSplatWithRunTime( vector float v, int i ) {
480 		vector unsigned char rotate = vec_lvsl( i * sizeof( float ), (int*) 0L );
481 		v = vec_perm( v, v, rotate );
482 		return vec_splat( v, 0 );
483 }
484 
485 
486 /*
487 ===============
488   FastScalarInvSqrt
489 
490 	n = 1 / sqrt( f )
491 ===============
492 */
FastScalarInvSqrt(float f)493 inline float FastScalarInvSqrt( float f ) {
494 #ifdef PPC_INTRINSICS
495 	float estimate;
496 	const float kSmallestFloat = FLT_MIN;
497 
498 	//Calculate a 5 bit starting estimate for the reciprocal sqrt
499 	estimate = __frsqrte ( f + kSmallestFloat );
500 
501 	//if you require less precision, you may reduce the number of loop iterations.
502 	// This will do 2 rounds of NR
503 	estimate = estimate + 0.5f * estimate * ( 1.0f - f * estimate * estimate );
504 	estimate = estimate + 0.5f * estimate * ( 1.0f - f * estimate * estimate );
505 	return estimate;
506 #else
507 	return idMath::InvSqrt( f );
508 #endif
509 }
510 
511 /*
512 ===============
513   FastScalarInvSqrt_x3
514 
515 	arg1 = 1 / sqrt( arg1 )
516 	arg2 = 1 / sqrt( arg2 )
517 	arg3 = 1 / sqrt( arg3 )
518 ===============
519 */
FastScalarInvSqrt_x3(float * arg1,float * arg2,float * arg3)520 inline void FastScalarInvSqrt_x3( float *arg1, float *arg2, float *arg3 ) {
521 #ifdef PPC_INTRINSICS
522 	register float estimate1, estimate2, estimate3;
523 	const float kSmallestFloat = FLT_MIN;
524 
525 	//Calculate a 5 bit starting estimate for the reciprocal sqrt of each
526 	estimate1 = __frsqrte ( *arg1 + kSmallestFloat );
527 	estimate2 = __frsqrte ( *arg2 + kSmallestFloat );
528 	estimate3 = __frsqrte ( *arg3 + kSmallestFloat );
529 
530 	// two rounds newton-raphson
531 	estimate1 = estimate1 + 0.5f * estimate1 * ( 1.0f - *arg1 * estimate1 * estimate1 );
532 	estimate2 = estimate2 + 0.5f * estimate2 * ( 1.0f - *arg2 * estimate2 * estimate2 );
533 	estimate3 = estimate3 + 0.5f * estimate3 * ( 1.0f - *arg3 * estimate3 * estimate3 );
534 	estimate1 = estimate1 + 0.5f * estimate1 * ( 1.0f - *arg1 * estimate1 * estimate1 );
535 	estimate2 = estimate2 + 0.5f * estimate2 * ( 1.0f - *arg2 * estimate2 * estimate2 );
536 	estimate3 = estimate3 + 0.5f * estimate3 * ( 1.0f - *arg3 * estimate3 * estimate3 );
537 
538 	*arg1 = estimate1;
539 	*arg2 = estimate2;
540 	*arg3 = estimate3;
541 #else
542 	*arg1 = idMath::InvSqrt( *arg1 );
543 	*arg2 = idMath::InvSqrt( *arg2 );
544 	*arg3 = idMath::InvSqrt( *arg3 );
545 #endif
546 }
547 
548 /*
549 ===============
550   FastScalarInvSqrt_x6
551 
552 	arg1 = 1 / sqrt( arg1 )
553 	arg2 = 1 / sqrt( arg2 )
554 	arg3 = 1 / sqrt( arg3 )
555 	arg4 = 1 / sqrt( arg4 )
556 	arg5 = 1 / sqrt( arg5 )
557 	arg6 = 1 / sqrt( arg6 )
558 
559 	On a G5, you've got 2 pipeline stages to fill. (2 FPU's with 6 stages each)
560 ===============
561 */
FastScalarInvSqrt_x6(float * arg1,float * arg2,float * arg3,float * arg4,float * arg5,float * arg6)562 inline void FastScalarInvSqrt_x6( float *arg1, float *arg2, float *arg3, float *arg4, float *arg5, float *arg6 ) {
563 #ifdef PPC_INTRINSICS
564 	register float estimate1, estimate2, estimate3, estimate4, estimate5, estimate6;
565 	const float kSmallestFloat = FLT_MIN;
566 
567 	//Calculate a 5 bit starting estimate for the reciprocal sqrt of each
568 	estimate1 = __frsqrte ( *arg1 + kSmallestFloat );
569 	estimate2 = __frsqrte ( *arg2 + kSmallestFloat );
570 	estimate3 = __frsqrte ( *arg3 + kSmallestFloat );
571 	estimate4 = __frsqrte ( *arg4 + kSmallestFloat );
572 	estimate5 = __frsqrte ( *arg5 + kSmallestFloat );
573 	estimate6 = __frsqrte ( *arg6 + kSmallestFloat );
574 
575 	// two rounds newton-raphson
576 	estimate1 = estimate1 + 0.5f * estimate1 * ( 1.0f - *arg1 * estimate1 * estimate1 );
577 	estimate2 = estimate2 + 0.5f * estimate2 * ( 1.0f - *arg2 * estimate2 * estimate2 );
578 	estimate3 = estimate3 + 0.5f * estimate3 * ( 1.0f - *arg3 * estimate3 * estimate3 );
579 	estimate4 = estimate4 + 0.5f * estimate4 * ( 1.0f - *arg4 * estimate4 * estimate4 );
580 	estimate5 = estimate5 + 0.5f * estimate5 * ( 1.0f - *arg5 * estimate5 * estimate5 );
581 	estimate6 = estimate6 + 0.5f * estimate6 * ( 1.0f - *arg6 * estimate6 * estimate6 );
582 
583 	estimate1 = estimate1 + 0.5f * estimate1 * ( 1.0f - *arg1 * estimate1 * estimate1 );
584 	estimate2 = estimate2 + 0.5f * estimate2 * ( 1.0f - *arg2 * estimate2 * estimate2 );
585 	estimate3 = estimate3 + 0.5f * estimate3 * ( 1.0f - *arg3 * estimate3 * estimate3 );
586 	estimate4 = estimate4 + 0.5f * estimate4 * ( 1.0f - *arg4 * estimate4 * estimate4 );
587 	estimate5 = estimate5 + 0.5f * estimate5 * ( 1.0f - *arg5 * estimate5 * estimate5 );
588 	estimate6 = estimate6 + 0.5f * estimate6 * ( 1.0f - *arg6 * estimate6 * estimate6 );
589 
590 	*arg1 = estimate1;
591 	*arg2 = estimate2;
592 	*arg3 = estimate3;
593 	*arg4 = estimate4;
594 	*arg5 = estimate5;
595 	*arg6 = estimate6;
596 #else
597 	*arg1 = idMath::InvSqrt( *arg1 );
598 	*arg2 = idMath::InvSqrt( *arg2 );
599 	*arg3 = idMath::InvSqrt( *arg3 );
600 	*arg4 = idMath::InvSqrt( *arg4 );
601 	*arg5 = idMath::InvSqrt( *arg5 );
602 	*arg6 = idMath::InvSqrt( *arg6 );
603 #endif
604 }
605 
606 
607 // End Helper Functions
608 
609 #ifdef ENABLE_SIMPLE_MATH
610 
611 /*
612 ============
613 idSIMD_AltiVec::Add
614 
615   dst[i] = constant + src[i];
616 ============
617 */
Add(float * dst,const float constant,const float * src,const int count)618 void VPCALL idSIMD_AltiVec::Add( float *dst, const float constant, const float *src, const int count ) {
619 	vector float v0, v1, v2, v3;
620 	vector float v0_low, v0_hi, v1_hi;
621 	vector unsigned char permVec;
622 	vector float constVec;
623 	int i;
624 
625 	// handle unaligned cases at beginning
626 	for ( i = 0; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count ); i++ ) {
627 	   dst[i] = constant + src[i];
628 	}
629 
630 	//splat constant into a vector
631 	constVec = loadSplatUnalignedScalar( &constant );
632 
633 	//calculate permute and do first load
634 	permVec = vec_add( vec_lvsl( -1, (int*) &src[i] ), (vector unsigned char)(1) );
635 	v1_hi = vec_ld( 0, &src[i] );
636 
637 	//vectorize!
638 	for ( ; i+7 < count; i += 8 ) {
639 		//load source
640 		v0_low = v1_hi;
641 		v0_hi = vec_ld( 15, &src[i] );
642 		v1_hi = vec_ld( 31, &src[i] );
643 
644 		v0 = vec_perm( v0_low, v0_hi, permVec );
645 		v1 = vec_perm( v0_hi, v1_hi, permVec );
646 
647 		v2 = vec_add( v0, constVec );
648 		v3 = vec_add( v1, constVec );
649 
650 		// store results
651 		ALIGNED_STORE2( &dst[i], v2, v3 );
652 	}
653 
654 	//handle cleanup
655 	 for ( ; i < count ; i++ ) {
656 		dst[i] = constant + src[i];
657 	}
658 }
659 
660 /*
661 ============
662 idSIMD_AltiVec::Add
663 
664   dst[i] = src0[i] + src1[i];
665 ============
666 */
Add(float * dst,const float * src0,const float * src1,const int count)667 void VPCALL idSIMD_AltiVec::Add( float *dst, const float *src0, const float *src1, const int count ) {
668 
669 	register vector float v0, v1, v2, v3, v4, v5;
670 	//src0
671 	register vector float v0_low, v0_hi, v2_low, v2_hi;
672 	//src1
673 	register vector float v1_low, v1_hi, v3_low, v3_hi;
674 	//permute vectors
675 	register vector unsigned char permVec1, permVec2;
676 	vector unsigned char oneCharVector = (vector unsigned char)(1);
677 
678 	int i;
679 
680 	//unaligned at start
681 	for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count ); i++ ) {
682 		dst[i] = src0[i] + src1[i];
683 	}
684 
685 	//calculate permute and do loads
686 	permVec1 = vec_add( vec_lvsl( -1, (int*) &src0[i] ), oneCharVector );
687 	permVec2 = vec_add( vec_lvsl( -1, (int*) &src1[i] ), oneCharVector );
688 	v2_hi = vec_ld( 0, &src0[i] );
689 	v3_hi = vec_ld( 0, &src1[i] );
690 
691 	//vectorize!
692 	for ( ; i+7 < count; i += 8 ) {
693 		//load source
694 		v0_low = v2_hi;
695 		v0_hi = vec_ld( 15, &src0[i] );
696 		v2_low = v0_hi;
697 		v2_hi = vec_ld( 31, &src0[i] );
698 
699 		v1_low = v3_hi;
700 		v1_hi = vec_ld( 15, &src1[i] );
701 		v3_low = v1_hi;
702 		v3_hi = vec_ld( 31, &src1[i] );
703 
704 		v0 = vec_perm( v0_low, v0_hi, permVec1 );
705 		v1 = vec_perm( v1_low, v1_hi, permVec2 );
706 		v2 = vec_perm( v2_low, v2_hi, permVec1 );
707 		v3 = vec_perm( v3_low, v3_hi, permVec2 );
708 
709 		v4 = vec_add( v0, v1 );
710 		v5 = vec_add( v2, v3 );
711 
712 		ALIGNED_STORE2( &dst[i], v4, v5 );
713 
714 	}
715 
716 	//handle cleanup
717 	 for ( ; i < count ; i++ ) {
718 		dst[i] = src0[i] + src1[i];
719 	}
720 }
721 
722 /*
723 ============
724 idSIMD_AltiVec::Sub
725 
726   dst[i] = constant - src[i];
727 ============
728 */
Sub(float * dst,const float constant,const float * src,const int count)729 void VPCALL idSIMD_AltiVec::Sub( float *dst, const float constant, const float *src, const int count ) {
730 
731 	register vector float v0, v1, v2, v3;
732 	register vector float v0_low, v0_hi, v1_low, v1_hi;
733 	register vector unsigned char permVec;
734 	register vector float constVec;
735 	vector unsigned char oneCharVector = (vector unsigned char)(1);
736 	int i;
737 
738 	//handle unaligned at start
739 	for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count ); i++ ) {
740 		dst[i] = constant - src[i];
741 	}
742 
743 	//splat constant into a vector
744 	constVec = loadSplatUnalignedScalar( &constant );
745 
746 	//calculate permute vector and do first load
747 	permVec = vec_add( vec_lvsl( -1, (int*) &src[i] ), oneCharVector );
748 	v1_hi = vec_ld( 0, &src[i] );
749 
750 	//vectorize!
751 	for ( ; i+7 < count; i += 8 ) {
752 		//load source
753 		v0_low = v1_hi;
754 		v0_hi = vec_ld( 15, &src[i] );
755 		v1_low = v0_hi;
756 		v1_hi = vec_ld( 31, &src[i] );
757 
758 		v0 = vec_perm( v0_low, v0_hi, permVec );
759 		v1 = vec_perm( v1_low, v1_hi, permVec );
760 
761 		v2 = vec_sub( constVec, v0 );
762 		v3 = vec_sub( constVec, v1 );
763 
764 		ALIGNED_STORE2( &dst[i], v2, v3 );
765 	}
766 
767 	//handle cleanup
768 	 for ( ; i < count ; i++ ) {
769 		dst[i] = constant - src[i];
770 	}
771 }
772 
773 /*
774 ============
775 idSIMD_AltiVec::Sub
776 
777   dst[i] = src0[i] - src1[i];
778 ============
779 */
Sub(float * dst,const float * src0,const float * src1,const int count)780 void VPCALL idSIMD_AltiVec::Sub( float *dst, const float *src0, const float *src1, const int count ) {
781 	register vector float v0, v1, v2, v3, v4, v5;
782 	//src0
783 	register vector float v0_low, v0_hi, v2_low, v2_hi;
784 	//src1
785 	register vector float v1_low, v1_hi, v3_low, v3_hi;
786 	register vector unsigned char permVec1, permVec2;
787 	vector unsigned char oneCharVector = (vector unsigned char)(1);
788 	int i;
789 
790 	//handle unaligned at start
791 	for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count ); i++ ) {
792 		dst[i] = src0[i] - src1[i];
793 	}
794 
795 	//calculate permute and do first loads
796 	permVec1 = vec_add( vec_lvsl( -1, (int*) &src0[i] ), oneCharVector );
797 	permVec2 = vec_add( vec_lvsl( -1, (int*) &src1[i] ), oneCharVector );
798 	v2_hi = vec_ld( 0, &src0[i] );
799 	v3_hi = vec_ld( 0, &src1[i] );
800 
801 	//vectorize!
802 	for ( ; i+7 < count; i += 8 ) {
803 		//load source
804 		v0_low = v2_hi;
805 		v0_hi = vec_ld( 15, &src0[i] );
806 		v2_low = v0_hi;
807 		v2_hi = vec_ld( 31, &src0[i] );
808 
809 		v1_low = v3_hi;
810 		v1_hi = vec_ld( 15, &src1[i] );
811 		v3_low = v1_hi;
812 		v3_hi = vec_ld( 31, &src1[i] );
813 
814 		v0 = vec_perm( v0_low, v0_hi, permVec1 );
815 		v1 = vec_perm( v1_low, v1_hi, permVec2 );
816 		v2 = vec_perm( v2_low, v2_hi, permVec1 );
817 		v3 = vec_perm( v3_low, v3_hi, permVec2 );
818 
819 		v4 = vec_sub( v0, v1 );
820 		v5 = vec_sub( v2, v3 );
821 
822 		ALIGNED_STORE2( &dst[i], v4, v5 );
823 	}
824 
825 	//handle cleanup
826 	 for ( ; i < count ; i++ ) {
827 		dst[i] = src0[i] - src1[i];
828 	}
829 }
830 
831 /*
832 ============
833 idSIMD_AltiVec::Mul
834 
835   dst[i] = constant * src[i];
836 ============
837 */
Mul(float * dst,const float constant,const float * src,const int count)838 void VPCALL idSIMD_AltiVec::Mul( float *dst, const float constant, const float *src, const int count) {
839 	register vector float v0, v0_low, v0_hi, v1_low, v1_hi, v1, v2, v3;
840 	register vector float constVec;
841 	register vector unsigned char permVec;
842 	vector unsigned char oneCharVector = (vector unsigned char)(1);
843 	register vector float zeroVector = (vector float)(0.0);
844 	int i;
845 
846 	// handle unaligned data at start
847 	for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count );i++ ) {
848 		dst[i] = constant * src[i];
849 	}
850 
851 	//splat constant into a vector
852 	constVec = loadSplatUnalignedScalar( &constant );
853 
854 	permVec = vec_add( vec_lvsl( -1, (int*) &src[i] ), oneCharVector );
855 	v1_hi = vec_ld( 0, &src[i] );
856 
857 	//vectorize!
858 	for ( ; i+7 < count; i += 8 ) {
859 		//load source
860 		v0_low = v1_hi;
861 		v0_hi = vec_ld( 15, &src[i] );
862 		v1_low = v0_hi;
863 		v1_hi = vec_ld( 31, &src[i] );
864 
865 		v0 = vec_perm( v0_low, v0_hi, permVec );
866 		v1 = vec_perm( v1_low, v1_hi, permVec );
867 
868 		v2 = vec_madd( constVec, v0, zeroVector );
869 		v3 = vec_madd( constVec, v1, zeroVector );
870 
871 		ALIGNED_STORE2( &dst[i], v2, v3 );
872 	}
873 
874 	//handle cleanup
875 	 for ( ; i < count ; i++ ) {
876 		dst[i] = constant * src[i];
877 	}
878 }
879 
880 /*
881 ============
882 idSIMD_AltiVec::Mul
883 
884   dst[i] = src0[i] * src1[i];
885 ============
886 */
Mul(float * dst,const float * src0,const float * src1,const int count)887 void VPCALL idSIMD_AltiVec::Mul( float *dst, const float *src0, const float *src1, const int count ) {
888 	register vector float v0, v1, v2, v3, v4, v5;
889 	//src0
890 	register vector float v0_low, v0_hi, v2_low, v2_hi;
891 	//src1
892 	register vector float v1_low, v1_hi, v3_low, v3_hi;
893 	//permute vectors
894 	register vector unsigned char permVec1, permVec2;
895 	register vector float constVec = (vector float)(0.0);
896 	vector unsigned char oneCharVector = (vector unsigned char)(1);
897 	int i;
898 
899 	//handle unaligned at start
900 	for ( i = 0; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count );i++ ) {
901 		dst[i] = src0[i] * src1[i];
902 	}
903 
904 	//calculate permute and do loads
905 	permVec1 = vec_add( vec_lvsl( -1, (int*) &src0[i] ), oneCharVector );
906 	permVec2 = vec_add( vec_lvsl( -1, (int*) &src1[i] ), oneCharVector );
907 	v2_hi = vec_ld( 0, &src0[i] );
908 	v3_hi = vec_ld( 0, &src1[i] );
909 
910 	//vectorize!
911 	for ( ; i+7 < count; i += 8 ) {
912 		//load source
913 		v0_low = v2_hi;
914 		v0_hi = vec_ld( 15, &src0[i] );
915 		v2_low = v0_hi;
916 		v2_hi = vec_ld( 31, &src0[i] );
917 
918 		v1_low = v3_hi;
919 		v1_hi = vec_ld( 15, &src1[i] );
920 		v3_low = v1_hi;
921 		v3_hi = vec_ld( 31, &src1[i] );
922 
923 		v0 = vec_perm( v0_low, v0_hi, permVec1 );
924 		v1 = vec_perm( v1_low, v1_hi, permVec2 );
925 		v2 = vec_perm( v2_low, v2_hi, permVec1 );
926 		v3 = vec_perm( v3_low, v3_hi, permVec2 );
927 
928 		//no such thing as regular multiply so we do
929 		//multiply then add zero
930 		v4 = vec_madd( v0, v1, constVec );
931 		v5 = vec_madd( v2, v3, constVec );
932 
933 		ALIGNED_STORE2( &dst[i], v4, v5 );
934 	}
935 
936 	//handle cleanup
937 	 for ( ; i < count ; i++ ) {
938 		dst[i] = src0[i] * src1[i];
939 	}
940 }
941 
942 /*
943 ============
944 idSIMD_AltiVec::Div
945 
946   dst[i] = constant / divisor[i];
947 ============
948 */
Div(float * dst,const float constant,const float * divisor,const int count)949 void VPCALL idSIMD_AltiVec::Div( float *dst, const float constant, const float *divisor, const int count ) {
950 	register vector float v0, v1, v2, v3;
951 	register vector float v0_low, v0_hi, v1_low, v1_hi;
952 	register vector unsigned char permVec;
953 	register vector float constVec;
954 	vector unsigned char oneCharVector = (vector unsigned char)(1);
955 	int i;
956 
957 	//handle unaligned at start
958 	for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count );i++ ) {
959 		dst[i] = constant / divisor[i];
960 	}
961 
962 	//splat constant into a vector
963 	constVec = loadSplatUnalignedScalar( &constant );
964 
965 	//calculate permute and do first loads
966 	permVec = vec_add( vec_lvsl( -1, (int*) &divisor[i] ), oneCharVector );
967 	v1_hi = vec_ld( 0, &divisor[i] );
968 
969 	//vectorize!
970 	for ( ; i+7 < count; i += 8 ) {
971 		//load source
972 		v0_low = v1_hi;
973 		v0_hi = vec_ld( 15, &divisor[i] );
974 		v1_low = v0_hi;
975 		v1_hi = vec_ld( 31, &divisor[i] );
976 
977 		v0 = vec_perm( v0_low, v0_hi, permVec );
978 		v1 = vec_perm( v1_low, v1_hi, permVec );
979 
980 		v2 = Divide( constVec, v0 );
981 		v3 = Divide( constVec, v1 );
982 
983 		ALIGNED_STORE2( &dst[i], v2, v3 );
984 	}
985 
986 	//handle cleanup
987 	 for ( ; i < count ; i++ ) {
988 		dst[i] = constant / divisor[i];
989 	}
990 }
991 
992 /*
993 ============
994 idSIMD_AltiVec::Div
995 
996   dst[i] = src0[i] / src1[i];
997 ============
998 */
Div(float * dst,const float * src0,const float * src1,const int count)999 void VPCALL idSIMD_AltiVec::Div( float *dst, const float *src0, const float *src1, const int count ) {
1000 	register vector float v0, v1, v2, v3, v4, v5;
1001 	 //src0
1002 	register vector float v0_low, v0_hi, v2_low, v2_hi;
1003 	//src1
1004 	register vector float v1_low, v1_hi, v3_low, v3_hi;
1005 	//permute vectors
1006 	register vector unsigned char permVec1, permVec2;
1007 	vector unsigned char oneCharVector = (vector unsigned char)(1);
1008 	int i;
1009 
1010 	//handle unaligned at start
1011 	for ( i = 0; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count );i++ ) {
1012 		dst[i] = src0[i] / src1[i];
1013 	}
1014 
1015 	//calculate permute and do loads
1016 	permVec1 = vec_add( vec_lvsl( -1, (int*) &src0[i] ), oneCharVector );
1017 	permVec2 = vec_add( vec_lvsl( -1, (int*) &src1[i] ), oneCharVector );
1018 	v2_hi = vec_ld( 0, &src0[i] );
1019 	v3_hi = vec_ld( 0, &src1[i] );
1020 
1021 	//vectorize!
1022 	for ( ; i+7 < count; i += 8 ) {
1023 		//load source
1024 		v0_low = v2_hi;
1025 		v0_hi = vec_ld( 15, &src0[i] );
1026 		v2_low = v0_hi;
1027 		v2_hi = vec_ld( 31, &src0[i] );
1028 
1029 		v1_low = v3_hi;
1030 		v1_hi = vec_ld( 15, &src1[i] );
1031 		v3_low = v1_hi;
1032 		v3_hi = vec_ld( 31, &src1[i] );
1033 
1034 		v0 = vec_perm( v0_low, v0_hi, permVec1 );
1035 		v1 = vec_perm( v1_low, v1_hi, permVec2 );
1036 		v2 = vec_perm( v2_low, v2_hi, permVec1 );
1037 		v3 = vec_perm( v3_low, v3_hi, permVec2 );
1038 
1039 		v4 = Divide( v0, v1 );
1040 		v5 = Divide( v2, v3 );
1041 
1042 		ALIGNED_STORE2( &dst[i], v4, v5 );
1043 	}
1044 
1045 	//handle cleanup
1046 	 for ( ; i < count ; i++ ) {
1047 		dst[i] = src0[i] / src1[i];
1048 	}
1049 }
1050 
1051 /*
1052 ============
1053 idSIMD_AltiVec::MulAdd
1054 
1055   dst[i] += constant * src[i];
1056 ============
1057 */
MulAdd(float * dst,const float constant,const float * src,const int count)1058 void VPCALL idSIMD_AltiVec::MulAdd( float *dst, const float constant, const float *src, const int count ) {
1059 
1060 	register vector float v0, v1, v2, v3, v4, v5;
1061 	register vector float constVec;
1062 	 //src
1063 	register vector float v0_low, v0_hi, v2_low, v2_hi;
1064 	//permute vectors
1065 	register vector unsigned char permVec1;
1066 	vector unsigned char oneCharVector = (vector unsigned char)(1);
1067 	int i;
1068 
1069 	//handle unaligned at start
1070 	for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count );i++ ) {
1071 		dst[i] += constant * src[i];
1072 	}
1073 
1074 	//splat constant into a vector
1075 	constVec = loadSplatUnalignedScalar( &constant );
1076 
1077 	//calculate permute and do loads
1078 	permVec1 = vec_add( vec_lvsl( -1, (int*) &src[i] ), oneCharVector );
1079 	v2_hi = vec_ld( 0, &src[i] );
1080 
1081 	//vectorize!
1082 	for ( ; i+7 < count; i += 8 ) {
1083 		v0_low = v2_hi;
1084 		v0_hi = vec_ld( 15, &src[i] );
1085 		v2_low = v0_hi;
1086 		v2_hi = vec_ld( 31, &src[i] );
1087 
1088 		v0 = vec_perm( v0_low, v0_hi, permVec1 );
1089 		v2 = vec_perm( v2_low, v2_hi, permVec1 );
1090 
1091 		// at this point, dst is known to be aligned
1092 		v1 = vec_ld( 0, &dst[i] );
1093 		v3 = vec_ld( 16, &dst[i] );
1094 
1095 		v4 = vec_madd( constVec, v0, v1 );
1096 		v5 = vec_madd( constVec, v2, v3 );
1097 
1098 		ALIGNED_STORE2( &dst[i], v4, v5 );
1099 	}
1100 
1101 	//handle cleanup
1102 	 for ( ; i < count ; i++ ) {
1103 		dst[i] += constant * src[i];
1104 	}
1105 }
1106 
1107 /*
1108 ============
1109 idSIMD_AltiVec::MulAdd
1110 
1111   dst[i] += src0[i] * src1[i];
1112 ============
1113 */
MulAdd(float * dst,const float * src0,const float * src1,const int count)1114 void VPCALL idSIMD_AltiVec::MulAdd( float *dst, const float *src0, const float *src1, const int count ) {
1115 	register vector float v0, v1, v2, v3, v4, v5, v6, v7;
1116 	//src0
1117 	register vector float v0_low, v0_hi, v2_low, v2_hi;
1118 	//src1
1119 	register vector float v1_low, v1_hi, v3_low, v3_hi;
1120 	//permute vectors
1121 	register vector unsigned char permVec1, permVec2;
1122 	vector unsigned char oneCharVector = (vector unsigned char)(1);
1123 
1124 	int i;
1125 
1126 	//unaligned at start
1127 	for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count );i++ ) {
1128 		dst[i] += src0[i] * src1[i];
1129 	}
1130 
1131 	//calculate permute and do loads
1132 	permVec1 = vec_add( vec_lvsl( -1, (int*) &src0[i] ), oneCharVector );
1133 	permVec2 = vec_add( vec_lvsl( -1, (int*) &src1[i] ), oneCharVector );
1134 	v2_hi = vec_ld( 0, &src0[i] );
1135 	v3_hi = vec_ld( 0, &src1[i] );
1136 
1137 	//vectorize!
1138 	for ( ; i+7 < count; i += 8 ) {
1139 		// load sources
1140 		v0_low = v2_hi;
1141 		v0_hi = vec_ld( 15, &src0[i] );
1142 		v2_low = v0_hi;
1143 		v2_hi = vec_ld( 31, &src0[i] );
1144 
1145 		v1_low = v3_hi;
1146 		v1_hi = vec_ld( 15, &src1[i] );
1147 		v3_low = v1_hi;
1148 		v3_hi = vec_ld( 31, &src1[i] );
1149 
1150 		v0 = vec_perm( v0_low, v0_hi, permVec1 );
1151 		v1 = vec_perm( v1_low, v1_hi, permVec2 );
1152 		v2 = vec_perm( v2_low, v2_hi, permVec1 );
1153 		v3 = vec_perm( v3_low, v3_hi, permVec2 );
1154 
1155 		//we know dst is aligned because we handled unaligned cases
1156 		//up front
1157 		v4 = vec_ld( 0, &dst[i] );
1158 		v5 = vec_ld( 16, &dst[i] );
1159 
1160 		v6 = vec_madd( v0, v1, v4 );
1161 		v7 = vec_madd( v2, v3, v5 );
1162 
1163 		ALIGNED_STORE2( &dst[i], v6, v7 );
1164 	}
1165 
1166 	//handle cleanup
1167 	 for ( ; i < count ; i++ ) {
1168 		dst[i] += src0[i] * src1[i];
1169 	}
1170 }
1171 
1172 /*
1173 ============
1174 idSIMD_AltiVec::MulSub
1175 
1176   dst[i] -= constant * src[i];
1177 ============
1178 */
MulSub(float * dst,const float constant,const float * src,const int count)1179 void VPCALL idSIMD_AltiVec::MulSub( float *dst, const float constant, const float *src, const int count ) {
1180 	register vector float v0, v1, v2, v3, v4, v5;
1181 	register vector float constVec;
1182 	 //src
1183 	register vector float v0_low, v0_hi, v2_low, v2_hi;
1184 	//permute vectors
1185 	register vector unsigned char permVec1;
1186 	vector unsigned char oneCharVector = (vector unsigned char)(1);
1187 	int i;
1188 
1189 	//handle unaligned at start
1190 	for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count );i++ ) {
1191 		dst[i] -= constant * src[i];
1192 	}
1193 
1194 	//splat constant into a vector
1195 	constVec = loadSplatUnalignedScalar( &constant );
1196 
1197 	//calculate permute and do loads
1198 	permVec1 = vec_add( vec_lvsl( -1, (int*) &src[i] ), oneCharVector );
1199 	v2_hi = vec_ld( 0, &src[i] );
1200 
1201 	//vectorize!
1202 	for ( ; i+7 < count; i += 8 ) {
1203 		v0_low = v2_hi;
1204 		v0_hi = vec_ld( 15, &src[i] );
1205 		v2_low = v0_hi;
1206 		v2_hi = vec_ld( 31, &src[i] );
1207 
1208 		v0 = vec_perm( v0_low, v0_hi, permVec1 );
1209 		v2 = vec_perm( v2_low, v2_hi, permVec1 );
1210 
1211 		//we know dst will be aligned here because we already handled the preceeding
1212 		//unaligned cases
1213 		v1 = vec_ld( 0, &dst[i] );
1214 		v3 = vec_ld( 16, &dst[i] );
1215 
1216 		v4 = vec_nmsub( v0, constVec, v1 );
1217 		v5 = vec_nmsub( v2, constVec, v3 );
1218 
1219 		ALIGNED_STORE2( &dst[i], v4, v5 );
1220 	}
1221 
1222 	//handle cleanup
1223 	 for ( ; i < count ; i++ ) {
1224 		dst[i] -= constant * src[i];
1225 	}
1226 }
1227 
1228 /*
1229 ============
1230 idSIMD_AltiVec::MulSub
1231 
1232   dst[i] -= src0[i] * src1[i];
1233 ============
1234 */
MulSub(float * dst,const float * src0,const float * src1,const int count)1235 void VPCALL idSIMD_AltiVec::MulSub( float *dst, const float *src0, const float *src1, const int count ) {
1236 	register vector float v0, v1, v2, v3, v4, v5, v6, v7;
1237 	//src0
1238 	register vector float v0_low, v0_hi, v2_low, v2_hi;
1239 	//src1
1240 	register vector float v1_low, v1_hi, v3_low, v3_hi;
1241 	//permute vectors
1242 	register vector unsigned char permVec1, permVec2;
1243 	vector unsigned char oneCharVector = (vector unsigned char)(1);
1244 	int i;
1245 
1246 	//unaligned at start
1247 	for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count );i++ ) {
1248 		dst[i] -= src0[i] * src1[i];
1249 	}
1250 
1251 	//calculate permute and do loads
1252 	permVec1 = vec_add( vec_lvsl( -1, (int*) &src0[i] ), oneCharVector );
1253 	permVec2 = vec_add( vec_lvsl( -1, (int*) &src1[i] ), oneCharVector );
1254 	v2_hi = vec_ld( 0, &src0[i] );
1255 	v3_hi = vec_ld( 0, &src1[i] );
1256 
1257 
1258 	//vectorize!
1259 	for ( ; i+7 < count; i += 8 ) {
1260 		// load sources
1261 		v0_low = v2_hi;
1262 		v0_hi = vec_ld( 15, &src0[i] );
1263 		v2_low = v0_hi;
1264 		v2_hi = vec_ld( 31, &src0[i] );
1265 
1266 		v1_low = v3_hi;
1267 		v1_hi = vec_ld( 15, &src1[i] );
1268 		v3_low = v1_hi;
1269 		v3_hi = vec_ld( 31, &src1[i] );
1270 
1271 		v0 = vec_perm( v0_low, v0_hi, permVec1 );
1272 		v1 = vec_perm( v1_low, v1_hi, permVec2 );
1273 		v2 = vec_perm( v2_low, v2_hi, permVec1 );
1274 		v3 = vec_perm( v3_low, v3_hi, permVec2 );
1275 
1276 		//we know dst is aligned because we handled unaligned cases
1277 		//up front
1278 		v4 = vec_ld( 0, &dst[i] );
1279 		v5 = vec_ld( 16, &dst[i] );
1280 
1281 		v6 = vec_nmsub( v0, v1, v4 );
1282 		v7 = vec_nmsub( v2, v3, v5 );
1283 
1284 		ALIGNED_STORE2( &dst[i], v6, v7 );
1285 	}
1286 
1287 	//handle cleanup
1288 	 for ( ; i < count ; i++ ) {
1289 		dst[i] -= src0[i] * src1[i];
1290 	}
1291 }
1292 
1293 #endif /* ENABLE_SIMPLE_MATH */
1294 
1295 #ifdef ENABLE_DOT
1296 /*
1297 ============
1298 idSIMD_AltiVec::Dot
1299 
1300   dst[i] = constant * src[i];
1301 ============
1302 */
Dot(float * dst,const idVec3 & constant,const idVec3 * src,const int count)1303 void VPCALL idSIMD_AltiVec::Dot( float *dst, const idVec3 &constant, const idVec3 *src, const int count ) {
1304 
1305 		register vector float vecLd1, vecLd2, vecLd3, vecLd4, vecLd5, vecLd6;
1306 		register vector float vecX, vecY, vecZ;
1307 		vector float vecX2, vecY2, vecZ2;
1308 		const float *addr = src[0].ToFloatPtr();
1309 		float tempVal[4];
1310 		float constVal[4];
1311 		register vector float zeroVector = (vector float)(0.0);
1312 		register vector float vecConstX, vecConstY, vecConstZ;
1313 
1314 		// permute vectors
1315 		register vector unsigned char permX1 = (vector unsigned char)(0,1,2,3,12,13,14,15,24,25,26,27,28,29,30,31); //last 4 bytes are junk
1316 		register vector unsigned char permX2 = (vector unsigned char)(0,1,2,3,4,5,6,7,8,9,10,11,20,21,22,23);
1317 
1318 		register vector unsigned char permY1 = (vector unsigned char)(4,5,6,7,16,17,18,19,28,29,30,31,0,1,2,3); //last 4 bytes are junk
1319 		register vector unsigned char permY2 = (vector unsigned char)(0,1,2,3,4,5,6,7,8,9,10,11,24,25,26,27);
1320 
1321 		register vector unsigned char permZ1 = (vector unsigned char)(8,9,10,11,20,21,22,23,0,1,2,3,4,5,6,7); //last 8 bytes are junk
1322 		register vector unsigned char permZ2 = (vector unsigned char)(0,1,2,3,4,5,6,7,16,17,18,19,28,29,30,31);
1323 
1324 		int i;
1325 
1326 		// for scalar cleanup, if necessary
1327 		constVal[0] = constant[0];
1328 		constVal[1] = constant[1];
1329 		constVal[2] = constant[2];
1330 		constVal[3] = 0;
1331 
1332 		vector unsigned char constPerm = vec_lvsl( 0, constant.ToFloatPtr() );
1333 		vecLd1 = vec_ld( 0, constant.ToFloatPtr() );
1334 		vecLd2 = vec_ld( 11, constant.ToFloatPtr() );
1335 		vecLd1 = vec_perm( vecLd1, vecLd2, constPerm );
1336 
1337 
1338 		// populate const vectors
1339 		vecConstX = vec_splat( vecLd1, 0 );
1340 		vecConstY = vec_splat( vecLd1, 1 );
1341 		vecConstZ = vec_splat( vecLd1, 2 );
1342 
1343 		vector unsigned char permVec = vec_add( vec_lvsl( -1, addr ), (vector unsigned char)(1) );
1344 		vector float vecOld = vec_ld( 0, addr );
1345 
1346 		// handle unaligned case at beginning
1347 		for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count ); i++ ) {
1348 			dst[i] = constant * src[i];
1349 		}
1350 
1351 		for ( ; i + 7 < count; i += 8 ) {
1352 			float *vecPtr = (float*)( addr + (i*3) );
1353 			vector float v0, v1, v2, v3, v4, v5;
1354 
1355 			v0 = vecOld; //vec_ld( 0, vecPtr );
1356 			v1 = vec_ld( 15, vecPtr );
1357 			v2 = vec_ld( 31, vecPtr );
1358 			v3 = vec_ld( 47, vecPtr );
1359 			v4 = vec_ld( 63, vecPtr );
1360 			v5 = vec_ld( 79, vecPtr );
1361 			vecOld = vec_ld( 95, vecPtr );
1362 
1363 			vecLd1 = vec_perm( v0, v1, permVec );
1364 			vecLd2 = vec_perm( v1, v2, permVec );
1365 			vecLd3 = vec_perm( v2, v3, permVec );
1366 
1367 			vecLd4 = vec_perm( v3, v4, permVec );
1368 			vecLd5 = vec_perm( v4, v5, permVec );
1369 			vecLd6 = vec_perm( v5, vecOld, permVec );
1370 
1371 			// permute into X Y Z vectors
1372 			vecX = vec_perm( vecLd1, vecLd2, permX1 );
1373 			vecY = vec_perm( vecLd1, vecLd2, permY1 );
1374 			vecZ = vec_perm( vecLd1, vecLd2, permZ1 );
1375 			vecX = vec_perm( vecX, vecLd3, permX2 );
1376 			vecY = vec_perm( vecY, vecLd3, permY2 );
1377 			vecZ = vec_perm( vecZ, vecLd3, permZ2 );
1378 
1379 			vecX2 = vec_perm( vecLd4, vecLd5, permX1 );
1380 			vecY2 = vec_perm( vecLd4, vecLd5, permY1 );
1381 			vecZ2 = vec_perm( vecLd4, vecLd5, permZ1 );
1382 			vecX2 = vec_perm( vecX2, vecLd6, permX2 );
1383 			vecY2 = vec_perm( vecY2, vecLd6, permY2 );
1384 			vecZ2 = vec_perm( vecZ2, vecLd6, permZ2 );
1385 
1386 			// do multiply
1387 			vecX = vec_madd( vecX, vecConstX, zeroVector );
1388 			vecY = vec_madd( vecY, vecConstY, vecX );
1389 			vecZ = vec_madd( vecZ, vecConstZ, vecY );
1390 
1391 			vecX2 = vec_madd( vecX2, vecConstX, zeroVector );
1392 			vecY2 = vec_madd( vecY2, vecConstY, vecX2 );
1393 			vecZ2 = vec_madd( vecZ2, vecConstZ, vecY2 );
1394 
1395 			// store out results
1396 			ALIGNED_STORE2( &dst[i], vecZ, vecZ2 );
1397 		}
1398 
1399 		//cleanup
1400 		for ( ; i < count; i++ ) {
1401 			// look up whats at the address we want, cast it as float pointer, then
1402 			// dereference that pointer
1403 			tempVal[0] =  *( addr + (i*3) + 0 );
1404 			tempVal[1] = *( addr + (i*3) + 1 );
1405 			tempVal[2] = *( addr + (i*3) + 2 );
1406 			dst[i] = constVal[0] * tempVal[0] + constVal[1] * tempVal[1] + constVal[2] * tempVal[2];
1407 	}
1408 }
1409 
1410 
1411 /*
1412 ============
1413 idSIMD_AltiVec::Dot
1414 
1415   dst[i] = constant * src[i].Normal() + src[i][3];
1416 ============
1417 */
Dot(float * dst,const idVec3 & constant,const idPlane * src,const int count)1418 void VPCALL idSIMD_AltiVec::Dot( float *dst, const idVec3 &constant, const idPlane *src, const int count ) {
1419 //#define OPER(X) dst[(X)] = constant * src[(X)].Normal() + src[(X)][3];
1420 
1421 	assert( sizeof(idPlane)  == PLANE_OFFSET * sizeof(float) );
1422 
1423 	int i;
1424 	float constVal[4];
1425 	float srcVal[3];
1426 	float srcI3;
1427 	float tempVal;
1428 
1429 	vector float vecPlaneLd1, vecPlaneLd2, vecPlaneLd3, vecPlaneLd4;
1430 	vector float vecPlaneLd5, vecPlaneLd6, vecPlaneLd7, vecPlaneLd8;
1431 	vector float vecX, vecY, vecZ, vecI3;
1432 	vector float vecX2, vecY2, vecZ2, vecI32;
1433 	vector float vecConstX, vecConstY, vecConstZ;
1434 
1435 	constVal[0] = constant[0];
1436 	constVal[1] = constant[1];
1437 	constVal[2] = constant[2];
1438 	constVal[3] = 1;
1439 
1440 	vector unsigned char constPerm = vec_lvsl( 0, constant.ToFloatPtr() );
1441 	vector float v0 = vec_ld( 0, constant.ToFloatPtr() );
1442 	vector float v1 = vec_ld( 11, constant.ToFloatPtr() );
1443 	vector float vecConst = vec_perm( v0, v1, constPerm );
1444 
1445 	vecConstX = vec_splat( vecConst, 0 );
1446 	vecConstY = vec_splat( vecConst, 1 );
1447 	vecConstZ = vec_splat( vecConst, 2 );
1448 
1449 	// handle unaligned case at beginning
1450 	for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count ); i++ ) {
1451 		dst[i] = constant * src[i].Normal() + src[i][3];
1452 	}
1453 
1454 	const float *addr = src[i].ToFloatPtr();
1455 	vector unsigned char permVec = vec_add( vec_lvsl( -1, addr ), (vector unsigned char)(1) );
1456 	vector float vecOld = vec_ld( 0, addr );
1457 
1458 	for ( ; i + 7 < count; i += 8 ) {
1459 			float *planePtr = (float*)( addr + (i*PLANE_OFFSET) );
1460 			vector float v0, v1, v2, v3, v4, v5, v6, v7;
1461 
1462 			v0 = vecOld; //vec_ld( 0, planePtr );
1463 			v1 = vec_ld( 15, planePtr );
1464 			v2 = vec_ld( 31, planePtr );
1465 			v3 = vec_ld( 47, planePtr );
1466 			v4 = vec_ld( 63, planePtr );
1467 			v5 = vec_ld( 79, planePtr );
1468 			v6 = vec_ld( 95, planePtr );
1469 			v7 = vec_ld( 111, planePtr );
1470 			vecOld = vec_ld( 127, planePtr );
1471 
1472 			vecPlaneLd1 = vec_perm( v0, v1, permVec );
1473 			vecPlaneLd2 = vec_perm( v1, v2, permVec );
1474 			vecPlaneLd3 = vec_perm( v2, v3, permVec );
1475 			vecPlaneLd4 = vec_perm( v3, v4, permVec );
1476 
1477 			vecPlaneLd5 = vec_perm( v4, v5, permVec );
1478 			vecPlaneLd6 = vec_perm( v5, v6, permVec );
1479 			vecPlaneLd7 = vec_perm( v6, v7, permVec );
1480 			vecPlaneLd8 = vec_perm( v7, vecOld, permVec );
1481 
1482 			// permute into X Y Z vectors, since this is square its basically
1483 			// a matrix transpose
1484 			v0 = vec_mergeh( vecPlaneLd1, vecPlaneLd3 );
1485 			v1 = vec_mergeh( vecPlaneLd2, vecPlaneLd4 );
1486 			v2 = vec_mergel( vecPlaneLd1, vecPlaneLd3 );
1487 			v3 = vec_mergel( vecPlaneLd2, vecPlaneLd4 );
1488 
1489 			vecX = vec_mergeh( v0, v1 );
1490 			vecY = vec_mergel( v0, v1 );
1491 			vecZ = vec_mergeh( v2, v3 );
1492 			vecI3 = vec_mergel( v2, v3 );
1493 
1494 			v4 = vec_mergeh( vecPlaneLd5, vecPlaneLd7 );
1495 			v5 = vec_mergeh( vecPlaneLd6, vecPlaneLd8 );
1496 			v6 = vec_mergel( vecPlaneLd5, vecPlaneLd7 );
1497 			v7 = vec_mergel( vecPlaneLd6, vecPlaneLd8 );
1498 
1499 			vecX2 = vec_mergeh( v4, v5 );
1500 			vecY2 = vec_mergel( v4, v5 );
1501 			vecZ2 = vec_mergeh( v6, v7 );
1502 			vecI32 = vec_mergel( v6, v7 );
1503 
1504 			// do calculation
1505 			v6 = vec_madd( vecZ, vecConstZ, vecI3 );
1506 			v5 = vec_madd( vecY, vecConstY, v6 );
1507 			v4 = vec_madd( vecX, vecConstX, v5 );
1508 
1509 			v0 = vec_madd( vecZ2, vecConstZ, vecI32 );
1510 			v1 = vec_madd( vecY2, vecConstY, v0 );
1511 			v2 = vec_madd( vecX2, vecConstX, v1 );
1512 
1513 			// store results
1514 			ALIGNED_STORE2( &dst[i], v4, v2 );
1515 	}
1516 
1517 	// cleanup
1518 	for ( ; i < count; i++ ) {
1519 		// populate srcVal with src X Y Z
1520 		srcVal[0] = *(addr + (i*PLANE_OFFSET) + 0 );
1521 		srcVal[1] = *(addr + (i*PLANE_OFFSET) + 1 );
1522 		srcVal[2] = *(addr + (i*PLANE_OFFSET) + 2 );
1523 
1524 		// put src[i][3] into srcI3
1525 		srcI3 = *(addr + (i*PLANE_OFFSET) + 3 );
1526 
1527 		tempVal = constVal[0] * srcVal[0] + constVal[1] * srcVal[1] + constVal[2] * srcVal[2];
1528 		dst[i] = tempVal + srcI3;
1529 	}
1530 }
1531 
1532 #ifndef DRAWVERT_PADDED
1533 /*
1534 ============
1535 idSIMD_AltiVec::Dot
1536 
1537   dst[i] = constant * src[i].xyz;
1538 ============
1539 */
Dot(float * dst,const idVec3 & constant,const idDrawVert * src,const int count)1540 void VPCALL idSIMD_AltiVec::Dot( float *dst, const idVec3 &constant, const idDrawVert *src, const int count ) {
1541 //#define OPER(X) dst[(X)] = constant * src[(X)].xyz;
1542 
1543 		// idDrawVert size is 60 bytes
1544 		assert( sizeof(idDrawVert) == DRAWVERT_OFFSET * sizeof( float ) );
1545 
1546 		register vector float v0, v1, v2, v3, v4, v5, v6, v7;
1547 		int i;
1548 		register vector float vecConstX, vecConstY, vecConstZ;
1549 		register vector float vecSrcX1, vecSrcY1, vecSrcZ1;
1550 		register vector float zeroVector = (vector float)(0.0);
1551 		vector unsigned char vertPerm1, vertPerm2, vertPerm3, vertPerm4;
1552 
1553 		vector unsigned char constPerm = vec_lvsl( 0, constant.ToFloatPtr() );
1554 		v0 = vec_ld( 0, constant.ToFloatPtr() );
1555 		v1 = vec_ld( 11, constant.ToFloatPtr() );
1556 		v0 = vec_perm( v0, v1, constPerm );
1557 
1558 		// permute into constant vectors
1559 		vecConstX = vec_splat( v0, 0 );
1560 		vecConstY = vec_splat( v0, 1 );
1561 		vecConstZ = vec_splat( v0, 2 );
1562 
1563 		// handle unaligned case at beginning
1564 		for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count ); i++ ) {
1565 			dst[i] = constant * src[i].xyz;
1566 		}
1567 
1568 		// every fourth one will have the same alignment. Make sure we've got enough here
1569 		if ( i+3 < count ) {
1570 			vertPerm1 = vec_add( vec_lvsl( -1, (float*) src[i].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
1571 			vertPerm2 = vec_add( vec_lvsl( -1, (float*) src[i+1].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
1572 			vertPerm3 = vec_add( vec_lvsl( -1, (float*) src[i+2].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
1573 			vertPerm4 = vec_add( vec_lvsl( -1, (float*) src[i+3].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
1574 		}
1575 
1576 		for ( ; i+3 < count; i += 4 ) {
1577 			const float *vertPtr = src[i].xyz.ToFloatPtr();
1578 			const float *vertPtr2 = src[i+1].xyz.ToFloatPtr();
1579 			const float *vertPtr3 = src[i+2].xyz.ToFloatPtr();
1580 			const float *vertPtr4 = src[i+3].xyz.ToFloatPtr();
1581 
1582 			v0 = vec_ld( 0, vertPtr );
1583 			v1 = vec_ld( 11, vertPtr );
1584 			v2 = vec_ld( 0, vertPtr2 );
1585 			v3 = vec_ld( 11, vertPtr2 );
1586 			v4 = vec_ld( 0, vertPtr3 );
1587 			v5 = vec_ld( 11, vertPtr3 );
1588 			v6 = vec_ld( 0, vertPtr4 );
1589 			v7 = vec_ld( 11, vertPtr4 );
1590 
1591 			v0 = vec_perm( v0, v1, vertPerm1 );
1592 			v2 = vec_perm( v2, v3, vertPerm2 );
1593 			v4 = vec_perm( v4, v5, vertPerm3 );
1594 			v6 = vec_perm( v6, v7, vertPerm4 );
1595 
1596 			// transpose into X Y Z vectors
1597 			v1 = vec_mergeh( v0, v4 );
1598 			v3 = vec_mergeh( v2, v6 );
1599 			v5 = vec_mergel( v0, v4 );
1600 			v7 = vec_mergel( v2, v6 );
1601 
1602 			vecSrcX1 = vec_mergeh( v1, v3 );
1603 			vecSrcY1 = vec_mergel( v1, v3 );
1604 			vecSrcZ1 = vec_mergeh( v5, v7 );
1605 
1606 			// now calculate dot product
1607 			vecSrcX1 = vec_madd( vecSrcX1, vecConstX, zeroVector );
1608 			vecSrcY1 = vec_madd( vecSrcY1, vecConstY, vecSrcX1 );
1609 			vecSrcZ1 = vec_madd( vecSrcZ1, vecConstZ, vecSrcY1 );
1610 
1611 			// store results
1612 			vec_st( vecSrcZ1, 0, &dst[i] );
1613 		}
1614 
1615 		for ( ; i < count; i++ ) {
1616 			dst[i] = constant * src[i].xyz;
1617 		}
1618 }
1619 #else
1620 /*
1621 ============
1622 idSIMD_AltiVec::Dot
1623 
1624   dst[i] = constant * src[i].xyz;
1625 ============
1626 */
Dot(float * dst,const idVec3 & constant,const idDrawVert * src,const int count)1627 void VPCALL idSIMD_AltiVec::Dot( float *dst, const idVec3 &constant, const idDrawVert *src, const int count ) {
1628 //#define OPER(X) dst[(X)] = constant * src[(X)].xyz;
1629 
1630 		// idDrawVert size is 64 bytes
1631 		assert( sizeof(idDrawVert) == DRAWVERT_OFFSET * sizeof( float ) );
1632 
1633 		register vector float v0, v1, v2, v3, v4, v5, v6, v7;
1634 		int i;
1635 		register vector float vecConstX, vecConstY, vecConstZ;
1636 		register vector float vecSrcX1, vecSrcY1, vecSrcZ1;
1637 		register vector float zeroVector = (vector float)(0.0);
1638 		vector unsigned char vertPerm1, vertPerm2, vertPerm3, vertPerm4;
1639 
1640 		vector unsigned char constPerm = vec_lvsl( 0, constant.ToFloatPtr() );
1641 		v0 = vec_ld( 0, constant.ToFloatPtr() );
1642 		v1 = vec_ld( 11, constant.ToFloatPtr() );
1643 		v0 = vec_perm( v0, v1, constPerm );
1644 
1645 		// permute into constant vectors
1646 		vecConstX = vec_splat( v0, 0 );
1647 		vecConstY = vec_splat( v0, 1 );
1648 		vecConstZ = vec_splat( v0, 2 );
1649 
1650 		// handle unaligned case at beginning
1651 		for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count ); i++ ) {
1652 			dst[i] = constant * src[i].xyz;
1653 		}
1654 
1655 		for ( ; i+3 < count; i += 4 ) {
1656 			const float *vertPtr = src[i].xyz.ToFloatPtr();
1657 			const float *vertPtr2 = src[i+1].xyz.ToFloatPtr();
1658 			const float *vertPtr3 = src[i+2].xyz.ToFloatPtr();
1659 			const float *vertPtr4 = src[i+3].xyz.ToFloatPtr();
1660 
1661 			v0 = vec_ld( 0, vertPtr );
1662 			v2 = vec_ld( 0, vertPtr2 );
1663 			v4 = vec_ld( 0, vertPtr3 );
1664 			v6 = vec_ld( 0, vertPtr4 );
1665 
1666 			// transpose into X Y Z vectors
1667 			v1 = vec_mergeh( v0, v4 );
1668 			v3 = vec_mergeh( v2, v6 );
1669 			v5 = vec_mergel( v0, v4 );
1670 			v7 = vec_mergel( v2, v6 );
1671 
1672 			vecSrcX1 = vec_mergeh( v1, v3 );
1673 			vecSrcY1 = vec_mergel( v1, v3 );
1674 			vecSrcZ1 = vec_mergeh( v5, v7 );
1675 
1676 			// now calculate dot product
1677 			vecSrcX1 = vec_madd( vecSrcX1, vecConstX, zeroVector );
1678 			vecSrcY1 = vec_madd( vecSrcY1, vecConstY, vecSrcX1 );
1679 			vecSrcZ1 = vec_madd( vecSrcZ1, vecConstZ, vecSrcY1 );
1680 
1681 			// store results
1682 			vec_st( vecSrcZ1, 0, &dst[i] );
1683 		}
1684 
1685 		for ( ; i < count; i++ ) {
1686 			dst[i] = constant * src[i].xyz;
1687 		}
1688 }
1689 
1690 #endif /* DRAWVERT_PADDED */
1691 
1692 /*
1693 ============
1694 idSIMD_AltiVec::Dot
1695 
1696   dst[i] = constant.Normal() * src[i] + constant[3];
1697 ============
1698 */
Dot(float * dst,const idPlane & constant,const idVec3 * src,const int count)1699 void VPCALL idSIMD_AltiVec::Dot( float *dst, const idPlane &constant, const idVec3 *src, const int count ) {
1700 //#define OPER(X) dst[(X)] = constant.Normal() * src[(X)] + constant[3];
1701 
1702 		register vector float vecLd1, vecLd2, vecLd3, vecLd4, vecLd5, vecLd6;
1703 		register vector float vecX, vecY, vecZ, vecX2, vecY2, vecZ2;
1704 		register vector float zeroVector = (vector float)(0.0);
1705 		register vector float vecConstX, vecConstY, vecConstZ;
1706 		register vector float vecConst3;
1707 
1708 		idVec3 constNormal = constant.Normal();
1709 		float const3 = constant[3];
1710 
1711 		// permute vectors
1712 		register vector unsigned char permX1 = (vector unsigned char)(0,1,2,3,12,13,14,15,24,25,26,27,28,29,30,31); //last 4 bytes are junk
1713 		register vector unsigned char permX2 = (vector unsigned char)(0,1,2,3,4,5,6,7,8,9,10,11,20,21,22,23);
1714 
1715 		register vector unsigned char permY1 = (vector unsigned char)(4,5,6,7,16,17,18,19,28,29,30,31,0,1,2,3); //last 4 bytes are junk
1716 		register vector unsigned char permY2 = (vector unsigned char)(0,1,2,3,4,5,6,7,8,9,10,11,24,25,26,27);
1717 
1718 		register vector unsigned char permZ1 = (vector unsigned char)(8,9,10,11,20,21,22,23,0,1,2,3,4,5,6,7); //last 8 bytes are junk
1719 		register vector unsigned char permZ2 = (vector unsigned char)(0,1,2,3,4,5,6,7,16,17,18,19,28,29,30,31);
1720 
1721 		int i;
1722 
1723 		vector unsigned char constPerm = vec_lvsl( 0, constant.ToFloatPtr() );
1724 		vecLd1 = vec_ld( 0, constant.ToFloatPtr() );
1725 		vecLd2 = vec_ld( 15, constant.ToFloatPtr() );
1726 		vecLd1 = vec_perm( vecLd1, vecLd2, constPerm );
1727 
1728 		// populate const vec
1729 		vecConstX = vec_splat( vecLd1, 0 );
1730 		vecConstY = vec_splat( vecLd1, 1 );
1731 		vecConstZ = vec_splat( vecLd1, 2 );
1732 
1733 		// put constant to add in vector
1734 		vecConst3 = loadSplatUnalignedScalar( &const3 );
1735 
1736 		// handle unaligned case at beginning
1737 		for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count ); i++ ) {
1738 			dst[i] = constant.Normal() * src[i] + constant[3];
1739 		}
1740 
1741 		const float *addr = src[i].ToFloatPtr();
1742 		vector unsigned char permVec = vec_add( vec_lvsl( -1, addr ), (vector unsigned char)(1) );
1743 		vector float vecOld = vec_ld( 0, addr );
1744 
1745 		for ( ; i+7 < count; i += 8 ) {
1746 			float *vecPtr = (float*)( addr + (i*3) );
1747 			vector float v0, v1, v2, v3, v4, v5;
1748 
1749 			v0 = vecOld; //vec_ld( 0, vecPtr );
1750 			v1 = vec_ld( 15, vecPtr );
1751 			v2 = vec_ld( 31, vecPtr );
1752 			v3 = vec_ld( 47, vecPtr );
1753 			v4 = vec_ld( 63, vecPtr );
1754 			v5 = vec_ld( 79, vecPtr );
1755 			vecOld = vec_ld( 95, vecPtr );
1756 
1757 			vecLd1 = vec_perm( v0, v1, permVec );
1758 			vecLd2 = vec_perm( v1, v2, permVec );
1759 			vecLd3 = vec_perm( v2, v3, permVec );
1760 
1761 			vecLd4 = vec_perm( v3, v4, permVec );
1762 			vecLd5 = vec_perm( v4, v5, permVec );
1763 			vecLd6 = vec_perm( v5, vecOld, permVec );
1764 
1765 			// permute into X Y Z vectors
1766 			vecX = vec_perm( vecLd1, vecLd2, permX1 );
1767 			vecY = vec_perm( vecLd1, vecLd2, permY1 );
1768 			vecZ = vec_perm( vecLd1, vecLd2, permZ1 );
1769 			vecX = vec_perm( vecX, vecLd3, permX2 );
1770 			vecY = vec_perm( vecY, vecLd3, permY2 );
1771 			vecZ = vec_perm( vecZ, vecLd3, permZ2 );
1772 
1773 			vecX2 = vec_perm( vecLd4, vecLd5, permX1 );
1774 			vecY2 = vec_perm( vecLd4, vecLd5, permY1 );
1775 			vecZ2 = vec_perm( vecLd4, vecLd5, permZ1 );
1776 			vecX2 = vec_perm( vecX2, vecLd6, permX2 );
1777 			vecY2 = vec_perm( vecY2, vecLd6, permY2 );
1778 			vecZ2 = vec_perm( vecZ2, vecLd6, permZ2 );
1779 
1780 			// calculate dot product
1781 			vecX = vec_madd( vecX, vecConstX, zeroVector );
1782 			vecY = vec_madd( vecY, vecConstY, vecX );
1783 			vecZ = vec_madd( vecZ, vecConstZ, vecY );
1784 
1785 			vecX2 = vec_madd( vecX2, vecConstX, zeroVector );
1786 			vecY2 = vec_madd( vecY2, vecConstY, vecX2 );
1787 			vecZ2 = vec_madd( vecZ2, vecConstZ, vecY2 );
1788 
1789 			// add in constant[3]
1790 			vecZ = vec_add( vecZ, vecConst3 );
1791 			vecZ2 = vec_add( vecZ2, vecConst3 );
1792 
1793 			// store out results
1794 			ALIGNED_STORE2( &dst[i], vecZ, vecZ2 );
1795 		}
1796 
1797 		//cleanup
1798 		for ( ; i < count; i++ ) {
1799 			dst[i] = constNormal * src[i] + const3;
1800 		}
1801 }
1802 
1803 /*
1804 ============
1805 idSIMD_AltiVec::Dot
1806 
1807   dst[i] = constant.Normal() * src[i].Normal() + constant[3] * src[i][3];
1808 ============
1809 */
Dot(float * dst,const idPlane & constant,const idPlane * src,const int count)1810 void VPCALL idSIMD_AltiVec::Dot( float *dst, const idPlane &constant, const idPlane *src, const int count ) {
1811 //#define OPER(X) dst[(X)] = constant.Normal() * src[(X)].Normal() + constant[3] * src[(X)][3];
1812 
1813 	// check plane size
1814 	assert( sizeof(idPlane) == PLANE_OFFSET * sizeof(float) );
1815 
1816 	float constVal[4];
1817 	float srcVal[4];
1818 
1819 	int i;
1820 	const float *constPtr = constant.ToFloatPtr();
1821 
1822 	register vector float vecX, vecY, vecZ, vecI3;
1823 	register vector float vecX2, vecY2, vecZ2, vecI32;
1824 
1825 	vector float vecPlaneLd1, vecPlaneLd2, vecPlaneLd3, vecPlaneLd4;
1826 	vector float vecPlaneLd5, vecPlaneLd6, vecPlaneLd7, vecPlaneLd8;
1827 	register vector float zeroVector = (vector float)(0.0);
1828 	register vector float vecConstX, vecConstY, vecConstZ, vecConstI3;
1829 
1830 	constVal[0] = *(constPtr);
1831 	constVal[1] = *(constPtr+1);
1832 	constVal[2] = *(constPtr+2);
1833 	constVal[3] = *(constPtr+3);
1834 
1835 	// populate const vector
1836 	vector unsigned char constPerm = vec_lvsl( 0, constant.ToFloatPtr() );
1837 	vector float v0 = vec_ld( 0, constant.ToFloatPtr() );
1838 	vector float v1 = vec_ld( 15, constant.ToFloatPtr() );
1839 	vector float vecConst = vec_perm( v0, v1, constPerm );
1840 
1841 	vecConstX = vec_splat( vecConst, 0 );
1842 	vecConstY = vec_splat( vecConst, 1 );
1843 	vecConstZ = vec_splat( vecConst, 2 );
1844 	vecConstI3 = vec_splat( vecConst, 3 );
1845 
1846 	// handle unaligned case at beginning
1847 	for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count ); i++ ) {
1848 		dst[i] = constant.Normal() * src[i].Normal() + constant[3] * src[i][3];
1849 	}
1850 
1851 	const float *srcPtr = src[i].ToFloatPtr();
1852 	vector unsigned char permVec = vec_add( vec_lvsl( -1, srcPtr ), (vector unsigned char)(1) );
1853 	vector float vecOld = vec_ld( 0, srcPtr );
1854 
1855 	for ( ; i+7 < count; i += 8 ) {
1856 		float *planePtr = (float*)( srcPtr + (i*PLANE_OFFSET) );
1857 		vector float v0, v1, v2, v3, v4, v5, v6, v7;
1858 
1859 		v0 = vecOld; // vec_ld( 0, planePtr );
1860 		v1 = vec_ld( 15, planePtr );
1861 		v2 = vec_ld( 31, planePtr );
1862 		v3 = vec_ld( 47, planePtr );
1863 		v4 = vec_ld( 63, planePtr );
1864 		v5 = vec_ld( 79, planePtr );
1865 		v6 = vec_ld( 95, planePtr );
1866 		v7 = vec_ld( 111, planePtr );
1867 		vecOld = vec_ld( 127, planePtr );
1868 
1869 		vecPlaneLd1 = vec_perm( v0, v1, permVec );
1870 		vecPlaneLd2 = vec_perm( v1, v2, permVec );
1871 		vecPlaneLd3 = vec_perm( v2, v3, permVec );
1872 		vecPlaneLd4 = vec_perm( v3, v4, permVec );
1873 
1874 		vecPlaneLd5 = vec_perm( v4, v5, permVec );
1875 		vecPlaneLd6 = vec_perm( v5, v6, permVec );
1876 		vecPlaneLd7 = vec_perm( v6, v7, permVec );
1877 		vecPlaneLd8 = vec_perm( v7, vecOld, permVec );
1878 
1879 		// permute into X Y Z vectors, since this is square its basically
1880 		// a matrix transpose
1881 		v0 = vec_mergeh( vecPlaneLd1, vecPlaneLd3 );
1882 		v1 = vec_mergeh( vecPlaneLd2, vecPlaneLd4 );
1883 		v2 = vec_mergel( vecPlaneLd1, vecPlaneLd3 );
1884 		v3 = vec_mergel( vecPlaneLd2, vecPlaneLd4 );
1885 
1886 		vecX = vec_mergeh( v0, v1 );
1887 		vecY = vec_mergel( v0, v1 );
1888 		vecZ = vec_mergeh( v2, v3 );
1889 		vecI3 = vec_mergel( v2, v3 );
1890 
1891 		v4 = vec_mergeh( vecPlaneLd5, vecPlaneLd7 );
1892 		v5 = vec_mergeh( vecPlaneLd6, vecPlaneLd8 );
1893 		v6 = vec_mergel( vecPlaneLd5, vecPlaneLd7 );
1894 		v7 = vec_mergel( vecPlaneLd6, vecPlaneLd8 );
1895 
1896 		vecX2 = vec_mergeh( v4, v5 );
1897 		vecY2 = vec_mergel( v4, v5 );
1898 		vecZ2 = vec_mergeh( v6, v7 );
1899 		vecI32 = vec_mergel( v6, v7 );
1900 
1901 		// do calculation
1902 		v4 = vec_madd( vecConstX, vecX, zeroVector );
1903 		v5 = vec_madd( vecConstY, vecY, v4 );
1904 		v6 = vec_madd( vecConstZ, vecZ, v5 );
1905 		v7 = vec_madd( vecConstI3, vecI3, v6 );
1906 
1907 		v0 = vec_madd( vecConstX, vecX2, zeroVector );
1908 		v1 = vec_madd( vecConstY, vecY2, v0 );
1909 		v2 = vec_madd( vecConstZ, vecZ2, v1 );
1910 		v3 = vec_madd( vecConstI3, vecI32, v2 );
1911 
1912 		//store result
1913 		ALIGNED_STORE2( &dst[i], v7, v3 );
1914 	}
1915 
1916 	// cleanup
1917 	for ( ; i < count; i++ ) {
1918 		//dst[i] = constant.Normal() * src[i].Normal() + constant[3] * src[i][3];
1919 		srcVal[0] = *(srcPtr + (i*PLANE_OFFSET) + 0 );
1920 		srcVal[1] = *(srcPtr + (i*PLANE_OFFSET) + 1 );
1921 		srcVal[2] = *(srcPtr + (i*PLANE_OFFSET) + 2 );
1922 		srcVal[3] = *(srcPtr + (i*PLANE_OFFSET) + 3 );
1923 		dst[i] = srcVal[0] * constVal[0] + srcVal[1] * constVal[1] + srcVal[2] * constVal[2] + constVal[3] * srcVal[3];
1924 	}
1925 }
1926 
1927 
1928 #ifndef DRAWVERT_PADDED
1929 /*
1930 ============
1931 idSIMD_AltiVec::Dot
1932 
1933   dst[i] = constant.Normal() * src[i].xyz + constant[3];
1934 ============
1935 */
Dot(float * dst,const idPlane & constant,const idDrawVert * src,const int count)1936 void VPCALL idSIMD_AltiVec::Dot( float *dst, const idPlane &constant, const idDrawVert *src, const int count ) {
1937 //#define OPER(X) dst[(X)] = constant.Normal() * src[(X)].xyz + constant[3];
1938 
1939 	// idDrawVert size is 60 bytes
1940 	assert( sizeof(idDrawVert) == DRAWVERT_OFFSET * sizeof( float ) );
1941 
1942 	int i;
1943 	const float *constPtr = constant.ToFloatPtr();
1944 	const float *srcPtr = src[0].xyz.ToFloatPtr();
1945 
1946 	register vector float v0, v1, v2, v3, v4, v5, v6, v7;
1947 	register vector float vecConstX, vecConstY, vecConstZ, vecConstI3;
1948 	register vector float vecSrcX1, vecSrcY1, vecSrcZ1;
1949 	register vector float vecDest1;
1950 	register vector float zeroVector = (vector float)(0.0);
1951 	vector unsigned char vertPerm1, vertPerm2, vertPerm3, vertPerm4;
1952 
1953 	float constVal[4];
1954 	float srcVal[3];
1955 
1956 	constVal[0] = *(constPtr+0);
1957 	constVal[1] = *(constPtr+1);
1958 	constVal[2] = *(constPtr+2);
1959 	constVal[3] = *(constPtr+3);
1960 
1961 	// populate const vec
1962 	vector unsigned char constPerm = vec_lvsl( 0, constant.ToFloatPtr() );
1963 	v0 = vec_ld( 0, constant.ToFloatPtr() );
1964 	v1 = vec_ld( 15, constant.ToFloatPtr() );
1965 	v0 = vec_perm( v0, v1, constPerm );
1966 
1967 	vecConstX = vec_splat( v0, 0 );
1968 	vecConstY = vec_splat( v0, 1 );
1969 	vecConstZ = vec_splat( v0, 2 );
1970 	vecConstI3 = vec_splat( v0, 3 );
1971 
1972 	// handle unaligned case at beginning
1973 	for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count ); i++ ) {
1974 		dst[i] = constant.Normal() * src[i].xyz + constant[3];
1975 	}
1976 
1977 	// every fourth one will have the same alignment, so can store these. Make sure we
1978 	// have enough so we don't run off the end of the array
1979 	if ( i+3 < count ) {
1980 			vertPerm1 = vec_add( vec_lvsl( -1, (float*) src[i].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
1981 			vertPerm2 = vec_add( vec_lvsl( -1, (float*) src[i+1].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
1982 			vertPerm3 = vec_add( vec_lvsl( -1, (float*) src[i+2].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
1983 			vertPerm4 = vec_add( vec_lvsl( -1, (float*) src[i+3].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
1984 	}
1985 
1986 	for ( ; i+3 < count; i+=4 ) {
1987 			const float *vertPtr = src[i].xyz.ToFloatPtr();
1988 			const float *vertPtr2 = src[i+1].xyz.ToFloatPtr();
1989 			const float *vertPtr3 = src[i+2].xyz.ToFloatPtr();
1990 			const float *vertPtr4 = src[i+3].xyz.ToFloatPtr();
1991 
1992 			v0 = vec_ld( 0, vertPtr );
1993 			v1 = vec_ld( 11, vertPtr );
1994 			v2 = vec_ld( 0, vertPtr2 );
1995 			v3 = vec_ld( 11, vertPtr2 );
1996 			v4 = vec_ld( 0, vertPtr3 );
1997 			v5 = vec_ld( 11, vertPtr3 );
1998 			v6 = vec_ld( 0, vertPtr4 );
1999 			v7 = vec_ld( 11, vertPtr4 );
2000 
2001 			v0 = vec_perm( v0, v1, vertPerm1 );
2002 			v2 = vec_perm( v2, v3, vertPerm2 );
2003 			v4 = vec_perm( v4, v5, vertPerm3 );
2004 			v6 = vec_perm( v6, v7, vertPerm4 );
2005 
2006 			// transpose into X Y Z vectors
2007 			v1 = vec_mergeh( v0, v4 );
2008 			v3 = vec_mergeh( v2, v6 );
2009 			v5 = vec_mergel( v0, v4 );
2010 			v7 = vec_mergel( v2, v6 );
2011 
2012 			vecSrcX1 = vec_mergeh( v1, v3 );
2013 			vecSrcY1 = vec_mergel( v1, v3 );
2014 			vecSrcZ1 = vec_mergeh( v5, v7 );
2015 
2016 			// now calculate dot product
2017 			vecSrcX1 = vec_madd( vecSrcX1, vecConstX, zeroVector );
2018 			vecSrcY1 = vec_madd( vecSrcY1, vecConstY, vecSrcX1 );
2019 			vecSrcZ1 = vec_madd( vecSrcZ1, vecConstZ, vecSrcY1 );
2020 			vecDest1 = vec_add( vecSrcZ1, vecConstI3 );
2021 
2022 			// store results
2023 			vec_st( vecDest1, 0, &dst[i] );
2024 	}
2025 
2026 	// cleanup
2027 	for ( ; i < count; i++ ) {
2028 		srcVal[0] = *(srcPtr + (i*DRAWVERT_OFFSET) + 0 );
2029 		srcVal[1] = *(srcPtr + (i*DRAWVERT_OFFSET) + 1 );
2030 		srcVal[2] = *(srcPtr + (i*DRAWVERT_OFFSET) + 2 );
2031 		//	dst[i] = constant.Normal() * src[i].xyz + constant[3];
2032 
2033 		dst[i] = constVal[0] * srcVal[0] + constVal[1] * srcVal[1] + constVal[2] * srcVal[2];
2034 		dst[i] += constVal[3];
2035 	}
2036 }
2037 #else
2038 /*
2039 ============
2040 idSIMD_AltiVec::Dot
2041 
2042   dst[i] = constant.Normal() * src[i].xyz + constant[3];
2043 ============
2044 */
Dot(float * dst,const idPlane & constant,const idDrawVert * src,const int count)2045 void VPCALL idSIMD_AltiVec::Dot( float *dst, const idPlane &constant, const idDrawVert *src, const int count ) {
2046 //#define OPER(X) dst[(X)] = constant.Normal() * src[(X)].xyz + constant[3];
2047 
2048 	// idDrawVert size is 60 bytes
2049 	assert( sizeof(idDrawVert) == DRAWVERT_OFFSET * sizeof( float ) );
2050 
2051 	int i;
2052 	const float *constPtr = constant.ToFloatPtr();
2053 	const float *srcPtr = src[0].xyz.ToFloatPtr();
2054 
2055 	register vector float v0, v1, v2, v3, v4, v5, v6, v7;
2056 	register vector float vecConstX, vecConstY, vecConstZ, vecConstI3;
2057 	register vector float vecSrcX1, vecSrcY1, vecSrcZ1;
2058 	register vector float vecDest1;
2059 	register vector float zeroVector = (vector float)(0.0);
2060 	vector unsigned char vertPerm1, vertPerm2, vertPerm3, vertPerm4;
2061 
2062 	float constVal[4];
2063 	float srcVal[3];
2064 
2065 	constVal[0] = *(constPtr+0);
2066 	constVal[1] = *(constPtr+1);
2067 	constVal[2] = *(constPtr+2);
2068 	constVal[3] = *(constPtr+3);
2069 
2070 	// populate const vec
2071 	vector unsigned char constPerm = vec_lvsl( 0, constant.ToFloatPtr() );
2072 	v0 = vec_ld( 0, constant.ToFloatPtr() );
2073 	v1 = vec_ld( 15, constant.ToFloatPtr() );
2074 	v0 = vec_perm( v0, v1, constPerm );
2075 
2076 	vecConstX = vec_splat( v0, 0 );
2077 	vecConstY = vec_splat( v0, 1 );
2078 	vecConstZ = vec_splat( v0, 2 );
2079 	vecConstI3 = vec_splat( v0, 3 );
2080 
2081 	// handle unaligned case at beginning
2082 	for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count ); i++ ) {
2083 		dst[i] = constant.Normal() * src[i].xyz + constant[3];
2084 	}
2085 
2086 		for ( ; i+3 < count; i+=4 ) {
2087 			const float *vertPtr = src[i].xyz.ToFloatPtr();
2088 			const float *vertPtr2 = src[i+1].xyz.ToFloatPtr();
2089 			const float *vertPtr3 = src[i+2].xyz.ToFloatPtr();
2090 			const float *vertPtr4 = src[i+3].xyz.ToFloatPtr();
2091 
2092 			v0 = vec_ld( 0, vertPtr );
2093 			v2 = vec_ld( 0, vertPtr2 );
2094 			v4 = vec_ld( 0, vertPtr3 );
2095 			v6 = vec_ld( 0, vertPtr4 );
2096 
2097 			// transpose into X Y Z vectors
2098 			v1 = vec_mergeh( v0, v4 );
2099 			v3 = vec_mergeh( v2, v6 );
2100 			v5 = vec_mergel( v0, v4 );
2101 			v7 = vec_mergel( v2, v6 );
2102 
2103 			vecSrcX1 = vec_mergeh( v1, v3 );
2104 			vecSrcY1 = vec_mergel( v1, v3 );
2105 			vecSrcZ1 = vec_mergeh( v5, v7 );
2106 
2107 			// now calculate dot product
2108 			vecSrcX1 = vec_madd( vecSrcX1, vecConstX, zeroVector );
2109 			vecSrcY1 = vec_madd( vecSrcY1, vecConstY, vecSrcX1 );
2110 			vecSrcZ1 = vec_madd( vecSrcZ1, vecConstZ, vecSrcY1 );
2111 			vecDest1 = vec_add( vecSrcZ1, vecConstI3 );
2112 
2113 			// store results
2114 			vec_st( vecDest1, 0, &dst[i] );
2115 	}
2116 
2117 	// cleanup
2118 	for ( ; i < count; i++ ) {
2119 		srcVal[0] = *(srcPtr + (i*DRAWVERT_OFFSET) + 0 );
2120 		srcVal[1] = *(srcPtr + (i*DRAWVERT_OFFSET) + 1 );
2121 		srcVal[2] = *(srcPtr + (i*DRAWVERT_OFFSET) + 2 );
2122 		//	dst[i] = constant.Normal() * src[i].xyz + constant[3];
2123 
2124 		dst[i] = constVal[0] * srcVal[0] + constVal[1] * srcVal[1] + constVal[2] * srcVal[2];
2125 		dst[i] += constVal[3];
2126 	}
2127 }
2128 
2129 #endif /* DRAWVERT_PADDED */
2130 
2131 /*
2132 ============
2133 idSIMD_AltiVec::Dot
2134 
2135   dst[i] = src0[i] * src1[i];
2136 ============
2137 */
Dot(float * dst,const idVec3 * src0,const idVec3 * src1,const int count)2138 void VPCALL idSIMD_AltiVec::Dot( float *dst, const idVec3 *src0, const idVec3 *src1, const int count ) {
2139 //#define OPER(X) dst[(X)] = src0[(X)] * src1[(X)];
2140 
2141 	int i;
2142 	float src0Val[3];
2143 	float src1Val[3];
2144 
2145 	register vector float vecLd1, vecLd2, vecLd3, vecLd4, vecLd5, vecLd6;
2146 	vector float vecLd7, vecLd8, vecLd9, vecLd10, vecLd11, vecLd12;
2147 	register vector float vecX0, vecY0, vecZ0, vecX1, vecY1, vecZ1;
2148 	register vector float vecX02, vecY02, vecZ02, vecX12, vecY12, vecZ12;
2149 	register vector float zeroVector = (vector float)(0.0);
2150 	// permute vectors
2151 	register vector unsigned char permX1 = (vector unsigned char)(0,1,2,3,12,13,14,15,24,25,26,27,28,29,30,31); //last 4 bytes are junk
2152 	register vector unsigned char permX2 = (vector unsigned char)(0,1,2,3,4,5,6,7,8,9,10,11,20,21,22,23);
2153 	register vector unsigned char permY1 = (vector unsigned char)(4,5,6,7,16,17,18,19,28,29,30,31,0,1,2,3); //last 4 bytes are junk
2154 	register vector unsigned char permY2 = (vector unsigned char)(0,1,2,3,4,5,6,7,8,9,10,11,24,25,26,27);
2155 	register vector unsigned char permZ1 = (vector unsigned char)(8,9,10,11,20,21,22,23,0,1,2,3,4,5,6,7); //last 8 bytes are junk
2156 	register vector unsigned char permZ2 = (vector unsigned char)(0,1,2,3,4,5,6,7,16,17,18,19,28,29,30,31);
2157 
2158 	// handle unaligned case at beginning
2159 	for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count ); i++ ) {
2160 		dst[i] = src0[i] * src1[i];
2161 	}
2162 
2163 	const float *src0Ptr = src0[i].ToFloatPtr();
2164 	const float *src1Ptr = src1[i].ToFloatPtr();
2165 	vector unsigned char permVec1 = vec_add( vec_lvsl( -1, src0Ptr ), (vector unsigned char)(1) );
2166 	vector unsigned char permVec2 = vec_add( vec_lvsl( -1, src1Ptr ), (vector unsigned char)(1) );
2167 	vector float vecOld0 = vec_ld( 0, src0Ptr );
2168 	vector float vecOld1 = vec_ld( 0, src1Ptr );
2169 
2170 	for ( i = 0; i+7 < count; i += 8 ) {
2171 			float *s0Ptr = (float*)( src0Ptr + (i*3) );
2172 			float *s1Ptr = (float*)( src1Ptr + (i*3) );
2173 
2174 			vector float v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11;
2175 			v0 = vecOld0;
2176 			v1 = vec_ld( 15, s0Ptr );
2177 			v2 = vec_ld( 31, s0Ptr );
2178 			v3 = vec_ld( 47, s0Ptr );
2179 			v4 = vec_ld( 63, s0Ptr );
2180 			v5 = vec_ld( 79, s0Ptr );
2181 			vecOld0 = vec_ld( 95, s0Ptr );
2182 
2183 			v6 = vecOld1;
2184 			v7 = vec_ld( 15, s1Ptr );
2185 			v8 = vec_ld( 31, s1Ptr );
2186 			v9 = vec_ld( 47, s1Ptr );
2187 			v10 = vec_ld( 63, s1Ptr );
2188 			v11 = vec_ld( 79, s1Ptr );
2189 			vecOld1 = vec_ld( 95, s1Ptr );
2190 
2191 			vecLd1 = vec_perm( v0, v1, permVec1 );
2192 			vecLd2 = vec_perm( v1, v2, permVec1 );
2193 			vecLd3 = vec_perm( v2, v3, permVec1 );
2194 			vecLd4 = vec_perm( v3, v4, permVec1 );
2195 			vecLd5 = vec_perm( v4, v5, permVec1 );
2196 			vecLd6 = vec_perm( v5, vecOld0, permVec1 );
2197 
2198 			vecLd7 = vec_perm( v6, v7, permVec2 );
2199 			vecLd8 = vec_perm( v7, v8, permVec2 );
2200 			vecLd9 = vec_perm( v8, v9, permVec2 );
2201 			vecLd10 = vec_perm( v9, v10, permVec2 );
2202 			vecLd11 = vec_perm( v10, v11, permVec2 );
2203 			vecLd12 = vec_perm( v11, vecOld1, permVec2 );
2204 
2205 			// permute into X Y Z vectors
2206 			vecX0 = vec_perm( vecLd1, vecLd2, permX1 );
2207 			vecY0 = vec_perm( vecLd1, vecLd2, permY1 );
2208 			vecZ0 = vec_perm( vecLd1, vecLd2, permZ1 );
2209 			vecX0 = vec_perm( vecX0, vecLd3, permX2 );
2210 			vecY0 = vec_perm( vecY0, vecLd3, permY2 );
2211 			vecZ0 = vec_perm( vecZ0, vecLd3, permZ2 );
2212 
2213 			vecX02 = vec_perm( vecLd4, vecLd5, permX1 );
2214 			vecY02 = vec_perm( vecLd4, vecLd5, permY1 );
2215 			vecZ02 = vec_perm( vecLd4, vecLd5, permZ1 );
2216 			vecX02 = vec_perm( vecX02, vecLd6, permX2 );
2217 			vecY02 = vec_perm( vecY02, vecLd6, permY2 );
2218 			vecZ02 = vec_perm( vecZ02, vecLd6, permZ2 );
2219 
2220 			vecX1 = vec_perm( vecLd7, vecLd8, permX1 );
2221 			vecY1 = vec_perm( vecLd7, vecLd8, permY1 );
2222 			vecZ1 = vec_perm( vecLd7, vecLd8, permZ1 );
2223 			vecX1 = vec_perm( vecX1, vecLd9, permX2 );
2224 			vecY1 = vec_perm( vecY1, vecLd9, permY2 );
2225 			vecZ1 = vec_perm( vecZ1, vecLd9, permZ2 );
2226 
2227 			vecX12 = vec_perm( vecLd10, vecLd11, permX1 );
2228 			vecY12 = vec_perm( vecLd10, vecLd11, permY1 );
2229 			vecZ12 = vec_perm( vecLd10, vecLd11, permZ1 );
2230 			vecX12 = vec_perm( vecX12, vecLd12, permX2 );
2231 			vecY12 = vec_perm( vecY12, vecLd12, permY2 );
2232 			vecZ12 = vec_perm( vecZ12, vecLd12, permZ2 );
2233 
2234 			// do multiply
2235 			vecX0 = vec_madd( vecX0, vecX1, zeroVector );
2236 			vecY0 = vec_madd( vecY0, vecY1, vecX0 );
2237 			vecZ0 = vec_madd( vecZ0, vecZ1, vecY0 );
2238 			vecX02 = vec_madd( vecX02, vecX12, zeroVector );
2239 			vecY02 = vec_madd( vecY02, vecY12, vecX02 );
2240 			vecZ02 = vec_madd( vecZ02, vecZ12, vecY02 );
2241 
2242 			// store out results
2243 			ALIGNED_STORE2( &dst[i], vecZ0, vecZ02 );
2244 	}
2245 
2246 	// cleanup
2247 	for ( ; i < count; i++ ) {
2248 		//	dst[i] = src0[i] * src1[i];
2249 		src0Val[0] = *( src0Ptr + (i*3) + 0 );
2250 		src0Val[1] = *( src0Ptr + (i*3) + 1 );
2251 		src0Val[2] = *( src0Ptr + (i*3) + 2 );
2252 
2253 		src1Val[0] = *( src1Ptr + (i*3) + 0 );
2254 		src1Val[1] = *( src1Ptr + (i*3) + 1 );
2255 		src1Val[2] = *( src1Ptr + (i*3) + 2 );
2256 
2257 		dst[i] = src0Val[0] * src1Val[0] + src0Val[1] * src1Val[1] + src0Val[2] * src1Val[2];
2258 	}
2259 }
2260 
2261 /*
2262 ============
2263 idSIMD_AltiVec::Dot
2264 
2265   dot = src1[0] * src2[0] + src1[1] * src2[1] + src1[2] * src2[2] + ...
2266 ============
2267 */
Dot(float & dot,const float * src1,const float * src2,const int count)2268 void VPCALL idSIMD_AltiVec::Dot( float &dot, const float *src1, const float *src2, const int count ) {
2269 	dot = 0.0f;
2270 
2271 	register vector float v0, v1, v2, v3;
2272 	register vector float zeroVector;
2273 	register vector float runningTotal1, runningTotal2;
2274 	//src0
2275 	register vector float v0_low, v0_hi, v2_low, v2_hi;
2276 	//src1
2277 	register vector float v1_low, v1_hi, v3_low, v3_hi;
2278 	//permute vectors
2279 	register vector unsigned char permVec1, permVec2;
2280 	vector unsigned char oneCharVector = (vector unsigned char)(1);
2281 
2282 	int i = 0;
2283 
2284 	 runningTotal1 = (vector float)(0.0);
2285 	 runningTotal2 = (vector float)(0.0);
2286 	 zeroVector = (vector float)(0.0);
2287 
2288 	if ( count >= 8 ) {
2289 		//calculate permute and do loads
2290 		permVec1 = vec_add( vec_lvsl( -1, (int*) &src1[i] ), oneCharVector );
2291 		permVec2 = vec_add( vec_lvsl( -1, (int*) &src2[i] ), oneCharVector );
2292 		v2_hi = vec_ld( 0, &src1[i] );
2293 		v3_hi = vec_ld( 0, &src2[i] );
2294 
2295 		//vectorize!
2296 		for ( ; i+7 < count; i += 8 ) {
2297 			//load sources
2298 			v0_low = v2_hi;
2299 			v0_hi = vec_ld( 15, &src1[i] );
2300 			v2_low = v0_hi;
2301 			v2_hi = vec_ld( 31, &src1[i] );
2302 
2303 			v1_low = v3_hi;
2304 			v1_hi = vec_ld( 15, &src2[i] );
2305 			v3_low = v1_hi;
2306 			v3_hi = vec_ld( 31, &src2[i] );
2307 
2308 			v0 = vec_perm( v0_low, v0_hi, permVec1 );
2309 			v1 = vec_perm( v1_low, v1_hi, permVec2 );
2310 			v2 = vec_perm( v2_low, v2_hi, permVec1 );
2311 			v3 = vec_perm( v3_low, v3_hi, permVec2 );
2312 
2313 			//multiply together and keep running sum
2314 			runningTotal1 = vec_madd( v0, v1, runningTotal1 );
2315 			runningTotal2 = vec_madd( v2, v3, runningTotal2 );
2316 		}
2317 
2318 		runningTotal1 = vec_add( runningTotal1, runningTotal2 );
2319 
2320 		// sum accross vector
2321 		v0 = vec_add( runningTotal1, vec_sld( runningTotal1, runningTotal1, 8 ) );
2322 		v1 = vec_add( v0, vec_sld( v0, v0, 4 ) );
2323 		runningTotal1 = vec_splat( v1, 0 );
2324 		vec_ste( runningTotal1, 0, &dot );
2325 	}
2326 
2327 	//handle cleanup. when profiling the game, we found that most of the counts to this function were small, so it
2328 	// spends a lot of time in this scalar code. It's already really really fast (eg 1 TB tick) for scalar code for
2329 	// counts less than 50, so not much point in trying to get vector code in on the action
2330 	for ( ; i < count ; i++ ) {
2331 		dot += src1[i] * src2[i];
2332 	}
2333 
2334 }
2335 #endif /* ENABLE_DOT */
2336 
2337 #ifdef ENABLE_COMPARES
2338 
2339 /*
2340 ============
2341 idSIMD_AltiVec::CmpGT
2342 
2343   dst[i] = src0[i] > constant;
2344 ============
2345 */
2346 
CmpGT(byte * dst,const float * src0,const float constant,const int count)2347 void VPCALL idSIMD_AltiVec::CmpGT( byte *dst, const float *src0, const float constant, const int count ) {
2348 //#define OPER(X) dst[(X)] = src0[(X)] > constant;
2349 
2350 	register vector float v0, v1, v2, v3;
2351 	register vector bool int vr1, vr2, vr3, vr4;
2352 	register vector bool short vs1, vs2;
2353 	register vector float v0_low, v0_hi, v1_low, v1_hi, v2_low, v2_hi, v3_low, v3_hi;
2354 	register vector unsigned char vc1;
2355 	register vector bool char vbc1;
2356 	register vector float constVec;
2357 	register vector unsigned char oneVector = (vector unsigned char)(1);
2358 	register vector unsigned char permVec;
2359 	int i;
2360 
2361 	//handle unaligned at start
2362 	for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count );i++ ) {
2363 		dst[i] = src0[i] > constant;
2364 	}
2365 
2366 	//splat constant into a vector
2367 	constVec = loadSplatUnalignedScalar( &constant );
2368 
2369 	//calculate permute and do loads
2370 	permVec = vec_add( vec_lvsl( -1, (int*) &src0[i] ), oneVector );
2371 	v3_hi = vec_ld( 0, &src0[i] );
2372 
2373 	//vectorize!
2374 	for ( ; i+15 < count; i += 16 ) {
2375 		// load values
2376 		v0_low = v3_hi;
2377 		v0_hi = vec_ld( 15, &src0[i] );
2378 		v1_low = v0_hi;
2379 		v1_hi = vec_ld( 31, &src0[i] );
2380 		v2_low = v1_hi;
2381 		v2_hi = vec_ld( 47, &src0[i] );
2382 		v3_low = v2_hi;
2383 		v3_hi = vec_ld( 63, &src0[i] );
2384 
2385 		//permute into the vectors we want
2386 		v0 = vec_perm( v0_low, v0_hi, permVec );
2387 		v1 = vec_perm( v1_low, v1_hi, permVec );
2388 		v2 = vec_perm( v2_low, v2_hi, permVec );
2389 		v3 = vec_perm( v3_low, v3_hi, permVec );
2390 
2391 		//do comparison
2392 		vr1 = vec_cmpgt( v0, constVec );
2393 		vr2 = vec_cmpgt( v1, constVec );
2394 		vr3 = vec_cmpgt( v2, constVec );
2395 		vr4 = vec_cmpgt( v3, constVec );
2396 
2397 		// pack results into shorts
2398 		vs1 = vec_pack(vr1, vr2);
2399 		vs2 = vec_pack(vr3, vr4);
2400 
2401 		// pack results into byte
2402 		vbc1 = vec_pack(vs1, vs2);
2403 
2404 		//AND with 1 to get true=1 not true=255
2405 		vc1 = vec_and( vbc1, oneVector );
2406 
2407 		//store results
2408 		vec_st( vc1, 0, &dst[i] );
2409 	}
2410 
2411 	//handle cleanup
2412 	 for ( ; i < count ; i++ ) {
2413 		dst[i] = src0[i] > constant;
2414 	}
2415 }
2416 
2417 
2418 /*
2419 ============
2420 idSIMD_AltiVec::CmpGT
2421 
2422   dst[i] |= ( src0[i] > constant ) << bitNum;
2423 ============
2424 */
CmpGT(byte * dst,const byte bitNum,const float * src0,const float constant,const int count)2425 void VPCALL idSIMD_AltiVec::CmpGT( byte *dst, const byte bitNum, const float *src0, const float constant, const int count ) {
2426 //#define OPER(X) dst[(X)] |= ( src0[(X)] > constant ) << bitNum;
2427 
2428 	// Temp vector registers
2429 	register vector bool int vtbi0, vtbi1, vtbi2, vtbi3;
2430 	register vector bool short vtbs0, vtbs1;
2431 	register vector bool char vtbc0;
2432 	register vector unsigned char vtuc0;
2433 	register vector unsigned char permVec, permVec2;
2434 
2435 	// dest vectors
2436 	register vector unsigned char vd;
2437 	// bitNum vectors
2438 	register vector unsigned char bitNumVec;
2439 	// src0 vectors
2440 	register vector float vs0, vs1, vs2, vs3;
2441 	register vector float vs0_low, vs0_hi, vs1_low, vs1_hi, vs2_low, vs2_hi, vs3_low, vs3_hi;
2442 	// constant vector
2443 	register vector float constVec;
2444 	// all one's
2445 	register vector unsigned char oneVector = (vector unsigned char)(1);
2446 	int i = 0;
2447 
2448 	//handle unaligned at start
2449 	for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count );i++ ) {
2450 		dst[i] |= ( src0[i] > constant ) << bitNum;
2451 	}
2452 
2453 	//splat constant into a vector
2454 	constVec = loadSplatUnalignedScalar( &constant );
2455 
2456 	//bitNum is unaligned.
2457 	permVec2 = vec_lvsl( 0, &bitNum );
2458 	vtuc0 = vec_ld( 0, &bitNum );
2459 	bitNumVec = vec_perm( vtuc0, vtuc0, permVec2 );
2460 	bitNumVec = vec_splat( bitNumVec, 0 );
2461 
2462 	//calculate permute and do loads
2463 	permVec = vec_add( vec_lvsl( -1, (int*) &src0[i] ), oneVector );
2464 	vs3_hi = vec_ld( 0, &src0[i] );
2465 
2466 	//vectorize!
2467 	for ( ; i+15 < count; i += 16 ) {
2468 		//load sources (floats)
2469 		vs0_low = vs3_hi;
2470 		vs0_hi = vec_ld( 15, &src0[i] );
2471 		vs1_low = vs0_hi;
2472 		vs1_hi = vec_ld( 31, &src0[i] );
2473 		vs2_low = vs1_hi;
2474 		vs2_hi = vec_ld( 47, &src0[i] );
2475 		vs3_low = vs2_hi;
2476 		vs3_hi = vec_ld( 63, &src0[i] );
2477 
2478 		//permute into the vectors we want
2479 		vs0 = vec_perm( vs0_low, vs0_hi, permVec );
2480 		vs1 = vec_perm( vs1_low, vs1_hi, permVec );
2481 		vs2 = vec_perm( vs2_low, vs2_hi, permVec );
2482 		vs3 = vec_perm( vs3_low, vs3_hi, permVec );
2483 
2484 		//load dest (bytes) as unsigned char
2485 		vd = vec_ld( 0, &dst[i] );
2486 
2487 		// do comparison and get bool int result
2488 		vtbi0 = vec_cmpgt( vs0, constVec );
2489 		vtbi1 = vec_cmpgt( vs1, constVec );
2490 		vtbi2 = vec_cmpgt( vs2, constVec );
2491 		vtbi3 = vec_cmpgt( vs3, constVec );
2492 
2493 		// pack results into shorts
2494 		vtbs0 = vec_pack(vtbi0, vtbi1);
2495 		vtbs1 = vec_pack(vtbi2, vtbi3);
2496 
2497 		// pack results into byte
2498 		vtbc0 = vec_pack(vtbs0, vtbs1);
2499 
2500 		//and with 1 to get true=1 instead of true=255
2501 		vtuc0 = vec_and(vtbc0, oneVector);
2502 		vtuc0 = vec_sl(vtuc0, bitNumVec );
2503 
2504 		//or with original
2505 		vd = vec_or( vd, vtuc0 );
2506 
2507 		vec_st( vd, 0, &dst[i] );
2508 	}
2509 
2510 	//handle cleanup
2511 	for ( ; i < count ; i++ ) {
2512 		dst[i] |= ( src0[i] > constant ) << bitNum;
2513 	}
2514 }
2515 
2516 /*
2517 ============
2518 idSIMD_AltiVec::CmpGE
2519 
2520   dst[i] = src0[i] >= constant;
2521 ============
2522 */
CmpGE(byte * dst,const float * src0,const float constant,const int count)2523 void VPCALL idSIMD_AltiVec::CmpGE( byte *dst, const float *src0, const float constant, const int count ) {
2524 
2525 	register vector float v0, v1, v2, v3;
2526 	register vector bool int vr1, vr2, vr3, vr4;
2527 	register vector bool short vs1, vs2;
2528 	register vector float v0_low, v0_hi, v1_low, v1_hi, v2_low, v2_hi, v3_low, v3_hi;
2529 	register vector unsigned char vc1;
2530 	register vector bool char vbc1;
2531 	register vector float constVec;
2532 	register vector unsigned char oneVector = (vector unsigned char)(1);
2533 	register vector unsigned char permVec;
2534 	int i = 0;
2535 
2536 	//handle unaligned at start
2537 	for ( ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count );i++ ) {
2538 		dst[i] = src0[i] >= constant;
2539 	}
2540 
2541 	//splat constant into a vector
2542 	constVec = loadSplatUnalignedScalar( &constant );
2543 
2544 	//calculate permute and do loads
2545 	permVec = vec_add( vec_lvsl( -1, (int*) &src0[i] ), oneVector );
2546 	v3_hi = vec_ld( 0, &src0[i] );
2547 
2548 	//vectorize!
2549 	for ( ; i+15 < count; i += 16 ) {
2550 		// load values
2551 		v0_low = v3_hi;
2552 		v0_hi = vec_ld( 15, &src0[i] );
2553 		v1_low = v0_hi;
2554 		v1_hi = vec_ld( 31, &src0[i] );
2555 		v2_low = v1_hi;
2556 		v2_hi = vec_ld( 47, &src0[i] );
2557 		v3_low = v2_hi;
2558 		v3_hi = vec_ld( 63, &src0[i] );
2559 
2560 		//permute into the vectors we want
2561 		v0 = vec_perm( v0_low, v0_hi, permVec );
2562 		v1 = vec_perm( v1_low, v1_hi, permVec );
2563 		v2 = vec_perm( v2_low, v2_hi, permVec );
2564 		v3 = vec_perm( v3_low, v3_hi, permVec );
2565 
2566 		//do comparison
2567 		vr1 = vec_cmpge( v0, constVec );
2568 		vr2 = vec_cmpge( v1, constVec );
2569 		vr3 = vec_cmpge( v2, constVec );
2570 		vr4 = vec_cmpge( v3, constVec );
2571 
2572 		// pack results into shorts
2573 		vs1 = vec_pack(vr1, vr2);
2574 		vs2 = vec_pack(vr3, vr4);
2575 
2576 		// pack results into byte
2577 		vbc1 = vec_pack(vs1, vs2);
2578 
2579 		//AND with 1 to get true=1 not true=255
2580 		vc1 = vec_and( vbc1, oneVector );
2581 
2582 		//store results
2583 		vec_st( vc1, 0, &dst[i] );
2584 	}
2585 
2586 	//handle cleanup
2587 	 for ( ; i < count ; i++ ) {
2588 		dst[i] = src0[i] >= constant;
2589 	}
2590 }
2591 
2592 /*
2593 ============
2594 idSIMD_AltiVec::CmpGE
2595 
2596   dst[i] |= ( src0[i] >= constant ) << bitNum;
2597 ============
2598 */
CmpGE(byte * dst,const byte bitNum,const float * src0,const float constant,const int count)2599 void VPCALL idSIMD_AltiVec::CmpGE( byte *dst, const byte bitNum, const float *src0, const float constant, const int count ) {
2600 	register vector bool int vtbi0, vtbi1, vtbi2, vtbi3;
2601 	register vector bool short vtbs0, vtbs1;
2602 	register vector bool char vtbc0;
2603 	register vector unsigned char vtuc0;
2604 	register vector unsigned char permVec, permVec2;
2605 
2606 	// dest vectors
2607 	register vector unsigned char vd;
2608 	// bitNum vectors
2609 	register vector unsigned char bitNumVec;
2610 	// src0 vectors
2611 	register vector float vs0, vs1, vs2, vs3;
2612 	register vector float vs0_low, vs0_hi, vs1_low, vs1_hi, vs2_low, vs2_hi, vs3_low, vs3_hi;
2613 	// constant vector
2614 	register vector float constVec;
2615 	// all one's
2616 	register vector unsigned char oneVector = (vector unsigned char)(1);
2617 	int i = 0;
2618 
2619 	//handle unaligned at start
2620 	for (  ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count );i++ ) {
2621 		dst[i] |= ( src0[i] >= constant ) << bitNum;
2622 	}
2623 
2624 	//splat constant into a vector
2625 	constVec = loadSplatUnalignedScalar( &constant );
2626 
2627 	//bitNum is unaligned.
2628 	permVec2 = vec_lvsl( 0, &bitNum );
2629 	vtuc0 = vec_ld( 0, &bitNum );
2630 	bitNumVec = vec_perm( vtuc0, vtuc0, permVec2 );
2631 	bitNumVec = vec_splat( bitNumVec, 0 );
2632 
2633 	//calculate permute and do loads
2634 	permVec = vec_add( vec_lvsl( -1, (int*) &src0[i] ), oneVector );
2635 	vs3_hi = vec_ld( 0, &src0[i] );
2636 
2637 	//vectorize!
2638 	for ( ; i+15 < count; i += 16 ) {
2639 		//load sources (floats)
2640 		vs0_low = vs3_hi;
2641 		vs0_hi = vec_ld( 15, &src0[i] );
2642 		vs1_low = vs0_hi;
2643 		vs1_hi = vec_ld( 31, &src0[i] );
2644 		vs2_low = vs1_hi;
2645 		vs2_hi = vec_ld( 47, &src0[i] );
2646 		vs3_low = vs2_hi;
2647 		vs3_hi = vec_ld( 63, &src0[i] );
2648 
2649 		//permute into the vectors we want
2650 		vs0 = vec_perm( vs0_low, vs0_hi, permVec );
2651 		vs1 = vec_perm( vs1_low, vs1_hi, permVec );
2652 		vs2 = vec_perm( vs2_low, vs2_hi, permVec );
2653 		vs3 = vec_perm( vs3_low, vs3_hi, permVec );
2654 
2655 		//load dest (bytes) as unsigned char
2656 		vd = vec_ld( 0, &dst[i] );
2657 
2658 		// do comparison and get bool int result
2659 		vtbi0 = vec_cmpge( vs0, constVec );
2660 		vtbi1 = vec_cmpge( vs1, constVec );
2661 		vtbi2 = vec_cmpge( vs2, constVec );
2662 		vtbi3 = vec_cmpge( vs3, constVec );
2663 
2664 		// pack results into shorts
2665 		vtbs0 = vec_pack(vtbi0, vtbi1);
2666 		vtbs1 = vec_pack(vtbi2, vtbi3);
2667 
2668 		// pack results into byte
2669 		vtbc0 = vec_pack(vtbs0, vtbs1);
2670 
2671 		//and with 1L to get true=1 instead of true=255
2672 		vtuc0 = vec_and(vtbc0, oneVector);
2673 		vtuc0 = vec_sl(vtuc0, bitNumVec );
2674 
2675 		//or with original
2676 		vd = vec_or( vd, vtuc0 );
2677 
2678 		vec_st( vd, 0, &dst[i] );
2679 	}
2680 
2681 	//handle cleanup
2682 	for ( ; i < count ; i++ ) {
2683 		dst[i] |= ( src0[i] >= constant ) << bitNum;
2684 	}
2685 }
2686 
2687 
2688 /*
2689 ============
2690 idSIMD_AltiVec::CmpLT
2691 
2692   dst[i] = src0[i] < constant;
2693 ============
2694 */
CmpLT(byte * dst,const float * src0,const float constant,const int count)2695 void VPCALL idSIMD_AltiVec::CmpLT( byte *dst, const float *src0, const float constant, const int count ) {
2696 //#define OPER(X) dst[(X)] = src0[(X)] < constant;
2697 	register vector float v0, v1, v2, v3;
2698 	register vector bool int vr1, vr2, vr3, vr4;
2699 	register vector bool short vs1, vs2;
2700 	register vector float v0_low, v0_hi, v1_low, v1_hi, v2_low, v2_hi, v3_low, v3_hi;
2701 	register vector unsigned char vc1;
2702 	register vector bool char vbc1;
2703 	register vector float constVec;
2704 	register vector unsigned char oneVector = (vector unsigned char)(1);
2705 	register vector unsigned char permVec;
2706 	int i = 0;
2707 
2708 	//handle unaligned at start
2709 	for ( ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count );i++ ) {
2710 		dst[i] = src0[i] < constant;
2711 	}
2712 
2713 	//splat constant into a vector
2714 	constVec = loadSplatUnalignedScalar( &constant );
2715 
2716 	//calculate permute and do loads
2717 	permVec = vec_add( vec_lvsl( -1, (int*) &src0[i] ), oneVector );
2718 	v3_hi = vec_ld( 0, &src0[i] );
2719 
2720 	//vectorize!
2721 	for ( ; i+15 < count; i += 16 ) {
2722 		// load values
2723 		v0_low = v3_hi;
2724 		v0_hi = vec_ld( 15, &src0[i] );
2725 		v1_low = v0_hi;
2726 		v1_hi = vec_ld( 31, &src0[i] );
2727 		v2_low = v1_hi;
2728 		v2_hi = vec_ld( 47, &src0[i] );
2729 		v3_low = v2_hi;
2730 		v3_hi = vec_ld( 63, &src0[i] );
2731 
2732 		//permute into the vectors we want
2733 		v0 = vec_perm( v0_low, v0_hi, permVec );
2734 		v1 = vec_perm( v1_low, v1_hi, permVec );
2735 		v2 = vec_perm( v2_low, v2_hi, permVec );
2736 		v3 = vec_perm( v3_low, v3_hi, permVec );
2737 
2738 		//do comparison
2739 		vr1 = vec_cmplt( v0, constVec );
2740 		vr2 = vec_cmplt( v1, constVec );
2741 		vr3 = vec_cmplt( v2, constVec );
2742 		vr4 = vec_cmplt( v3, constVec );
2743 
2744 		// pack results into shorts
2745 		vs1 = vec_pack(vr1, vr2);
2746 		vs2 = vec_pack(vr3, vr4);
2747 
2748 		// pack results into byte
2749 		vbc1 = vec_pack(vs1, vs2);
2750 
2751 		//AND with 1 to get true=1 not true=255
2752 		vc1 = vec_and( vbc1, oneVector );
2753 
2754 		//store results
2755 		vec_st( vc1, 0, &dst[i] );
2756 	}
2757 
2758 	//handle cleanup
2759 	 for ( ; i < count ; i++ ) {
2760 		dst[i] = src0[i] < constant;
2761 	}
2762 }
2763 
2764 /*
2765 ============
2766 idSIMD_AltiVec::CmpLT
2767 
2768   dst[i] |= ( src0[i] < constant ) << bitNum;
2769 ============
2770 */
CmpLT(byte * dst,const byte bitNum,const float * src0,const float constant,const int count)2771 void VPCALL idSIMD_AltiVec::CmpLT( byte *dst, const byte bitNum, const float *src0, const float constant, const int count ) {
2772 //#define OPER(X) dst[(X)] |= ( src0[(X)] < constant ) << bitNum;
2773 	register vector bool int vtbi0, vtbi1, vtbi2, vtbi3;
2774 	register vector bool short vtbs0, vtbs1;
2775 	register vector bool char vtbc0;
2776 	register vector unsigned char vtuc0;
2777 	register vector unsigned char permVec, permVec2;
2778 
2779 	// dest vectors
2780 	register vector unsigned char vd;
2781 	// bitNum vectors
2782 	register vector unsigned char bitNumVec;
2783 	// src0 vectors
2784 	register vector float vs0, vs1, vs2, vs3;
2785 	register vector float vs0_low, vs0_hi, vs1_low, vs1_hi, vs2_low, vs2_hi, vs3_low, vs3_hi;
2786 	// constant vector
2787 	register vector float constVec;
2788 	// all one's
2789 	register vector unsigned char oneVector = (vector unsigned char)(1);
2790 	int i = 0;
2791 
2792 	//handle unaligned at start
2793 	for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count );i++ ) {
2794 		dst[i] |= ( src0[i] < constant ) << bitNum;
2795 	}
2796 
2797 	//splat constant into a vector
2798 	constVec = loadSplatUnalignedScalar( &constant );
2799 
2800 	//bitNum is unaligned.
2801 	permVec2 = vec_lvsl( 0, &bitNum );
2802 	vtuc0 = vec_ld( 0, &bitNum );
2803 	bitNumVec = vec_perm( vtuc0, vtuc0, permVec2 );
2804 	bitNumVec = vec_splat( bitNumVec, 0 );
2805 
2806 	//calculate permute and do loads
2807 	permVec = vec_add( vec_lvsl( -1, (int*) &src0[i] ), oneVector );
2808 	vs3_hi = vec_ld( 0, &src0[i] );
2809 
2810 	//vectorize!
2811 	for ( ; i+15 < count; i += 16 ) {
2812 		//load sources (floats)
2813 		vs0_low = vs3_hi;
2814 		vs0_hi = vec_ld( 15, &src0[i] );
2815 		vs1_low = vs0_hi;
2816 		vs1_hi = vec_ld( 31, &src0[i] );
2817 		vs2_low = vs1_hi;
2818 		vs2_hi = vec_ld( 47, &src0[i] );
2819 		vs3_low = vs2_hi;
2820 		vs3_hi = vec_ld( 63, &src0[i] );
2821 
2822 		//permute into the vectors we want
2823 		vs0 = vec_perm( vs0_low, vs0_hi, permVec );
2824 		vs1 = vec_perm( vs1_low, vs1_hi, permVec );
2825 		vs2 = vec_perm( vs2_low, vs2_hi, permVec );
2826 		vs3 = vec_perm( vs3_low, vs3_hi, permVec );
2827 
2828 		//load dest (bytes) as unsigned char
2829 		vd = vec_ld( 0, &dst[i] );
2830 
2831 		// do comparison and get bool int result
2832 		vtbi0 = vec_cmplt( vs0, constVec );
2833 		vtbi1 = vec_cmplt( vs1, constVec );
2834 		vtbi2 = vec_cmplt( vs2, constVec );
2835 		vtbi3 = vec_cmplt( vs3, constVec );
2836 
2837 		// pack results into shorts
2838 		vtbs0 = vec_pack(vtbi0, vtbi1);
2839 		vtbs1 = vec_pack(vtbi2, vtbi3);
2840 
2841 		// pack results into byte
2842 		vtbc0 = vec_pack(vtbs0, vtbs1);
2843 
2844 		//and with 1L to get true=1 instead of true=255
2845 		vtuc0 = vec_and(vtbc0, oneVector);
2846 		vtuc0 = vec_sl(vtuc0, bitNumVec );
2847 
2848 		//or with original
2849 		vd = vec_or( vd, vtuc0 );
2850 
2851 		vec_st( vd, 0, &dst[i] );
2852 	}
2853 
2854 	//handle cleanup
2855 	for ( ; i < count ; i++ ) {
2856 		dst[i] |= ( src0[i] < constant ) << bitNum;
2857 	}
2858 
2859 }
2860 //#endif
2861 
2862 /*
2863 ============
2864 idSIMD_AltiVec::CmpLE
2865 
2866   dst[i] = src0[i] <= constant;
2867 ============
2868 */
CmpLE(byte * dst,const float * src0,const float constant,const int count)2869 void VPCALL idSIMD_AltiVec::CmpLE( byte *dst, const float *src0, const float constant, const int count ) {
2870 //#define OPER(X) dst[(X)] = src0[(X)] <= constant;
2871 	register vector float v0, v1, v2, v3;
2872 	register vector bool int vr1, vr2, vr3, vr4;
2873 	register vector bool short vs1, vs2;
2874 	register vector float v0_low, v0_hi, v1_low, v1_hi, v2_low, v2_hi, v3_low, v3_hi;
2875 	register vector unsigned char vc1;
2876 	register vector bool char vbc1;
2877 	register vector float constVec;
2878 	register vector unsigned char oneVector = (vector unsigned char)(1);
2879 	register vector unsigned char permVec;
2880 	int i = 0;
2881 
2882 	//handle unaligned at start
2883 	for ( ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count );i++ ) {
2884 		dst[i] = src0[i] <= constant;
2885 	}
2886 
2887 	//splat constant into a vector
2888 	constVec = loadSplatUnalignedScalar( &constant );
2889 
2890 	//calculate permute and do loads
2891 	permVec = vec_add( vec_lvsl( -1, (int*) &src0[i] ), oneVector );
2892 	v3_hi = vec_ld( 0, &src0[i] );
2893 
2894 	//vectorize!
2895 	for ( ; i+15 < count; i += 16 ) {
2896 		// load values
2897 		v0_low = v3_hi;
2898 		v0_hi = vec_ld( 15, &src0[i] );
2899 		v1_low = v0_hi;
2900 		v1_hi = vec_ld( 31, &src0[i] );
2901 		v2_low = v1_hi;
2902 		v2_hi = vec_ld( 47, &src0[i] );
2903 		v3_low = v2_hi;
2904 		v3_hi = vec_ld( 63, &src0[i] );
2905 
2906 		//permute into the vectors we want
2907 		v0 = vec_perm( v0_low, v0_hi, permVec );
2908 		v1 = vec_perm( v1_low, v1_hi, permVec );
2909 		v2 = vec_perm( v2_low, v2_hi, permVec );
2910 		v3 = vec_perm( v3_low, v3_hi, permVec );
2911 
2912 		//do comparison
2913 		vr1 = vec_cmple( v0, constVec );
2914 		vr2 = vec_cmple( v1, constVec );
2915 		vr3 = vec_cmple( v2, constVec );
2916 		vr4 = vec_cmple( v3, constVec );
2917 
2918 		// pack results into shorts
2919 		vs1 = vec_pack(vr1, vr2);
2920 		vs2 = vec_pack(vr3, vr4);
2921 
2922 		// pack results into byte
2923 		vbc1 = vec_pack(vs1, vs2);
2924 
2925 		//AND with 1 to get true=1 not true=255
2926 		vc1 = vec_and( vbc1, oneVector );
2927 
2928 		//store results
2929 		vec_st( vc1, 0, &dst[i] );
2930 	}
2931 
2932 	//handle cleanup
2933 	 for ( ; i < count ; i++ ) {
2934 		dst[i] = src0[i] <= constant;
2935 	}
2936 }
2937 
2938 /*
2939 ============
2940 idSIMD_AltiVec::CmpLE
2941 
2942   dst[i] |= ( src0[i] <= constant ) << bitNum;
2943 ============
2944 */
CmpLE(byte * dst,const byte bitNum,const float * src0,const float constant,const int count)2945 void VPCALL idSIMD_AltiVec::CmpLE( byte *dst, const byte bitNum, const float *src0, const float constant, const int count ) {
2946 //#define OPER(X) dst[(X)] |= ( src0[(X)] <= constant ) << bitNum;
2947 	register vector bool int vtbi0, vtbi1, vtbi2, vtbi3;
2948 	register vector bool short vtbs0, vtbs1;
2949 	register vector bool char vtbc0;
2950 	register vector unsigned char vtuc0;
2951 	register vector unsigned char permVec, permVec2;
2952 
2953 	// dest vectors
2954 	register vector unsigned char vd;
2955 	// bitNum vectors
2956 	register vector unsigned char bitNumVec;
2957 	// src0 vectors
2958 	register vector float vs0, vs1, vs2, vs3;
2959 	register vector float vs0_low, vs0_hi, vs1_low, vs1_hi, vs2_low, vs2_hi, vs3_low, vs3_hi;
2960 	// constant vector
2961 	register vector float constVec;
2962 	// all one's
2963 	register vector unsigned char oneVector = (vector unsigned char)(1);
2964 	int i = 0;
2965 
2966 	//handle unaligned at start
2967 	for ( ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count );i++ ) {
2968 		dst[i] |= ( src0[i] <= constant ) << bitNum;
2969 	}
2970 
2971 	//splat constant into a vector
2972 	constVec = loadSplatUnalignedScalar( &constant );
2973 
2974 	//bitNum is unaligned.
2975 	permVec2 = vec_lvsl( 0, &bitNum );
2976 	vtuc0 = vec_ld( 0, &bitNum );
2977 	bitNumVec = vec_perm( vtuc0, vtuc0, permVec2 );
2978 	bitNumVec = vec_splat( bitNumVec, 0 );
2979 
2980 	//calculate permute and do loads
2981 	permVec = vec_add( vec_lvsl( -1, (int*) &src0[i] ), oneVector );
2982 	vs3_hi = vec_ld( 0, &src0[i] );
2983 
2984 	//vectorize!
2985 	for ( ; i+15 < count; i += 16 ) {
2986 		//load sources (floats)
2987 		vs0_low = vs3_hi;
2988 		vs0_hi = vec_ld( 15, &src0[i] );
2989 		vs1_low = vs0_hi;
2990 		vs1_hi = vec_ld( 31, &src0[i] );
2991 		vs2_low = vs1_hi;
2992 		vs2_hi = vec_ld( 47, &src0[i] );
2993 		vs3_low = vs2_hi;
2994 		vs3_hi = vec_ld( 63, &src0[i] );
2995 
2996 		//permute into the vectors we want
2997 		vs0 = vec_perm( vs0_low, vs0_hi, permVec );
2998 		vs1 = vec_perm( vs1_low, vs1_hi, permVec );
2999 		vs2 = vec_perm( vs2_low, vs2_hi, permVec );
3000 		vs3 = vec_perm( vs3_low, vs3_hi, permVec );
3001 
3002 		//load dest (bytes) as unsigned char
3003 		vd = vec_ld( 0, &dst[i] );
3004 
3005 		// do comparison and get bool int result
3006 		vtbi0 = vec_cmple( vs0, constVec );
3007 		vtbi1 = vec_cmple( vs1, constVec );
3008 		vtbi2 = vec_cmple( vs2, constVec );
3009 		vtbi3 = vec_cmple( vs3, constVec );
3010 
3011 		// pack results into shorts
3012 		vtbs0 = vec_pack(vtbi0, vtbi1);
3013 		vtbs1 = vec_pack(vtbi2, vtbi3);
3014 
3015 		// pack results into byte
3016 		vtbc0 = vec_pack(vtbs0, vtbs1);
3017 
3018 		//and with 1L to get true=1 instead of true=255
3019 		vtuc0 = vec_and(vtbc0, oneVector);
3020 		vtuc0 = vec_sl(vtuc0, bitNumVec );
3021 
3022 		//or with original
3023 		vd = vec_or( vd, vtuc0 );
3024 
3025 		vec_st( vd, 0, &dst[i] );
3026 	}
3027 
3028 	//handle cleanup
3029 	for ( ; i < count ; i++ ) {
3030 		dst[i] |= ( src0[i] <= constant ) << bitNum;
3031 	}
3032 }
3033 #endif /* ENABLE_COMPARES */
3034 
3035 #ifdef ENABLE_MINMAX
3036 
3037 /*
3038 ============
3039 idSIMD_AltiVec::MinMax
3040 ============
3041 */
MinMax(float & min,float & max,const float * src,const int count)3042 void VPCALL idSIMD_AltiVec::MinMax( float &min, float &max, const float *src, const int count ) {
3043 	min = idMath::INFINITY; max = -idMath::INFINITY;
3044 //#define OPER(X) if ( src[(X)] < min ) {min = src[(X)];} if ( src[(X)] > max ) {max = src[(X)];}
3045 
3046 	register vector float v0, v1, v2, v3;
3047 	register vector float maxVec, minVec, tempMin, tempMax;
3048 	register vector unsigned char permVec;
3049 	register vector float v0_low, v0_hi, v1_low, v1_hi;
3050 	vector unsigned char oneCharVector = (vector unsigned char)(1);
3051 	int i = 0;
3052 
3053 	if ( count >= 4 ) {
3054 
3055 		//calculate permute and do first load to
3056 		//get a starting point for min and max
3057 		permVec = vec_add( vec_lvsl( -1, (int*) &src[0] ), oneCharVector );
3058 		v1_hi = vec_ld( 0, &src[0] );
3059 
3060 		maxVec = loadSplatUnalignedScalar( &max );
3061 		minVec = loadSplatUnalignedScalar( &min );
3062 
3063 		//vectorize!
3064 		for ( ; i+7 < count; i += 8 ) {
3065 			//load sources
3066 			v0_low = v1_hi;
3067 			v0_hi = vec_ld( 15, &src[i] );
3068 			v1_low = v0_hi;
3069 			v1_hi = vec_ld( 31, &src[i] );
3070 			v0 = vec_perm( v0_low, v0_hi, permVec );
3071 			v1 = vec_perm( v1_low, v1_hi, permVec );
3072 
3073 			// minimum
3074 			v2 = vec_min( v0, v1 );
3075 			minVec = vec_min( minVec, v2 );
3076 			// maximum
3077 			v3 = vec_max( v0, v1 );
3078 			maxVec = vec_max( maxVec, v3 );
3079 		}
3080 
3081 		//minVec and maxVec hold the min/max elements from the array, but now
3082 		//we need to figure out which particular element it is
3083 
3084 		tempMin = minVec;
3085 		tempMax = maxVec;
3086 
3087 		// rotate vector around and compare to itself to find the real min/max
3088 		tempMin = vec_min( tempMin, vec_sld( tempMin, tempMin, 8 ) );
3089 		tempMax = vec_max( tempMax, vec_sld( tempMax, tempMax, 8 ) );
3090 		tempMin = vec_min( tempMin, vec_sld( tempMin, tempMin, 4 ) );
3091 		tempMax = vec_max( tempMax, vec_sld( tempMax, tempMax, 4 ) );
3092 		minVec = vec_splat( tempMin, 0 );
3093 		maxVec = vec_splat( tempMax, 0 );
3094 		vec_ste( minVec, 0, &min );
3095 		vec_ste( maxVec, 0, &max );
3096 	}
3097 
3098 	//cleanup
3099 	for ( ; i < count; i++ ) {
3100 		if ( src[i] < min ) {
3101 			min = src[i];
3102 		}
3103 		if ( src[i] > max ) {
3104 			max = src[i];
3105 		}
3106 	}
3107 }
3108 
3109 /*
3110 ============
3111 idSIMD_AltiVec::MinMax
3112 ============
3113 */
MinMax(idVec2 & min,idVec2 & max,const idVec2 * src,const int count)3114 void VPCALL idSIMD_AltiVec::MinMax( idVec2 &min, idVec2 &max, const idVec2 *src, const int count ) {
3115 	min[0] = min[1] = idMath::INFINITY; max[0] = max[1] = -idMath::INFINITY;
3116 //#define OPER(X) const idVec2 &v = src[(X)]; if ( v[0] < min[0] ) { min[0] = v[0]; } if ( v[0] > max[0] ) { max[0] = v[0]; } if ( v[1] < min[1] ) { min[1] = v[1]; } if ( v[1] > max[1] ) { max[1] = v[1]; }
3117 
3118 	idVec2 v;
3119 	int i = 0;
3120 	int j;
3121 
3122 	const float *srcPtr = src[0].ToFloatPtr();
3123 	register vector float vecLd1, vecLd2, vecLd3, vecLd4;
3124 	register vector float vecMin, vecMax;
3125 
3126 	register vector float v0, v1, v2, v3;
3127 
3128 	if ( count > 4 ) {
3129 
3130 		vecMin = (vector float)(FLT_MAX);
3131 		vecMax = (vector float)(FLT_MIN);
3132 
3133 		vector unsigned char permVec = vec_add( vec_lvsl( -1, srcPtr ), (vector unsigned char)(1) );
3134 		vector float vecOld = vec_ld( 0, srcPtr );
3135 
3136 		for ( i = 0, j = 0; i+7 < count; i += 8, j += 4) {
3137 			// load data
3138 			float *vecPtr = (float*)( srcPtr + (j*4) );
3139 			vector float v0, v1, v2, v3;
3140 
3141 			v0 = vecOld;
3142 			v1 = vec_ld( 15, vecPtr );
3143 			v2 = vec_ld( 31, vecPtr );
3144 			v3 = vec_ld( 47, vecPtr );
3145 			vecOld = vec_ld( 63, vecPtr );
3146 
3147 			vecLd1 = vec_perm( v0, v1, permVec );
3148 			vecLd2 = vec_perm( v1, v2, permVec );
3149 			vecLd3 = vec_perm( v2, v3, permVec );
3150 			vecLd4 = vec_perm( v3, vecOld, permVec );
3151 
3152 			// each of these vectors contains 2 elements
3153 			// looks like | X Y X Y | X Y X Y
3154 			v0 = vec_min( vecLd1, vecLd2 );
3155 			v1 = vec_min( vecLd3, vecLd4 );
3156 			v0 = vec_min( v0, v1 );
3157 
3158 			v2 = vec_max( vecLd1, vecLd2 );
3159 			v3 = vec_max( vecLd3, vecLd4 );
3160 			v2 = vec_max( v2, v3 );
3161 
3162 			// since its always X Y X Y we don't have to re-merge each time. we can wait
3163 			// until the end
3164 			vecMin = vec_min( v0, vecMin );
3165 			vecMax = vec_max( v2, vecMax );
3166 		}
3167 
3168 		vecMin = vec_min( vecMin, vec_sld( vecMin, vecMin, 8 ) );
3169 		vecMax = vec_max( vecMax, vec_sld( vecMax, vecMax, 8 ) );
3170 		v0 = vec_splat( vecMin, 0 );
3171 		v1 = vec_splat( vecMin, 1 );
3172 		v2 = vec_splat( vecMax, 0 );
3173 		v3 = vec_splat( vecMax, 1 );
3174 
3175 		vec_ste( v0, 0, &min[0] );
3176 		vec_ste( v1, 0, &min[1] );
3177 		vec_ste( v2, 0, &max[0] );
3178 		vec_ste( v3, 0, &max[1] );
3179 	}
3180 
3181 	// cleanup
3182 	for ( ; i < count; i++ ) {
3183 		v = src[i];
3184 
3185 		if ( v[0] < min[0] ) {
3186 			min[0] = v[0];
3187 		}
3188 		if ( v[0] > max[0] ) {
3189 			max[0] = v[0];
3190 		}
3191 
3192 		if ( v[1] < min[1] ) {
3193 			min[1] = v[1];
3194 		}
3195 		if ( v[1] > max[1] ) {
3196 			max[1] = v[1];
3197 		}
3198 	}
3199 }
3200 
3201 /*
3202 ============
3203 idSIMD_AltiVec::MinMax
3204 ============
3205 */
MinMax(idVec3 & min,idVec3 & max,const idVec3 * src,const int count)3206 void VPCALL idSIMD_AltiVec::MinMax( idVec3 &min, idVec3 &max, const idVec3 *src, const int count ) {
3207 	min[0] = min[1] = min[2] = idMath::INFINITY; max[0] = max[1] = max[2] = -idMath::INFINITY;
3208 //#define OPER(X) const idVec3 &v = src[(X)]; if ( v[0] < min[0] ) { min[0] = v[0]; } if ( v[0] > max[0] ) { max[0] = v[0]; } if ( v[1] < min[1] ) { min[1] = v[1]; } if ( v[1] > max[1] ) { max[1] = v[1]; } if ( v[2] < min[2] ) { min[2] = v[2]; } if ( v[2] > max[2] ) { max[2] = v[2]; }
3209 
3210 	int i = 0;
3211 	const float *srcPtr = src[0].ToFloatPtr();
3212 	idVec3 v;
3213 
3214 	register vector float vecLd1, vecLd2, vecLd3;
3215 	register vector float vecMin, vecMax;
3216 	register vector float vecSrc1, vecSrc2, vecSrc3, vecSrc4;
3217 	register vector float vecMin1, vecMin2, vecMax1, vecMax2;
3218 
3219 	if ( count >= 4 ) {
3220 
3221 		vecMin = (vector float)(FLT_MAX);
3222 		vecMax = (vector float)(FLT_MIN);
3223 
3224 		vector unsigned char permVec = vec_add( vec_lvsl( -1, srcPtr), (vector unsigned char)(1) );
3225 		vector float vecOld = vec_ld( 0, srcPtr );
3226 
3227 		// 4 elements at a time
3228 		for ( ; i+3 < count; i += 4 ) {
3229 			float *vecPtr = (float*)( srcPtr + (i*3) );
3230 			vector float v0, v1, v2;
3231 
3232 			v0 = vecOld;
3233 			v1 = vec_ld( 15, vecPtr );
3234 			v2 = vec_ld( 31, vecPtr );
3235 			vecOld = vec_ld( 47, vecPtr );
3236 
3237 			vecLd1 = vec_perm( v0, v1, permVec );
3238 			vecLd2 = vec_perm( v1, v2, permVec );
3239 			vecLd3 = vec_perm( v2, vecOld, permVec );
3240 
3241 			// put each idVec3 into its own vector as X Y Z (crap)
3242 			vecSrc1 = vecLd1;
3243 			vecSrc2 = vec_sld( vecLd1, vecLd2, 12 );
3244 			vecSrc3 = vec_sld( vecLd2, vecLd3, 8 );
3245 			vecSrc4 = vec_sld( vecLd3, vecLd3, 4 );
3246 
3247 			// do min and max
3248 			vecMin1 = vec_min( vecSrc1, vecSrc2 );
3249 			vecMin2 = vec_min( vecSrc3, vecSrc4 );
3250 			vecMin1 = vec_min( vecMin1, vecMin2 );
3251 			vecMin = vec_min( vecMin, vecMin1 );
3252 
3253 			vecMax1 = vec_max( vecSrc1, vecSrc2 );
3254 			vecMax2 = vec_max( vecSrc3, vecSrc4 );
3255 			vecMax1 = vec_max( vecMax1, vecMax2 );
3256 			vecMax = vec_max( vecMax1, vecMax );
3257 		}
3258 
3259 		// store results
3260 		vector float v0, v1, v2, v3, v4, v5;
3261 		v0 = vec_splat( vecMin, 0 );
3262 		v1 = vec_splat( vecMin, 1 );
3263 		v2 = vec_splat( vecMin, 2 );
3264 		v3 = vec_splat( vecMax, 0 );
3265 		v4 = vec_splat( vecMax, 1 );
3266 		v5 = vec_splat( vecMax, 2 );
3267 
3268 		vec_ste( v0, 0, &min[0] );
3269 		vec_ste( v1, 0, &min[1] );
3270 		vec_ste( v2, 0, &min[2] );
3271 		vec_ste( v3, 0, &max[0] );
3272 		vec_ste( v4, 0, &max[1] );
3273 		vec_ste( v5, 0, &max[2] );
3274 	}
3275 
3276 	// cleanup
3277 	for ( ; i < count; i ++ ) {
3278 		v = src[i];
3279 
3280 		if ( v[0] < min[0] ) {
3281 			min[0] = v[0];
3282 		}
3283 		if ( v[0] > max[0] ) {
3284 			max[0] = v[0];
3285 		}
3286 		if ( v[1] < min[1] ) {
3287 			min[1] = v[1];
3288 		}
3289 		if ( v[1] > max[1] ) {
3290 			max[1] = v[1];
3291 		}
3292 		if ( v[2] < min[2] ) {
3293 			min[2] = v[2];
3294 		}
3295 		if ( v[2] > max[2] ) {
3296 			max[2] = v[2];
3297 		}
3298 	}
3299 }
3300 
3301 #ifndef DRAWVERT_PADDED
3302 /*
3303 ============
3304 idSIMD_AltiVec::MinMax
3305 ============
3306 */
MinMax(idVec3 & min,idVec3 & max,const idDrawVert * src,const int count)3307 void VPCALL idSIMD_AltiVec::MinMax( idVec3 &min, idVec3 &max, const idDrawVert *src, const int count ) {
3308 
3309 	min[0] = min[1] = min[2] = idMath::INFINITY; max[0] = max[1] = max[2] = -idMath::INFINITY;
3310 	idVec3 v;
3311 	int i = 0;
3312 	register vector float vecMin, vecMax;
3313 
3314 	register vector float v0, v1, v2, v3, v4, v5, v6, v7;
3315 	register vector float vecMin1, vecMin2, vecMax1, vecMax2;
3316 
3317 	if ( count >= 4 ) {
3318 		vecMin = (vector float)(FLT_MAX);
3319 		vecMax = (vector float)(FLT_MIN);
3320 
3321 		vector unsigned char vertPerm1 = vec_add( vec_lvsl( -1, (float*) src[i].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
3322 		vector unsigned char vertPerm2 = vec_add( vec_lvsl( -1, (float*) src[i+1].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
3323 		vector unsigned char vertPerm3 = vec_add( vec_lvsl( -1, (float*) src[i+2].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
3324 		vector unsigned char vertPerm4 = vec_add( vec_lvsl( -1, (float*) src[i+3].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
3325 
3326 		for ( ; i+3 < count; i += 4) {
3327 			const float *vertPtr = src[i].xyz.ToFloatPtr();
3328 			const float *vertPtr2 = src[i+1].xyz.ToFloatPtr();
3329 			const float *vertPtr3 = src[i+2].xyz.ToFloatPtr();
3330 			const float *vertPtr4 = src[i+3].xyz.ToFloatPtr();
3331 
3332 			v0 = vec_ld( 0, vertPtr );
3333 			v1 = vec_ld( 11, vertPtr );
3334 			v2 = vec_ld( 0, vertPtr2 );
3335 			v3 = vec_ld( 11, vertPtr2 );
3336 			v4 = vec_ld( 0, vertPtr3 );
3337 			v5 = vec_ld( 11, vertPtr3 );
3338 			v6 = vec_ld( 0, vertPtr4 );
3339 			v7 = vec_ld( 11, vertPtr4 );
3340 
3341 			v0 = vec_perm( v0, v1, vertPerm1 );
3342 			v2 = vec_perm( v2, v3, vertPerm2 );
3343 			v4 = vec_perm( v4, v5, vertPerm3 );
3344 			v6 = vec_perm( v6, v7, vertPerm4 );
3345 
3346 			vecMin1 = vec_min( v0, v2 );
3347 			vecMin2 = vec_min( v4, v6 );
3348 			vecMin1 = vec_min( vecMin1, vecMin2 );
3349 			vecMin = vec_min( vecMin, vecMin1 );
3350 
3351 			vecMax1 = vec_max( v0, v2 );
3352 			vecMax2 = vec_max( v4, v6 );
3353 			vecMax1 = vec_max( vecMax1, vecMax2 );
3354 			vecMax = vec_max( vecMax, vecMax1 );
3355 		}
3356 
3357 		// now we have min/max vectors in X Y Z form, store out
3358 		v0 = vec_splat( vecMin, 0 );
3359 		v1 = vec_splat( vecMin, 1 );
3360 		v2 = vec_splat( vecMin, 2 );
3361 		v3 = vec_splat( vecMax, 0 );
3362 		v4 = vec_splat( vecMax, 1 );
3363 		v5 = vec_splat( vecMax, 2 );
3364 
3365 		vec_ste( v0, 0, &min[0] );
3366 		vec_ste( v1, 0, &min[1] );
3367 		vec_ste( v2, 0, &min[2] );
3368 		vec_ste( v3, 0, &max[0] );
3369 		vec_ste( v4, 0, &max[1] );
3370 		vec_ste( v5, 0, &max[2] );
3371 	}
3372 
3373 	// cleanup
3374 	for ( ; i < count; i++ ) {
3375 		v = src[i].xyz;
3376 
3377 		if ( v[0] < min[0] ) {
3378 			min[0] = v[0];
3379 		}
3380 		if ( v[0] > max[0] ) {
3381 			max[0] = v[0];
3382 		}
3383 
3384 		if ( v[1] < min[1] ) {
3385 			min[1] = v[1];
3386 		}
3387 		if ( v[1] > max[1] ) {
3388 			max[1] = v[1];
3389 		}
3390 
3391 		if ( v[2] > max[2] ) {
3392 			max[2] = v[2];
3393 		}
3394 
3395 		if ( v[2] < min[2] ) {
3396 			min[2] = v[2];
3397 		}
3398 	}
3399 }
3400 #else
3401 /*
3402 ============
3403 idSIMD_AltiVec::MinMax
3404 ============
3405 */
MinMax(idVec3 & min,idVec3 & max,const idDrawVert * src,const int count)3406 void VPCALL idSIMD_AltiVec::MinMax( idVec3 &min, idVec3 &max, const idDrawVert *src, const int count ) {
3407 
3408 	min[0] = min[1] = min[2] = idMath::INFINITY; max[0] = max[1] = max[2] = -idMath::INFINITY;
3409 	idVec3 v;
3410 	int i = 0;
3411 	register vector float vecMin, vecMax;
3412 
3413 	register vector float v0, v1, v2, v3, v4, v5, v6, v7;
3414 	register vector float vecMin1, vecMin2, vecMax1, vecMax2;
3415 
3416 	if ( count >= 4 ) {
3417 		vecMin = (vector float)(FLT_MAX);
3418 		vecMax = (vector float)(FLT_MIN);
3419 
3420 		for ( ; i+3 < count; i += 4) {
3421 			const float *vertPtr = src[i].xyz.ToFloatPtr();
3422 			const float *vertPtr2 = src[i+1].xyz.ToFloatPtr();
3423 			const float *vertPtr3 = src[i+2].xyz.ToFloatPtr();
3424 			const float *vertPtr4 = src[i+3].xyz.ToFloatPtr();
3425 
3426 			v0 = vec_ld( 0, vertPtr );
3427 			v2 = vec_ld( 0, vertPtr2 );
3428 			v4 = vec_ld( 0, vertPtr3 );
3429 			v6 = vec_ld( 0, vertPtr4 );
3430 
3431 			vecMin1 = vec_min( v0, v2 );
3432 			vecMin2 = vec_min( v4, v6 );
3433 			vecMin1 = vec_min( vecMin1, vecMin2 );
3434 			vecMin = vec_min( vecMin, vecMin1 );
3435 
3436 			vecMax1 = vec_max( v0, v2 );
3437 			vecMax2 = vec_max( v4, v6 );
3438 			vecMax1 = vec_max( vecMax1, vecMax2 );
3439 			vecMax = vec_max( vecMax, vecMax1 );
3440 		}
3441 
3442 		// now we have min/max vectors in X Y Z form, store out
3443 		v0 = vec_splat( vecMin, 0 );
3444 		v1 = vec_splat( vecMin, 1 );
3445 		v2 = vec_splat( vecMin, 2 );
3446 		v3 = vec_splat( vecMax, 0 );
3447 		v4 = vec_splat( vecMax, 1 );
3448 		v5 = vec_splat( vecMax, 2 );
3449 
3450 		vec_ste( v0, 0, &min[0] );
3451 		vec_ste( v1, 0, &min[1] );
3452 		vec_ste( v2, 0, &min[2] );
3453 		vec_ste( v3, 0, &max[0] );
3454 		vec_ste( v4, 0, &max[1] );
3455 		vec_ste( v5, 0, &max[2] );
3456 	}
3457 
3458 	// cleanup
3459 	for ( ; i < count; i++ ) {
3460 		v = src[i].xyz;
3461 
3462 		if ( v[0] < min[0] ) {
3463 			min[0] = v[0];
3464 		}
3465 		if ( v[0] > max[0] ) {
3466 			max[0] = v[0];
3467 		}
3468 
3469 		if ( v[1] < min[1] ) {
3470 			min[1] = v[1];
3471 		}
3472 		if ( v[1] > max[1] ) {
3473 			max[1] = v[1];
3474 		}
3475 
3476 		if ( v[2] > max[2] ) {
3477 			max[2] = v[2];
3478 		}
3479 
3480 		if ( v[2] < min[2] ) {
3481 			min[2] = v[2];
3482 		}
3483 	}
3484 }
3485 
3486 #endif /* DRAWVERT_PADDED */
3487 
3488 #ifndef DRAWVERT_PADDED
3489 /*
3490 ============
3491 idSIMD_AltiVec::MinMax
3492 ============
3493 */
MinMax(idVec3 & min,idVec3 & max,const idDrawVert * src,const int * indexes,const int count)3494 void VPCALL idSIMD_AltiVec::MinMax( idVec3 &min, idVec3 &max, const idDrawVert *src, const int *indexes, const int count ) {
3495 	min[0] = min[1] = min[2] = idMath::INFINITY; max[0] = max[1] = max[2] = -idMath::INFINITY;
3496 
3497 	idVec3 v;
3498 	int i = 0;
3499 
3500 	register vector float vecMin, vecMax;
3501 
3502 	register vector float v0, v1, v2, v3, v4, v5, v6, v7;
3503 	register vector float vecMin1, vecMin2, vecMax1, vecMax2;
3504 
3505 	if ( count >= 4 ) {
3506 
3507 		vecMin = (vector float)(FLT_MAX);
3508 		vecMax = (vector float)(FLT_MIN);
3509 
3510 		vector unsigned char vertPerm1;
3511 		vector unsigned char vertPerm2;
3512 		vector unsigned char vertPerm3;
3513 		vector unsigned char vertPerm4;
3514 
3515 		for ( ; i+3 < count; i += 4) {
3516 			const float *vertPtr = src[indexes[i]].xyz.ToFloatPtr();
3517 			const float *vertPtr2 = src[indexes[i+1]].xyz.ToFloatPtr();
3518 			const float *vertPtr3 = src[indexes[i+2]].xyz.ToFloatPtr();
3519 			const float *vertPtr4 = src[indexes[i+3]].xyz.ToFloatPtr();
3520 
3521 			vertPerm1 = vec_add( vec_lvsl( -1, vertPtr ), (vector unsigned char)(1) );
3522 			vertPerm2 = vec_add( vec_lvsl( -1, vertPtr2 ), (vector unsigned char)(1) );
3523 			vertPerm3 = vec_add( vec_lvsl( -1, vertPtr3 ), (vector unsigned char)(1) );
3524 			vertPerm4 = vec_add( vec_lvsl( -1, vertPtr4 ), (vector unsigned char)(1) );
3525 
3526 			v0 = vec_ld( 0, vertPtr );
3527 			v1 = vec_ld( 15, vertPtr );
3528 			v2 = vec_ld( 0, vertPtr2 );
3529 			v3 = vec_ld( 15, vertPtr2 );
3530 			v4 = vec_ld( 0, vertPtr3 );
3531 			v5 = vec_ld( 15, vertPtr3 );
3532 			v6 = vec_ld( 0, vertPtr4 );
3533 			v7 = vec_ld( 15, vertPtr4 );
3534 
3535 			v0 = vec_perm( v0, v1, vertPerm1 );
3536 			v2 = vec_perm( v2, v3, vertPerm2 );
3537 			v4 = vec_perm( v4, v5, vertPerm3 );
3538 			v6 = vec_perm( v6, v7, vertPerm4 );
3539 
3540 			vecMin1 = vec_min( v0, v2 );
3541 			vecMin2 = vec_min( v4, v6 );
3542 			vecMin1 = vec_min( vecMin1, vecMin2 );
3543 			vecMin = vec_min( vecMin, vecMin1 );
3544 
3545 			vecMax1 = vec_max( v0, v2 );
3546 			vecMax2 = vec_max( v4, v6 );
3547 			vecMax1 = vec_max( vecMax1, vecMax2 );
3548 			vecMax = vec_max( vecMax, vecMax1 );
3549 		}
3550 
3551 		// now we have min/max vectors in X Y Z form, store out
3552 		v0 = vec_splat( vecMin, 0 );
3553 		v1 = vec_splat( vecMin, 1 );
3554 		v2 = vec_splat( vecMin, 2 );
3555 		v3 = vec_splat( vecMax, 0 );
3556 		v4 = vec_splat( vecMax, 1 );
3557 		v5 = vec_splat( vecMax, 2 );
3558 
3559 		vec_ste( v0, 0, &min[0] );
3560 		vec_ste( v1, 0, &min[1] );
3561 		vec_ste( v2, 0, &min[2] );
3562 		vec_ste( v3, 0, &max[0] );
3563 		vec_ste( v4, 0, &max[1] );
3564 		vec_ste( v5, 0, &max[2] );
3565 	}
3566 
3567 	// cleanup
3568 	for ( ; i < count; i++ ) {
3569 		v = src[indexes[i]].xyz;
3570 
3571 		if ( v[0] < min[0] ) {
3572 			min[0] = v[0];
3573 		}
3574 		if ( v[0] > max[0] ) {
3575 			max[0] = v[0];
3576 		}
3577 
3578 		if ( v[1] < min[1] ) {
3579 			min[1] = v[1];
3580 		}
3581 		if ( v[1] > max[1] ) {
3582 			max[1] = v[1];
3583 		}
3584 
3585 		if ( v[2] > max[2] ) {
3586 			max[2] = v[2];
3587 		}
3588 
3589 		if ( v[2] < min[2] ) {
3590 			min[2] = v[2];
3591 		}
3592 	}
3593 }
3594 #else
3595 /*
3596 ============
3597 idSIMD_AltiVec::MinMax
3598 ============
3599 */
MinMax(idVec3 & min,idVec3 & max,const idDrawVert * src,const int * indexes,const int count)3600 void VPCALL idSIMD_AltiVec::MinMax( idVec3 &min, idVec3 &max, const idDrawVert *src, const int *indexes, const int count ) {
3601 	min[0] = min[1] = min[2] = idMath::INFINITY; max[0] = max[1] = max[2] = -idMath::INFINITY;
3602 
3603 	idVec3 v;
3604 	int i = 0;
3605 
3606 	register vector float vecMin, vecMax;
3607 
3608 	register vector float v0, v1, v2, v3, v4, v5, v6, v7;
3609 	register vector float vecMin1, vecMin2, vecMax1, vecMax2;
3610 
3611 	if ( count >= 4 ) {
3612 
3613 		vecMin = (vector float)(FLT_MAX);
3614 		vecMax = (vector float)(FLT_MIN);
3615 
3616 		vector unsigned char vertPerm1;
3617 		vector unsigned char vertPerm2;
3618 		vector unsigned char vertPerm3;
3619 		vector unsigned char vertPerm4;
3620 
3621 		for ( ; i+3 < count; i += 4) {
3622 			const float *vertPtr = src[indexes[i]].xyz.ToFloatPtr();
3623 			const float *vertPtr2 = src[indexes[i+1]].xyz.ToFloatPtr();
3624 			const float *vertPtr3 = src[indexes[i+2]].xyz.ToFloatPtr();
3625 			const float *vertPtr4 = src[indexes[i+3]].xyz.ToFloatPtr();
3626 
3627 			v0 = vec_ld( 0, vertPtr );
3628 			v2 = vec_ld( 0, vertPtr2 );
3629 			v4 = vec_ld( 0, vertPtr3 );
3630 			v6 = vec_ld( 0, vertPtr4 );
3631 
3632 			vecMin1 = vec_min( v0, v2 );
3633 			vecMin2 = vec_min( v4, v6 );
3634 			vecMin1 = vec_min( vecMin1, vecMin2 );
3635 			vecMin = vec_min( vecMin, vecMin1 );
3636 
3637 			vecMax1 = vec_max( v0, v2 );
3638 			vecMax2 = vec_max( v4, v6 );
3639 			vecMax1 = vec_max( vecMax1, vecMax2 );
3640 			vecMax = vec_max( vecMax, vecMax1 );
3641 		}
3642 
3643 		// now we have min/max vectors in X Y Z form, store out
3644 		v0 = vec_splat( vecMin, 0 );
3645 		v1 = vec_splat( vecMin, 1 );
3646 		v2 = vec_splat( vecMin, 2 );
3647 		v3 = vec_splat( vecMax, 0 );
3648 		v4 = vec_splat( vecMax, 1 );
3649 		v5 = vec_splat( vecMax, 2 );
3650 
3651 		vec_ste( v0, 0, &min[0] );
3652 		vec_ste( v1, 0, &min[1] );
3653 		vec_ste( v2, 0, &min[2] );
3654 		vec_ste( v3, 0, &max[0] );
3655 		vec_ste( v4, 0, &max[1] );
3656 		vec_ste( v5, 0, &max[2] );
3657 	}
3658 
3659 	// cleanup
3660 	for ( ; i < count; i++ ) {
3661 		v = src[indexes[i]].xyz;
3662 
3663 		if ( v[0] < min[0] ) {
3664 			min[0] = v[0];
3665 		}
3666 		if ( v[0] > max[0] ) {
3667 			max[0] = v[0];
3668 		}
3669 
3670 		if ( v[1] < min[1] ) {
3671 			min[1] = v[1];
3672 		}
3673 		if ( v[1] > max[1] ) {
3674 			max[1] = v[1];
3675 		}
3676 
3677 		if ( v[2] > max[2] ) {
3678 			max[2] = v[2];
3679 		}
3680 
3681 		if ( v[2] < min[2] ) {
3682 			min[2] = v[2];
3683 		}
3684 	}
3685 }
3686 
3687 
3688 #endif /* DRAWVERT_PADDED */
3689 
3690 #endif /* ENABLE_MINMAX */
3691 
3692 #ifdef ENABLE_CLAMP
3693 
3694 /*
3695 ============
3696 idSIMD_AltiVec::Clamp
3697 ============
3698 */
Clamp(float * dst,const float * src,const float min,const float max,const int count)3699 void VPCALL idSIMD_AltiVec::Clamp( float *dst, const float *src, const float min, const float max, const int count ) {
3700 //#define OPER(X) dst[(X)] = src[(X)] < min ? min : src[(X)] > max ? max : src[(X)];
3701 	register vector float v0, v1, v2, v3, v4, v5;
3702 	register vector unsigned char permVec;
3703 	register vector float v0_low, v0_hi, v1_low, v1_hi;
3704 	vector unsigned char oneVector = (vector unsigned char)(1);
3705 	register vector float minVec, maxVec;
3706 	int i = 0;
3707 
3708 	//handle unaligned at start
3709 	for ( ;  NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count ); i++ ) {
3710 		dst[i] = src[i] < min ? min : src[i] > max ? max : src[i];
3711 	}
3712 
3713 	//splat min/max into a vector
3714 	minVec = loadSplatUnalignedScalar( &min );
3715 	maxVec = loadSplatUnalignedScalar( &max );
3716 
3717 	//calculate permute and do first load
3718 	permVec = vec_add( vec_lvsl( -1, (int*) &src[i] ), oneVector );
3719 	v1_hi = vec_ld( 0, &src[i] );
3720 
3721 
3722 	//vectorize!
3723 	for ( ; i+7 < count; i += 8 ) {
3724 		//load source
3725 		v0_low = v1_hi;
3726 		v0_hi = vec_ld( 15, &src[i] );
3727 		v1_low = v0_hi;
3728 		v1_hi = vec_ld( 31, &src[i] );
3729 
3730 		v0 = vec_perm( v0_low, v0_hi, permVec );
3731 		v1 = vec_perm( v1_low, v1_hi, permVec );
3732 
3733 		//apply minimum
3734 		v2 = vec_max( v0, minVec );
3735 		v3 = vec_max( v1, minVec );
3736 
3737 		//apply maximum
3738 		v4 = vec_min( v2, maxVec );
3739 		v5 = vec_min( v3, maxVec );
3740 
3741 		ALIGNED_STORE2( &dst[i], v4, v5 );
3742 	}
3743 
3744 	//handle cleanup
3745 	for ( ; i < count ; i++ ) {
3746 		dst[i] = src[i] < min ? min : src[i] > max ? max : src[i];
3747 	}
3748 }
3749 
3750 /*
3751 ============
3752 idSIMD_AltiVec::ClampMin
3753 ============
3754 */
ClampMin(float * dst,const float * src,const float min,const int count)3755 void VPCALL idSIMD_AltiVec::ClampMin( float *dst, const float *src, const float min, const int count ) {
3756 //#define OPER(X) dst[(X)] = src[(X)] < min ? min : src[(X)];
3757 	register vector float v0, v1, v2, v3;
3758 	register vector unsigned char permVec;
3759 	register vector float v0_low, v0_hi, v1_low, v1_hi;
3760 	register vector float constVec;
3761 	vector unsigned char oneVector = (vector unsigned char)(1);
3762 	int i = 0;
3763 
3764 	//handle unaligned at start
3765 	for ( ;  NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count ); i++ ) {
3766 		dst[i] = src[i] < min ? min : src[i];
3767 	}
3768 
3769 	//splat constant into a vector
3770 	constVec = loadSplatUnalignedScalar( &min );
3771 
3772 	//calculate permute and do first load
3773 	permVec = vec_add( vec_lvsl( -1, (int*) &src[i] ), oneVector );
3774 	v1_hi = vec_ld( 0, &src[i] );
3775 
3776 	//vectorize!
3777 	for ( ; i+7 < count; i += 8 ) {
3778 		//load source
3779 		v0_low = v1_hi;
3780 		v0_hi = vec_ld( 15, &src[i] );
3781 		v1_low = v0_hi;
3782 		v1_hi = vec_ld( 31, &src[i] );
3783 
3784 		v0 = vec_perm( v0_low, v0_hi, permVec );
3785 		v1 = vec_perm( v1_low, v1_hi, permVec );
3786 
3787 		v2 = vec_max( v0, constVec );
3788 		v3 = vec_max( v1, constVec );
3789 
3790 		ALIGNED_STORE2( &dst[i], v2, v3 );
3791 	}
3792 
3793 	//handle cleanup
3794 	 for ( ; i < count ; i++ ) {
3795 		dst[i] = src[i] < min ? min : src[i];
3796 	}
3797  }
3798 
3799 /*
3800 ============
3801 idSIMD_AltiVec::ClampMax
3802 ============
3803 */
ClampMax(float * dst,const float * src,const float max,const int count)3804 void VPCALL idSIMD_AltiVec::ClampMax( float *dst, const float *src, const float max, const int count ) {
3805 //#define OPER(X) dst[(X)] = src[(X)] > max ? max : src[(X)];
3806 	register vector float v0, v1, v2, v3;
3807 	register vector unsigned char permVec;
3808 	register vector float constVec;
3809 	register vector float v0_low, v0_hi, v1_low, v1_hi;
3810 	vector unsigned char oneVector = (vector unsigned char)(1);
3811 	int i = 0;
3812 
3813 	//handle unaligned at start
3814 	for ( ;  NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count ); i++ ) {
3815 		dst[i] = src[i] < max ? max : src[i];
3816 	}
3817 
3818 	//splat constant into a vector
3819 	constVec = loadSplatUnalignedScalar( &max );
3820 
3821 	//calculate permute and do first load
3822 	permVec = vec_add( vec_lvsl( -1, (int*) &src[i] ), oneVector );
3823 	v1_hi = vec_ld( 0, &src[i] );
3824 
3825 	//vectorize!
3826 	for ( ; i+7 < count; i += 8 ) {
3827 		//load source
3828 		v0_low = v1_hi;
3829 		v0_hi = vec_ld( 15, &src[i] );
3830 		v1_low = v0_hi;
3831 		v1_hi = vec_ld( 31, &src[i] );
3832 
3833 		v0 = vec_perm( v0_low, v0_hi, permVec );
3834 		v1 = vec_perm( v1_low, v1_hi, permVec );
3835 		v2 = vec_min( v0, constVec );
3836 		v3 = vec_min( v1, constVec );
3837 
3838 		ALIGNED_STORE2( &dst[i], v2, v3 );
3839 	}
3840 
3841 	//handle cleanup
3842 	for ( ; i < count ; i++ ) {
3843 		dst[i] = src[i] < max ? max : src[i];
3844 	}
3845 }
3846 
3847 #endif /* ENABLE_CLAMP */
3848 
3849 #ifdef ENABLE_16ROUTINES
3850 
3851 /*
3852 ============
3853 idSIMD_AltiVec::Zero16
3854 ============
3855 */
Zero16(float * dst,const int count)3856 void VPCALL idSIMD_AltiVec::Zero16( float *dst, const int count ) {
3857 	memset( dst, 0, count * sizeof( float ) );
3858 }
3859 
3860 /*
3861 ============
3862 idSIMD_AltiVec::Negate16
3863 
3864 	Assumptions:
3865 		dst is aligned
3866 ============
3867 */
Negate16(float * dst,const int count)3868 void VPCALL idSIMD_AltiVec::Negate16( float *dst, const int count ) {
3869 //#define OPER(X) ptr[(X)] ^= ( 1 << 31 )		// IEEE 32 bits float sign bit
3870 
3871 	// dst is aligned
3872 	assert( IS_16BYTE_ALIGNED( dst[0] ) );
3873 
3874 	// round count up to next 4 if needbe
3875 	int count2 = ( count + 3 ) & ~3;
3876 
3877 	int i = 0;
3878 	vector float v0, v1, v2, v3;
3879 
3880 	//know its 16-byte aligned
3881 	for ( ; i + 7 < count2; i += 8 ) {
3882 		v0 = vec_ld( 0, &dst[i] );
3883 		v1 = vec_ld( 16, &dst[i] );
3884 
3885 		v2 = vec_sub( (vector float)(0), v0 );
3886 		v3 = vec_sub( (vector float)(0), v1 );
3887 
3888 		ALIGNED_STORE2( &dst[i], v2, v3 );
3889 	}
3890 
3891 	for ( ; i < count2; i += 4 ) {
3892 		v0 = vec_ld( 0, &dst[i] );
3893 		v1 = vec_sub( (vector float)(0), v0 );
3894 		vec_st( v1, 0, &dst[i] );
3895 	}
3896 }
3897 
3898 /*
3899 ============
3900 idSIMD_AltiVec::Copy16
3901 ============
3902 */
Copy16(float * dst,const float * src,const int count)3903 void VPCALL idSIMD_AltiVec::Copy16( float *dst, const float *src, const int count ) {
3904 //#define OPER(X) dst[(X)] = src[(X)]
3905 	memcpy( dst, src, sizeof(float) * count );
3906 }
3907 
3908 /*
3909 ============
3910 idSIMD_AltiVec::Add16
3911 
3912 	Assumptions:
3913 		Assumes dst, src1, src2 all start at aligned address
3914 ============
3915 */
Add16(float * dst,const float * src1,const float * src2,const int count)3916 void VPCALL idSIMD_AltiVec::Add16( float *dst, const float *src1, const float *src2, const int count ) {
3917 //#define OPER(X) dst[(X)] = src1[(X)] + src2[(X)]
3918 
3919 	// dst is aligned
3920 	assert( IS_16BYTE_ALIGNED( dst[0] ) );
3921 	// src1 is aligned
3922 	assert( IS_16BYTE_ALIGNED( src1[0] ) );
3923 	// src2 is aligned
3924 	assert( IS_16BYTE_ALIGNED( src2[0] ) );
3925 
3926 	// round count up to next 4 if needbe
3927 	int count2 = ( count + 3 ) & ~3;
3928 
3929 	register vector float v0, v1, v2, v3, v4, v5;
3930 	int i = 0;
3931 
3932 	//know all data is 16-byte aligned, so vectorize!
3933 	for ( ; i+7 < count2; i += 8 ) {
3934 		//load sources
3935 		v0 = vec_ld( 0, &src1[i] );
3936 		v1 = vec_ld( 16, &src1[i] );
3937 		v2 = vec_ld( 0, &src2[i] );
3938 		v3 = vec_ld( 16, &src2[i] );
3939 		v4 = vec_add( v0, v2 );
3940 		v5 = vec_add( v1, v3 );
3941 
3942 		ALIGNED_STORE2( &dst[i], v4, v5 );
3943 	}
3944 
3945 	for ( ; i < count2; i += 4 ) {
3946 		v0 = vec_ld( 0, &src1[i] );
3947 		v1 = vec_ld( 0, &src2[i] );
3948 		v2 = vec_add( v0, v1 );
3949 		vec_st( v2, 0, &dst[i] );
3950 	}
3951 }
3952 
3953 /*
3954 ============
3955 idSIMD_AltiVec::Sub16
3956 
3957 	Assumptions:
3958 		Assumes that dst, src1, and src2 all start at aligned address
3959 ============
3960 */
Sub16(float * dst,const float * src1,const float * src2,const int count)3961 void VPCALL idSIMD_AltiVec::Sub16( float *dst, const float *src1, const float *src2, const int count ) {
3962 //#define OPER(X) dst[(X)] = src1[(X)] - src2[(X)]
3963 	// dst is aligned
3964 	assert( IS_16BYTE_ALIGNED( dst[0] ) );
3965 	// src1 is aligned
3966 	assert( IS_16BYTE_ALIGNED( src1[0] ) );
3967 	// src2 is aligned
3968 	assert( IS_16BYTE_ALIGNED( src2[0] ) );
3969 
3970 	// round count up to next 4 if needbe
3971 	int count2 = ( count + 3 ) & ~3;
3972 
3973 	register vector float v0, v1, v2, v3, v4, v5;
3974 	int i = 0;
3975 
3976 	//know data is aligned, so vectorize!
3977 	for ( ; i+7 < count2; i += 8 ) {
3978 		//load sources
3979 		v0 = vec_ld( 0, &src1[i] );
3980 		v1 = vec_ld( 16, &src1[i] );
3981 		v2 = vec_ld( 0, &src2[i] );
3982 		v3 = vec_ld( 16, &src2[i] );
3983 		v4 = vec_sub( v0, v2 );
3984 		v5 = vec_sub( v1, v3 );
3985 
3986 		ALIGNED_STORE2( &dst[i], v4, v5 );
3987 	}
3988 
3989 	for ( ; i < count2; i += 4 ) {
3990 		v0 = vec_ld( 0, &src1[i] );
3991 		v1 = vec_ld( 0, &src2[i] );
3992 		v2 = vec_sub( v0, v1 );
3993 		vec_st( v2, 0, &dst[i] );
3994 	}
3995 }
3996 
3997 /*
3998 ============
3999 idSIMD_AltiVec::Mul16
4000 
4001 	Assumptions:
4002 		Assumes that dst and src1 start at aligned address
4003 ============
4004 */
Mul16(float * dst,const float * src1,const float constant,const int count)4005 void VPCALL idSIMD_AltiVec::Mul16( float *dst, const float *src1, const float constant, const int count ) {
4006 //#define OPER(X) dst[(X)] = src1[(X)] * constant
4007 
4008 	// dst is aligned
4009 	assert( IS_16BYTE_ALIGNED( dst[0] ) );
4010 	// src1 is aligned
4011 	assert( IS_16BYTE_ALIGNED( src1[0] ) );
4012 
4013 	// round count up to next 4 if needbe
4014 	int count2 = ( count + 3 ) & ~3;
4015 
4016 	register vector float v0, v1, v2, v3;
4017 	register vector float constVec;
4018 	register vector float zeroVector = (vector float)(0.0);
4019 	int i = 0;
4020 
4021 	//splat constant into a vector
4022 	constVec = loadSplatUnalignedScalar( &constant );
4023 
4024 	//know data is aligned, so vectorize!
4025 	for ( ; i+7 < count2; i += 8 ) {
4026 		//load source
4027 		v0 = vec_ld( 0, &src1[i] );
4028 		v1 = vec_ld( 16, &src1[i] );
4029 		v2 = vec_madd( constVec, v0, zeroVector );
4030 		v3 = vec_madd( constVec, v1, zeroVector );
4031 		ALIGNED_STORE2( &dst[i], v2, v3 );
4032 	}
4033 
4034 	for ( ; i < count2; i += 4 ) {
4035 		v0 = vec_ld( 0, &src1[i] );
4036 		v1 = vec_madd( constVec, v0, zeroVector );
4037 		vec_st( v1, 0, &dst[i] );
4038 	}
4039 }
4040 
4041 /*
4042 ============
4043 idSIMD_AltiVec::AddAssign16
4044 
4045 	Assumptions:
4046 		Assumes that dst and src start at aligned address
4047 ============
4048 */
AddAssign16(float * dst,const float * src,const int count)4049 void VPCALL idSIMD_AltiVec::AddAssign16( float *dst, const float *src, const int count ) {
4050 //#define OPER(X) dst[(X)] += src[(X)]
4051 
4052 	// dst is aligned
4053 	assert( IS_16BYTE_ALIGNED( dst[0] ) );
4054 	// src is aligned
4055 	assert( IS_16BYTE_ALIGNED( src[0] ) );
4056 
4057 	// round count up to next 4 if needbe
4058 	int count2 = ( count + 3 ) & ~3;
4059 
4060 	register vector float v0, v1, v2, v3, v4, v5;
4061 	int i = 0;
4062 
4063 	//vectorize!
4064 	for ( ; i+7 < count2; i += 8 ) {
4065 		v0 = vec_ld( 0, &src[i] );
4066 		v1 = vec_ld( 16, &src[i] );
4067 		v2 = vec_ld( 0, &dst[i] );
4068 		v3 = vec_ld( 16, &dst[i] );
4069 		v4 = vec_add( v0, v2 );
4070 		v5 = vec_add( v1, v3 );
4071 		ALIGNED_STORE2( &dst[i], v4, v5 );
4072 	}
4073 
4074 	for ( ; i < count2; i += 4 ) {
4075 		v0 = vec_ld( 0, &src[i] );
4076 		v1 = vec_ld( 0, &dst[i] );
4077 		v2 = vec_add( v0, v1 );
4078 		vec_st( v2, 0, &dst[i] );
4079 	}
4080 }
4081 
4082 /*
4083 ============
4084 idSIMD_AltiVec::SubAssign16
4085 
4086 	Assumptions:
4087 		Assumes that dst and src start at aligned address
4088 ============
4089 */
SubAssign16(float * dst,const float * src,const int count)4090 void VPCALL idSIMD_AltiVec::SubAssign16( float *dst, const float *src, const int count ) {
4091 //#define OPER(X) dst[(X)] -= src[(X)]
4092 	register vector float v0, v1, v2, v3, v4, v5;
4093 	int i=0;
4094 
4095 	// dst is aligned
4096 	assert( IS_16BYTE_ALIGNED( dst[0] ) );
4097 	// src is aligned
4098 	assert( IS_16BYTE_ALIGNED( src[0] ) );
4099 	// round count up to next 4 if needbe
4100 	int count2 = ( count + 3 ) & ~3;
4101 
4102 	//vectorize!
4103 	for ( ; i+7 < count2; i += 8 ) {
4104 		v0 = vec_ld( 0, &src[i] );
4105 		v1 = vec_ld( 16, &src[i] );
4106 		v2 = vec_ld( 0, &dst[i] );
4107 		v3 = vec_ld( 16, &dst[i] );
4108 		v4 = vec_sub( v2, v0 );
4109 		v5 = vec_sub( v3, v1 );
4110 		ALIGNED_STORE2( &dst[i], v4, v5 );
4111 	}
4112 
4113 	for ( ; i < count2; i += 4 ) {
4114 		v0 = vec_ld( 0, &src[i] );
4115 		v1 = vec_ld( 0, &dst[i] );
4116 		v2 = vec_sub( v1, v0 );
4117 		vec_st( v2, 0, &dst[i] );
4118 	}
4119 }
4120 
4121 /*
4122 ============
4123 idSIMD_AltiVec::MulAssign16
4124 
4125 	Assumptions:
4126 		Assumes that dst starts at aligned address and count is multiple of 4
4127 ============
4128 */
MulAssign16(float * dst,const float constant,const int count)4129 void VPCALL idSIMD_AltiVec::MulAssign16( float *dst, const float constant, const int count ) {
4130 //#define OPER(X) dst[(X)] *= constant
4131 
4132 	// dst is aligned
4133 	assert( IS_16BYTE_ALIGNED( dst[0] ) );
4134 	// round count up to next 4 if needbe
4135 	int count2 = ( count + 3 ) & ~3;
4136 
4137 	register vector float v0, v1, v2, v3;
4138 	register vector float constVec;
4139 	int i = 0;
4140 	register vector float zeroVector = (vector float)(0.0);
4141 
4142 	//splat constant into a vector
4143 	constVec = loadSplatUnalignedScalar( &constant );
4144 
4145 	//vectorize!
4146 	for ( ; i+7 < count2; i += 8 ) {
4147 		v0 = vec_ld( 0, &dst[i] );
4148 		v1 = vec_ld( 16, &dst[i] );
4149 		v2 = vec_madd( v0, constVec, zeroVector );
4150 		v3 = vec_madd( v1, constVec, zeroVector );
4151 		ALIGNED_STORE2( &dst[i], v2, v3 );
4152 	}
4153 
4154 	for ( ; i < count2; i += 4 ) {
4155 		v0 = vec_ld( 0, &dst[i] );
4156 		v1 = vec_madd( v0, constVec, zeroVector );
4157 		vec_st( v1, 0, &dst[i] );
4158 	}
4159 }
4160 
4161 #endif /* ENABLE_16ROUTINES */
4162 
4163 #ifdef ENABLE_LOWER_TRIANGULAR
4164 
4165 /*
4166 ============
4167 idSIMD_AltiVec::MatX_LowerTriangularSolve
4168 
4169   solves x in L * x = b for the first n rows of L
4170   if skip > 0 the first skip elements of x are assumed to be valid already
4171   L has to be a lower triangular matrix with (implicit) ones on the diagonal
4172   x == b is allowed
4173 ============
4174 */
4175 
MatX_LowerTriangularSolve(const idMatX & L,float * x,const float * b,const int n,int skip)4176 void VPCALL idSIMD_AltiVec::MatX_LowerTriangularSolve( const idMatX &L, float *x, const float *b, const int n, int skip ) {
4177 
4178 	int i, j;
4179 	const float *lptr;
4180 	const float *lptr2;
4181 	const float *lptr3;
4182 	const float *lptr4;
4183 	float sum;
4184 	float sum2;
4185 	float sum3;
4186 	float sum4;
4187 	float tempSum;
4188 	float tempSum2;
4189 	float tempSum3;
4190 	float tempSum4;
4191 	vector float vecSum1 = (vector float)(0.0);
4192 	vector float vecSum2 = (vector float)(0.0);
4193 	vector float v0, v1, v2, v3, v4, v5, v6, v7, v8, v9;
4194 	vector float zeroVector = (vector float)(0.0);
4195 	vector float vecSum3, vecSum4, vecSum5, vecSum6, vecSum7, vecSum8;
4196 
4197 	vector unsigned char vecPermX = vec_add( vec_lvsl( -1, &x[0] ), (vector unsigned char)(1) );
4198 
4199 	// unrolled this loop a bit
4200 	for ( i = skip; i+3 < n; i+=4 ) {
4201 		sum = b[i];
4202 		sum2 = b[i+1];
4203 		sum3 = b[i+2];
4204 		sum4 = b[i+3];
4205 
4206 		vecSum1 = zeroVector;
4207 		vecSum2 = zeroVector;
4208 		vecSum3 = vecSum4 = vecSum5 = vecSum6 = vecSum7 = vecSum8 = zeroVector;
4209 		lptr = L[i];
4210 		lptr2 = L[i+1];
4211 		lptr3 = L[i+2];
4212 		lptr4 = L[i+3];
4213 
4214 		vector unsigned char vecPermLptr1 = vec_add( vec_lvsl( -1, lptr ), (vector unsigned char)(1) );
4215 		vector unsigned char vecPermLptr2 = vec_add( vec_lvsl( -1, lptr2 ), (vector unsigned char)(1) );
4216 		vector unsigned char vecPermLptr3 = vec_add( vec_lvsl( -1, lptr3 ), (vector unsigned char)(1) );
4217 		vector unsigned char vecPermLptr4 = vec_add( vec_lvsl( -1, lptr4 ), (vector unsigned char)(1) );
4218 
4219 		for ( j = 0 ; j+7 < i; j+=8 ) {
4220 
4221 			v0 = vec_ld( 0, &x[j] );
4222 			v1 = vec_ld( 15, &x[j] );
4223 			vector float vecExtraX = vec_ld( 31, &x[j] );
4224 			v0 = vec_perm( v0, v1, vecPermX );
4225 			v1 = vec_perm( v1, vecExtraX, vecPermX );
4226 
4227 			v2 = vec_ld( 0, lptr + j );
4228 			v3 = vec_ld( 15, lptr + j );
4229 			vector float vecExtra1 = vec_ld( 31, lptr + j );
4230 			v2 = vec_perm( v2, v3, vecPermLptr1 );
4231 			v3 = vec_perm( v3, vecExtra1, vecPermLptr1 );
4232 
4233 			v4 = vec_ld( 0, lptr2 + j );
4234 			v5 = vec_ld( 15, lptr2 + j );
4235 			vector float vecExtra2 = vec_ld( 31, lptr2 + j );
4236 			v4 = vec_perm( v4, v5, vecPermLptr2 );
4237 			v5 = vec_perm( v5, vecExtra2, vecPermLptr2 );
4238 
4239 			v6 = vec_ld( 0, lptr3 + j );
4240 			v7 = vec_ld( 15, lptr3 + j );
4241 			vector float vecExtra3 = vec_ld( 31, lptr3 + j );
4242 			v6 = vec_perm( v6, v7, vecPermLptr3 );
4243 			v7 = vec_perm( v7, vecExtra3, vecPermLptr3 );
4244 
4245 			v8 = vec_ld( 0, lptr4 + j );
4246 			v9 = vec_ld( 15, lptr4 + j );
4247 			vector float vecExtra4 = vec_ld( 31, lptr4 + j );
4248 			v8 = vec_perm( v8, v9, vecPermLptr4 );
4249 			v9 = vec_perm( v9, vecExtra4, vecPermLptr4 );
4250 
4251 			vecSum1 = vec_madd( v2, v0, vecSum1 );
4252 			vecSum2 = vec_madd( v3, v1, vecSum2 );
4253 
4254 			vecSum3 = vec_madd( v4, v0, vecSum3 );
4255 			vecSum4 = vec_madd( v5, v1, vecSum4 );
4256 
4257 			vecSum5 = vec_madd( v6, v0, vecSum5 );
4258 			vecSum6 = vec_madd( v7, v1, vecSum6 );
4259 
4260 			vecSum7 = vec_madd( v8, v0, vecSum7 );
4261 			vecSum8 = vec_madd( v9, v1, vecSum8 );
4262 		}
4263 
4264 		// if we ran the unrolled code, we need to sum accross the vectors
4265 		// to find out how much to subtract from sum
4266 		if ( j > 0 ) {
4267 			vecSum1 = vec_add( vecSum1, vecSum2 );
4268 			vecSum3 = vec_add( vecSum3, vecSum4 );
4269 			vecSum5 = vec_add( vecSum5, vecSum6 );
4270 			vecSum7 = vec_add( vecSum7, vecSum8 );
4271 			//sum accross the vectors
4272 			vecSum1 = vec_add( vecSum1, vec_sld( vecSum1, vecSum1, 8 ) );
4273 			vecSum1 = vec_add( vecSum1, vec_sld( vecSum1, vecSum1, 4 ) );
4274 
4275 			vecSum3 = vec_add( vecSum3, vec_sld( vecSum3, vecSum3, 8 ) );
4276 			vecSum3 = vec_add( vecSum3, vec_sld( vecSum3, vecSum3, 4 ) );
4277 
4278 			vecSum5 = vec_add( vecSum5, vec_sld( vecSum5, vecSum5, 8 ) );
4279 			vecSum5 = vec_add( vecSum5, vec_sld( vecSum5, vecSum5, 4 ) );
4280 
4281 			vecSum7 = vec_add( vecSum7, vec_sld( vecSum7, vecSum7, 8 ) );
4282 			vecSum7 = vec_add( vecSum7, vec_sld( vecSum7, vecSum7, 4 ) );
4283 
4284 			//move the result to the FPU
4285 			vec_ste( vec_splat( vecSum1, 0 ), 0, &tempSum );
4286 			vec_ste( vec_splat( vecSum3, 0 ), 0, &tempSum2 );
4287 			vec_ste( vec_splat( vecSum5, 0 ), 0, &tempSum3 );
4288 			vec_ste( vec_splat( vecSum7, 0 ), 0, &tempSum4 );
4289 
4290 			sum -= tempSum;
4291 			sum2 -= tempSum2;
4292 			sum3 -= tempSum3;
4293 			sum4 -= tempSum4;
4294 		}
4295 
4296 		//cleanup
4297 		for (  ; j < i; j++ ) {
4298 			sum -= lptr[j] * x[j];
4299 			sum2 -= lptr2[j] * x[j];
4300 			sum3 -= lptr3[j] * x[j];
4301 			sum4 -= lptr4[j] * x[j];
4302 		}
4303 
4304 		// store the 4 results at a time
4305 		sum2 -=  ( lptr2[i] * sum );
4306 		sum3 = sum3 - ( lptr3[i+1] * sum2 ) - ( lptr3[i] * sum );
4307 		sum4 = sum4 - ( lptr4[i+2] * sum3 ) -  ( lptr4[i+1] * sum2 ) - ( lptr4[i] * sum );
4308 
4309 		x[i] = sum;
4310 		x[i+1] = sum2;
4311 		x[i+2] = sum3;
4312 		x[i+3] = sum4;
4313 	}
4314 
4315 	// cleanup
4316 	for ( ; i < n; i++ ) {
4317 		sum = b[i];
4318 		vecSum1 = zeroVector;
4319 		vecSum2 = zeroVector;
4320 		lptr = L[i];
4321 		vector unsigned char vecPermLptr = vec_add( vec_lvsl( -1, lptr ), (vector unsigned char)(1) );
4322 
4323 		for ( j = 0 ; j+7 < i; j+=8 ) {
4324 
4325 			v0 = vec_ld( 0, &x[j] );
4326 			v2 = vec_ld( 15, &x[j] );
4327 			vector float vecExtraX = vec_ld( 31, &x[j] );
4328 			v0 = vec_perm( v0, v2, vecPermX );
4329 			v2 = vec_perm( v2, vecExtraX, vecPermX );
4330 
4331 			v1 = vec_ld( 0, lptr + j );
4332 			v3 = vec_ld( 15, lptr + j );
4333 			vector float vecExtra = vec_ld( 31, lptr + j );
4334 			v1 = vec_perm( v1, v3, vecPermLptr );
4335 			v3 = vec_perm( v3, vecExtra, vecPermLptr );
4336 
4337 			vecSum1 = vec_madd( v1, v0, vecSum1 );
4338 			vecSum2 = vec_madd( v3, v2, vecSum2 );
4339 		}
4340 
4341 		// if we ran the unrolled code, we need to sum accross the vectors
4342 		// to find out how much to subtract from sum
4343 		if ( j > 0 ) {
4344 			//sum accross the vectors
4345 			vecSum1 = vec_add( vecSum1, vecSum2 );
4346 			vecSum1 = vec_add( vecSum1, vec_sld( vecSum1, vecSum1, 8 ) );
4347 			vecSum1 = vec_add( vecSum1, vec_sld( vecSum1, vecSum1, 4 ) );
4348 
4349 			//move the result to the FPU
4350 			vec_ste( vec_splat( vecSum1, 0 ), 0, &tempSum );
4351 			sum -= tempSum;
4352 		}
4353 
4354 		//cleanup
4355 		for (  ; j < i; j++ ) {
4356 			sum -= lptr[j] * x[j];
4357 		}
4358 		x[i] = sum;
4359 	}
4360 }
4361 
4362 /*
4363 ============
4364 idSIMD_AltiVec::MatX_LowerTriangularSolveTranspose
4365 
4366   solves x in L.Transpose() * x = b for the first n rows of L
4367   L has to be a lower triangular matrix with (implicit) ones on the diagonal
4368   x == b is allowed
4369 ============
4370 */
MatX_LowerTriangularSolveTranspose(const idMatX & L,float * x,const float * b,const int n)4371 void VPCALL idSIMD_AltiVec::MatX_LowerTriangularSolveTranspose( const idMatX &L, float *x, const float *b, const int n ) {
4372 
4373 	int nc;
4374 	const float *lptr;
4375 
4376 	lptr = L.ToFloatPtr();
4377 	nc = L.GetNumColumns();
4378 
4379 	float x0, x1, x2, x3, x4, x5, x6;
4380 	// unrolled cases for n < 8
4381 	if ( n < 8 ) {
4382 		switch( n ) {
4383 			// using local variables to avoid aliasing issues
4384 			case 0:
4385 				return;
4386 			case 1:
4387 				x[0] = b[0];
4388 				return;
4389 			case 2:
4390 				x1 = b[1];
4391 				x0 = b[0] - lptr[1*nc+0] * x1;
4392 
4393 				x[1] = x1;
4394 				x[0] = x0;
4395 				return;
4396 			case 3:
4397 				x2 = b[2];
4398 				x1 = b[1] - lptr[2*nc+1] * x2;
4399 				x0 = b[0] - lptr[2*nc+0] * x2 - lptr[1*nc+0] * x1;
4400 
4401 				x[2] = x2;
4402 				x[1] = x1;
4403 				x[0] = x0;
4404 				return;
4405 			case 4:
4406 				x3 = b[3];
4407 				x2 = b[2] - lptr[3*nc+2] * x3;
4408 				x1 = b[1] - lptr[3*nc+1] * x3 - lptr[2*nc+1] * x2;
4409 				x0 = b[0] - lptr[3*nc+0] * x3 - lptr[2*nc+0] * x2 - lptr[1*nc+0] * x1;
4410 
4411 				x[3] = x3;
4412 				x[2] = x2;
4413 				x[1] = x1;
4414 				x[0] = x0;
4415 
4416 				return;
4417 			case 5:
4418 				x4 = b[4];
4419 				x3 = b[3] - lptr[4*nc+3] * x4;
4420 				x2 = b[2] - lptr[4*nc+2] * x4 - lptr[3*nc+2] * x3;
4421 				x1 = b[1] - lptr[4*nc+1] * x4 - lptr[3*nc+1] * x3 - lptr[2*nc+1] * x2;
4422 				x0 = b[0] - lptr[4*nc+0] * x4 - lptr[3*nc+0] * x3 - lptr[2*nc+0] * x2 - lptr[1*nc+0] * x1;
4423 
4424 				x[4] = x4;
4425 				x[3] = x3;
4426 				x[2] = x2;
4427 				x[1] = x1;
4428 				x[0] = x0;
4429 				return;
4430 			case 6:
4431 				x5 = b[5];
4432 				x4 = b[4] - lptr[5*nc+4] * x5;
4433 				x3 = b[3] - lptr[5*nc+3] * x5 - lptr[4*nc+3] * x4;
4434 				x2 = b[2] - lptr[5*nc+2] * x5 - lptr[4*nc+2] * x4 - lptr[3*nc+2] * x3;
4435 				x1 = b[1] - lptr[5*nc+1] * x5 - lptr[4*nc+1] * x4 - lptr[3*nc+1] * x3 - lptr[2*nc+1] * x2;
4436 				x0 = b[0] - lptr[5*nc+0] * x5 - lptr[4*nc+0] * x4 - lptr[3*nc+0] * x3 - lptr[2*nc+0] * x2 - lptr[1*nc+0] * x1;
4437 
4438 				x[5] = x5;
4439 				x[4] = x4;
4440 				x[3] = x3;
4441 				x[2] = x2;
4442 				x[1] = x1;
4443 				x[0] = x0;
4444 
4445 				return;
4446 			case 7:
4447 				x6 = b[6];
4448 				x5 = b[5] - lptr[6*nc+5] * x6;
4449 				x4 = b[4] - lptr[6*nc+4] * x6 - lptr[5*nc+4] * x5;
4450 				x3 = b[3] - lptr[6*nc+3] * x6 - lptr[5*nc+3] * x5 - lptr[4*nc+3] * x4;
4451 				x2 = b[2] - lptr[6*nc+2] * x6 - lptr[5*nc+2] * x5 - lptr[4*nc+2] * x4 - lptr[3*nc+2] * x3;
4452 				x1 = b[1] - lptr[6*nc+1] * x6 - lptr[5*nc+1] * x5 - lptr[4*nc+1] * x4 - lptr[3*nc+1] * x3 - lptr[2*nc+1] * x2;
4453 				x0 = b[0] - lptr[6*nc+0] * x6 - lptr[5*nc+0] * x5 - lptr[4*nc+0] * x4 - lptr[3*nc+0] * x3 - lptr[2*nc+0] * x2 - lptr[1*nc+0] * x1;
4454 
4455 				x[6] = x6;
4456 				x[5] = x5;
4457 				x[4] = x4;
4458 				x[3] = x3;
4459 				x[2] = x2;
4460 				x[1] = x1;
4461 				x[0] = x0;
4462 				return;
4463 		}
4464 		return;
4465 	}
4466 
4467 	int i, j;
4468 	register float s0, s1, s2, s3;
4469 	float *xptr;
4470 
4471 	lptr = L.ToFloatPtr() + n * nc + n - 4;
4472 	xptr = x + n;
4473 
4474 	// process 4 rows at a time
4475 	for ( i = n; i >= 4; i -= 4 ) {
4476 		s0 = b[i-4];
4477 		s1 = b[i-3];
4478 		s2 = b[i-2];
4479 		s3 = b[i-1];
4480 		// process 4x4 blocks
4481 		for ( j = 0; j < n-i; j += 4 ) {
4482 			s0 -= lptr[(j+0)*nc+0] * xptr[j+0];
4483 			s1 -= lptr[(j+0)*nc+1] * xptr[j+0];
4484 			s2 -= lptr[(j+0)*nc+2] * xptr[j+0];
4485 			s3 -= lptr[(j+0)*nc+3] * xptr[j+0];
4486 			s0 -= lptr[(j+1)*nc+0] * xptr[j+1];
4487 			s1 -= lptr[(j+1)*nc+1] * xptr[j+1];
4488 			s2 -= lptr[(j+1)*nc+2] * xptr[j+1];
4489 			s3 -= lptr[(j+1)*nc+3] * xptr[j+1];
4490 			s0 -= lptr[(j+2)*nc+0] * xptr[j+2];
4491 			s1 -= lptr[(j+2)*nc+1] * xptr[j+2];
4492 			s2 -= lptr[(j+2)*nc+2] * xptr[j+2];
4493 			s3 -= lptr[(j+2)*nc+3] * xptr[j+2];
4494 			s0 -= lptr[(j+3)*nc+0] * xptr[j+3];
4495 			s1 -= lptr[(j+3)*nc+1] * xptr[j+3];
4496 			s2 -= lptr[(j+3)*nc+2] * xptr[j+3];
4497 			s3 -= lptr[(j+3)*nc+3] * xptr[j+3];
4498 		}
4499 		// process left over of the 4 rows
4500 		s0 -= lptr[0-1*nc] * s3;
4501 		s1 -= lptr[1-1*nc] * s3;
4502 		s2 -= lptr[2-1*nc] * s3;
4503 		s0 -= lptr[0-2*nc] * s2;
4504 		s1 -= lptr[1-2*nc] * s2;
4505 		s0 -= lptr[0-3*nc] * s1;
4506 		// store result
4507 		xptr[-4] = s0;
4508 		xptr[-3] = s1;
4509 		xptr[-2] = s2;
4510 		xptr[-1] = s3;
4511 		// update pointers for next four rows
4512 		lptr -= 4 + 4 * nc;
4513 		xptr -= 4;
4514 	}
4515 	// process left over rows
4516 	for ( i--; i >= 0; i-- ) {
4517 		s0 = b[i];
4518 		lptr = L[0] + i;
4519 		for ( j = i + 1; j < n; j++ ) {
4520 			s0 -= lptr[j*nc] * x[j];
4521 		}
4522 		x[i] = s0;
4523 	}
4524 }
4525 
4526 /*
4527 ============
4528 idSIMD_AltiVec::MatX_LDLTFactor
4529 ============
4530 */
MatX_LDLTFactor(idMatX & mat,idVecX & invDiag,const int n)4531 bool VPCALL idSIMD_AltiVec::MatX_LDLTFactor( idMatX &mat, idVecX &invDiag, const int n ) {
4532 	int i, j, k, nc;
4533 	float *v, *diag, *mptr;
4534 	float s0, s1, s2, s3, sum, d;
4535 	float s0_2, s1_2, s2_2, s3_2, sum_2;
4536 	float *mptr2;
4537 
4538 	v = (float *) _alloca16( n * sizeof( float ) );
4539 	diag = (float *) _alloca16( n * sizeof( float ) );
4540 
4541 	nc = mat.GetNumColumns();
4542 
4543 	if ( n <= 0 ) {
4544 		return true;
4545 	}
4546 
4547 	mptr = mat[0];
4548 
4549 	sum = mptr[0];
4550 
4551 	if ( sum == 0.0f ) {
4552 		return false;
4553 	}
4554 
4555 	diag[0] = sum;
4556 	invDiag[0] = d = 1.0f / sum;
4557 
4558 	if ( n <= 1 ) {
4559 		return true;
4560 	}
4561 
4562 	mptr = mat[0];
4563 	for ( j = 1; j < n; j++ ) {
4564 		mptr[j*nc+0] = ( mptr[j*nc+0] ) * d;
4565 	}
4566 
4567 	mptr = mat[1];
4568 
4569 	v[0] = diag[0] * mptr[0]; s0 = v[0] * mptr[0];
4570 	sum = mptr[1] - s0;
4571 
4572 	if ( sum == 0.0f ) {
4573 		return false;
4574 	}
4575 
4576 	mat[1][1] = sum;
4577 	diag[1] = sum;
4578 	invDiag[1] = d = 1.0f / sum;
4579 
4580 	if ( n <= 2 ) {
4581 		return true;
4582 	}
4583 
4584 	mptr = mat[0];
4585 	for ( j = 2; j < n; j++ ) {
4586 		mptr[j*nc+1] = ( mptr[j*nc+1] - v[0] * mptr[j*nc+0] ) * d;
4587 	}
4588 
4589 	mptr = mat[2];
4590 
4591 	v[0] = diag[0] * mptr[0]; s0 = v[0] * mptr[0];
4592 	v[1] = diag[1] * mptr[1]; s1 = v[1] * mptr[1];
4593 	sum = mptr[2] - s0 - s1;
4594 
4595 	if ( sum == 0.0f ) {
4596 		return false;
4597 	}
4598 
4599 	mat[2][2] = sum;
4600 	diag[2] = sum;
4601 	invDiag[2] = d = 1.0f / sum;
4602 
4603 	if ( n <= 3 ) {
4604 		return true;
4605 	}
4606 
4607 	mptr = mat[0];
4608 	for ( j = 3; j < n; j++ ) {
4609 		mptr[j*nc+2] = ( mptr[j*nc+2] - v[0] * mptr[j*nc+0] - v[1] * mptr[j*nc+1] ) * d;
4610 	}
4611 
4612 	mptr = mat[3];
4613 
4614 	v[0] = diag[0] * mptr[0]; s0 = v[0] * mptr[0];
4615 	v[1] = diag[1] * mptr[1]; s1 = v[1] * mptr[1];
4616 	v[2] = diag[2] * mptr[2]; s2 = v[2] * mptr[2];
4617 	sum = mptr[3] - s0 - s1 - s2;
4618 
4619 	if ( sum == 0.0f ) {
4620 		return false;
4621 	}
4622 
4623 	mat[3][3] = sum;
4624 	diag[3] = sum;
4625 	invDiag[3] = d = 1.0f / sum;
4626 
4627 	if ( n <= 4 ) {
4628 		return true;
4629 	}
4630 
4631 	mptr = mat[0];
4632 	for ( j = 4; j < n; j++ ) {
4633 		mptr[j*nc+3] = ( mptr[j*nc+3] - v[0] * mptr[j*nc+0] - v[1] * mptr[j*nc+1] - v[2] * mptr[j*nc+2] ) * d;
4634 	}
4635 
4636 	for ( i = 4; i < n; i++ ) {
4637 
4638 		mptr = mat[i];
4639 
4640 		v[0] = diag[0] * mptr[0]; s0 = v[0] * mptr[0];
4641 		v[1] = diag[1] * mptr[1]; s1 = v[1] * mptr[1];
4642 		v[2] = diag[2] * mptr[2]; s2 = v[2] * mptr[2];
4643 		v[3] = diag[3] * mptr[3]; s3 = v[3] * mptr[3];
4644 		for ( k = 4; k < i-3; k += 4 ) {
4645 			v[k+0] = diag[k+0] * mptr[k+0]; s0 += v[k+0] * mptr[k+0];
4646 			v[k+1] = diag[k+1] * mptr[k+1]; s1 += v[k+1] * mptr[k+1];
4647 			v[k+2] = diag[k+2] * mptr[k+2]; s2 += v[k+2] * mptr[k+2];
4648 			v[k+3] = diag[k+3] * mptr[k+3]; s3 += v[k+3] * mptr[k+3];
4649 		}
4650 		switch( i - k ) {
4651 			case 3: v[k+2] = diag[k+2] * mptr[k+2]; s0 += v[k+2] * mptr[k+2];
4652 			case 2: v[k+1] = diag[k+1] * mptr[k+1]; s1 += v[k+1] * mptr[k+1];
4653 			case 1: v[k+0] = diag[k+0] * mptr[k+0]; s2 += v[k+0] * mptr[k+0];
4654 		}
4655 		sum = s3;
4656 		sum += s2;
4657 		sum += s1;
4658 		sum += s0;
4659 		sum = mptr[i] - sum;
4660 
4661 		if ( sum == 0.0f ) {
4662 			return false;
4663 		}
4664 
4665 		mat[i][i] = sum;
4666 		diag[i] = sum;
4667 		invDiag[i] = d = 1.0f / sum;
4668 
4669 		if ( i + 1 >= n ) {
4670 			return true;
4671 		}
4672 
4673 		// unrolling madness!
4674 		mptr = mat[i+1];
4675 		mptr2 = mat[i+1] + nc;
4676 
4677 		for ( j = i+1; j+1 < n; j+=2 ) {
4678 			s0 = mptr[0] * v[0];
4679 			s1 = mptr[1] * v[1];
4680 			s2 = mptr[2] * v[2];
4681 			s3 = mptr[3] * v[3];
4682 
4683 			s0_2 = mptr2[0] * v[0];
4684 			s1_2 = mptr2[1] * v[1];
4685 			s2_2 = mptr2[2] * v[2];
4686 			s3_2 = mptr2[3] * v[3];
4687 
4688 			for ( k = 4; k < i-7; k += 8 ) {
4689 				s0 += mptr[k+0] * v[k+0];
4690 				s1 += mptr[k+1] * v[k+1];
4691 				s2 += mptr[k+2] * v[k+2];
4692 				s3 += mptr[k+3] * v[k+3];
4693 				s0 += mptr[k+4] * v[k+4];
4694 				s1 += mptr[k+5] * v[k+5];
4695 				s2 += mptr[k+6] * v[k+6];
4696 				s3 += mptr[k+7] * v[k+7];
4697 
4698 				s0_2 += mptr2[k+0] * v[k+0];
4699 				s1_2 += mptr2[k+1] * v[k+1];
4700 				s2_2 += mptr2[k+2] * v[k+2];
4701 				s3_2 += mptr2[k+3] * v[k+3];
4702 				s0_2 += mptr2[k+4] * v[k+4];
4703 				s1_2 += mptr2[k+5] * v[k+5];
4704 				s2_2 += mptr2[k+6] * v[k+6];
4705 				s3_2 += mptr2[k+7] * v[k+7];
4706 			}
4707 
4708 			switch( i - k ) {
4709 				case 7: s0 += mptr[k+6] * v[k+6]; s0_2 += mptr2[k+6] * v[k+6];
4710 				case 6: s1 += mptr[k+5] * v[k+5]; s1_2 += mptr2[k+5] * v[k+5];
4711 				case 5: s2 += mptr[k+4] * v[k+4]; s2_2 += mptr2[k+4] * v[k+4];
4712 				case 4: s3 += mptr[k+3] * v[k+3]; s3_2 += mptr2[k+3] * v[k+3];
4713 				case 3: s0 += mptr[k+2] * v[k+2]; s0_2 += mptr2[k+2] * v[k+2];
4714 				case 2: s1 += mptr[k+1] * v[k+1]; s1_2 += mptr2[k+1] * v[k+1];
4715 				case 1: s2 += mptr[k+0] * v[k+0]; s2_2 += mptr2[k+0] * v[k+0];
4716 			}
4717 			// disassociate these adds
4718 			s3 += s2;
4719 			s1 += s0;
4720 			sum = s1 + s3;
4721 
4722 			s3_2 += s2_2;
4723 			s1_2 += s0_2;
4724 			sum_2 = s1_2 + s3_2;
4725 
4726 			mptr[i] = ( mptr[i] - sum ) * d;
4727 			mptr2[i] = ( mptr2[i] - sum_2 ) * d;
4728 
4729 			mptr += nc*2;
4730 			mptr2 += nc*2;
4731 		}
4732 
4733 		// cleanup
4734 		for ( ; j < n; j++ ) {
4735 			s0 = mptr[0] * v[0];
4736 			s1 = mptr[1] * v[1];
4737 			s2 = mptr[2] * v[2];
4738 			s3 = mptr[3] * v[3];
4739 			for ( k = 4; k < i-7; k += 8 ) {
4740 				s0 += mptr[k+0] * v[k+0];
4741 				s1 += mptr[k+1] * v[k+1];
4742 				s2 += mptr[k+2] * v[k+2];
4743 				s3 += mptr[k+3] * v[k+3];
4744 				s0 += mptr[k+4] * v[k+4];
4745 				s1 += mptr[k+5] * v[k+5];
4746 				s2 += mptr[k+6] * v[k+6];
4747 				s3 += mptr[k+7] * v[k+7];
4748 			}
4749 			switch( i - k ) {
4750 				case 7: s0 += mptr[k+6] * v[k+6];
4751 				case 6: s1 += mptr[k+5] * v[k+5];
4752 				case 5: s2 += mptr[k+4] * v[k+4];
4753 				case 4: s3 += mptr[k+3] * v[k+3];
4754 				case 3: s0 += mptr[k+2] * v[k+2];
4755 				case 2: s1 += mptr[k+1] * v[k+1];
4756 				case 1: s2 += mptr[k+0] * v[k+0];
4757 			}
4758 			// disassociate these adds
4759 			s3 += s2;
4760 			s1 += s0;
4761 			sum = s1 + s3;
4762 			mptr[i] = ( mptr[i] - sum ) * d;
4763 			mptr += nc;
4764 		}
4765 	}
4766 	return true;
4767 }
4768 #endif /* ENABLE_LOWER_TRIANGULAR */
4769 
4770 
4771 #ifdef LIVE_VICARIOUSLY
4772 /*
4773 ============
4774 idSIMD_AltiVec::BlendJoints
4775 ============
4776 */
BlendJoints(idJointQuat * joints,const idJointQuat * blendJoints,const float lerp,const int * index,const int numJoints)4777 void VPCALL idSIMD_AltiVec::BlendJoints( idJointQuat *joints, const idJointQuat *blendJoints, const float lerp, const int *index, const int numJoints ) {
4778 	int i;
4779 
4780 	// since lerp is a constant, we can special case the two cases if they're true
4781 	if ( lerp <= 0.0f ) {
4782 		// this sets joints back to joints. No sense in doing no work, so just return
4783 		return;
4784 	}
4785 
4786 	if ( lerp >= 1.0f ) {
4787 		// this copies each q from blendJoints to joints and copies each t from blendJoints to joints
4788 		memcpy( joints[0].q.ToFloatPtr(), blendJoints[0].q.ToFloatPtr(), sizeof(idJointQuat) * numJoints );
4789 		return;
4790 	}
4791 
4792 	vector float vecLerp = loadSplatUnalignedScalar( &lerp );
4793 	vector float zeroVector = (vector float)(0);
4794 
4795 	for ( i = 0; i+3 < numJoints; i+=4 ) {
4796 		int j = index[i];
4797 		int j2 = index[i+1];
4798 		int j3 = index[i+2];
4799 		int j4 = index[i+3];
4800 
4801 		// slerp
4802 		const float *jointPtr = joints[j].q.ToFloatPtr();
4803 		const float *blendPtr = blendJoints[j].q.ToFloatPtr();
4804 		const float *jointPtr2 = joints[j2].q.ToFloatPtr();
4805 		const float *blendPtr2 = blendJoints[j2].q.ToFloatPtr();
4806 		const float *jointPtr3 = joints[j3].q.ToFloatPtr();
4807 		const float *blendPtr3 = blendJoints[j3].q.ToFloatPtr();
4808 		const float *jointPtr4 = joints[j4].q.ToFloatPtr();
4809 		const float *blendPtr4 = blendJoints[j4].q.ToFloatPtr();
4810 
4811 		vector unsigned char permVec = vec_add( vec_lvsl( -1, jointPtr ), (vector unsigned char)(1) );
4812 		vector unsigned char permVec2 = vec_add( vec_lvsl( -1, jointPtr2 ), (vector unsigned char)(1) );
4813 		vector unsigned char permVec3 = vec_add( vec_lvsl( -1, jointPtr3 ), (vector unsigned char)(1) );
4814 		vector unsigned char permVec4 = vec_add( vec_lvsl( -1, jointPtr4 ), (vector unsigned char)(1) );
4815 
4816 		vector unsigned char permVec5 = vec_add( vec_lvsl( -1, blendPtr ), (vector unsigned char)(1) );
4817 		vector unsigned char permVec6 = vec_add( vec_lvsl( -1, blendPtr2 ), (vector unsigned char)(1) );
4818 		vector unsigned char permVec7 = vec_add( vec_lvsl( -1, blendPtr3 ), (vector unsigned char)(1) );
4819 		vector unsigned char permVec8 = vec_add( vec_lvsl( -1, blendPtr4 ), (vector unsigned char)(1) );
4820 
4821 		vector float v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11;
4822 		vector float v12, v13, v14, v15, v16;
4823 		vector float vecFromX, vecFromY, vecFromZ, vecFromW;
4824 		vector float vecToX, vecToY, vecToZ, vecToW;
4825 
4826 		// load up the the idJointQuats from joints
4827 		v0 = vec_ld( 0, jointPtr );
4828 		v1 = vec_ld( 15, jointPtr );
4829 		v2 = vec_perm( v0, v1, permVec );
4830 
4831 		v3 = vec_ld( 0, jointPtr2 );
4832 		v4 = vec_ld( 15, jointPtr2 );
4833 		v5 = vec_perm( v3, v4, permVec2 );
4834 
4835 		v6 = vec_ld( 0, jointPtr3 );
4836 		v7 = vec_ld( 15, jointPtr3 );
4837 		v8 = vec_perm( v6, v7, permVec3 );
4838 
4839 		v9 = vec_ld( 0, jointPtr4 );
4840 		v10 = vec_ld( 15, jointPtr4 );
4841 		v11 = vec_perm( v9, v10, permVec4 );
4842 
4843 		// planarizing, so put each x y z w into its own vector
4844 		v0 = vec_mergeh( v2, v8 );
4845 		v1 = vec_mergeh( v5, v11 );
4846 		v3 = vec_mergel( v2, v8 );
4847 		v4 = vec_mergel( v5, v11 );
4848 
4849 		vecFromX = vec_mergeh( v0, v1 );
4850 		vecFromY = vec_mergel( v0, v1 );
4851 		vecFromZ = vec_mergeh( v3, v4 );
4852 		vecFromW = vec_mergel( v3, v4 );
4853 
4854 		// load up idJointQuats from blendJoints
4855 		v5 = vec_ld( 0, blendPtr );
4856 		v6 = vec_ld( 15, blendPtr );
4857 		v7 = vec_perm( v5, v6, permVec5 );
4858 
4859 		v8 = vec_ld( 0, blendPtr2 );
4860 		v9 = vec_ld( 15, blendPtr2 );
4861 		v10 = vec_perm( v8, v9, permVec6 );
4862 
4863 		v11 = vec_ld( 0, blendPtr3 );
4864 		v12 = vec_ld( 15, blendPtr3 );
4865 		v13 = vec_perm( v11, v12, permVec7 );
4866 
4867 		v14 = vec_ld( 0, blendPtr4 );
4868 		v15 = vec_ld( 15, blendPtr4 );
4869 		v16 = vec_perm( v14, v15, permVec8 );
4870 
4871 		// put these into their own vectors too
4872 		v5 = vec_mergeh( v7, v13 );
4873 		v6 = vec_mergeh( v10, v16 );
4874 		v8 = vec_mergel( v7, v13 );
4875 		v9 = vec_mergel( v10, v16 );
4876 
4877 		vecToX = vec_mergeh( v5, v6 );
4878 		vecToY = vec_mergel( v5, v6 );
4879 		vecToZ = vec_mergeh( v8, v9 );
4880 		vecToW = vec_mergel( v8, v9 );
4881 
4882 		// calculate cosom
4883 		vector float vecCosom = vec_madd( vecFromX, vecToX, (vector float)(0) );
4884 		vecCosom = vec_madd( vecFromY, vecToY, vecCosom );
4885 		vecCosom = vec_madd( vecFromZ, vecToZ, vecCosom );
4886 		vecCosom = vec_madd( vecFromW, vecToW, vecCosom );
4887 
4888 		// if cosom is < 0, negate it and set temp to negated elements in to. otherwise, set temp to
4889 		// to
4890 		vector bool int vecCmp, vecCmp2;
4891 		vecCmp = vec_cmplt( vecCosom, zeroVector );
4892 
4893 		// negate if needed
4894 		vecToX = vec_sel( vecToX, vec_madd( vecToX, (vector float)(-1), zeroVector ), vecCmp );
4895 		vecToY = vec_sel( vecToY, vec_madd( vecToY, (vector float)(-1), zeroVector ), vecCmp );
4896 		vecToZ = vec_sel( vecToZ, vec_madd( vecToZ, (vector float)(-1), zeroVector ), vecCmp );
4897 		vecToW = vec_sel( vecToW, vec_madd( vecToW, (vector float)(-1), zeroVector ), vecCmp );
4898 		vecCosom = vec_sel( vecCosom, vec_madd( vecCosom, (vector float)(-1), zeroVector ), vecCmp );
4899 
4900 		// check if we need to calculate scale
4901 		vecCmp2 = vec_cmpgt( vec_sub( (vector float)(1), vecCosom ), (vector float)(1e-6f) );
4902 		vector float vecScale0 = vec_sub( (vector float)(1), vecLerp );
4903 		vector float vecScale1 = vec_splat( vecLerp, 0 );
4904 
4905 		vector float vecWork1 = vec_sub( (vector float)(1), vec_madd( vecCosom, vecCosom, zeroVector ) );
4906 		vector float vecWork2 = ReciprocalSquareRoot( vecWork1 );
4907 		vector float vecWork3 = VectorATan16( vec_madd( vecWork1, vecWork2, zeroVector ), vecCosom );
4908 
4909 		vecWork1 = vec_madd( VectorSin16( vec_madd( vecScale0, vecWork3, zeroVector ) ), vecWork2, zeroVector );
4910 		vecWork2 = vec_madd( VectorSin16( vec_madd( vecLerp, vecWork3, zeroVector ) ), vecWork2, zeroVector );
4911 
4912 		// see which ones we have to insert into our scale0 and scale1 vectors
4913 		vecScale0 = vec_sel( vecScale0, vecWork1, vecCmp2 );
4914 		vecScale1 = vec_sel( vecScale1, vecWork2, vecCmp2 );
4915 
4916 		// multiply each element by the scale
4917 		vecFromX = vec_madd( vecFromX, vecScale0, zeroVector );
4918 		vecFromY = vec_madd( vecFromY, vecScale0, zeroVector );
4919 		vecFromZ = vec_madd( vecFromZ, vecScale0, zeroVector );
4920 		vecFromW = vec_madd( vecFromW, vecScale0, zeroVector );
4921 
4922 		// multiply temp by scale and add to result
4923 		vecFromX = vec_madd( vecToX, vecScale1, vecFromX );
4924 		vecFromY = vec_madd( vecToY, vecScale1, vecFromY );
4925 		vecFromZ = vec_madd( vecToZ, vecScale1, vecFromZ );
4926 		vecFromW = vec_madd( vecToW, vecScale1, vecFromW );
4927 
4928 		// do a transform again to get the results back to vectors we can store out
4929 		v5 = vec_mergeh( vecFromX, vecFromZ );
4930 		v6 = vec_mergeh( vecFromY, vecFromW );
4931 		v8 = vec_mergel( vecFromX, vecFromZ );
4932 		v9 = vec_mergel( vecFromY, vecFromW );
4933 
4934 		vecToX = vec_mergeh( v5, v6 );
4935 		vecToY = vec_mergel( v5, v6 );
4936 		vecToZ = vec_mergeh( v8, v9 );
4937 		vecToW = vec_mergel( v8, v9 );
4938 
4939 		vector unsigned char storePerm1 = vec_lvsr( 0, jointPtr );
4940 		vector unsigned char storePerm2 = vec_lvsr( 0, jointPtr2 );
4941 		vector unsigned char storePerm3 = vec_lvsr( 0, jointPtr3 );
4942 		vector unsigned char storePerm4 = vec_lvsr( 0, jointPtr4 );
4943 
4944 		// right rotate the input data
4945 		vecToX = vec_perm( vecToX, vecToX, storePerm1 );
4946 		vecToY = vec_perm( vecToY, vecToY, storePerm2 );
4947 		vecToZ = vec_perm( vecToZ, vecToZ, storePerm3 );
4948 		vecToW = vec_perm( vecToW, vecToW, storePerm4 );
4949 
4950 		vec_ste( vecToX, 0, (float*) jointPtr );
4951 		vec_ste( vecToX, 4, (float*) jointPtr );
4952 		vec_ste( vecToX, 8, (float*) jointPtr );
4953 		vec_ste( vecToX, 12, (float*) jointPtr );
4954 
4955 		vec_ste( vecToY, 0, (float*) jointPtr2 );
4956 		vec_ste( vecToY, 4, (float*) jointPtr2 );
4957 		vec_ste( vecToY, 8, (float*) jointPtr2 );
4958 		vec_ste( vecToY, 12, (float*) jointPtr2 );
4959 
4960 		vec_ste( vecToZ, 0, (float*) jointPtr3 );
4961 		vec_ste( vecToZ, 4, (float*) jointPtr3 );
4962 		vec_ste( vecToZ, 8, (float*) jointPtr3 );
4963 		vec_ste( vecToZ, 12, (float*) jointPtr3 );
4964 
4965 		vec_ste( vecToW, 0, (float*) jointPtr4 );
4966 		vec_ste( vecToW, 4, (float*) jointPtr4 );
4967 		vec_ste( vecToW, 8, (float*) jointPtr4 );
4968 		vec_ste( vecToW, 12, (float*) jointPtr4 );
4969 
4970 		// lerp is  v1 + l * ( v2 - v1 );
4971 		// the idVec3 T is going to be 12 bytes after the Q, so we can do this without calling ToFloatPtr() again. since its
4972 		float *jointVecPtr = (float*)( jointPtr + 4 );
4973 		float *jointVecPtr2 = (float*)( jointPtr2 + 4 );
4974 		float *jointVecPtr3 = (float*)( jointPtr3 + 4 );
4975 		float *jointVecPtr4 = (float*)( jointPtr4 + 4 );
4976 
4977 		v0 = vec_ld( 0, jointVecPtr );
4978 		v1 = vec_ld( 11, jointVecPtr );
4979 		vector float vecLd1 = vec_perm( v0, v1, vec_add( vec_lvsl( -1, jointVecPtr ), (vector unsigned char)(1) ) );
4980 
4981 		v2 = vec_ld( 0, jointVecPtr2 );
4982 		v3 = vec_ld( 11, jointVecPtr2  );
4983 		vector float vecLd2 = vec_perm( v2, v3, vec_add( vec_lvsl( -1, jointVecPtr2 ), (vector unsigned char)(1) ) );
4984 
4985 		v4 = vec_ld( 0, jointVecPtr3 );
4986 		v5 = vec_ld( 11, jointVecPtr3 );
4987 		vector float vecLd3 = vec_perm( v4, v5, vec_add( vec_lvsl( -1, jointVecPtr3 ), (vector unsigned char)(1) ) );
4988 
4989 		v6 = vec_ld( 0, jointVecPtr4  );
4990 		v7 = vec_ld( 11, jointVecPtr4  );
4991 		vector float vecLd4 = vec_perm( v6, v7, vec_add( vec_lvsl( -1, jointVecPtr4 ), (vector unsigned char)(1) ) );
4992 
4993 		vector float vecVecX, vecVecY, vecVecZ;
4994 		vecVecX = vecVecY = vecVecZ = zeroVector;
4995 
4996 		// planarize
4997 		v0 = vec_mergeh( vecLd1, vecLd3 );
4998 		v1 = vec_mergeh( vecLd2, vecLd4 );
4999 		v3 = vec_mergel( vecLd1, vecLd3 );
5000 		v4 = vec_mergel( vecLd2, vecLd4 );
5001 
5002 		vecVecX = vec_mergeh( v0, v1 );
5003 		vecVecY = vec_mergel( v0, v1 );
5004 		vecVecZ = vec_mergeh( v3, v4 );
5005 
5006 		// load blend joint idvec3's
5007 		float *blendVecPtr = (float*)( blendPtr + 4 );
5008 		float *blendVecPtr2 =(float*)( blendPtr2 + 4 );
5009 		float *blendVecPtr3 = (float*)( blendPtr3 + 4 );
5010 		float *blendVecPtr4 = (float*)( blendPtr4 + 4 );
5011 
5012 		v0 = vec_ld( 0, blendVecPtr );
5013 		v1 = vec_ld( 11, blendVecPtr );
5014 		vector float vecLd5 = vec_perm( v0, v1, vec_add( vec_lvsl( -1, blendVecPtr ), (vector unsigned char)(1) ) );
5015 
5016 		v2 = vec_ld( 0, blendVecPtr2 );
5017 		v3 = vec_ld( 11, blendVecPtr2  );
5018 		vector float vecLd6 = vec_perm( v2, v3, vec_add( vec_lvsl( -1, blendVecPtr2 ), (vector unsigned char)(1) ) );
5019 
5020 		v4 = vec_ld( 0, blendVecPtr3 );
5021 		v5 = vec_ld( 11, blendVecPtr3 );
5022 		vector float vecLd7 = vec_perm( v4, v5, vec_add( vec_lvsl( -1, blendVecPtr3 ), (vector unsigned char)(1) ) );
5023 
5024 		v6 = vec_ld( 0, blendVecPtr4  );
5025 		v7 = vec_ld( 11, blendVecPtr4  );
5026 		vector float vecLd8 = vec_perm( v6, v7, vec_add( vec_lvsl( -1, blendVecPtr4 ), (vector unsigned char)(1) ) );
5027 
5028 		vector float vecBlendX, vecBlendY, vecBlendZ;
5029 		vecBlendX = vecBlendY = vecBlendZ = zeroVector;
5030 
5031 		// planarize
5032 		v0 = vec_mergeh( vecLd5, vecLd7 );
5033 		v1 = vec_mergeh( vecLd6, vecLd8 );
5034 		v3 = vec_mergel( vecLd5, vecLd7 );
5035 		v4 = vec_mergel( vecLd6, vecLd8 );
5036 
5037 		vecBlendX = vec_mergeh( v0, v1 );
5038 		vecBlendY = vec_mergel( v0, v1 );
5039 		vecBlendZ = vec_mergeh( v3, v4 );
5040 
5041 		// do subtraction
5042 		vecWork1 = vec_sub( vecBlendX, vecVecX );
5043 		vecWork2 = vec_sub( vecBlendY, vecVecY );
5044 		vecWork3 = vec_sub( vecBlendZ, vecVecZ );
5045 
5046 		// multiply by lerp and add to v1
5047 		vecVecX = vec_madd( vecWork1, vecLerp, vecVecX );
5048 		vecVecY = vec_madd( vecWork2, vecLerp, vecVecY );
5049 		vecVecZ = vec_madd( vecWork3, vecLerp, vecVecZ );
5050 
5051 		// put it back in original form
5052 		v0 = vec_mergeh( vecVecX, vecVecZ );
5053 		v1 = vec_mergeh( vecVecY, zeroVector );
5054 		v3 = vec_mergel( vecVecX, vecVecZ );
5055 		v4 = vec_mergel( vecVecY, zeroVector );
5056 
5057 		// generate vectors to store
5058 		vecWork1 = vec_mergeh( v0, v1 );
5059 		vecWork2 = vec_mergel( v0, v1 );
5060 		vecWork3 = vec_mergeh( v3, v4 );
5061 		vector float vecWork4 = vec_mergel( v3, v4 );
5062 
5063 		// store the T values
5064 		storePerm1 = vec_lvsr( 0, jointVecPtr );
5065 		storePerm2 = vec_lvsr( 0, jointVecPtr2 );
5066 		storePerm3 = vec_lvsr( 0, jointVecPtr3 );
5067 		storePerm4 = vec_lvsr( 0, jointVecPtr4 );
5068 
5069 		// right rotate the input data
5070 		vecWork1 = vec_perm( vecWork1, vecWork1, storePerm1 );
5071 		vecWork2 = vec_perm( vecWork2, vecWork2, storePerm2 );
5072 		vecWork3 = vec_perm( vecWork3, vecWork3, storePerm3 );
5073 		vecWork4 = vec_perm( vecWork4, vecWork4, storePerm4 );
5074 
5075 		vec_ste( vecWork1, 0, (float*) jointVecPtr );
5076 		vec_ste( vecWork1, 4, (float*) jointVecPtr );
5077 		vec_ste( vecWork1, 8, (float*) jointVecPtr );
5078 
5079 		vec_ste( vecWork2, 0, (float*) jointVecPtr2 );
5080 		vec_ste( vecWork2, 4, (float*) jointVecPtr2 );
5081 		vec_ste( vecWork2, 8, (float*) jointVecPtr2 );
5082 
5083 		vec_ste( vecWork3, 0, (float*) jointVecPtr3 );
5084 		vec_ste( vecWork3, 4, (float*) jointVecPtr3 );
5085 		vec_ste( vecWork3, 8, (float*) jointVecPtr3 );
5086 
5087 		vec_ste( vecWork4, 0, (float*) jointVecPtr4 );
5088 		vec_ste( vecWork4, 4, (float*) jointVecPtr4 );
5089 		vec_ste( vecWork4, 8, (float*) jointVecPtr4 );
5090 	}
5091 
5092 	// cleanup
5093 	for ( ; i < numJoints; i++ ) {
5094 		int j = index[i];
5095 		joints[j].q.Slerp( joints[j].q, blendJoints[j].q, lerp );
5096 		joints[j].t.Lerp( joints[j].t, blendJoints[j].t, lerp );
5097 	}
5098 }
5099 
5100 /*
5101 ============
5102 idSIMD_AltiVec::ConvertJointQuatsToJointMats
5103 ============
5104 */
5105 
5106 // SSE doesn't vectorize this, and I don't think we should either. Its mainly just copying data, there's very little math involved and
5107 // it's not easily parallelizable
ConvertJointQuatsToJointMats(idJointMat * jointMats,const idJointQuat * jointQuats,const int numJoints)5108 void VPCALL idSIMD_AltiVec::ConvertJointQuatsToJointMats( idJointMat *jointMats, const idJointQuat *jointQuats, const int numJoints ) {
5109 
5110 	for ( int i = 0; i < numJoints; i++ ) {
5111 
5112 		const float *q = jointQuats[i].q.ToFloatPtr();
5113 		float *m = jointMats[i].ToFloatPtr();
5114 
5115 		m[0*4+3] = q[4];
5116 		m[1*4+3] = q[5];
5117 		m[2*4+3] = q[6];
5118 
5119 		float x2 = q[0] + q[0];
5120 		float y2 = q[1] + q[1];
5121 		float z2 = q[2] + q[2];
5122 
5123 		{
5124 			float xx = q[0] * x2;
5125 			float yy = q[1] * y2;
5126 			float zz = q[2] * z2;
5127 
5128 			m[0*4+0] = 1.0f - yy - zz;
5129 			m[1*4+1] = 1.0f - xx - zz;
5130 			m[2*4+2] = 1.0f - xx - yy;
5131 		}
5132 
5133 		{
5134 			float yz = q[1] * z2;
5135 			float wx = q[3] * x2;
5136 
5137 			m[2*4+1] = yz - wx;
5138 			m[1*4+2] = yz + wx;
5139 		}
5140 
5141 		{
5142 			float xy = q[0] * y2;
5143 			float wz = q[3] * z2;
5144 
5145 			m[1*4+0] = xy - wz;
5146 			m[0*4+1] = xy + wz;
5147 		}
5148 
5149 		{
5150 			float xz = q[0] * z2;
5151 			float wy = q[3] * y2;
5152 
5153 			m[0*4+2] = xz - wy;
5154 			m[2*4+0] = xz + wy;
5155 		}
5156 	}
5157 }
5158 
5159 /*
5160 ============
5161 idSIMD_AltiVec::ConvertJointMatsToJointQuats
5162 ============
5163 */
ConvertJointMatsToJointQuats(idJointQuat * jointQuats,const idJointMat * jointMats,const int numJoints)5164 void VPCALL idSIMD_AltiVec::ConvertJointMatsToJointQuats( idJointQuat *jointQuats, const idJointMat *jointMats, const int numJoints ) {
5165 
5166 	int index;
5167 
5168 	// Since we use very little of the data we have to pull in for the altivec version, we end up with
5169 	// a lot of wasted math. Rather than try to force it to use altivec, I wrote an optimized version
5170 	// of InvSqrt for the G5, and made it use that instead. With only this change, we get a little
5171 	// bigger than 50% speedup, which is not too shabby. Should really replace idMath::InvSqrt with
5172 	// my function so everyone can benefit on G5.
5173 
5174 	for ( index = 0; index < numJoints; index++ ) {
5175 
5176 		idJointQuat	jq;
5177 		float		trace;
5178 		float		s;
5179 		float		t;
5180 		int	i;
5181 		int			j;
5182 		int			k;
5183 
5184 		static int	next[3] = { 1, 2, 0 };
5185 
5186 		float *mat = (float*)( jointMats[index].ToFloatPtr() );
5187 		trace = mat[0 * 4 + 0] + mat[1 * 4 + 1] + mat[2 * 4 + 2];
5188 
5189 		if ( trace > 0.0f ) {
5190 
5191 			t = trace + 1.0f;
5192 			//s = idMath::InvSqrt( t ) * 0.5f;
5193 			s = FastScalarInvSqrt( t ) * 0.5f;
5194 
5195 			jq.q[3] = s * t;
5196 			jq.q[0] = ( mat[1 * 4 + 2] - mat[2 * 4 + 1] ) * s;
5197 			jq.q[1] = ( mat[2 * 4 + 0] - mat[0 * 4 + 2] ) * s;
5198 			jq.q[2] = ( mat[0 * 4 + 1] - mat[1 * 4 + 0] ) * s;
5199 
5200 		} else {
5201 
5202 			i = 0;
5203 			if ( mat[1 * 4 + 1] > mat[0 * 4 + 0] ) {
5204 				i = 1;
5205 			}
5206 			if ( mat[2 * 4 + 2] > mat[i * 4 + i] ) {
5207 				i = 2;
5208 			}
5209 			j = next[i];
5210 			k = next[j];
5211 
5212 			t = ( mat[i * 4 + i] - ( mat[j * 4 + j] + mat[k * 4 + k] ) ) + 1.0f;
5213 			//s = idMath::InvSqrt( t ) * 0.5f;
5214 			s = FastScalarInvSqrt( t ) * 0.5f;
5215 
5216 			jq.q[i] = s * t;
5217 			jq.q[3] = ( mat[j * 4 + k] - mat[k * 4 + j] ) * s;
5218 			jq.q[j] = ( mat[i * 4 + j] + mat[j * 4 + i] ) * s;
5219 			jq.q[k] = ( mat[i * 4 + k] + mat[k * 4 + i] ) * s;
5220 		}
5221 
5222 		jq.t[0] = mat[0 * 4 + 3];
5223 		jq.t[1] = mat[1 * 4 + 3];
5224 		jq.t[2] = mat[2 * 4 + 3];
5225 		jointQuats[index] = jq;
5226 	}
5227 }
5228 
5229 /*
5230 ============
5231 idSIMD_AltiVec::TransformJoints
5232 ============
5233 */
TransformJoints(idJointMat * jointMats,const int * parents,const int firstJoint,const int lastJoint)5234 void VPCALL idSIMD_AltiVec::TransformJoints( idJointMat *jointMats, const int *parents, const int firstJoint, const int lastJoint ) {
5235 	int i;
5236 #if 0
5237 	for( i = firstJoint; i <= lastJoint; i++ ) {
5238 		assert( parents[i] < i );
5239 		jointMats[i] *= jointMats[parents[i]];
5240 	}
5241 #else
5242 
5243 	// I don't think you can unroll this since the next iteration of the loop might depending on the previous iteration, depending
5244 	// on what the parents array looks like. This is true in the test code.
5245 	for ( i = firstJoint; i <= lastJoint; i++ ) {
5246 		assert( parents[i] < i );
5247 		float *jointPtr = jointMats[i].ToFloatPtr();
5248 		float *parentPtr = jointMats[parents[i]].ToFloatPtr();
5249 
5250 		vector unsigned char permVec = vec_add( vec_lvsl( -1, jointPtr ), (vector unsigned char)(1) );
5251 		vector unsigned char permVec2 = vec_add( vec_lvsl( -1, parentPtr ), (vector unsigned char)(1) );
5252 		vector float v0, v1, v2, v3, v4, v5, v6, v7;
5253 
5254 		// we need to load up 12 float elements that make up the Mat
5255 		v0 = vec_ld( 0, jointPtr );
5256 		v1 = vec_ld( 15, jointPtr );
5257 		v2 = vec_ld( 31, jointPtr );
5258 		v3 = vec_ld( 47, jointPtr );
5259 
5260 		// load parents
5261 		v4 = vec_ld( 0, parentPtr );
5262 		v5 = vec_ld( 15, parentPtr );
5263 		v6 = vec_ld( 31, parentPtr );
5264 		v7 = vec_ld( 47, parentPtr );
5265 
5266 		// permute into vectors
5267 		vector float vecJointMat1 = vec_perm( v0, v1, permVec );
5268 		vector float vecJointMat2 = vec_perm( v1, v2, permVec );
5269 		vector float vecJointMat3 = vec_perm( v2, v3, permVec );
5270 
5271 		vector float vecParentMat1 = vec_perm( v4, v5, permVec2 );
5272 		vector float vecParentMat2 = vec_perm( v5, v6, permVec2 );
5273 		vector float vecParentMat3 = vec_perm( v6, v7, permVec2 );
5274 
5275 		vector float zero = (vector float)(0);
5276 		vector float C1, C2, C3;
5277 
5278 		// matrix multiply
5279 		C1 = vec_madd( vecJointMat1, vec_splat( vecParentMat1, 0 ), zero ); // m(0 to 3) * a(0)
5280 		C2 = vec_madd( vecJointMat1, vec_splat( vecParentMat2, 0 ), zero ); //  m(4 to 7) * a(4)
5281 		C3 = vec_madd( vecJointMat1, vec_splat( vecParentMat3, 0 ), zero ); // m(8 to 11) * a(8)
5282 
5283 		C1 = vec_madd( vecJointMat2, vec_splat( vecParentMat1, 1 ), C1 ); // add in m(4 to 7) * a(1)
5284 		C2 = vec_madd( vecJointMat2, vec_splat( vecParentMat2, 1 ), C2 ); // add in m(4 to 7) * a(5)
5285 		C3 = vec_madd( vecJointMat2, vec_splat( vecParentMat3, 1 ), C3 ); // add in m(4 to 7) * a(9)
5286 
5287 		C1 = vec_madd( vecJointMat3, vec_splat( vecParentMat1, 2 ), C1 );
5288 		C2 = vec_madd( vecJointMat3, vec_splat( vecParentMat2, 2 ), C2 );
5289 		C3 = vec_madd( vecJointMat3, vec_splat( vecParentMat3, 2 ), C3 );
5290 
5291 		// do the addition at the end
5292 		vector unsigned char permZeroAndLast = (vector unsigned char)(0,1,2,3,4,5,6,7,8,9,10,11,28,29,30,31);
5293 		C1 = vec_add( C1, vec_perm( zero, vecParentMat1, permZeroAndLast ) );
5294 		C2 = vec_add( C2, vec_perm( zero, vecParentMat2, permZeroAndLast ) );
5295 		C3 = vec_add( C3, vec_perm( zero, vecParentMat3, permZeroAndLast ) );
5296 
5297 		// store results
5298 		UNALIGNED_STORE3( (float*) jointPtr, C1, C2, C3 );
5299 	}
5300 #endif
5301 }
5302 
5303 /*
5304 ============
5305 idSIMD_AltiVec::UntransformJoints
5306 ============
5307 */
UntransformJoints(idJointMat * jointMats,const int * parents,const int firstJoint,const int lastJoint)5308 void VPCALL idSIMD_AltiVec::UntransformJoints( idJointMat *jointMats, const int *parents, const int firstJoint, const int lastJoint ) {
5309 	int i;
5310 #if 0
5311 	for( i = lastJoint; i >= firstJoint; i-- ) {
5312 		assert( parents[i] < i );
5313 		jointMats[i] /= jointMats[parents[i]];
5314 	}
5315 #else
5316 	// I don't think you can unroll this since the next iteration of the loop might depending on the previous iteration, depending
5317 	// on what the parents array looks like. This is true in the test code.
5318 	for ( i = lastJoint; i >= firstJoint; i-- ) {
5319 		assert( parents[i] < i );
5320 		float *jointPtr = jointMats[i].ToFloatPtr();
5321 		float *parentPtr = jointMats[parents[i]].ToFloatPtr();
5322 
5323 		vector unsigned char permVec = vec_add( vec_lvsl( -1, jointPtr ), (vector unsigned char)(1) );
5324 		vector unsigned char permVec2 = vec_add( vec_lvsl( -1, parentPtr ), (vector unsigned char)(1) );
5325 		vector float v0, v1, v2, v3, v4, v5, v6, v7;
5326 
5327 		// we need to load up 12 float elements that make up the Mat
5328 		v0 = vec_ld( 0, jointPtr );
5329 		v1 = vec_ld( 15, jointPtr );
5330 		v2 = vec_ld( 31, jointPtr );
5331 		v3 = vec_ld( 47, jointPtr );
5332 
5333 		// load parents
5334 		v4 = vec_ld( 0, parentPtr );
5335 		v5 = vec_ld( 15, parentPtr );
5336 		v6 = vec_ld( 31, parentPtr );
5337 		v7 = vec_ld( 47, parentPtr );
5338 
5339 		// permute into vectors
5340 		vector float vecJointMat1 = vec_perm( v0, v1, permVec );
5341 		vector float vecJointMat2 = vec_perm( v1, v2, permVec );
5342 		vector float vecJointMat3 = vec_perm( v2, v3, permVec );
5343 
5344 		vector float vecParentMat1 = vec_perm( v4, v5, permVec2 );
5345 		vector float vecParentMat2 = vec_perm( v5, v6, permVec2 );
5346 		vector float vecParentMat3 = vec_perm( v6, v7, permVec2 );
5347 
5348 		vector float zero = (vector float)(0);
5349 		vector float C1, C2, C3;
5350 
5351 		// do subtraction at the beginning
5352 		vector unsigned char permZeroAndLast = (vector unsigned char)(0,1,2,3,4,5,6,7,8,9,10,11,28,29,30,31);
5353 		vecJointMat1 = vec_sub( vecJointMat1, vec_perm( zero, vecParentMat1, permZeroAndLast ) );
5354 		vecJointMat2 = vec_sub( vecJointMat2, vec_perm( zero, vecParentMat2, permZeroAndLast ) );
5355 		vecJointMat3 = vec_sub( vecJointMat3, vec_perm( zero, vecParentMat3, permZeroAndLast ) );
5356 
5357 		// matrix multiply
5358 		C1 = vec_madd( vecJointMat1, vec_splat( vecParentMat1, 0 ), zero );
5359 		C2 = vec_madd( vecJointMat1, vec_splat( vecParentMat1, 1 ), zero );
5360 		C3 = vec_madd( vecJointMat1, vec_splat( vecParentMat1, 2 ), zero );
5361 
5362 		C1 = vec_madd( vecJointMat2, vec_splat( vecParentMat2, 0 ), C1 );
5363 		C2 = vec_madd( vecJointMat2, vec_splat( vecParentMat2, 1 ), C2 );
5364 		C3 = vec_madd( vecJointMat2, vec_splat( vecParentMat2, 2 ), C3 );
5365 
5366 		C1 = vec_madd( vecJointMat3, vec_splat( vecParentMat3, 0 ), C1 );
5367 		C2 = vec_madd( vecJointMat3, vec_splat( vecParentMat3, 1 ), C2 );
5368 		C3 = vec_madd( vecJointMat3, vec_splat( vecParentMat3, 2 ), C3 );
5369 
5370 		// store results back
5371 		vector unsigned char storePerm = vec_lvsr( 0, jointPtr );
5372 
5373 		// right rotate the input data
5374 		C1 = vec_perm( C1, C1, storePerm );
5375 		C2 = vec_perm( C2, C2, storePerm );
5376 		C3 = vec_perm( C3, C3, storePerm );
5377 
5378 		vec_ste( C1, 0, (float*) jointPtr );
5379 		vec_ste( C1, 4, (float*) jointPtr );
5380 		vec_ste( C1, 8, (float*) jointPtr );
5381 		vec_ste( C1, 12, (float*) jointPtr );
5382 
5383 		vec_ste( C2, 16, (float*) jointPtr );
5384 		vec_ste( C2, 20, (float*) jointPtr );
5385 		vec_ste( C2, 24, (float*) jointPtr );
5386 		vec_ste( C2, 28, (float*) jointPtr );
5387 
5388 		vec_ste( C3, 32, (float*) jointPtr );
5389 		vec_ste( C3, 36, (float*) jointPtr );
5390 		vec_ste( C3, 40, (float*) jointPtr );
5391 		vec_ste( C3, 44, (float*) jointPtr );
5392 	}
5393 
5394 #endif
5395 }
5396 
5397 /*
5398 ============
5399 idSIMD_AltiVec::TransformVerts
5400 ============
5401 */
5402 
5403 // Here we don't have much for the vector unit to do, and the gain we get from doing the math
5404 // in parallel is eaten by doing unaligned stores.
TransformVerts(idDrawVert * verts,const int numVerts,const idJointMat * joints,const idVec4 * weights,const int * index,int numWeights)5405 void VPCALL idSIMD_AltiVec::TransformVerts( idDrawVert *verts, const int numVerts, const idJointMat *joints, const idVec4 *weights, const int *index, int numWeights ) {
5406 	int i, j;
5407 	const byte *jointsPtr = (byte *)joints;
5408 
5409 	for( j = i = 0; i < numVerts; i++ ) {
5410 		idVec3 v;
5411 
5412 		float *matPtrOrig = ( *(idJointMat *)( jointsPtr + index[j*2] ) ).ToFloatPtr();
5413 		float *weightPtr = (float*) weights[j].ToFloatPtr();
5414 
5415 		v[0] = matPtrOrig[0] * weightPtr[0];
5416 		v[0] += matPtrOrig[1] * weightPtr[1];
5417 		v[0] += matPtrOrig[2] * weightPtr[2];
5418 		v[0] += matPtrOrig[3] * weightPtr[3];
5419 
5420 		v[1] = matPtrOrig[4] * weightPtr[0];
5421 		v[1] += matPtrOrig[5] * weightPtr[1];
5422 		v[1] += matPtrOrig[6] * weightPtr[2];
5423 		v[1] += matPtrOrig[7] * weightPtr[3];
5424 
5425 		v[2] = matPtrOrig[8] * weightPtr[0];
5426 		v[2] += matPtrOrig[9] * weightPtr[1];
5427 		v[2] += matPtrOrig[10] * weightPtr[2];
5428 		v[2] += matPtrOrig[11] * weightPtr[3];
5429 
5430 		while( index[j*2+1] == 0 ) {
5431 			j++;
5432 			float *matPtr = ( *(idJointMat *)( jointsPtr + index[j*2] ) ).ToFloatPtr();
5433 			weightPtr = (float*) weights[j].ToFloatPtr();
5434 
5435 			v[0] += matPtr[0] * weightPtr[0];
5436 			v[0] += matPtr[1] * weightPtr[1];
5437 			v[0] += matPtr[2] * weightPtr[2];
5438 			v[0] += matPtr[3] * weightPtr[3];
5439 
5440 			v[1] += matPtr[4] * weightPtr[0];
5441 			v[1] += matPtr[5] * weightPtr[1];
5442 			v[1] += matPtr[6] * weightPtr[2];
5443 			v[1] += matPtr[7] * weightPtr[3];
5444 
5445 			v[2] += matPtr[8] * weightPtr[0];
5446 			v[2] += matPtr[9] * weightPtr[1];
5447 			v[2] += matPtr[10] * weightPtr[2];
5448 			v[2] += matPtr[11] * weightPtr[3];
5449 		}
5450 		j++;
5451 
5452 		verts[i].xyz = v;
5453 	}
5454 }
5455 #endif /* LIVE_VICARIOUSLY */
5456 
5457 #ifdef ENABLE_CULL
5458 
5459 #ifndef DRAWVERT_PADDED
5460 /*
5461 ============
5462 idSIMD_AltiVec::TracePointCull
5463 ============
5464 */
TracePointCull(byte * cullBits,byte & totalOr,const float radius,const idPlane * planes,const idDrawVert * verts,const int numVerts)5465 void VPCALL idSIMD_AltiVec::TracePointCull( byte *cullBits, byte &totalOr, const float radius, const idPlane *planes, const idDrawVert *verts, const int numVerts ) {
5466 
5467 	// idDrawVert size
5468 	assert( sizeof(idDrawVert) == DRAWVERT_OFFSET * sizeof(float) );
5469 
5470 	byte tOr;
5471 	tOr = 0;
5472 
5473 	// pointers
5474 	const float *planePtr = planes[0].ToFloatPtr();
5475 
5476 	vector unsigned int vecShift1 = (vector unsigned int)(0,1,2,3);
5477 	vector unsigned int vecShift2 = (vector unsigned int)(4,5,6,7);
5478 	vector unsigned int vecFlipBits = (vector unsigned int)(0x0F);
5479 	vector float vecPlane0, vecPlane1, vecPlane2, vecPlane3;
5480 	vector bool int vecCmp1, vecCmp2, vecCmp3, vecCmp4, vecCmp5, vecCmp6, vecCmp7, vecCmp8;
5481 	vector unsigned char vecPerm;
5482 	vector float v0, v1, v2, v3, v4, v5, v6, v7;
5483 	vector float zeroVector = (vector float)(0);
5484 	vector float vecRadius;
5485 	vector float vecXYZ1, vecXYZ2, vecXYZ3, vecXYZ4;
5486 	vector float vec1Sum1, vec1Sum2, vec1Sum3, vec1Sum4;
5487 	vector unsigned char vecPermLast = (vector unsigned char)(0,1,2,3,4,5,6,7,8,9,10,11,16,17,18,19);
5488 	vector unsigned char vecPermHalves = (vector unsigned char)(0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23);
5489 	vector float vecDPlusRadius1, vecDPlusRadius2, vecDPlusRadius3, vecDPlusRadius4;
5490 	vector float vecDMinusRadius1, vecDMinusRadius2, vecDMinusRadius3, vecDMinusRadius4;
5491 	vector bool int oneIntVector = (vector bool int)(1);
5492 	vector unsigned int vecBitShifted1, vecBitShifted2, vecBitShifted3, vecBitShifted4, vecBitShifted5, vecBitShifted6, vecBitShifted7, vecBitShifted8;
5493 	vector unsigned int vecTotals;
5494 	vector unsigned int tempIntSum;
5495 	vector unsigned char vertPerm1, vertPerm2, vertPerm3, vertPerm4;
5496 
5497 	vecPerm = vec_add( vec_lvsl( -1, planePtr ), (vector unsigned char)(1) );
5498 
5499 	// populate planes
5500 	v0 = vec_ld( 0, planePtr );
5501 	v1 = vec_ld( 15, planePtr );
5502 	vecPlane0 = vec_perm( v0, v1, vecPerm );
5503 
5504 	v2 = vec_ld( 0, planePtr + 4 );
5505 	v3 = vec_ld( 15, planePtr + 4 );
5506 	vecPlane1 = vec_perm( v2, v3, vecPerm );
5507 
5508 	v0 = vec_ld( 0, planePtr + 8 );
5509 	v1 = vec_ld( 15, planePtr + 8 );
5510 	vecPlane2 = vec_perm( v0, v1, vecPerm );
5511 
5512 	v2 = vec_ld( 0, planePtr + 12 );
5513 	v3 = vec_ld( 15, planePtr + 12 );
5514 	vecPlane3 = vec_perm( v2, v3, vecPerm );
5515 
5516 	// transpose
5517 	v0 = vec_mergeh( vecPlane0, vecPlane2 );
5518 	v1 = vec_mergeh( vecPlane1, vecPlane3 );
5519 	v2 = vec_mergel( vecPlane0, vecPlane2 );
5520 	v3 = vec_mergel( vecPlane1, vecPlane3 );
5521 
5522 	vecPlane0 = vec_mergeh( v0, v1 );
5523 	vecPlane1 = vec_mergel( v0, v1 );
5524 	vecPlane2 = vec_mergeh( v2, v3 );
5525 	vecPlane3 = vec_mergel( v2, v3 );
5526 
5527 	// load constants
5528 	vecRadius = loadSplatUnalignedScalar( &radius );
5529 
5530 	unsigned int cullBitVal[4];
5531 	vector unsigned char cullBitPerm = vec_lvsr( 0, &cullBitVal[0] );
5532 	int i = 0;
5533 
5534 	// every fourth one will have the same alignment. Make sure we've got enough here
5535 	if ( i+3 < numVerts ) {
5536 		vertPerm1 = vec_add( vec_lvsl( -1, (float*) verts[0].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
5537 		vertPerm2 = vec_add( vec_lvsl( -1, (float*) verts[1].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
5538 		vertPerm3 = vec_add( vec_lvsl( -1, (float*) verts[2].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
5539 		vertPerm4 = vec_add( vec_lvsl( -1, (float*) verts[3].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
5540 	}
5541 
5542 
5543 	for ( ; i+3 < numVerts; i+=4 ) {
5544 		const float *vertPtr = verts[i].xyz.ToFloatPtr();
5545 		const float *vertPtr2 = verts[i+1].xyz.ToFloatPtr();
5546 		const float *vertPtr3 = verts[i+2].xyz.ToFloatPtr();
5547 		const float *vertPtr4 = verts[i+3].xyz.ToFloatPtr();
5548 
5549 		v0 = vec_ld( 0, vertPtr );
5550 		v1 = vec_ld( 15, vertPtr );
5551 		v2 = vec_ld( 0, vertPtr2 );
5552 		v3 = vec_ld( 15, vertPtr2 );
5553 		v4 = vec_ld( 0, vertPtr3 );
5554 		v5 = vec_ld( 15, vertPtr3 );
5555 		v6 = vec_ld( 0, vertPtr4 );
5556 		v7 = vec_ld( 15, vertPtr4 );
5557 
5558 		vecXYZ1 = vec_perm( v0, v1, vertPerm1 );
5559 		vecXYZ2 = vec_perm( v2, v3, vertPerm2 );
5560 		vecXYZ3 = vec_perm( v4, v5, vertPerm3 );
5561 		vecXYZ4 = vec_perm( v6, v7, vertPerm4 );
5562 
5563 		vec1Sum1 = vec_madd( vec_splat( vecXYZ1, 0 ), vecPlane0, zeroVector );
5564 		vec1Sum1 = vec_madd( vec_splat( vecXYZ1, 1 ), vecPlane1, vec1Sum1 );
5565 		vec1Sum1 = vec_madd( vec_splat( vecXYZ1, 2 ), vecPlane2, vec1Sum1 );
5566 		vec1Sum1 = vec_add( vec1Sum1, vecPlane3 );
5567 
5568 		vec1Sum2 = vec_madd( vec_splat( vecXYZ2, 0 ), vecPlane0, zeroVector );
5569 		vec1Sum2 = vec_madd( vec_splat( vecXYZ2, 1 ), vecPlane1, vec1Sum2 );
5570 		vec1Sum2 = vec_madd( vec_splat( vecXYZ2, 2 ), vecPlane2, vec1Sum2 );
5571 		vec1Sum2 = vec_add( vec1Sum2, vecPlane3 );
5572 
5573 		vec1Sum3 = vec_madd( vec_splat( vecXYZ3, 0 ), vecPlane0, zeroVector );
5574 		vec1Sum3 = vec_madd( vec_splat( vecXYZ3, 1 ), vecPlane1, vec1Sum3 );
5575 		vec1Sum3 = vec_madd( vec_splat( vecXYZ3, 2 ), vecPlane2, vec1Sum3 );
5576 		vec1Sum3 = vec_add( vec1Sum3, vecPlane3 );
5577 
5578 		vec1Sum4 = vec_madd( vec_splat( vecXYZ4, 0 ), vecPlane0, zeroVector );
5579 		vec1Sum4 = vec_madd( vec_splat( vecXYZ4, 1 ), vecPlane1, vec1Sum4 );
5580 		vec1Sum4 = vec_madd( vec_splat( vecXYZ4, 2 ), vecPlane2, vec1Sum4 );
5581 		vec1Sum4 = vec_add( vec1Sum4, vecPlane3 );
5582 
5583 		// vec1Sum1 now holds d0, d1, d2, d3. calculate the
5584 		// difference with +radius and -radius
5585 		vecDPlusRadius1 = vec_add( vec1Sum1, vecRadius );
5586 		vecDMinusRadius1 = vec_sub( vec1Sum1, vecRadius );
5587 		vecDPlusRadius2 = vec_add( vec1Sum2, vecRadius );
5588 		vecDMinusRadius2 = vec_sub( vec1Sum2, vecRadius );
5589 		vecDPlusRadius3 = vec_add( vec1Sum3, vecRadius );
5590 		vecDMinusRadius3 = vec_sub( vec1Sum3, vecRadius );
5591 		vecDPlusRadius4 = vec_add( vec1Sum4, vecRadius );
5592 		vecDMinusRadius4 = vec_sub( vec1Sum4, vecRadius );
5593 
5594 		// do compare
5595 		vecCmp1 = vec_cmplt( vecDPlusRadius1, zeroVector );
5596 		vecCmp2 = vec_cmplt( vecDMinusRadius1, zeroVector );
5597 		vecCmp3 = vec_cmplt( vecDPlusRadius2, zeroVector );
5598 		vecCmp4 = vec_cmplt( vecDMinusRadius2, zeroVector );
5599 		vecCmp5 = vec_cmplt( vecDPlusRadius3, zeroVector );
5600 		vecCmp6 = vec_cmplt( vecDMinusRadius3, zeroVector );
5601 		vecCmp7 = vec_cmplt( vecDPlusRadius4, zeroVector );
5602 		vecCmp8 = vec_cmplt( vecDMinusRadius4, zeroVector );
5603 
5604 		//and it with 1 so we multiply by 1 not 1111's
5605 		vecCmp1 = vec_and( vecCmp1, oneIntVector );
5606 		vecCmp2 = vec_and( vecCmp2, oneIntVector );
5607 		vecCmp3 = vec_and( vecCmp3, oneIntVector );
5608 		vecCmp4 = vec_and( vecCmp4, oneIntVector );
5609 		vecCmp5 = vec_and( vecCmp5, oneIntVector );
5610 		vecCmp6 = vec_and( vecCmp6, oneIntVector );
5611 		vecCmp7 = vec_and( vecCmp7, oneIntVector );
5612 		vecCmp8 = vec_and( vecCmp8, oneIntVector );
5613 
5614 		vecBitShifted1 = vec_sl( (vector unsigned int)vecCmp1, vecShift1 );
5615 		vecBitShifted2 = vec_sl( (vector unsigned int)vecCmp2, vecShift2 );
5616 		vecBitShifted3 = vec_sl( (vector unsigned int)vecCmp3, vecShift1 );
5617 		vecBitShifted4 = vec_sl( (vector unsigned int)vecCmp4, vecShift2 );
5618 		vecBitShifted5 = vec_sl( (vector unsigned int)vecCmp5, vecShift1 );
5619 		vecBitShifted6 = vec_sl( (vector unsigned int)vecCmp6, vecShift2 );
5620 		vecBitShifted7 = vec_sl( (vector unsigned int)vecCmp7, vecShift1 );
5621 		vecBitShifted8 = vec_sl( (vector unsigned int)vecCmp8, vecShift2 );
5622 
5623 		// OR (add) them all together
5624 		vecBitShifted1 = vec_add( vecBitShifted1, vecBitShifted2 );
5625 		vecBitShifted3 = vec_add( vecBitShifted3, vecBitShifted4 );
5626 		vecBitShifted5 = vec_add( vecBitShifted5, vecBitShifted6 );
5627 		vecBitShifted7 = vec_add( vecBitShifted7, vecBitShifted8 );
5628 
5629 		vecTotals = vec_add( vecBitShifted1, vec_sld( vecBitShifted1, vecBitShifted1, 8 ) );
5630 		vecTotals = vec_add( vecTotals, vec_sld( vecTotals, vecTotals, 4 ) );
5631 		tempIntSum = vec_add( vecBitShifted3, vec_sld( vecBitShifted3, vecBitShifted3, 8 ) );
5632 		tempIntSum = vec_add( tempIntSum, vec_sld( tempIntSum, tempIntSum, 4 ) );
5633 		vecTotals = vec_mergeh( vecTotals, tempIntSum );
5634 		tempIntSum = vec_add( vecBitShifted5, vec_sld( vecBitShifted5, vecBitShifted5, 8 ) );
5635 		tempIntSum = vec_add( tempIntSum, vec_sld( tempIntSum, tempIntSum, 4 ) );
5636 		vecTotals = vec_perm( vecTotals, tempIntSum, vecPermHalves );
5637 		tempIntSum = vec_add( vecBitShifted7, vec_sld( vecBitShifted7, vecBitShifted7, 8 ) );
5638 		tempIntSum = vec_add( tempIntSum, vec_sld( tempIntSum, tempIntSum, 4 ) );
5639 		vecTotals = vec_perm( vecTotals, tempIntSum, vecPermLast );
5640 
5641 		// store out results
5642 		vector unsigned int tempSt = vec_xor( vecTotals, vecFlipBits );
5643 		tempSt = vec_perm( tempSt, tempSt, cullBitPerm );
5644 		vec_ste( tempSt, 0, &cullBitVal[0] );
5645 		vec_ste( tempSt, 4, &cullBitVal[0] );
5646 		vec_ste( tempSt, 8, &cullBitVal[0] );
5647 		vec_ste( tempSt, 12, &cullBitVal[0] );
5648 
5649 		tOr |= cullBitVal[0];
5650 		tOr |= cullBitVal[1];
5651 		tOr |= cullBitVal[2];
5652 		tOr |= cullBitVal[3];
5653 
5654 		cullBits[i] = cullBitVal[0];
5655 		cullBits[i+1] = cullBitVal[1];
5656 		cullBits[i+2] = cullBitVal[2];
5657 		cullBits[i+3] = cullBitVal[3];
5658 	}
5659 
5660 	// cleanup
5661 	for ( ; i < numVerts; i++ ) {
5662 		byte bits;
5663 		float d0, d1, d2, d3, t;
5664 		const idVec3 &v = verts[i].xyz;
5665 
5666 		d0 = planes[0].Distance( v );
5667 		d1 = planes[1].Distance( v );
5668 		d2 = planes[2].Distance( v );
5669 		d3 = planes[3].Distance( v );
5670 
5671 		t = d0 + radius;
5672 		bits  = FLOATSIGNBITSET( t ) << 0;
5673 		t = d1 + radius;
5674 		bits |= FLOATSIGNBITSET( t ) << 1;
5675 		t = d2 + radius;
5676 		bits |= FLOATSIGNBITSET( t ) << 2;
5677 		t = d3 + radius;
5678 		bits |= FLOATSIGNBITSET( t ) << 3;
5679 
5680 		t = d0 - radius;
5681 		bits |= FLOATSIGNBITSET( t ) << 4;
5682 		t = d1 - radius;
5683 		bits |= FLOATSIGNBITSET( t ) << 5;
5684 		t = d2 - radius;
5685 		bits |= FLOATSIGNBITSET( t ) << 6;
5686 		t = d3 - radius;
5687 		bits |= FLOATSIGNBITSET( t ) << 7;
5688 
5689 		bits ^= 0x0F;		// flip lower four bits
5690 
5691 		tOr |= bits;
5692 		cullBits[i] = bits;
5693 	}
5694 
5695 	totalOr = tOr;
5696 }
5697 #else
5698 
5699 /*
5700 ============
5701 idSIMD_AltiVec::TracePointCull
5702 ============
5703 */
TracePointCull(byte * cullBits,byte & totalOr,const float radius,const idPlane * planes,const idDrawVert * verts,const int numVerts)5704 void VPCALL idSIMD_AltiVec::TracePointCull( byte *cullBits, byte &totalOr, const float radius, const idPlane *planes, const idDrawVert *verts, const int numVerts ) {
5705 
5706 	// idDrawVert size
5707 	assert( sizeof(idDrawVert) == DRAWVERT_OFFSET * sizeof(float) );
5708 
5709 	byte tOr;
5710 	tOr = 0;
5711 
5712 	// pointers
5713 	const float *planePtr = planes[0].ToFloatPtr();
5714 
5715 	vector unsigned int vecShift1 = (vector unsigned int)(0,1,2,3);
5716 	vector unsigned int vecShift2 = (vector unsigned int)(4,5,6,7);
5717 	vector unsigned int vecFlipBits = (vector unsigned int)(0x0F);
5718 	vector float vecPlane0, vecPlane1, vecPlane2, vecPlane3;
5719 	vector bool int vecCmp1, vecCmp2, vecCmp3, vecCmp4, vecCmp5, vecCmp6, vecCmp7, vecCmp8;
5720 	vector unsigned char vecPerm;
5721 	vector float v0, v1, v2, v3, v4, v5, v6, v7;
5722 	vector float zeroVector = (vector float)(0);
5723 	vector float vecRadius;
5724 	vector float vecXYZ1, vecXYZ2, vecXYZ3, vecXYZ4;
5725 	vector float vec1Sum1, vec1Sum2, vec1Sum3, vec1Sum4;
5726 	vector unsigned char vecPermLast = (vector unsigned char)(0,1,2,3,4,5,6,7,8,9,10,11,16,17,18,19);
5727 	vector unsigned char vecPermHalves = (vector unsigned char)(0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23);
5728 	vector float vecDPlusRadius1, vecDPlusRadius2, vecDPlusRadius3, vecDPlusRadius4;
5729 	vector float vecDMinusRadius1, vecDMinusRadius2, vecDMinusRadius3, vecDMinusRadius4;
5730 	vector bool int oneIntVector = (vector bool int)(1);
5731 	vector unsigned int vecBitShifted1, vecBitShifted2, vecBitShifted3, vecBitShifted4, vecBitShifted5, vecBitShifted6, vecBitShifted7, vecBitShifted8;
5732 	vector unsigned int vecTotals;
5733 	vector unsigned int tempIntSum;
5734 	vector unsigned char vertPerm1, vertPerm2, vertPerm3, vertPerm4;
5735 
5736 	vecPerm = vec_add( vec_lvsl( -1, planePtr ), (vector unsigned char)(1) );
5737 
5738 	// populate planes
5739 	v0 = vec_ld( 0, planePtr );
5740 	v1 = vec_ld( 15, planePtr );
5741 	vecPlane0 = vec_perm( v0, v1, vecPerm );
5742 
5743 	v2 = vec_ld( 0, planePtr + 4 );
5744 	v3 = vec_ld( 15, planePtr + 4 );
5745 	vecPlane1 = vec_perm( v2, v3, vecPerm );
5746 
5747 	v0 = vec_ld( 0, planePtr + 8 );
5748 	v1 = vec_ld( 15, planePtr + 8 );
5749 	vecPlane2 = vec_perm( v0, v1, vecPerm );
5750 
5751 	v2 = vec_ld( 0, planePtr + 12 );
5752 	v3 = vec_ld( 15, planePtr + 12 );
5753 	vecPlane3 = vec_perm( v2, v3, vecPerm );
5754 
5755 	// transpose
5756 	v0 = vec_mergeh( vecPlane0, vecPlane2 );
5757 	v1 = vec_mergeh( vecPlane1, vecPlane3 );
5758 	v2 = vec_mergel( vecPlane0, vecPlane2 );
5759 	v3 = vec_mergel( vecPlane1, vecPlane3 );
5760 
5761 	vecPlane0 = vec_mergeh( v0, v1 );
5762 	vecPlane1 = vec_mergel( v0, v1 );
5763 	vecPlane2 = vec_mergeh( v2, v3 );
5764 	vecPlane3 = vec_mergel( v2, v3 );
5765 
5766 	// load constants
5767 	vecRadius = loadSplatUnalignedScalar( &radius );
5768 
5769 	unsigned int cullBitVal[4];
5770 	vector unsigned char cullBitPerm = vec_lvsr( 0, &cullBitVal[0] );
5771 	int i = 0;
5772 
5773 
5774 	for ( ; i+3 < numVerts; i+=4 ) {
5775 		const float *vertPtr = verts[i].xyz.ToFloatPtr();
5776 		const float *vertPtr2 = verts[i+1].xyz.ToFloatPtr();
5777 		const float *vertPtr3 = verts[i+2].xyz.ToFloatPtr();
5778 		const float *vertPtr4 = verts[i+3].xyz.ToFloatPtr();
5779 
5780 		vecXYZ1 = vec_ld( 0, vertPtr );
5781 		vecXYZ2 = vec_ld( 0, vertPtr2 );
5782 		vecXYZ3 = vec_ld( 0, vertPtr3 );
5783 		vecXYZ4 = vec_ld( 0, vertPtr4 );
5784 
5785 		vec1Sum1 = vec_madd( vec_splat( vecXYZ1, 0 ), vecPlane0, zeroVector );
5786 		vec1Sum1 = vec_madd( vec_splat( vecXYZ1, 1 ), vecPlane1, vec1Sum1 );
5787 		vec1Sum1 = vec_madd( vec_splat( vecXYZ1, 2 ), vecPlane2, vec1Sum1 );
5788 		vec1Sum1 = vec_add( vec1Sum1, vecPlane3 );
5789 
5790 		vec1Sum2 = vec_madd( vec_splat( vecXYZ2, 0 ), vecPlane0, zeroVector );
5791 		vec1Sum2 = vec_madd( vec_splat( vecXYZ2, 1 ), vecPlane1, vec1Sum2 );
5792 		vec1Sum2 = vec_madd( vec_splat( vecXYZ2, 2 ), vecPlane2, vec1Sum2 );
5793 		vec1Sum2 = vec_add( vec1Sum2, vecPlane3 );
5794 
5795 		vec1Sum3 = vec_madd( vec_splat( vecXYZ3, 0 ), vecPlane0, zeroVector );
5796 		vec1Sum3 = vec_madd( vec_splat( vecXYZ3, 1 ), vecPlane1, vec1Sum3 );
5797 		vec1Sum3 = vec_madd( vec_splat( vecXYZ3, 2 ), vecPlane2, vec1Sum3 );
5798 		vec1Sum3 = vec_add( vec1Sum3, vecPlane3 );
5799 
5800 		vec1Sum4 = vec_madd( vec_splat( vecXYZ4, 0 ), vecPlane0, zeroVector );
5801 		vec1Sum4 = vec_madd( vec_splat( vecXYZ4, 1 ), vecPlane1, vec1Sum4 );
5802 		vec1Sum4 = vec_madd( vec_splat( vecXYZ4, 2 ), vecPlane2, vec1Sum4 );
5803 		vec1Sum4 = vec_add( vec1Sum4, vecPlane3 );
5804 
5805 		// vec1Sum1 now holds d0, d1, d2, d3. calculate the
5806 		// difference with +radius and -radius
5807 		vecDPlusRadius1 = vec_add( vec1Sum1, vecRadius );
5808 		vecDMinusRadius1 = vec_sub( vec1Sum1, vecRadius );
5809 		vecDPlusRadius2 = vec_add( vec1Sum2, vecRadius );
5810 		vecDMinusRadius2 = vec_sub( vec1Sum2, vecRadius );
5811 		vecDPlusRadius3 = vec_add( vec1Sum3, vecRadius );
5812 		vecDMinusRadius3 = vec_sub( vec1Sum3, vecRadius );
5813 		vecDPlusRadius4 = vec_add( vec1Sum4, vecRadius );
5814 		vecDMinusRadius4 = vec_sub( vec1Sum4, vecRadius );
5815 
5816 		// do compare
5817 		vecCmp1 = vec_cmplt( vecDPlusRadius1, zeroVector );
5818 		vecCmp2 = vec_cmplt( vecDMinusRadius1, zeroVector );
5819 		vecCmp3 = vec_cmplt( vecDPlusRadius2, zeroVector );
5820 		vecCmp4 = vec_cmplt( vecDMinusRadius2, zeroVector );
5821 		vecCmp5 = vec_cmplt( vecDPlusRadius3, zeroVector );
5822 		vecCmp6 = vec_cmplt( vecDMinusRadius3, zeroVector );
5823 		vecCmp7 = vec_cmplt( vecDPlusRadius4, zeroVector );
5824 		vecCmp8 = vec_cmplt( vecDMinusRadius4, zeroVector );
5825 
5826 		//and it with 1 so we multiply by 1 not 1111's
5827 		vecCmp1 = vec_and( vecCmp1, oneIntVector );
5828 		vecCmp2 = vec_and( vecCmp2, oneIntVector );
5829 		vecCmp3 = vec_and( vecCmp3, oneIntVector );
5830 		vecCmp4 = vec_and( vecCmp4, oneIntVector );
5831 		vecCmp5 = vec_and( vecCmp5, oneIntVector );
5832 		vecCmp6 = vec_and( vecCmp6, oneIntVector );
5833 		vecCmp7 = vec_and( vecCmp7, oneIntVector );
5834 		vecCmp8 = vec_and( vecCmp8, oneIntVector );
5835 
5836 		vecBitShifted1 = vec_sl( (vector unsigned int)vecCmp1, vecShift1 );
5837 		vecBitShifted2 = vec_sl( (vector unsigned int)vecCmp2, vecShift2 );
5838 		vecBitShifted3 = vec_sl( (vector unsigned int)vecCmp3, vecShift1 );
5839 		vecBitShifted4 = vec_sl( (vector unsigned int)vecCmp4, vecShift2 );
5840 		vecBitShifted5 = vec_sl( (vector unsigned int)vecCmp5, vecShift1 );
5841 		vecBitShifted6 = vec_sl( (vector unsigned int)vecCmp6, vecShift2 );
5842 		vecBitShifted7 = vec_sl( (vector unsigned int)vecCmp7, vecShift1 );
5843 		vecBitShifted8 = vec_sl( (vector unsigned int)vecCmp8, vecShift2 );
5844 
5845 		// OR (add) them all together
5846 		vecBitShifted1 = vec_add( vecBitShifted1, vecBitShifted2 );
5847 		vecBitShifted3 = vec_add( vecBitShifted3, vecBitShifted4 );
5848 		vecBitShifted5 = vec_add( vecBitShifted5, vecBitShifted6 );
5849 		vecBitShifted7 = vec_add( vecBitShifted7, vecBitShifted8 );
5850 
5851 		vecTotals = vec_add( vecBitShifted1, vec_sld( vecBitShifted1, vecBitShifted1, 8 ) );
5852 		vecTotals = vec_add( vecTotals, vec_sld( vecTotals, vecTotals, 4 ) );
5853 		tempIntSum = vec_add( vecBitShifted3, vec_sld( vecBitShifted3, vecBitShifted3, 8 ) );
5854 		tempIntSum = vec_add( tempIntSum, vec_sld( tempIntSum, tempIntSum, 4 ) );
5855 		vecTotals = vec_mergeh( vecTotals, tempIntSum );
5856 		tempIntSum = vec_add( vecBitShifted5, vec_sld( vecBitShifted5, vecBitShifted5, 8 ) );
5857 		tempIntSum = vec_add( tempIntSum, vec_sld( tempIntSum, tempIntSum, 4 ) );
5858 		vecTotals = vec_perm( vecTotals, tempIntSum, vecPermHalves );
5859 		tempIntSum = vec_add( vecBitShifted7, vec_sld( vecBitShifted7, vecBitShifted7, 8 ) );
5860 		tempIntSum = vec_add( tempIntSum, vec_sld( tempIntSum, tempIntSum, 4 ) );
5861 		vecTotals = vec_perm( vecTotals, tempIntSum, vecPermLast );
5862 
5863 		// store out results
5864 		vector unsigned int tempSt = vec_xor( vecTotals, vecFlipBits );
5865 		tempSt = vec_perm( tempSt, tempSt, cullBitPerm );
5866 		vec_ste( tempSt, 0, &cullBitVal[0] );
5867 		vec_ste( tempSt, 4, &cullBitVal[0] );
5868 		vec_ste( tempSt, 8, &cullBitVal[0] );
5869 		vec_ste( tempSt, 12, &cullBitVal[0] );
5870 
5871 		tOr |= cullBitVal[0];
5872 		tOr |= cullBitVal[1];
5873 		tOr |= cullBitVal[2];
5874 		tOr |= cullBitVal[3];
5875 
5876 		cullBits[i] = cullBitVal[0];
5877 		cullBits[i+1] = cullBitVal[1];
5878 		cullBits[i+2] = cullBitVal[2];
5879 		cullBits[i+3] = cullBitVal[3];
5880 	}
5881 
5882 	// cleanup
5883 	for ( ; i < numVerts; i++ ) {
5884 		byte bits;
5885 		float d0, d1, d2, d3, t;
5886 		const idVec3 &v = verts[i].xyz;
5887 
5888 		d0 = planes[0].Distance( v );
5889 		d1 = planes[1].Distance( v );
5890 		d2 = planes[2].Distance( v );
5891 		d3 = planes[3].Distance( v );
5892 
5893 		t = d0 + radius;
5894 		bits  = FLOATSIGNBITSET( t ) << 0;
5895 		t = d1 + radius;
5896 		bits |= FLOATSIGNBITSET( t ) << 1;
5897 		t = d2 + radius;
5898 		bits |= FLOATSIGNBITSET( t ) << 2;
5899 		t = d3 + radius;
5900 		bits |= FLOATSIGNBITSET( t ) << 3;
5901 
5902 		t = d0 - radius;
5903 		bits |= FLOATSIGNBITSET( t ) << 4;
5904 		t = d1 - radius;
5905 		bits |= FLOATSIGNBITSET( t ) << 5;
5906 		t = d2 - radius;
5907 		bits |= FLOATSIGNBITSET( t ) << 6;
5908 		t = d3 - radius;
5909 		bits |= FLOATSIGNBITSET( t ) << 7;
5910 
5911 		bits ^= 0x0F;		// flip lower four bits
5912 
5913 		tOr |= bits;
5914 		cullBits[i] = bits;
5915 	}
5916 
5917 	totalOr = tOr;
5918 }
5919 
5920 #endif /* DRAWVERT_PADDED */
5921 
5922 #ifndef DRAWVERT_PADDED
5923 /*
5924 ============
5925 idSIMD_AltiVec::DecalPointCull
5926 ============
5927 */
DecalPointCull(byte * cullBits,const idPlane * planes,const idDrawVert * verts,const int numVerts)5928 void VPCALL idSIMD_AltiVec::DecalPointCull( byte *cullBits, const idPlane *planes, const idDrawVert *verts, const int numVerts ) {
5929 
5930 	// idDrawVert size
5931 	assert( sizeof(idDrawVert) == DRAWVERT_OFFSET * sizeof(float) );
5932 
5933 	int i;
5934 	const float *planePtr = planes[0].ToFloatPtr();
5935 
5936 	vector float vecPlane0, vecPlane1, vecPlane2, vecPlane3, vecPlane4, vecPlane5, vecPlane6, vecPlane7;
5937 	vector float zeroVector = (vector float)(0.0);
5938 	vector unsigned char vecPerm;
5939 	vector float v0, v1, v2, v3, v4, v5, v6, v7;
5940 
5941 	vecPerm = vec_add( vec_lvsl( -1, planePtr ), (vector unsigned char)(1) );
5942 
5943 	// populate planes
5944 	v0 = vec_ld( 0, planePtr );
5945 	v1 = vec_ld( 15, planePtr );
5946 	vecPlane0 = vec_perm( v0, v1, vecPerm );
5947 
5948 	v2 = vec_ld( 0, planePtr + 4 );
5949 	v3 = vec_ld( 15, planePtr + 4 );
5950 	vecPlane1 = vec_perm( v2, v3, vecPerm );
5951 
5952 	v0 = vec_ld( 0, planePtr + 8 );
5953 	v1 = vec_ld( 15, planePtr + 8 );
5954 	vecPlane2 = vec_perm( v0, v1, vecPerm );
5955 
5956 	v2 = vec_ld( 0, planePtr + 12 );
5957 	v3 = vec_ld( 15, planePtr + 12 );
5958 	vecPlane3 = vec_perm( v2, v3, vecPerm );
5959 
5960 	v0 = vec_ld( 0, planePtr + 16 );
5961 	v1 = vec_ld( 15, planePtr + 16 );
5962 	vecPlane4 = vec_perm( v0, v1, vecPerm );
5963 
5964 	v2 = vec_ld( 0, planePtr + 20 );
5965 	v3 = vec_ld( 15, planePtr + 20 );
5966 	vecPlane5 = vec_perm( v2, v3, vecPerm );
5967 
5968 	// transpose
5969 	v0 = vec_mergeh( vecPlane0, vecPlane2 );
5970 	v1 = vec_mergeh( vecPlane1, vecPlane3 );
5971 	v2 = vec_mergel( vecPlane0, vecPlane2 );
5972 	v3 = vec_mergel( vecPlane1, vecPlane3 );
5973 
5974 	vecPlane0 = vec_mergeh( v0, v1 );
5975 	vecPlane1 = vec_mergel( v0, v1 );
5976 	vecPlane2 = vec_mergeh( v2, v3 );
5977 	vecPlane3 = vec_mergel( v2, v3 );
5978 
5979 	v0 = vec_mergeh( vecPlane4, zeroVector );
5980 	v1 = vec_mergeh( vecPlane5, zeroVector );
5981 	v2 = vec_mergel( vecPlane4, zeroVector );
5982 	v3 = vec_mergel( vecPlane5, zeroVector );
5983 
5984 	vecPlane4 = vec_mergeh( v0, v1 );
5985 	vecPlane5 = vec_mergel( v0, v1 );
5986 	vecPlane6 = vec_mergeh( v2, v3 );
5987 	vecPlane7 = vec_mergel( v2, v3 );
5988 
5989 
5990 	vector float vecXYZ1, vecXYZ2, vecXYZ3, vecXYZ4;
5991 	vector bool int oneIntVector = (vector bool int)(1);
5992 	vector float vec1Sum1, vec1Sum2, vec2Sum1, vec2Sum2, vec3Sum1, vec3Sum2, vec4Sum1, vec4Sum2;
5993 	vector unsigned int vecShift1 = (vector unsigned int)(0, 1, 2, 3 );
5994 	vector unsigned int vecShift2 = (vector unsigned int)(4, 5, 0, 0 );
5995 
5996 	vector bool int vecCmp1, vecCmp2, vecCmp3, vecCmp4, vecCmp5, vecCmp6, vecCmp7, vecCmp8;
5997 	vector unsigned int vecBitShifted1, vecBitShifted2, vecBitShifted3, vecBitShifted4;
5998 	vector unsigned int vecBitShifted5, vecBitShifted6, vecBitShifted7, vecBitShifted8;
5999 	vector unsigned int vecFlipBits = (vector unsigned int)( 0x3F, 0x3F, 0x3F, 0x3F );
6000 	vector unsigned int vecR1, vecR2, vecR3, vecR4;
6001 	vector unsigned char permHalves = (vector unsigned char)(0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23);
6002 	vector unsigned char vertPerm1, vertPerm2, vertPerm3, vertPerm4;
6003 	unsigned int vBits[4];
6004 	vector unsigned char vBitPerm = vec_lvsr( 0, &vBits[4] );
6005 
6006 	i = 0;
6007 	// every fourth one will have the same alignment. Make sure we've got enough here
6008 	if ( i+3 < numVerts ) {
6009 		vertPerm1 = vec_add( vec_lvsl( -1, (float*) verts[0].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
6010 		vertPerm2 = vec_add( vec_lvsl( -1, (float*) verts[1].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
6011 		vertPerm3 = vec_add( vec_lvsl( -1, (float*) verts[2].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
6012 		vertPerm4 = vec_add( vec_lvsl( -1, (float*) verts[3].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
6013 	}
6014 
6015 
6016 	for ( ; i+3 < numVerts; i+=4 ) {
6017 		const float *vertPtr = verts[i].xyz.ToFloatPtr();
6018 		const float *vertPtr2 = verts[i+1].xyz.ToFloatPtr();
6019 		const float *vertPtr3 = verts[i+2].xyz.ToFloatPtr();
6020 		const float *vertPtr4 = verts[i+3].xyz.ToFloatPtr();
6021 
6022 		v0 = vec_ld( 0, vertPtr );
6023 		v1 = vec_ld( 15, vertPtr );
6024 		v2 = vec_ld( 0, vertPtr2 );
6025 		v3 = vec_ld( 15, vertPtr2 );
6026 		v4 = vec_ld( 0, vertPtr3 );
6027 		v5 = vec_ld( 15, vertPtr3 );
6028 		v6 = vec_ld( 0, vertPtr4 );
6029 		v7 = vec_ld( 15, vertPtr4 );
6030 
6031 		vecXYZ1 = vec_perm( v0, v1, vertPerm1 );
6032 		vecXYZ2 = vec_perm( v2, v3, vertPerm2 );
6033 		vecXYZ3 = vec_perm( v4, v5, vertPerm3 );
6034 		vecXYZ4 = vec_perm( v6, v7, vertPerm4 );
6035 
6036 		vec1Sum1 = vec_madd( vec_splat( vecXYZ1, 0 ), vecPlane0, zeroVector );
6037 		vec1Sum1 = vec_madd( vec_splat( vecXYZ1, 1 ), vecPlane1, vec1Sum1 );
6038 		vec1Sum1 = vec_madd( vec_splat( vecXYZ1, 2 ), vecPlane2, vec1Sum1 );
6039 		vec1Sum1 = vec_add( vec1Sum1, vecPlane3 );
6040 
6041 		vec1Sum2 = vec_madd( vec_splat( vecXYZ1, 0 ), vecPlane4, zeroVector );
6042 		vec1Sum2 = vec_madd( vec_splat( vecXYZ1, 1 ), vecPlane5, vec1Sum2 );
6043 		vec1Sum2 = vec_madd( vec_splat( vecXYZ1, 2 ), vecPlane6, vec1Sum2 );
6044 		vec1Sum2 = vec_add( vec1Sum2, vecPlane7 );
6045 
6046 		vec2Sum1 = vec_madd( vec_splat( vecXYZ2, 0 ), vecPlane0, zeroVector );
6047 		vec2Sum1 = vec_madd( vec_splat( vecXYZ2, 1 ), vecPlane1, vec2Sum1 );
6048 		vec2Sum1 = vec_madd( vec_splat( vecXYZ2, 2 ), vecPlane2, vec2Sum1 );
6049 		vec2Sum1 = vec_add( vec2Sum1, vecPlane3 );
6050 
6051 		vec2Sum2 = vec_madd( vec_splat( vecXYZ2, 0 ), vecPlane4, zeroVector );
6052 		vec2Sum2 = vec_madd( vec_splat( vecXYZ2, 1 ), vecPlane5, vec2Sum2 );
6053 		vec2Sum2 = vec_madd( vec_splat( vecXYZ2, 2 ), vecPlane6, vec2Sum2 );
6054 		vec2Sum2 = vec_add( vec2Sum2, vecPlane7 );
6055 
6056 		vec3Sum1 = vec_madd( vec_splat( vecXYZ3, 0 ), vecPlane0, zeroVector );
6057 		vec3Sum1 = vec_madd( vec_splat( vecXYZ3, 1 ), vecPlane1, vec3Sum1 );
6058 		vec3Sum1 = vec_madd( vec_splat( vecXYZ3, 2 ), vecPlane2, vec3Sum1 );
6059 		vec3Sum1 = vec_add( vec3Sum1, vecPlane3 );
6060 
6061 		vec3Sum2 = vec_madd( vec_splat( vecXYZ3, 0 ), vecPlane4, zeroVector );
6062 		vec3Sum2 = vec_madd( vec_splat( vecXYZ3, 1 ), vecPlane5, vec3Sum2 );
6063 		vec3Sum2 = vec_madd( vec_splat( vecXYZ3, 2 ), vecPlane6, vec3Sum2 );
6064 		vec3Sum2 = vec_add( vec3Sum2, vecPlane7 );
6065 
6066 		vec4Sum1 = vec_madd( vec_splat( vecXYZ4, 0 ), vecPlane0, zeroVector );
6067 		vec4Sum1 = vec_madd( vec_splat( vecXYZ4, 1 ), vecPlane1, vec4Sum1 );
6068 		vec4Sum1 = vec_madd( vec_splat( vecXYZ4, 2 ), vecPlane2, vec4Sum1 );
6069 		vec4Sum1 = vec_add( vec4Sum1, vecPlane3 );
6070 
6071 		vec4Sum2 = vec_madd( vec_splat( vecXYZ4, 0 ), vecPlane4, zeroVector );
6072 		vec4Sum2 = vec_madd( vec_splat( vecXYZ4, 1 ), vecPlane5, vec4Sum2 );
6073 		vec4Sum2 = vec_madd( vec_splat( vecXYZ4, 2 ), vecPlane6, vec4Sum2 );
6074 		vec4Sum2 = vec_add( vec4Sum2, vecPlane7 );
6075 
6076 		vecCmp1 = vec_cmplt( vec1Sum1, zeroVector );
6077 		vecCmp2 = vec_cmplt( vec1Sum2, zeroVector );
6078 		vecCmp3 = vec_cmplt( vec2Sum1, zeroVector );
6079 		vecCmp4 = vec_cmplt( vec2Sum2, zeroVector );
6080 		vecCmp5 = vec_cmplt( vec3Sum1, zeroVector );
6081 		vecCmp6 = vec_cmplt( vec3Sum2, zeroVector );
6082 		vecCmp7 = vec_cmplt( vec4Sum1, zeroVector );
6083 		vecCmp8 = vec_cmplt( vec4Sum2, zeroVector );
6084 
6085 		//and it with 1 so we multiply by 1 not 1111's
6086 		vecCmp1 = vec_and( vecCmp1, oneIntVector );
6087 		vecCmp2 = vec_and( vecCmp2, oneIntVector );
6088 		vecCmp3 = vec_and( vecCmp3, oneIntVector );
6089 		vecCmp4 = vec_and( vecCmp4, oneIntVector );
6090 		vecCmp5 = vec_and( vecCmp5, oneIntVector );
6091 		vecCmp6 = vec_and( vecCmp6, oneIntVector );
6092 		vecCmp7 = vec_and( vecCmp7, oneIntVector );
6093 		vecCmp8 = vec_and( vecCmp8, oneIntVector );
6094 
6095 		vecBitShifted1 = vec_sl( (vector unsigned int)vecCmp1, vecShift1 );
6096 		vecBitShifted2 = vec_sl( (vector unsigned int)vecCmp2, vecShift2 );
6097 		vecBitShifted3 = vec_sl( (vector unsigned int)vecCmp3, vecShift1 );
6098 		vecBitShifted4 = vec_sl( (vector unsigned int)vecCmp4, vecShift2 );
6099 		vecBitShifted5 = vec_sl( (vector unsigned int)vecCmp5, vecShift1 );
6100 		vecBitShifted6 = vec_sl( (vector unsigned int)vecCmp6, vecShift2 );
6101 		vecBitShifted7 = vec_sl( (vector unsigned int)vecCmp7, vecShift1 );
6102 		vecBitShifted8 = vec_sl( (vector unsigned int)vecCmp8, vecShift2 );
6103 
6104 		//OR them all together (this is the same as adding them, since they're all only 1 bit set)
6105 		vecR1 = (vector unsigned int)(0); //zeroIntVector;
6106 		vecR1 = vec_add( vecBitShifted1, vec_sld( vecBitShifted1, vecBitShifted1, 8 ) );
6107 		vecR1 = vec_add( vecR1, vec_sld( vecR1, vecR1, 4 ) );
6108 		vecR1 = vec_add(vecR1, vecBitShifted2 );
6109 		vecR1 = vec_or( vecR1, vec_sld( vecBitShifted2, vecBitShifted2, 4 ) );
6110 
6111 		vecR2 = (vector unsigned int)(0); //zeroIntVector;
6112 		vecR2 = vec_add( vecBitShifted3, vec_sld( vecBitShifted3, vecBitShifted3, 8 ) );
6113 		vecR2 = vec_add( vecR2, vec_sld( vecR2, vecR2, 4 ) );
6114 		vecR2 = vec_add(vecR2, vecBitShifted4 );
6115 		vecR2 = vec_or( vecR2, vec_sld( vecBitShifted4, vecBitShifted4, 4 ) );
6116 
6117 		vecR3 = (vector unsigned int)(0); //zeroIntVector;
6118 		vecR3 = vec_add( vecBitShifted5, vec_sld( vecBitShifted5, vecBitShifted5, 8 ) );
6119 		vecR3 = vec_add( vecR3, vec_sld( vecR3, vecR3, 4 ) );
6120 		vecR3 = vec_add(vecR3, vecBitShifted6 );
6121 		vecR3 = vec_or( vecR3, vec_sld( vecBitShifted6, vecBitShifted6, 4 ) );
6122 
6123 		vecR4 = (vector unsigned int)(0); //zeroIntVector;
6124 		vecR4 = vec_add( vecBitShifted7, vec_sld( vecBitShifted7, vecBitShifted7, 8 ) );
6125 		vecR4 = vec_add( vecR4, vec_sld( vecR4, vecR4, 4 ) );
6126 		vecR4 = vec_add(vecR4, vecBitShifted8 );
6127 		vecR4 = vec_or( vecR4, vec_sld( vecBitShifted8, vecBitShifted8, 4 ) );
6128 
6129 		// take the first element from each vector and put them into vecR1
6130 		vecR1 = vec_mergeh( vecR1, vecR2 );
6131 		vecR3 = vec_mergeh( vecR3, vecR4 );
6132 		vecR1 = vec_perm( vecR1, vecR3, permHalves );
6133 
6134 		// XOR with 0x3F to flip lower 6 bits
6135 		vecR1 = vec_xor( vecR1, vecFlipBits );
6136 
6137 		// store out results. don't have 16 at a time so let's just
6138 		// do this and avoid alignment concerns
6139 		vecR1 = vec_perm( vecR1, vecR1, vBitPerm );
6140 		vec_ste( vecR1, 0, &vBits[0] );
6141 		vec_ste( vecR1, 4, &vBits[0] );
6142 		vec_ste( vecR1, 8, &vBits[0] );
6143 		vec_ste( vecR1, 12, &vBits[0] );
6144 
6145 		cullBits[i] = vBits[0];
6146 		cullBits[i+1] = vBits[1];
6147 		cullBits[i+2] = vBits[2];
6148 		cullBits[i+3] = vBits[3];
6149 	}
6150 
6151 	for ( ; i < numVerts; i++ ) {
6152 		byte bits;
6153 		float d0, d1, d2, d3, d4, d5;
6154 		const idVec3 &v = verts[i].xyz;
6155 
6156 		d0 = planes[0].Distance( v );
6157 		d1 = planes[1].Distance( v );
6158 		d2 = planes[2].Distance( v );
6159 		d3 = planes[3].Distance( v );
6160 		d4 = planes[4].Distance( v );
6161 		d5 = planes[5].Distance( v );
6162 
6163 		// they check if the sign bit is set by casting as long and shifting right 31 places.
6164 		bits  = FLOATSIGNBITSET( d0 ) << 0;
6165 		bits |= FLOATSIGNBITSET( d1 ) << 1;
6166 		bits |= FLOATSIGNBITSET( d2 ) << 2;
6167 		bits |= FLOATSIGNBITSET( d3 ) << 3;
6168 		bits |= FLOATSIGNBITSET( d4 ) << 4;
6169 		bits |= FLOATSIGNBITSET( d5 ) << 5;
6170 
6171 		cullBits[i] = bits ^ 0x3F;		// flip lower 6 bits
6172 	}
6173 }
6174 
6175 #else
6176 
6177 /*
6178 ============
6179 idSIMD_AltiVec::DecalPointCull
6180 ============
6181 */
DecalPointCull(byte * cullBits,const idPlane * planes,const idDrawVert * verts,const int numVerts)6182 void VPCALL idSIMD_AltiVec::DecalPointCull( byte *cullBits, const idPlane *planes, const idDrawVert *verts, const int numVerts ) {
6183 
6184 	// idDrawVert size
6185 	assert( sizeof(idDrawVert) == DRAWVERT_OFFSET * sizeof(float) );
6186 
6187 	int i;
6188 	const float *planePtr = planes[0].ToFloatPtr();
6189 
6190 	vector float vecPlane0, vecPlane1, vecPlane2, vecPlane3, vecPlane4, vecPlane5, vecPlane6, vecPlane7;
6191 	vector float zeroVector = (vector float)(0.0);
6192 	vector unsigned char vecPerm;
6193 	vector float v0, v1, v2, v3, v4, v5, v6, v7;
6194 
6195 	vecPerm = vec_add( vec_lvsl( -1, planePtr ), (vector unsigned char)(1) );
6196 
6197 	// populate planes
6198 	v0 = vec_ld( 0, planePtr );
6199 	v1 = vec_ld( 15, planePtr );
6200 	vecPlane0 = vec_perm( v0, v1, vecPerm );
6201 
6202 	v2 = vec_ld( 0, planePtr + 4 );
6203 	v3 = vec_ld( 15, planePtr + 4 );
6204 	vecPlane1 = vec_perm( v2, v3, vecPerm );
6205 
6206 	v0 = vec_ld( 0, planePtr + 8 );
6207 	v1 = vec_ld( 15, planePtr + 8 );
6208 	vecPlane2 = vec_perm( v0, v1, vecPerm );
6209 
6210 	v2 = vec_ld( 0, planePtr + 12 );
6211 	v3 = vec_ld( 15, planePtr + 12 );
6212 	vecPlane3 = vec_perm( v2, v3, vecPerm );
6213 
6214 	v0 = vec_ld( 0, planePtr + 16 );
6215 	v1 = vec_ld( 15, planePtr + 16 );
6216 	vecPlane4 = vec_perm( v0, v1, vecPerm );
6217 
6218 	v2 = vec_ld( 0, planePtr + 20 );
6219 	v3 = vec_ld( 15, planePtr + 20 );
6220 	vecPlane5 = vec_perm( v2, v3, vecPerm );
6221 
6222 	// transpose
6223 	v0 = vec_mergeh( vecPlane0, vecPlane2 );
6224 	v1 = vec_mergeh( vecPlane1, vecPlane3 );
6225 	v2 = vec_mergel( vecPlane0, vecPlane2 );
6226 	v3 = vec_mergel( vecPlane1, vecPlane3 );
6227 
6228 	vecPlane0 = vec_mergeh( v0, v1 );
6229 	vecPlane1 = vec_mergel( v0, v1 );
6230 	vecPlane2 = vec_mergeh( v2, v3 );
6231 	vecPlane3 = vec_mergel( v2, v3 );
6232 
6233 	v0 = vec_mergeh( vecPlane4, zeroVector );
6234 	v1 = vec_mergeh( vecPlane5, zeroVector );
6235 	v2 = vec_mergel( vecPlane4, zeroVector );
6236 	v3 = vec_mergel( vecPlane5, zeroVector );
6237 
6238 	vecPlane4 = vec_mergeh( v0, v1 );
6239 	vecPlane5 = vec_mergel( v0, v1 );
6240 	vecPlane6 = vec_mergeh( v2, v3 );
6241 	vecPlane7 = vec_mergel( v2, v3 );
6242 
6243 
6244 	vector float vecXYZ1, vecXYZ2, vecXYZ3, vecXYZ4;
6245 	vector bool int oneIntVector = (vector bool int)(1);
6246 	vector float vec1Sum1, vec1Sum2, vec2Sum1, vec2Sum2, vec3Sum1, vec3Sum2, vec4Sum1, vec4Sum2;
6247 	vector unsigned int vecShift1 = (vector unsigned int)(0, 1, 2, 3 );
6248 	vector unsigned int vecShift2 = (vector unsigned int)(4, 5, 0, 0 );
6249 
6250 	vector bool int vecCmp1, vecCmp2, vecCmp3, vecCmp4, vecCmp5, vecCmp6, vecCmp7, vecCmp8;
6251 	vector unsigned int vecBitShifted1, vecBitShifted2, vecBitShifted3, vecBitShifted4;
6252 	vector unsigned int vecBitShifted5, vecBitShifted6, vecBitShifted7, vecBitShifted8;
6253 	vector unsigned int vecFlipBits = (vector unsigned int)( 0x3F, 0x3F, 0x3F, 0x3F );
6254 	vector unsigned int vecR1, vecR2, vecR3, vecR4;
6255 	vector unsigned char permHalves = (vector unsigned char)(0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23);
6256 	vector unsigned char vertPerm1, vertPerm2, vertPerm3, vertPerm4;
6257 	unsigned int vBits[4];
6258 	vector unsigned char vBitPerm = vec_lvsr( 0, &vBits[4] );
6259 
6260 	i = 0;
6261 
6262 	for ( ; i+3 < numVerts; i+=4 ) {
6263 		const float *vertPtr = verts[i].xyz.ToFloatPtr();
6264 		const float *vertPtr2 = verts[i+1].xyz.ToFloatPtr();
6265 		const float *vertPtr3 = verts[i+2].xyz.ToFloatPtr();
6266 		const float *vertPtr4 = verts[i+3].xyz.ToFloatPtr();
6267 
6268 		v0 = vec_ld( 0, vertPtr );
6269 		v2 = vec_ld( 0, vertPtr2 );
6270 		v4 = vec_ld( 0, vertPtr3 );
6271 		v6 = vec_ld( 0, vertPtr4 );
6272 
6273 		vec1Sum1 = vec_madd( vec_splat( vecXYZ1, 0 ), vecPlane0, zeroVector );
6274 		vec1Sum1 = vec_madd( vec_splat( vecXYZ1, 1 ), vecPlane1, vec1Sum1 );
6275 		vec1Sum1 = vec_madd( vec_splat( vecXYZ1, 2 ), vecPlane2, vec1Sum1 );
6276 		vec1Sum1 = vec_add( vec1Sum1, vecPlane3 );
6277 
6278 		vec1Sum2 = vec_madd( vec_splat( vecXYZ1, 0 ), vecPlane4, zeroVector );
6279 		vec1Sum2 = vec_madd( vec_splat( vecXYZ1, 1 ), vecPlane5, vec1Sum2 );
6280 		vec1Sum2 = vec_madd( vec_splat( vecXYZ1, 2 ), vecPlane6, vec1Sum2 );
6281 		vec1Sum2 = vec_add( vec1Sum2, vecPlane7 );
6282 
6283 		vec2Sum1 = vec_madd( vec_splat( vecXYZ2, 0 ), vecPlane0, zeroVector );
6284 		vec2Sum1 = vec_madd( vec_splat( vecXYZ2, 1 ), vecPlane1, vec2Sum1 );
6285 		vec2Sum1 = vec_madd( vec_splat( vecXYZ2, 2 ), vecPlane2, vec2Sum1 );
6286 		vec2Sum1 = vec_add( vec2Sum1, vecPlane3 );
6287 
6288 		vec2Sum2 = vec_madd( vec_splat( vecXYZ2, 0 ), vecPlane4, zeroVector );
6289 		vec2Sum2 = vec_madd( vec_splat( vecXYZ2, 1 ), vecPlane5, vec2Sum2 );
6290 		vec2Sum2 = vec_madd( vec_splat( vecXYZ2, 2 ), vecPlane6, vec2Sum2 );
6291 		vec2Sum2 = vec_add( vec2Sum2, vecPlane7 );
6292 
6293 		vec3Sum1 = vec_madd( vec_splat( vecXYZ3, 0 ), vecPlane0, zeroVector );
6294 		vec3Sum1 = vec_madd( vec_splat( vecXYZ3, 1 ), vecPlane1, vec3Sum1 );
6295 		vec3Sum1 = vec_madd( vec_splat( vecXYZ3, 2 ), vecPlane2, vec3Sum1 );
6296 		vec3Sum1 = vec_add( vec3Sum1, vecPlane3 );
6297 
6298 		vec3Sum2 = vec_madd( vec_splat( vecXYZ3, 0 ), vecPlane4, zeroVector );
6299 		vec3Sum2 = vec_madd( vec_splat( vecXYZ3, 1 ), vecPlane5, vec3Sum2 );
6300 		vec3Sum2 = vec_madd( vec_splat( vecXYZ3, 2 ), vecPlane6, vec3Sum2 );
6301 		vec3Sum2 = vec_add( vec3Sum2, vecPlane7 );
6302 
6303 		vec4Sum1 = vec_madd( vec_splat( vecXYZ4, 0 ), vecPlane0, zeroVector );
6304 		vec4Sum1 = vec_madd( vec_splat( vecXYZ4, 1 ), vecPlane1, vec4Sum1 );
6305 		vec4Sum1 = vec_madd( vec_splat( vecXYZ4, 2 ), vecPlane2, vec4Sum1 );
6306 		vec4Sum1 = vec_add( vec4Sum1, vecPlane3 );
6307 
6308 		vec4Sum2 = vec_madd( vec_splat( vecXYZ4, 0 ), vecPlane4, zeroVector );
6309 		vec4Sum2 = vec_madd( vec_splat( vecXYZ4, 1 ), vecPlane5, vec4Sum2 );
6310 		vec4Sum2 = vec_madd( vec_splat( vecXYZ4, 2 ), vecPlane6, vec4Sum2 );
6311 		vec4Sum2 = vec_add( vec4Sum2, vecPlane7 );
6312 
6313 		vecCmp1 = vec_cmplt( vec1Sum1, zeroVector );
6314 		vecCmp2 = vec_cmplt( vec1Sum2, zeroVector );
6315 		vecCmp3 = vec_cmplt( vec2Sum1, zeroVector );
6316 		vecCmp4 = vec_cmplt( vec2Sum2, zeroVector );
6317 		vecCmp5 = vec_cmplt( vec3Sum1, zeroVector );
6318 		vecCmp6 = vec_cmplt( vec3Sum2, zeroVector );
6319 		vecCmp7 = vec_cmplt( vec4Sum1, zeroVector );
6320 		vecCmp8 = vec_cmplt( vec4Sum2, zeroVector );
6321 
6322 		//and it with 1 so we multiply by 1 not 1111's
6323 		vecCmp1 = vec_and( vecCmp1, oneIntVector );
6324 		vecCmp2 = vec_and( vecCmp2, oneIntVector );
6325 		vecCmp3 = vec_and( vecCmp3, oneIntVector );
6326 		vecCmp4 = vec_and( vecCmp4, oneIntVector );
6327 		vecCmp5 = vec_and( vecCmp5, oneIntVector );
6328 		vecCmp6 = vec_and( vecCmp6, oneIntVector );
6329 		vecCmp7 = vec_and( vecCmp7, oneIntVector );
6330 		vecCmp8 = vec_and( vecCmp8, oneIntVector );
6331 
6332 		vecBitShifted1 = vec_sl( (vector unsigned int)vecCmp1, vecShift1 );
6333 		vecBitShifted2 = vec_sl( (vector unsigned int)vecCmp2, vecShift2 );
6334 		vecBitShifted3 = vec_sl( (vector unsigned int)vecCmp3, vecShift1 );
6335 		vecBitShifted4 = vec_sl( (vector unsigned int)vecCmp4, vecShift2 );
6336 		vecBitShifted5 = vec_sl( (vector unsigned int)vecCmp5, vecShift1 );
6337 		vecBitShifted6 = vec_sl( (vector unsigned int)vecCmp6, vecShift2 );
6338 		vecBitShifted7 = vec_sl( (vector unsigned int)vecCmp7, vecShift1 );
6339 		vecBitShifted8 = vec_sl( (vector unsigned int)vecCmp8, vecShift2 );
6340 
6341 		//OR them all together (this is the same as adding them, since they're all only 1 bit set)
6342 		vecR1 = (vector unsigned int)(0); //zeroIntVector;
6343 		vecR1 = vec_add( vecBitShifted1, vec_sld( vecBitShifted1, vecBitShifted1, 8 ) );
6344 		vecR1 = vec_add( vecR1, vec_sld( vecR1, vecR1, 4 ) );
6345 		vecR1 = vec_add(vecR1, vecBitShifted2 );
6346 		vecR1 = vec_or( vecR1, vec_sld( vecBitShifted2, vecBitShifted2, 4 ) );
6347 
6348 		vecR2 = (vector unsigned int)(0); //zeroIntVector;
6349 		vecR2 = vec_add( vecBitShifted3, vec_sld( vecBitShifted3, vecBitShifted3, 8 ) );
6350 		vecR2 = vec_add( vecR2, vec_sld( vecR2, vecR2, 4 ) );
6351 		vecR2 = vec_add(vecR2, vecBitShifted4 );
6352 		vecR2 = vec_or( vecR2, vec_sld( vecBitShifted4, vecBitShifted4, 4 ) );
6353 
6354 		vecR3 = (vector unsigned int)(0); //zeroIntVector;
6355 		vecR3 = vec_add( vecBitShifted5, vec_sld( vecBitShifted5, vecBitShifted5, 8 ) );
6356 		vecR3 = vec_add( vecR3, vec_sld( vecR3, vecR3, 4 ) );
6357 		vecR3 = vec_add(vecR3, vecBitShifted6 );
6358 		vecR3 = vec_or( vecR3, vec_sld( vecBitShifted6, vecBitShifted6, 4 ) );
6359 
6360 		vecR4 = (vector unsigned int)(0); //zeroIntVector;
6361 		vecR4 = vec_add( vecBitShifted7, vec_sld( vecBitShifted7, vecBitShifted7, 8 ) );
6362 		vecR4 = vec_add( vecR4, vec_sld( vecR4, vecR4, 4 ) );
6363 		vecR4 = vec_add(vecR4, vecBitShifted8 );
6364 		vecR4 = vec_or( vecR4, vec_sld( vecBitShifted8, vecBitShifted8, 4 ) );
6365 
6366 		// take the first element from each vector and put them into vecR1
6367 		vecR1 = vec_mergeh( vecR1, vecR2 );
6368 		vecR3 = vec_mergeh( vecR3, vecR4 );
6369 		vecR1 = vec_perm( vecR1, vecR3, permHalves );
6370 
6371 		// XOR with 0x3F to flip lower 6 bits
6372 		vecR1 = vec_xor( vecR1, vecFlipBits );
6373 
6374 		// store out results. don't have 16 at a time so let's just
6375 		// do this and avoid alignment concerns
6376 		vecR1 = vec_perm( vecR1, vecR1, vBitPerm );
6377 		vec_ste( vecR1, 0, &vBits[0] );
6378 		vec_ste( vecR1, 4, &vBits[0] );
6379 		vec_ste( vecR1, 8, &vBits[0] );
6380 		vec_ste( vecR1, 12, &vBits[0] );
6381 
6382 		cullBits[i] = vBits[0];
6383 		cullBits[i+1] = vBits[1];
6384 		cullBits[i+2] = vBits[2];
6385 		cullBits[i+3] = vBits[3];
6386 	}
6387 
6388 	for ( ; i < numVerts; i++ ) {
6389 		byte bits;
6390 		float d0, d1, d2, d3, d4, d5;
6391 		const idVec3 &v = verts[i].xyz;
6392 
6393 		d0 = planes[0].Distance( v );
6394 		d1 = planes[1].Distance( v );
6395 		d2 = planes[2].Distance( v );
6396 		d3 = planes[3].Distance( v );
6397 		d4 = planes[4].Distance( v );
6398 		d5 = planes[5].Distance( v );
6399 
6400 		// they check if the sign bit is set by casting as long and shifting right 31 places.
6401 		bits  = FLOATSIGNBITSET( d0 ) << 0;
6402 		bits |= FLOATSIGNBITSET( d1 ) << 1;
6403 		bits |= FLOATSIGNBITSET( d2 ) << 2;
6404 		bits |= FLOATSIGNBITSET( d3 ) << 3;
6405 		bits |= FLOATSIGNBITSET( d4 ) << 4;
6406 		bits |= FLOATSIGNBITSET( d5 ) << 5;
6407 
6408 		cullBits[i] = bits ^ 0x3F;		// flip lower 6 bits
6409 	}
6410 }
6411 
6412 
6413 #endif /*DRAWVERT_PADDED */
6414 
6415 #ifndef DRAWVERT_PADDED
6416 /*
6417 ============
6418 idSIMD_AltiVec::OverlayPointCull
6419 ============
6420 */
OverlayPointCull(byte * cullBits,idVec2 * texCoords,const idPlane * planes,const idDrawVert * verts,const int numVerts)6421 void VPCALL idSIMD_AltiVec::OverlayPointCull( byte *cullBits, idVec2 *texCoords, const idPlane *planes, const idDrawVert *verts, const int numVerts ) {
6422 
6423 	// idDrawVert size
6424 	assert( sizeof(idDrawVert) == DRAWVERT_OFFSET * sizeof(float) );
6425 
6426 	int i;
6427 
6428 	float p0x, p0y, p0z, p0d;
6429 	float p1x, p1y, p1z, p1d;
6430 
6431 	const float *planePtr = planes[0].ToFloatPtr();
6432 	const float *vertPtr = verts[0].xyz.ToFloatPtr();
6433 
6434 	vector float vecPlane0, vecPlane1, vecPlane2, vecPlane3;
6435 	vector float v0, v1, v2, v3, v4, v5, v6, v7;
6436 	vector unsigned char vecPerm;
6437 	vector float zeroVector = (vector float)(0);
6438 
6439 	p0x = *(planePtr + 0);
6440 	p0y = *(planePtr + 1);
6441 	p0z = *(planePtr + 2);
6442 	p0d = *(planePtr + 3);
6443 	p1x = *(planePtr + 4);
6444 	p1y = *(planePtr + 5);
6445 	p1z = *(planePtr + 6);
6446 	p1d = *(planePtr + 7);
6447 
6448 	// populate the planes
6449 	vecPerm = vec_add( vec_lvsl( -1, planePtr ), (vector unsigned char)(1) );
6450 	v0 = vec_ld( 0, planePtr );
6451 	v1 = vec_ld( 15, planePtr );
6452 	vecPlane0 = vec_perm( v0, v1, vecPerm );
6453 
6454 	v2 = vec_ld( 31, planePtr );
6455 	vecPlane1 = vec_perm( v1, v2, vecPerm );
6456 
6457 	// transpose
6458 	v0 = vec_mergeh( vecPlane0, vecPlane0 );
6459 	v1 = vec_mergeh( vecPlane1, vecPlane1 );
6460 	v2 = vec_mergel( vecPlane0, vecPlane0 );
6461 	v3 = vec_mergel( vecPlane1, vecPlane1);
6462 
6463 	vecPlane0 = vec_mergeh( v0, v1 );
6464 	vecPlane1 = vec_mergel( v0, v1 );
6465 	vecPlane2 = vec_mergeh( v2, v3 );
6466 	vecPlane3 = vec_mergel( v2, v3 );
6467 
6468 	vector float vecXYZ1, vecXYZ2, vecXYZ3, vecXYZ4;
6469 	vector float oneVector = (vector float)(1);
6470 
6471 	vector float vecSum1, vecSum2, vecSum1Inv,vecSum2Inv;
6472 
6473 	vector bool int vecCmp1, vecCmp2, vecCmp1Inv, vecCmp2Inv;
6474 	vector float negTwoVector = (vector float)(-2);
6475 	vector unsigned int vecBitShifted1, vecBitShifted2, vecBitShifted1Inv, vecBitShifted2Inv;
6476 	vector unsigned int vecShift = (vector unsigned int)( 0, 1, 0, 1 );
6477 	vector unsigned int vecShiftInv = (vector unsigned int)( 2, 3, 2, 3 );
6478 	vector unsigned char vecPermFirstThird = (vector unsigned char)(0,1,2,3,8,9,10,11,16,17,18,19,24,25,26,27);
6479 	vector bool int oneIntVector = (vector bool int)(1);
6480 	vector unsigned char vertPerm1, vertPerm2, vertPerm3, vertPerm4;
6481 	unsigned int cullBitVal[4];
6482 	vector unsigned char cullBitPerm = vec_lvsr( 0, &cullBitVal[0] );
6483 
6484 	i = 0;
6485 	// every fourth one will have the same alignment. Make sure we've got enough here
6486 	if ( i+3 < numVerts ) {
6487 		vertPerm1 = vec_add( vec_lvsl( -1, (float*) verts[0].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
6488 		vertPerm2 = vec_add( vec_lvsl( -1, (float*) verts[1].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
6489 		vertPerm3 = vec_add( vec_lvsl( -1, (float*) verts[2].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
6490 		vertPerm4 = vec_add( vec_lvsl( -1, (float*) verts[3].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
6491 	}
6492 
6493 
6494 	for ( ; i+3 < numVerts; i+=4 ) {
6495 		const float *vertPtr = verts[i].xyz.ToFloatPtr();
6496 		const float *vertPtr2 = verts[i+1].xyz.ToFloatPtr();
6497 		const float *vertPtr3 = verts[i+2].xyz.ToFloatPtr();
6498 		const float *vertPtr4 = verts[i+3].xyz.ToFloatPtr();
6499 
6500 		v0 = vec_ld( 0, vertPtr );
6501 		v1 = vec_ld( 15, vertPtr );
6502 		v2 = vec_ld( 0, vertPtr2 );
6503 		v3 = vec_ld( 15, vertPtr2 );
6504 		v4 = vec_ld( 0, vertPtr3 );
6505 		v5 = vec_ld( 15, vertPtr3 );
6506 		v6 = vec_ld( 0, vertPtr4 );
6507 		v7 = vec_ld( 15, vertPtr4 );
6508 
6509 		vecXYZ1 = vec_perm( v0, v1, vertPerm1 );
6510 		vecXYZ2 = vec_perm( v2, v3, vertPerm2 );
6511 		vecXYZ3 = vec_perm( v4, v5, vertPerm3 );
6512 		vecXYZ4 = vec_perm( v6, v7, vertPerm4 );
6513 
6514 		// like a splat, but only doing halves
6515 		vecSum1 = vec_madd( vec_perm( vecXYZ1, vecXYZ2, (vector unsigned char)(0,1,2,3,0,1,2,3,16,17,18,19,16,17,18,19) ), vecPlane0, zeroVector );
6516 		vecSum1 = vec_madd( vec_perm( vecXYZ1, vecXYZ2, (vector unsigned char)(4,5,6,7,4,5,6,7,20,21,22,23,20,21,22,23) ) , vecPlane1, vecSum1 );
6517 		vecSum1 = vec_madd( vec_perm( vecXYZ1, vecXYZ2, (vector unsigned char)(8,9,10,11,8,9,10,11,24,25,26,27,24,25,26,27) ), vecPlane2, vecSum1 );
6518 		vecSum1 = vec_add( vecSum1, vecPlane3 );
6519 
6520 		vecSum2 = vec_madd( vec_perm( vecXYZ3, vecXYZ4, (vector unsigned char)(0,1,2,3,0,1,2,3,16,17,18,19,16,17,18,19) ), vecPlane0, zeroVector );
6521 		vecSum2 = vec_madd( vec_perm( vecXYZ3, vecXYZ4, (vector unsigned char)(4,5,6,7,4,5,6,7,20,21,22,23,20,21,22,23) ) , vecPlane1, vecSum2 );
6522 		vecSum2 = vec_madd( vec_perm( vecXYZ3, vecXYZ4, (vector unsigned char)(8,9,10,11,8,9,10,11,24,25,26,27,24,25,26,27) ), vecPlane2, vecSum2 );
6523 		vecSum2 = vec_add( vecSum2, vecPlane3 );
6524 
6525 		// store out results
6526 		UNALIGNED_STORE2( &texCoords[i][0], vecSum1, vecSum2 );
6527 
6528 		// bit manipulation
6529 		vecCmp1 = vec_cmplt( vecSum1, zeroVector );
6530 		vecCmp2 = vec_cmplt( vecSum2, zeroVector );
6531 
6532 		//and it with 1 so we multiply by 1 not 1111's
6533 		vecCmp1 = vec_and( vecCmp1, oneIntVector );
6534 		vecCmp2 = vec_and( vecCmp2, oneIntVector );
6535 
6536 		 // store out and write to cullBits
6537 		// finally, a use for algebra! 1-x = x + 1 - 2x
6538 		vecSum1Inv = vec_madd( vecSum1, negTwoVector, vecSum1 );
6539 		vecSum2Inv = vec_madd( vecSum2, negTwoVector, vecSum2 );
6540 		vecSum1Inv = vec_add( vecSum1Inv, oneVector );
6541 		vecSum2Inv = vec_add( vecSum2Inv, oneVector );
6542 
6543 		// do the same comparisons for the inverted d0/d1
6544 		vecCmp1Inv = vec_cmplt( vecSum1Inv, zeroVector );
6545 		vecCmp2Inv = vec_cmplt( vecSum2Inv, zeroVector );
6546 
6547 		//and it with 1 so we multiply by 1 not 1111's
6548 		vecCmp1Inv = vec_and( vecCmp1Inv, oneIntVector );
6549 		vecCmp2Inv = vec_and( vecCmp2Inv, oneIntVector );
6550 
6551 		// shift them as needed
6552 		vecBitShifted1 = vec_sl( (vector unsigned int)vecCmp1, vecShift );
6553 		vecBitShifted2 = vec_sl( (vector unsigned int)vecCmp2, vecShift );
6554 		vecBitShifted1Inv = vec_sl( (vector unsigned int)vecCmp1Inv, vecShiftInv );
6555 		vecBitShifted2Inv = vec_sl( (vector unsigned int)vecCmp2Inv, vecShiftInv );
6556 
6557 		// OR them all together. since only 1 bit is set for each value, thats
6558 		// the same as adding them. add up d0 + d1 + d0Inv + d1Inv
6559 		vector unsigned int vecResult;
6560 		vector unsigned int vecResult2;
6561 		vector unsigned int vecResult3;
6562 		vecResult = vec_add( vecBitShifted1, vec_sld( vecBitShifted1, vecBitShifted1, 4 ) );
6563 
6564 		vecResult2 = vec_add( vecBitShifted2, vec_sld( vecBitShifted2, vecBitShifted2, 4 ) );
6565 
6566 		// vecResult now holds the values without the inverses yet, so add those
6567 		vecResult = vec_perm( vecResult, vecResult2, vecPermFirstThird );
6568 		vecResult2 = vec_add( vecBitShifted1Inv, vec_sld( vecBitShifted1Inv, vecBitShifted1Inv, 4 ) );
6569 		vecResult3 = vec_add( vecBitShifted2Inv, vec_sld( vecBitShifted2Inv, vecBitShifted2Inv, 4 ) );
6570 		vecResult2 = vec_perm( vecResult2, vecResult3, vecPermFirstThird );
6571 
6572 		vecResult = vec_add( vecResult, vecResult2 );
6573 
6574 		//store out results
6575 		vecResult = vec_perm( vecResult, vecResult, cullBitPerm );
6576 		vec_ste( vecResult, 0, &cullBitVal[0] );
6577 		vec_ste( vecResult, 4, &cullBitVal[0] );
6578 		vec_ste( vecResult, 8, &cullBitVal[0] );
6579 		vec_ste( vecResult, 12, &cullBitVal[0] );
6580 
6581 		cullBits[i] = cullBitVal[0];
6582 		cullBits[i+1] = cullBitVal[1];
6583 		cullBits[i+2] = cullBitVal[2];
6584 		cullBits[i+3] = cullBitVal[3];
6585 	}
6586 
6587 	// cleanup
6588 	for ( ; i < numVerts; i++ ) {
6589 		byte bits;
6590 		float d0, d1;
6591 		float vx, vy, vz;
6592 
6593 		vx = *( vertPtr + (i*DRAWVERT_OFFSET) + 0 );
6594 		vy = *( vertPtr + (i*DRAWVERT_OFFSET) + 1 );
6595 		vz = *( vertPtr + (i*DRAWVERT_OFFSET) + 2 );
6596 
6597 		d0 = p0x * vx + p0y * vy + p0z * vz + p0d;
6598 		d1 = p1x * vx + p1y * vy + p1z * vz + p1d;
6599 		texCoords[i][0] = d0;
6600 		texCoords[i][1] = d1;
6601 
6602 		bits = ( d0 >= 0 ) ? 0 : 1;
6603 		d0 = 1.0f - d0;
6604 		bits |= ( d1 >= 0 ) ? 0 : 1*2;
6605 		d1 = 1.0f - d1;
6606 
6607 		bits |= ( d0 >= 0 ) ? 0: 1*4;
6608 		bits |= ( d1 >= 0 ) ? 0: 1*8;
6609 
6610 		cullBits[i] = bits;
6611 	}
6612 }
6613 #else
6614 
6615 /*
6616 ============
6617 idSIMD_AltiVec::OverlayPointCull
6618 ============
6619 */
OverlayPointCull(byte * cullBits,idVec2 * texCoords,const idPlane * planes,const idDrawVert * verts,const int numVerts)6620 void VPCALL idSIMD_AltiVec::OverlayPointCull( byte *cullBits, idVec2 *texCoords, const idPlane *planes, const idDrawVert *verts, const int numVerts ) {
6621 
6622 	// idDrawVert size
6623 	assert( sizeof(idDrawVert) == DRAWVERT_OFFSET * sizeof(float) );
6624 
6625 	int i;
6626 
6627 	float p0x, p0y, p0z, p0d;
6628 	float p1x, p1y, p1z, p1d;
6629 
6630 	const float *planePtr = planes[0].ToFloatPtr();
6631 	const float *vertPtr = verts[0].xyz.ToFloatPtr();
6632 
6633 	vector float vecPlane0, vecPlane1, vecPlane2, vecPlane3;
6634 	vector float v0, v1, v2, v3, v4, v5, v6, v7;
6635 	vector unsigned char vecPerm;
6636 	vector float zeroVector = (vector float)(0);
6637 
6638 	p0x = *(planePtr + 0);
6639 	p0y = *(planePtr + 1);
6640 	p0z = *(planePtr + 2);
6641 	p0d = *(planePtr + 3);
6642 	p1x = *(planePtr + 4);
6643 	p1y = *(planePtr + 5);
6644 	p1z = *(planePtr + 6);
6645 	p1d = *(planePtr + 7);
6646 
6647 	// populate the planes
6648 	vecPerm = vec_add( vec_lvsl( -1, planePtr ), (vector unsigned char)(1) );
6649 	v0 = vec_ld( 0, planePtr );
6650 	v1 = vec_ld( 15, planePtr );
6651 	vecPlane0 = vec_perm( v0, v1, vecPerm );
6652 
6653 	v2 = vec_ld( 31, planePtr );
6654 	vecPlane1 = vec_perm( v1, v2, vecPerm );
6655 
6656 	// transpose
6657 	v0 = vec_mergeh( vecPlane0, vecPlane0 );
6658 	v1 = vec_mergeh( vecPlane1, vecPlane1 );
6659 	v2 = vec_mergel( vecPlane0, vecPlane0 );
6660 	v3 = vec_mergel( vecPlane1, vecPlane1);
6661 
6662 	vecPlane0 = vec_mergeh( v0, v1 );
6663 	vecPlane1 = vec_mergel( v0, v1 );
6664 	vecPlane2 = vec_mergeh( v2, v3 );
6665 	vecPlane3 = vec_mergel( v2, v3 );
6666 
6667 	vector float vecXYZ1, vecXYZ2, vecXYZ3, vecXYZ4;
6668 	vector float oneVector = (vector float)(1);
6669 
6670 	vector float vecSum1, vecSum2, vecSum1Inv,vecSum2Inv;
6671 
6672 	vector bool int vecCmp1, vecCmp2, vecCmp1Inv, vecCmp2Inv;
6673 	vector float negTwoVector = (vector float)(-2);
6674 	vector unsigned int vecBitShifted1, vecBitShifted2, vecBitShifted1Inv, vecBitShifted2Inv;
6675 	vector unsigned int vecShift = (vector unsigned int)( 0, 1, 0, 1 );
6676 	vector unsigned int vecShiftInv = (vector unsigned int)( 2, 3, 2, 3 );
6677 	vector unsigned char vecPermFirstThird = (vector unsigned char)(0,1,2,3,8,9,10,11,16,17,18,19,24,25,26,27);
6678 	vector bool int oneIntVector = (vector bool int)(1);
6679 	vector unsigned char vertPerm1, vertPerm2, vertPerm3, vertPerm4;
6680 	unsigned int cullBitVal[4];
6681 	vector unsigned char cullBitPerm = vec_lvsr( 0, &cullBitVal[0] );
6682 
6683 	i = 0;
6684 
6685 	for ( ; i+3 < numVerts; i+=4 ) {
6686 		const float *vertPtr = verts[i].xyz.ToFloatPtr();
6687 		const float *vertPtr2 = verts[i+1].xyz.ToFloatPtr();
6688 		const float *vertPtr3 = verts[i+2].xyz.ToFloatPtr();
6689 		const float *vertPtr4 = verts[i+3].xyz.ToFloatPtr();
6690 
6691 		vecXYZ1 = vec_ld( 0, vertPtr );
6692 		vecXYZ2 = vec_ld( 0, vertPtr2 );
6693 		vecXYZ3 = vec_ld( 0, vertPtr3 );
6694 		vecXYZ4 = vec_ld( 0, vertPtr4 );
6695 
6696 		// like a splat, but only doing halves
6697 		vecSum1 = vec_madd( vec_perm( vecXYZ1, vecXYZ2, (vector unsigned char)(0,1,2,3,0,1,2,3,16,17,18,19,16,17,18,19) ), vecPlane0, zeroVector );
6698 		vecSum1 = vec_madd( vec_perm( vecXYZ1, vecXYZ2, (vector unsigned char)(4,5,6,7,4,5,6,7,20,21,22,23,20,21,22,23) ) , vecPlane1, vecSum1 );
6699 		vecSum1 = vec_madd( vec_perm( vecXYZ1, vecXYZ2, (vector unsigned char)(8,9,10,11,8,9,10,11,24,25,26,27,24,25,26,27) ), vecPlane2, vecSum1 );
6700 		vecSum1 = vec_add( vecSum1, vecPlane3 );
6701 
6702 		vecSum2 = vec_madd( vec_perm( vecXYZ3, vecXYZ4, (vector unsigned char)(0,1,2,3,0,1,2,3,16,17,18,19,16,17,18,19) ), vecPlane0, zeroVector );
6703 		vecSum2 = vec_madd( vec_perm( vecXYZ3, vecXYZ4, (vector unsigned char)(4,5,6,7,4,5,6,7,20,21,22,23,20,21,22,23) ) , vecPlane1, vecSum2 );
6704 		vecSum2 = vec_madd( vec_perm( vecXYZ3, vecXYZ4, (vector unsigned char)(8,9,10,11,8,9,10,11,24,25,26,27,24,25,26,27) ), vecPlane2, vecSum2 );
6705 		vecSum2 = vec_add( vecSum2, vecPlane3 );
6706 
6707 		// store out results
6708 		UNALIGNED_STORE2( &texCoords[i][0], vecSum1, vecSum2 );
6709 
6710 		// bit manipulation
6711 		vecCmp1 = vec_cmplt( vecSum1, zeroVector );
6712 		vecCmp2 = vec_cmplt( vecSum2, zeroVector );
6713 
6714 		//and it with 1 so we multiply by 1 not 1111's
6715 		vecCmp1 = vec_and( vecCmp1, oneIntVector );
6716 		vecCmp2 = vec_and( vecCmp2, oneIntVector );
6717 
6718 		 // store out and write to cullBits
6719 		// finally, a use for algebra! 1-x = x + 1 - 2x
6720 		vecSum1Inv = vec_madd( vecSum1, negTwoVector, vecSum1 );
6721 		vecSum2Inv = vec_madd( vecSum2, negTwoVector, vecSum2 );
6722 		vecSum1Inv = vec_add( vecSum1Inv, oneVector );
6723 		vecSum2Inv = vec_add( vecSum2Inv, oneVector );
6724 
6725 		// do the same comparisons for the inverted d0/d1
6726 		vecCmp1Inv = vec_cmplt( vecSum1Inv, zeroVector );
6727 		vecCmp2Inv = vec_cmplt( vecSum2Inv, zeroVector );
6728 
6729 		//and it with 1 so we multiply by 1 not 1111's
6730 		vecCmp1Inv = vec_and( vecCmp1Inv, oneIntVector );
6731 		vecCmp2Inv = vec_and( vecCmp2Inv, oneIntVector );
6732 
6733 		// shift them as needed
6734 		vecBitShifted1 = vec_sl( (vector unsigned int)vecCmp1, vecShift );
6735 		vecBitShifted2 = vec_sl( (vector unsigned int)vecCmp2, vecShift );
6736 		vecBitShifted1Inv = vec_sl( (vector unsigned int)vecCmp1Inv, vecShiftInv );
6737 		vecBitShifted2Inv = vec_sl( (vector unsigned int)vecCmp2Inv, vecShiftInv );
6738 
6739 		// OR them all together. since only 1 bit is set for each value, thats
6740 		// the same as adding them. add up d0 + d1 + d0Inv + d1Inv
6741 		vector unsigned int vecResult;
6742 		vector unsigned int vecResult2;
6743 		vector unsigned int vecResult3;
6744 		vecResult = vec_add( vecBitShifted1, vec_sld( vecBitShifted1, vecBitShifted1, 4 ) );
6745 
6746 		vecResult2 = vec_add( vecBitShifted2, vec_sld( vecBitShifted2, vecBitShifted2, 4 ) );
6747 
6748 		// vecResult now holds the values without the inverses yet, so add those
6749 		vecResult = vec_perm( vecResult, vecResult2, vecPermFirstThird );
6750 		vecResult2 = vec_add( vecBitShifted1Inv, vec_sld( vecBitShifted1Inv, vecBitShifted1Inv, 4 ) );
6751 		vecResult3 = vec_add( vecBitShifted2Inv, vec_sld( vecBitShifted2Inv, vecBitShifted2Inv, 4 ) );
6752 		vecResult2 = vec_perm( vecResult2, vecResult3, vecPermFirstThird );
6753 
6754 		vecResult = vec_add( vecResult, vecResult2 );
6755 
6756 		//store out results
6757 		vecResult = vec_perm( vecResult, vecResult, cullBitPerm );
6758 		vec_ste( vecResult, 0, &cullBitVal[0] );
6759 		vec_ste( vecResult, 4, &cullBitVal[0] );
6760 		vec_ste( vecResult, 8, &cullBitVal[0] );
6761 		vec_ste( vecResult, 12, &cullBitVal[0] );
6762 
6763 		cullBits[i] = cullBitVal[0];
6764 		cullBits[i+1] = cullBitVal[1];
6765 		cullBits[i+2] = cullBitVal[2];
6766 		cullBits[i+3] = cullBitVal[3];
6767 	}
6768 
6769 	// cleanup
6770 	for ( ; i < numVerts; i++ ) {
6771 		byte bits;
6772 		float d0, d1;
6773 		float vx, vy, vz;
6774 
6775 		vx = *( vertPtr + (i*DRAWVERT_OFFSET) + 0 );
6776 		vy = *( vertPtr + (i*DRAWVERT_OFFSET) + 1 );
6777 		vz = *( vertPtr + (i*DRAWVERT_OFFSET) + 2 );
6778 
6779 		d0 = p0x * vx + p0y * vy + p0z * vz + p0d;
6780 		d1 = p1x * vx + p1y * vy + p1z * vz + p1d;
6781 		texCoords[i][0] = d0;
6782 		texCoords[i][1] = d1;
6783 
6784 		bits = ( d0 >= 0 ) ? 0 : 1;
6785 		d0 = 1.0f - d0;
6786 		bits |= ( d1 >= 0 ) ? 0 : 1*2;
6787 		d1 = 1.0f - d1;
6788 
6789 		bits |= ( d0 >= 0 ) ? 0: 1*4;
6790 		bits |= ( d1 >= 0 ) ? 0: 1*8;
6791 
6792 		cullBits[i] = bits;
6793 	}
6794 }
6795 
6796 
6797 #endif /* DRAWVERT_PADDED */
6798 
6799 #endif /* ENABLE_CULL */
6800 
6801 #ifdef ENABLE_DERIVE
6802 /*
6803 ============
6804 idSIMD_AltiVec::DeriveTriPlanes
6805 
6806 	Derives a plane equation for each triangle.
6807 ============
6808 */
DeriveTriPlanes(idPlane * planes,const idDrawVert * verts,const int numVerts,const int * indexes,const int numIndexes)6809 void VPCALL idSIMD_AltiVec::DeriveTriPlanes( idPlane *planes, const idDrawVert *verts, const int numVerts, const int *indexes, const int numIndexes ) {
6810 
6811 	// idDrawVert size
6812 	assert( sizeof(idDrawVert) == DRAWVERT_OFFSET * sizeof(float) );
6813 	// idPlane size
6814 	assert( sizeof(idPlane) == PLANE_OFFSET * sizeof(float) );
6815 	int i;
6816 
6817 	vector float vecD0, vecD1, vecD2, vecD3, vecD4, vecD5, vecD6, vecD7;
6818 	vector float vecVertA, vecVertB, vecVertC;
6819 	vector float vecVertA2, vecVertB2, vecVertC2;
6820 	vector float vecVertA3, vecVertB3, vecVertC3;
6821 	vector float vecVertA4, vecVertB4, vecVertC4;
6822 
6823 	vector float vecN, vecN2, vecN3, vecN4;
6824 	vector float vecWork1, vecWork2, vecWork3, vecWork4, vecWork5, vecWork6, vecWork7, vecWork8;
6825 	vector unsigned char vecPerm1 =  (vector unsigned char)(4,5,6,7,8,9,10,11,0,1,2,3,12,13,14,15);
6826 	vector unsigned char vecPerm2 = (vector unsigned char)(8,9,10,11,0,1,2,3,4,5,6,7,12,13,14,15);
6827 	vector float vecF;
6828 	vector float vecF1, vecF2, vecF3, vecF4;
6829 	vector float zeroVector = (vector float)(0);
6830 	vector float vecNegOne = (vector float)(-1);
6831 	vector float vecSecondHalf, vecFirstHalf, vecSecondHalf2, vecFirstHalf2, vecSecondHalf3, vecFirstHalf3, vecFirstHalf4, vecSecondHalf4;
6832 
6833 	vector unsigned char vecPermA, vecPermA2, vecPermA3, vecPermA4;
6834 	vector unsigned char vecPermB, vecPermB2, vecPermB3, vecPermB4;
6835 	vector unsigned char vecPermC, vecPermC2, vecPermC3, vecPermC4;
6836 
6837 	vector unsigned char oneVector = (vector unsigned char)(1);
6838 	vector float vecLd1, vecLd2, vecLd3, vecLd4, vecLd5, vecLd6;
6839 	vector unsigned char vecPermZeroLast = (vector unsigned char)(0,1,2,3,4,5,6,7,8,9,10,11,16,17,18,19);
6840 
6841 	const float *xyzPtr = verts[0].xyz.ToFloatPtr();
6842 	float *planePtr = planes[0].ToFloatPtr();
6843 
6844 	int j;
6845 	for ( j = 0, i = 0; i+11 < numIndexes; i += 12, j += 4 ) {
6846 
6847 #ifndef DRAWVERT_PADDED
6848 		// calculate permute vectors to load as needed. these are all
6849 		// triangle indexes and are usaully pretty close together but
6850 		// not guaranteed to be in any particular order
6851 		vecPermA = vec_add( vec_lvsl( -1, xyzPtr + ( indexes[i+0] * DRAWVERT_OFFSET ) ), oneVector );
6852 		vecPermB = vec_add( vec_lvsl( -1, xyzPtr + ( indexes[i+1] * DRAWVERT_OFFSET ) ), oneVector );
6853 		vecPermC = vec_add( vec_lvsl( -1, xyzPtr + ( indexes[i+2] * DRAWVERT_OFFSET ) ), oneVector );
6854 		vecPermA2 = vec_add( vec_lvsl( -1, xyzPtr + ( indexes[i+3] * DRAWVERT_OFFSET ) ), oneVector );
6855 		vecPermB2 = vec_add( vec_lvsl( -1, xyzPtr + ( indexes[i+4] * DRAWVERT_OFFSET ) ), oneVector );
6856 		vecPermC2 = vec_add( vec_lvsl( -1, xyzPtr + ( indexes[i+5] * DRAWVERT_OFFSET ) ), oneVector );
6857 		vecPermA3 = vec_add( vec_lvsl( -1, xyzPtr + ( indexes[i+6] * DRAWVERT_OFFSET ) ), oneVector );
6858 		vecPermB3 = vec_add( vec_lvsl( -1, xyzPtr + ( indexes[i+7] * DRAWVERT_OFFSET ) ), oneVector );
6859 		vecPermC3 = vec_add( vec_lvsl( -1, xyzPtr + ( indexes[i+8] * DRAWVERT_OFFSET ) ), oneVector );
6860 		vecPermA4 = vec_add( vec_lvsl( -1, xyzPtr + ( indexes[i+9] * DRAWVERT_OFFSET ) ), oneVector );
6861 		vecPermB4 = vec_add( vec_lvsl( -1, xyzPtr + ( indexes[i+10] * DRAWVERT_OFFSET ) ), oneVector );
6862 		vecPermC4 = vec_add( vec_lvsl( -1, xyzPtr + ( indexes[i+11] * DRAWVERT_OFFSET ) ), oneVector );
6863 #endif
6864 
6865 #ifndef DRAWVERT_PADDED
6866 		// load first A B C
6867 		vecLd1 = vec_ld( 0, xyzPtr + ( indexes[i+0] * DRAWVERT_OFFSET ) );
6868 		vecLd2 = vec_ld( 15, xyzPtr + ( indexes[i+0] * DRAWVERT_OFFSET ) );
6869 		vecLd3 = vec_ld( 0, xyzPtr + ( indexes[i+1] * DRAWVERT_OFFSET ) );
6870 		vecLd4 = vec_ld( 15, xyzPtr + ( indexes[i+1] * DRAWVERT_OFFSET ) );
6871 		vecLd5 = vec_ld( 0, xyzPtr + ( indexes[i+2] * DRAWVERT_OFFSET ) );
6872 		vecLd6 = vec_ld( 15, xyzPtr + ( indexes[i+2] * DRAWVERT_OFFSET ) );
6873 
6874 		vecVertA = vec_perm( vecLd1, vecLd2, vecPermA );
6875 		vecVertB = vec_perm( vecLd3, vecLd4, vecPermB );
6876 		vecVertC = vec_perm( vecLd5, vecLd6, vecPermC );
6877 
6878 		// set the last element to 0
6879 		vecVertA = vec_perm( vecVertA, zeroVector, vecPermZeroLast );
6880 		vecVertB = vec_perm( vecVertB, zeroVector, vecPermZeroLast );
6881 		vecVertC = vec_perm( vecVertC, zeroVector, vecPermZeroLast );
6882 
6883 		// load second A B C
6884 		vecLd1 = vec_ld( 0, xyzPtr + ( indexes[i+3] * DRAWVERT_OFFSET ) );
6885 		vecLd2 = vec_ld( 15, xyzPtr + ( indexes[i+3] * DRAWVERT_OFFSET ) );
6886 		vecLd3 = vec_ld( 0, xyzPtr + ( indexes[i+4] * DRAWVERT_OFFSET ) );
6887 		vecLd4 = vec_ld( 15, xyzPtr + ( indexes[i+4] * DRAWVERT_OFFSET ) );
6888 		vecLd5 = vec_ld( 0, xyzPtr + ( indexes[i+5] * DRAWVERT_OFFSET ) );
6889 		vecLd6 = vec_ld( 15, xyzPtr + ( indexes[i+5] * DRAWVERT_OFFSET ) );
6890 
6891 		vecVertA2 = vec_perm( vecLd1, vecLd2, vecPermA2 );
6892 		vecVertB2 = vec_perm( vecLd3, vecLd4, vecPermB2 );
6893 		vecVertC2 = vec_perm( vecLd5, vecLd6, vecPermC2 );
6894 
6895 		// set the last element to 0
6896 		vecVertA2 = vec_perm( vecVertA2, zeroVector, vecPermZeroLast );
6897 		vecVertB2 = vec_perm( vecVertB2, zeroVector, vecPermZeroLast );
6898 		vecVertC2 = vec_perm( vecVertC2, zeroVector, vecPermZeroLast );
6899 
6900 		// load third A B C
6901 		vecLd1 = vec_ld( 0, xyzPtr + ( indexes[i+6] * DRAWVERT_OFFSET ) );
6902 		vecLd2 = vec_ld( 15, xyzPtr + ( indexes[i+6] * DRAWVERT_OFFSET ) );
6903 		vecLd3 = vec_ld( 0, xyzPtr + ( indexes[i+7] * DRAWVERT_OFFSET ) );
6904 		vecLd4 = vec_ld( 15, xyzPtr + ( indexes[i+7] * DRAWVERT_OFFSET ) );
6905 		vecLd5 = vec_ld( 0, xyzPtr + ( indexes[i+8] * DRAWVERT_OFFSET ) );
6906 		vecLd6 = vec_ld( 15, xyzPtr + ( indexes[i+8] * DRAWVERT_OFFSET ) );
6907 
6908 		vecVertA3 = vec_perm( vecLd1, vecLd2, vecPermA3 );
6909 		vecVertB3 = vec_perm( vecLd3, vecLd4, vecPermB3 );
6910 		vecVertC3 = vec_perm( vecLd5, vecLd6, vecPermC3 );
6911 
6912 		// set the last element to 0
6913 		vecVertA2 = vec_perm( vecVertA2, zeroVector, vecPermZeroLast );
6914 		vecVertB2 = vec_perm( vecVertB2, zeroVector, vecPermZeroLast );
6915 		vecVertC2 = vec_perm( vecVertC2, zeroVector, vecPermZeroLast );
6916 
6917 		// load the fourth A B C
6918 		vecLd1 = vec_ld( 0, xyzPtr + ( indexes[i+9] * DRAWVERT_OFFSET ) );
6919 		vecLd2 = vec_ld( 15, xyzPtr + ( indexes[i+9] * DRAWVERT_OFFSET ) );
6920 		vecLd3 = vec_ld( 0, xyzPtr + ( indexes[i+10] * DRAWVERT_OFFSET ) );
6921 		vecLd4 = vec_ld( 15, xyzPtr + ( indexes[i+10] * DRAWVERT_OFFSET ) );
6922 		vecLd5 = vec_ld( 0, xyzPtr + ( indexes[i+11] * DRAWVERT_OFFSET ) );
6923 		vecLd6 = vec_ld( 15, xyzPtr + ( indexes[i+11] * DRAWVERT_OFFSET ) );
6924 
6925 		vecVertA4 = vec_perm( vecLd1, vecLd2, vecPermA4 );
6926 		vecVertB4 = vec_perm( vecLd3, vecLd4, vecPermB4 );
6927 		vecVertC4 = vec_perm( vecLd5, vecLd6, vecPermC4 );
6928 
6929 		// set the last element to 0
6930 		vecVertA4 = vec_perm( vecVertA4, zeroVector, vecPermZeroLast );
6931 		vecVertB4 = vec_perm( vecVertB4, zeroVector, vecPermZeroLast );
6932 		vecVertC4 = vec_perm( vecVertC4, zeroVector, vecPermZeroLast );
6933 #else
6934 		// load first A B C
6935 		vecVertA = vec_ld( 0, xyzPtr + ( indexes[i+0] * DRAWVERT_OFFSET ) );
6936 		vecVertB = vec_ld( 0, xyzPtr + ( indexes[i+1] * DRAWVERT_OFFSET ) );
6937 		vecVertC = vec_ld( 0, xyzPtr + ( indexes[i+2] * DRAWVERT_OFFSET ) );
6938 
6939 		// set the last element to 0
6940 		vecVertA = vec_perm( vecVertA, zeroVector, vecPermZeroLast );
6941 		vecVertB = vec_perm( vecVertB, zeroVector, vecPermZeroLast );
6942 		vecVertC = vec_perm( vecVertC, zeroVector, vecPermZeroLast );
6943 
6944 		// load second A B C
6945 		vecVertA2 = vec_ld( 0, xyzPtr + ( indexes[i+3] * DRAWVERT_OFFSET ) );
6946 		vecVertB2 = vec_ld( 0, xyzPtr + ( indexes[i+4] * DRAWVERT_OFFSET ) );
6947 		vecVertC2 = vec_ld( 0, xyzPtr + ( indexes[i+5] * DRAWVERT_OFFSET ) );
6948 
6949 		// set the last element to 0
6950 		vecVertA2 = vec_perm( vecVertA2, zeroVector, vecPermZeroLast );
6951 		vecVertB2 = vec_perm( vecVertB2, zeroVector, vecPermZeroLast );
6952 		vecVertC2 = vec_perm( vecVertC2, zeroVector, vecPermZeroLast );
6953 
6954 		// load third A B C
6955 		vecVertA3 = vec_ld( 0, xyzPtr + ( indexes[i+6] * DRAWVERT_OFFSET ) );
6956 		vecVertB3 = vec_ld( 0, xyzPtr + ( indexes[i+7] * DRAWVERT_OFFSET ) );
6957 		vecVertC3 = vec_ld( 0, xyzPtr + ( indexes[i+8] * DRAWVERT_OFFSET ) );
6958 
6959 		// set the last element to 0
6960 		vecVertA3 = vec_perm( vecVertA3, zeroVector, vecPermZeroLast );
6961 		vecVertB3 = vec_perm( vecVertB3, zeroVector, vecPermZeroLast );
6962 		vecVertC3 = vec_perm( vecVertC3, zeroVector, vecPermZeroLast );
6963 
6964 		// load the fourth A B C
6965 		vecVertA4 = vec_ld( 0, xyzPtr + ( indexes[i+9] * DRAWVERT_OFFSET ) );
6966 		vecVertB4 = vec_ld( 0, xyzPtr + ( indexes[i+10] * DRAWVERT_OFFSET ) );
6967 		vecVertC4 = vec_ld( 0, xyzPtr + ( indexes[i+11] * DRAWVERT_OFFSET ) );
6968 
6969 		// set the last element to 0
6970 		vecVertA4 = vec_perm( vecVertA4, zeroVector, vecPermZeroLast );
6971 		vecVertB4 = vec_perm( vecVertB4, zeroVector, vecPermZeroLast );
6972 		vecVertC4 = vec_perm( vecVertC4, zeroVector, vecPermZeroLast );
6973 #endif
6974 		// calculate d0 and d1 for each
6975 		vecD0 = vec_sub( vecVertB, vecVertA );
6976 		vecD1 = vec_sub( vecVertC, vecVertA );
6977 
6978 		vecD2 = vec_sub( vecVertB2, vecVertA2 );
6979 		vecD3 = vec_sub( vecVertC2, vecVertA2 );
6980 
6981 		vecD4 = vec_sub( vecVertB3, vecVertA3 );
6982 		vecD5 = vec_sub( vecVertC3, vecVertA3 );
6983 
6984 		vecD6 = vec_sub( vecVertB4, vecVertA4 );
6985 		vecD7 = vec_sub( vecVertC4, vecVertA4 );
6986 
6987 		vecWork1 = vec_perm( vecD0, vecD0, vecPerm1 );
6988 		vecWork2 = vec_perm( vecD1, vecD1, vecPerm2 );
6989 		vecWork3 = vec_perm( vecD2, vecD2, vecPerm1 );
6990 		vecWork4 = vec_perm( vecD3, vecD3, vecPerm2 );
6991 		vecWork5 = vec_perm( vecD4, vecD4, vecPerm1 );
6992 		vecWork6 = vec_perm( vecD5, vecD5, vecPerm2 );
6993 		vecWork7 = vec_perm( vecD6, vecD6, vecPerm1 );
6994 		vecWork8 = vec_perm( vecD7, vecD7, vecPerm2 );
6995 
6996 		vecSecondHalf = vec_madd( vecWork1, vecWork2, zeroVector );
6997 		vecSecondHalf2 = vec_madd( vecWork3, vecWork4, zeroVector );
6998 		vecSecondHalf3 = vec_madd( vecWork5, vecWork6, zeroVector );
6999 		vecSecondHalf4 = vec_madd( vecWork7, vecWork8, zeroVector );
7000 
7001 		vecWork1 = vec_perm( vecD1, vecD1, vecPerm1 );
7002 		vecWork2 = vec_perm( vecD0, vecD0, vecPerm2 );
7003 		vecWork3 = vec_perm( vecD3, vecD3, vecPerm1 );
7004 		vecWork4 = vec_perm( vecD2, vecD2, vecPerm2 );
7005 		vecWork5 = vec_perm( vecD5, vecD5, vecPerm1 );
7006 		vecWork6 = vec_perm( vecD4, vecD4, vecPerm2 );
7007 		vecWork7 = vec_perm( vecD7, vecD7, vecPerm1 );
7008 		vecWork8 = vec_perm( vecD6, vecD6, vecPerm2 );
7009 
7010 		vecFirstHalf = vec_madd( vecWork1, vecWork2, zeroVector );
7011 		vecFirstHalf2 = vec_madd( vecWork3, vecWork4, zeroVector );
7012 		vecFirstHalf3 = vec_madd( vecWork5, vecWork6, zeroVector );
7013 		vecFirstHalf4 = vec_madd( vecWork7, vecWork8, zeroVector );
7014 
7015 		vecN = vec_madd( vecSecondHalf, vecNegOne, vecFirstHalf );
7016 		vecN2 = vec_madd( vecSecondHalf2, vecNegOne, vecFirstHalf2 );
7017 		vecN3 = vec_madd( vecSecondHalf3, vecNegOne, vecFirstHalf3 );
7018 		vecN4 = vec_madd( vecSecondHalf4, vecNegOne, vecFirstHalf4 );
7019 
7020 		// transpose vecNs
7021 		vector float v0, v1, v2, v3;
7022 		v0 = vec_mergeh( vecN, vecN3 );
7023 		v1 = vec_mergeh( vecN2, vecN4 );
7024 		v2 = vec_mergel( vecN, vecN3 );
7025 		v3 = vec_mergel( vecN2, vecN4 );
7026 
7027 		vecN = vec_mergeh( v0, v1 );
7028 		vecN2 = vec_mergel( v0, v1 );
7029 		vecN3 = vec_mergeh( v2, v3 );
7030 		vecN4 = vec_mergel( v2, v3 );
7031 
7032 		vecF = vec_madd( vecN, vecN, zeroVector );
7033 		vecF = vec_madd( vecN2, vecN2, vecF );
7034 		vecF = vec_madd( vecN3, vecN3, vecF );
7035 
7036 		vecF = ReciprocalSquareRoot( vecF );
7037 
7038 		vecF1 = vec_madd( vecF, vecN, zeroVector );
7039 		vecF2 = vec_madd( vecF, vecN2, zeroVector );
7040 		vecF3 = vec_madd( vecF, vecN3, zeroVector );
7041 		vecF4 = vec_madd( vecF, vecN4, zeroVector );
7042 
7043 		vector float v8, v9, v10, v11;
7044 		v8 = vecF1;
7045 		v9 = vecF2;
7046 		v10 = vecF3;
7047 		v11 = vecF4;
7048 
7049 		// transpose vecVerts
7050 		v0 = vec_mergeh( vecVertA, vecVertA3 );
7051 		v1 = vec_mergeh( vecVertA2, vecVertA4 );
7052 		v2 = vec_mergel( vecVertA, vecVertA3 );
7053 		v3 = vec_mergel( vecVertA2, vecVertA4 );
7054 
7055 		vecVertA = vec_mergeh( v0, v1 );
7056 		vecVertA2 = vec_mergel( v0, v1 );
7057 		vecVertA3 = vec_mergeh( v2, v3 );
7058 		vecVertA4 = vec_mergel( v2, v3 );
7059 
7060 		vector float vecTotals;
7061 		vecTotals = vec_madd( vecVertA, v8, zeroVector );
7062 		vecTotals = vec_madd( vecVertA2, v9, vecTotals );
7063 		vecTotals = vec_madd( vecVertA3, v10, vecTotals );
7064 		vecTotals = vec_madd( vecVertA4, v11, vecTotals );
7065 		vecF = vec_madd( vecTotals, vecNegOne, zeroVector );
7066 
7067 		// transpose vecFs
7068 		v0 = vec_mergeh( vecF1, vecF3 );
7069 		v1 = vec_mergeh( vecF2, vecF );
7070 		v2 = vec_mergel( vecF1, vecF3 );
7071 		v3 = vec_mergel( vecF2, vecF );
7072 
7073 		vecF1 = vec_mergeh( v0, v1 );
7074 		vecF2 = vec_mergel( v0, v1 );
7075 		vecF3 = vec_mergeh( v2, v3 );
7076 		vecF4 = vec_mergel( v2, v3 );
7077 
7078 		// store results
7079 		UNALIGNED_STORE4( planePtr + ( j * PLANE_OFFSET ), vecF1, vecF2, vecF3, vecF4 );
7080 	}
7081 
7082 	// cleanup
7083 	for ( ; i < numIndexes; i += 3, j++ ) {
7084 		const idDrawVert *a, *b, *c;
7085 		float d0[3], d1[3], f;
7086 		idVec3 n;
7087 
7088 		a = verts + indexes[i + 0];
7089 		b = verts + indexes[i + 1];
7090 		c = verts + indexes[i + 2];
7091 
7092 		d0[0] = b->xyz[0] - a->xyz[0];
7093 		d0[1] = b->xyz[1] - a->xyz[1];
7094 		d0[2] = b->xyz[2] - a->xyz[2];
7095 
7096 		d1[0] = c->xyz[0] - a->xyz[0];
7097 		d1[1] = c->xyz[1] - a->xyz[1];
7098 		d1[2] = c->xyz[2] - a->xyz[2];
7099 
7100 		n[0] = d1[1] * d0[2] - d1[2] * d0[1];
7101 		n[1] = d1[2] * d0[0] - d1[0] * d0[2];
7102 		n[2] = d1[0] * d0[1] - d1[1] * d0[0];
7103 
7104 		f = FastScalarInvSqrt( n.x * n.x + n.y * n.y + n.z * n.z );
7105 		//idMath::RSqrt( n.x * n.x + n.y * n.y + n.z * n.z );
7106 
7107 		n.x *= f;
7108 		n.y *= f;
7109 		n.z *= f;
7110 
7111 		planes[j].SetNormal( n );
7112 		planes[j].FitThroughPoint( a->xyz );
7113 	}
7114 }
7115 
7116 /*
7117 ============
7118 idSIMD_AltiVec::DeriveTangents
7119 
7120 	Derives the normal and orthogonal tangent vectors for the triangle vertices.
7121 	For each vertex the normal and tangent vectors are derived from all triangles
7122 	using the vertex which results in smooth tangents across the mesh.
7123 	In the process the triangle planes are calculated as well.
7124 
7125 ============
7126 */
DeriveTangents(idPlane * planes,idDrawVert * verts,const int numVerts,const int * indexes,const int numIndexes)7127 void VPCALL idSIMD_AltiVec::DeriveTangents( idPlane *planes, idDrawVert *verts, const int numVerts, const int *indexes, const int numIndexes ) {
7128 	int i;
7129 
7130 	bool *used = (bool *)_alloca16( numVerts * sizeof( used[0] ) );
7131 	memset( used, 0, numVerts * sizeof( used[0] ) );
7132 
7133 	idPlane *planesPtr = planes;
7134 	for ( i = 0; i < numIndexes; i += 3 ) {
7135 		idDrawVert *a, *b, *c;
7136 	//	unsigned long signBit;
7137 		float d0[5], d1[5], area;
7138 		idVec3 n, t0, t1;
7139 		float f1, f2, f3;
7140 
7141 		int v0 = indexes[i + 0];
7142 		int v1 = indexes[i + 1];
7143 		int v2 = indexes[i + 2];
7144 
7145 		a = verts + v0;
7146 		b = verts + v1;
7147 		c = verts + v2;
7148 
7149 		d0[0] = b->xyz[0] - a->xyz[0];
7150 		d0[1] = b->xyz[1] - a->xyz[1];
7151 		d0[2] = b->xyz[2] - a->xyz[2];
7152 		d0[3] = b->st[0] - a->st[0];
7153 		d0[4] = b->st[1] - a->st[1];
7154 
7155 		d1[0] = c->xyz[0] - a->xyz[0];
7156 		d1[1] = c->xyz[1] - a->xyz[1];
7157 		d1[2] = c->xyz[2] - a->xyz[2];
7158 		d1[3] = c->st[0] - a->st[0];
7159 		d1[4] = c->st[1] - a->st[1];
7160 
7161 		// normal
7162 		n[0] = d1[1] * d0[2] - d1[2] * d0[1];
7163 		n[1] = d1[2] * d0[0] - d1[0] * d0[2];
7164 		n[2] = d1[0] * d0[1] - d1[1] * d0[0];
7165 
7166 		f1 =  n.x * n.x + n.y * n.y + n.z * n.z;
7167 
7168 		// area sign bit
7169 		area = d0[3] * d1[4] - d0[4] * d1[3];
7170 
7171 		// first tangent
7172 		t0[0] = d0[0] * d1[4] - d0[4] * d1[0];
7173 		t0[1] = d0[1] * d1[4] - d0[4] * d1[1];
7174 		t0[2] = d0[2] * d1[4] - d0[4] * d1[2];
7175 
7176 		f2 = t0.x * t0.x + t0.y * t0.y + t0.z * t0.z;
7177 
7178 		// second tangent
7179 		t1[0] = d0[3] * d1[0] - d0[0] * d1[3];
7180 		t1[1] = d0[3] * d1[1] - d0[1] * d1[3];
7181 		t1[2] = d0[3] * d1[2] - d0[2] * d1[3];
7182 
7183 		f3 = t1.x * t1.x + t1.y * t1.y + t1.z * t1.z;
7184 
7185 		// Behold! The power of the pipeline
7186 		FastScalarInvSqrt_x3( &f1, &f2, &f3 );
7187 #ifdef PPC_INTRINSICS
7188 		f2 = __fsel( area, f2, -f2 );
7189 		f3 = __fsel( area, f3, -f3 );
7190 #else
7191 		f2 = ( area < 0.0f ) ? -f2 : f2;
7192 		f3 = ( area < 0.0f ) ? -f3 : f3;
7193 #endif
7194 		t0.x *= f2;
7195 		t0.y *= f2;
7196 		t0.z *= f2;
7197 
7198 		n.x *= f1;
7199 		n.y *= f1;
7200 		n.z *= f1;
7201 
7202 		planesPtr->SetNormal( n );
7203 		planesPtr->FitThroughPoint( a->xyz );
7204 		planesPtr++;
7205 
7206 		t1.x *= f3;
7207 		t1.y *= f3;
7208 		t1.z *= f3;
7209 
7210 		if ( used[v0] ) {
7211 			a->normal += n;
7212 			a->tangents[0] += t0;
7213 			a->tangents[1] += t1;
7214 		} else {
7215 			a->normal = n;
7216 			a->tangents[0] = t0;
7217 			a->tangents[1] = t1;
7218 			used[v0] = true;
7219 		}
7220 
7221 		if ( used[v1] ) {
7222 			b->normal += n;
7223 			b->tangents[0] += t0;
7224 			b->tangents[1] += t1;
7225 		} else {
7226 			b->normal = n;
7227 			b->tangents[0] = t0;
7228 			b->tangents[1] = t1;
7229 			used[v1] = true;
7230 		}
7231 
7232 		if ( used[v2] ) {
7233 			c->normal += n;
7234 			c->tangents[0] += t0;
7235 			c->tangents[1] += t1;
7236 		} else {
7237 			c->normal = n;
7238 			c->tangents[0] = t0;
7239 			c->tangents[1] = t1;
7240 			used[v2] = true;
7241 		}
7242 	}
7243 }
7244 
7245 
7246 #ifdef DERIVE_UNSMOOTH_DRAWVERT_ALIGNED
7247 
7248 /*
7249 ============
7250 idSIMD_AltiVec::DeriveUnsmoothedTangents
7251 
7252 	Derives the normal and orthogonal tangent vectors for the triangle vertices.
7253 	For each vertex the normal and tangent vectors are derived from a single dominant triangle.
7254 ============
7255 */
7256 #define DERIVE_UNSMOOTHED_BITANGENT
DeriveUnsmoothedTangents(idDrawVert * verts,const dominantTri_s * dominantTris,const int numVerts)7257 void VPCALL idSIMD_AltiVec::DeriveUnsmoothedTangents( idDrawVert *verts, const dominantTri_s *dominantTris, const int numVerts ) {
7258 
7259 	int i;
7260 	// idDrawVert size
7261 	assert( sizeof(idDrawVert) == DRAWVERT_OFFSET * sizeof(float) );
7262 	// drawverts aligned
7263 	assert( IS_16BYTE_ALIGNED( verts[0] ) );
7264 
7265 	vector float vecVertA, vecVertB, vecVertC;
7266 	vector float vecVertA2, vecVertB2, vecVertC2;
7267 	vector float vecVertA3, vecVertB3, vecVertC3;
7268 	vector float vecVertA4, vecVertB4, vecVertC4;
7269 
7270 	vector float v0, v1, v2, v3, v4, v5, v6, v7, v8;
7271 	vector float vecS0, vecS1, vecS2;
7272 	vector float vecS0_2, vecS1_2, vecS2_2;
7273 	vector float vecS0_3, vecS1_3, vecS2_3;
7274 	vector float vecS0_4, vecS1_4, vecS2_4;
7275 
7276 	vector float vecD1, vecD2, vecD3, vecD4, vecD5, vecD6;
7277 	vector float vecD7, vecD8, vecD9, vecD10, vecD11, vecD12;
7278 	vector float vecT1, vecT1_2, vecT1_3, vecT1_4, vecT2, vecT2_2, vecT2_3, vecT2_4;
7279 	vector float vecWork1, vecWork2, vecWork3, vecWork4, vecWork5, vecWork6, vecWork7, vecWork8;
7280 	vector float vecN, vecN2, vecN3, vecN4;
7281 
7282 	vector unsigned char vecPermN0 = (vector unsigned char)(8,9,10,11,0,1,2,3,4,5,6,7,12,13,14,15);
7283 	vector unsigned char vecPermN1 = (vector unsigned char)(4,5,6,7,8,9,10,11,0,1,2,3,12,13,14,15);
7284 	vector unsigned char vecPermT0 = (vector unsigned char)(0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3);
7285 	vector unsigned char vecPermT1 = (vector unsigned char)(8,9,10,11,8,9,10,11,8,9,10,11,8,9,10,11);
7286 	vector float zeroVector = (vector float)(0);
7287 
7288 	vector float vecNegOne = (vector float)(-1.0);
7289 
7290 	vector float vecStore1, vecStore2, vecStore3;
7291 	vector unsigned char vecPermFirstThreeLast = (vector unsigned char)(0,1,2,3,4,5,6,7,8,9,10,11,16,17,18,19);
7292 	vector unsigned char vecPermStoreSecond = (vector unsigned char)(4,5,6,7,8,9,10,11,16,17,18,19,20,21,22,23);
7293 	vector unsigned char vecPermLeadAndThree = (vector unsigned char)(0,1,2,3,16,17,18,19,20,21,22,23,24,25,26,27);
7294 	vector unsigned char vecPermStore2 = (vector unsigned char)(4,5,6,7,8,9,10,11,24,25,26,27,28,29,30,31);
7295 	vector unsigned char vecPermStore3 = (vector unsigned char)(4,5,6,7,8,9,10,11,16,17,18,19,20,21,22,23);
7296 	vector unsigned char vecPermStore4 = (vector unsigned char)(8,9,10,11,16,17,18,19,20,21,22,23,24,25,26,27);
7297 	vector unsigned char vecPermHalves = (vector unsigned char)(0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23);
7298 
7299 	vector float vecLd1, vecLd2, vecLd3;
7300 	vector unsigned char vecPerm0, vecPerm1, vecPerm2, vecPerm3, vecPerm4;
7301 
7302 	float *normalPtr = verts[0].normal.ToFloatPtr();
7303 	float *xyzPtr = verts[0].xyz.ToFloatPtr();
7304 
7305 	vector float vecFirstHalf, vecSecondHalf;
7306 	vector float vecFirstHalf2, vecSecondHalf2;
7307 	vector float vecFirstHalf3, vecSecondHalf3;
7308 	vector float vecFirstHalf4, vecSecondHalf4;
7309 
7310 	for ( i = 0; i+3 < numVerts; i+=4 ) {
7311 		int bOffset1, bOffset2, bOffset3, bOffset4;
7312 		int cOffset1, cOffset2, cOffset3, cOffset4;
7313 
7314 		bOffset1 = dominantTris[i].v2;
7315 		cOffset1 = dominantTris[i].v3;
7316 		bOffset2 = dominantTris[i+1].v2;
7317 		cOffset2 = dominantTris[i+1].v3;
7318 		bOffset3 = dominantTris[i+2].v2;
7319 		cOffset3 = dominantTris[i+2].v3;
7320 		bOffset4 = dominantTris[i+3].v2;
7321 		cOffset4 = dominantTris[i+3].v3;
7322 
7323 		vecPerm0 = vec_lvsl( 0, xyzPtr + ( i * DRAWVERT_OFFSET ) );
7324 		v0 = vec_ld( 0, xyzPtr + (i * DRAWVERT_OFFSET ) );
7325 		v1 = vec_ld( 16, xyzPtr + (i * DRAWVERT_OFFSET ) );
7326 		vecVertA = vec_perm( v0, v1, vecPerm0 );
7327 
7328 		vecPerm1 = vec_lvsl( 0, xyzPtr + (bOffset1 * DRAWVERT_OFFSET ) );
7329 		v2 = vec_ld( 0, xyzPtr + ( bOffset1 * DRAWVERT_OFFSET ) );
7330 		v3 = vec_ld( 16, xyzPtr + ( bOffset1 * DRAWVERT_OFFSET ) );
7331 		vecVertB = vec_perm( v2, v3, vecPerm1 );
7332 
7333 		vecPerm2 = vec_lvsl( 0, xyzPtr + ( cOffset1 * DRAWVERT_OFFSET ) );
7334 		v4 = vec_ld( 0, xyzPtr + ( cOffset1 * DRAWVERT_OFFSET ) );
7335 		v5 = vec_ld( 16, xyzPtr + ( cOffset1 * DRAWVERT_OFFSET ) );
7336 		vecVertC = vec_perm( v4, v5, vecPerm2 );
7337 
7338 		// put remainder into v2
7339 		v1 = vec_perm( v1, v1, vecPerm0 );
7340 		v3 = vec_perm( v3, v3, vecPerm1 );
7341 		v5 = vec_perm( v5, v5, vecPerm2 );
7342 
7343 		v1 = vec_mergeh( v1, v5 );
7344 		v2 = vec_mergeh( v3, zeroVector );
7345 		v2 = vec_mergeh( v1, v2 );
7346 		v2 = vec_perm( v2, v2, (vector unsigned char)(4,5,6,7,0,1,2,3,8,9,10,11,0,1,2,3) );
7347 
7348 		// load second one
7349 		vecPerm0 = vec_lvsl( 0, xyzPtr + ((i+1) * DRAWVERT_OFFSET ) );
7350 		v0 = vec_ld( 0, xyzPtr + ((i+1) * DRAWVERT_OFFSET ) );
7351 		v1 = vec_ld( 16, xyzPtr + ((i+1) * DRAWVERT_OFFSET ) );
7352 		vecVertA2 = vec_perm( v0, v1, vecPerm0 );
7353 
7354 		vecPerm3 = vec_lvsl( 0, xyzPtr + (bOffset2 * DRAWVERT_OFFSET ) );
7355 		v3 = vec_ld( 0, xyzPtr + ( bOffset2 * DRAWVERT_OFFSET ) );
7356 		v4 = vec_ld( 16, xyzPtr + ( bOffset2 * DRAWVERT_OFFSET ) );
7357 		vecVertB2 = vec_perm( v3, v4, vecPerm3 );
7358 
7359 		vecPerm4 = vec_lvsl( 0, xyzPtr + ( cOffset2 * DRAWVERT_OFFSET ) );
7360 		v5 = vec_ld( 0, xyzPtr + ( cOffset2 * DRAWVERT_OFFSET ) );
7361 		v6 = vec_ld( 16, xyzPtr + ( cOffset2 * DRAWVERT_OFFSET ) );
7362 		vecVertC2 = vec_perm( v5, v6, vecPerm4 );
7363 
7364 		// put remainder into v3
7365 		v1 = vec_perm( v1, v1, vecPerm0 );
7366 		v4 = vec_perm( v4, v4, vecPerm3 );
7367 		v5 = vec_perm( v6, v6, vecPerm4 );
7368 
7369 		v1 = vec_mergeh( v1, v5 );
7370 		v3 = vec_mergeh( v4, zeroVector );
7371 		v3 = vec_mergeh( v1, v3 );
7372 		v3 = vec_perm( v3, v3, (vector unsigned char)(4,5,6,7,0,1,2,3,8,9,10,11,0,1,2,3) );
7373 
7374 		// load third one
7375 		vecPerm0 = vec_lvsl( 0, xyzPtr + ((i+2) * DRAWVERT_OFFSET ) );
7376 		v0 = vec_ld( 0, xyzPtr + ((i+2) * DRAWVERT_OFFSET ) );
7377 		v1 = vec_ld( 16, xyzPtr + ((i+2) * DRAWVERT_OFFSET ) );
7378 		vecVertA3 = vec_perm( v0, v1, vecPerm0 );
7379 
7380 		vecPerm1 = vec_lvsl( 0, xyzPtr + (bOffset3 * DRAWVERT_OFFSET ) );
7381 		v4 = vec_ld( 0, xyzPtr + ( bOffset3 * DRAWVERT_OFFSET ) );
7382 		v5 = vec_ld( 16, xyzPtr + ( bOffset3 * DRAWVERT_OFFSET ) );
7383 		vecVertB3 = vec_perm( v4, v5, vecPerm1 );
7384 
7385 		vecPerm2 = vec_lvsl( 0, xyzPtr + ( cOffset3 * DRAWVERT_OFFSET ) );
7386 		v6 = vec_ld( 0, xyzPtr + ( cOffset3 * DRAWVERT_OFFSET ) );
7387 		v7 = vec_ld( 16, xyzPtr + ( cOffset3 * DRAWVERT_OFFSET ) );
7388 		vecVertC3 = vec_perm( v6, v7, vecPerm2 );
7389 
7390 		// put remainder into v4
7391 		v1 = vec_perm( v1, v1, vecPerm0 );
7392 		v5 = vec_perm( v5, v5, vecPerm1 );
7393 		v7 = vec_perm( v7, v7, vecPerm2 );
7394 
7395 		v1 = vec_mergeh( v1, v7 );
7396 		v4 = vec_mergeh( v5, zeroVector );
7397 		v4 = vec_mergeh( v1, v4 );
7398 		v4 = vec_perm( v4, v4, (vector unsigned char)(4,5,6,7,0,1,2,3,8,9,10,11,0,1,2,3) );
7399 
7400 		// load fourth one
7401 		vecPerm0 = vec_lvsl( 0, xyzPtr + ((i+3) * DRAWVERT_OFFSET ) );
7402 		v0 = vec_ld( 0, xyzPtr + ((i+3) * DRAWVERT_OFFSET ) );
7403 		v1 = vec_ld( 16, xyzPtr + ((i+3) * DRAWVERT_OFFSET ) );
7404 		vecVertA4 = vec_perm( v0, v1, vecPerm0 );
7405 
7406 		vecPerm3 = vec_lvsl( 0, xyzPtr + (bOffset4 * DRAWVERT_OFFSET ) );
7407 		v5 = vec_ld( 0, xyzPtr + ( bOffset4 * DRAWVERT_OFFSET ) );
7408 		v6 = vec_ld( 16, xyzPtr + ( bOffset4 * DRAWVERT_OFFSET ) );
7409 		vecVertB4 = vec_perm( v5, v6, vecPerm3 );
7410 
7411 		vecPerm4 = vec_lvsl( 0, xyzPtr + ( cOffset4 * DRAWVERT_OFFSET ) );
7412 		v7 = vec_ld( 0, xyzPtr + ( cOffset4 * DRAWVERT_OFFSET ) );
7413 		v8 = vec_ld( 16, xyzPtr + ( cOffset4 * DRAWVERT_OFFSET ) );
7414 		vecVertC4 = vec_perm( v7, v8, vecPerm4 );
7415 
7416 		// put remainder into v5
7417 		v1 = vec_perm( v1, v1, vecPerm0 );
7418 		v6 = vec_perm( v6, v6, vecPerm3 );
7419 		v8 = vec_perm( v8, v8, vecPerm4 );
7420 
7421 		v1 = vec_mergeh( v1, v8 );
7422 		v5 = vec_mergeh( v6, zeroVector );
7423 		v5 = vec_mergeh( v1, v5 );
7424 		v5 = vec_perm( v5, v5, (vector unsigned char)(4,5,6,7,0,1,2,3,8,9,10,11,0,1,2,3) );
7425 
7426 		// remainder vectors look like b->st[1], a->st[1], c->st[1], a->st[1]
7427 
7428 		//vecD1 now holds d0, d1, d2, d3
7429 		vecD1 = vec_sub( vecVertB, vecVertA );
7430 		vecD4 = vec_sub( vecVertB2, vecVertA2 );
7431 		vecD7 = vec_sub( vecVertB3, vecVertA3 );
7432 		vecD10 = vec_sub( vecVertB4, vecVertA4 );
7433 
7434 		// vecD2 how holds d5, d6, d7, d8
7435 		vecD2 = vec_sub( vecVertC, vecVertA );
7436 		vecD5 = vec_sub( vecVertC2, vecVertA2 );
7437 		vecD8 = vec_sub( vecVertC3, vecVertA3 );
7438 		vecD11 = vec_sub( vecVertC4, vecVertA4 );
7439 
7440 		// vecD3 now holds d4, crap, d9, crap
7441 		vecD3 = vec_sub( v2, vec_sld( v2, v2, 4 ) );
7442 		vecD6 = vec_sub( v3, vec_sld( v3, v3, 4 ) );
7443 		vecD9 = vec_sub( v4, vec_sld( v4, v4, 4 ) );
7444 		vecD12 = vec_sub( v5, vec_sld( v5, v5, 4 ) );
7445 
7446 		// get permute vectors for loading from dt
7447 		vecPerm1 = vec_add( vec_lvsl( -1, (int*) &dominantTris[i].normalizationScale[0] ), (vector unsigned char)(1) );
7448 		vecPerm2 = vec_add( vec_lvsl( -1, (int*) &dominantTris[i+1].normalizationScale[0] ), (vector unsigned char)(1) );
7449 		vecPerm3 = vec_add( vec_lvsl( -1, (int*) &dominantTris[i+2].normalizationScale[0] ), (vector unsigned char)(1) );
7450 		vecPerm4 = vec_add( vec_lvsl( -1, (int*) &dominantTris[i+3].normalizationScale[0] ), (vector unsigned char)(1) );
7451 
7452 		// load S values from dominantTris
7453 		v0 = vec_ld( 0, &dominantTris[i].normalizationScale[0] );
7454 		v1 = vec_ld( 11, &dominantTris[i].normalizationScale[0] );
7455 		v2 = vec_ld( 0, &dominantTris[i+1].normalizationScale[0] );
7456 		v3 = vec_ld( 11, &dominantTris[i+1].normalizationScale[0] );
7457 		v4 = vec_ld( 0, &dominantTris[i+2].normalizationScale[0] );
7458 		v5 = vec_ld( 11, &dominantTris[i+2].normalizationScale[0] );
7459 		v6 = vec_ld( 0, &dominantTris[i+3].normalizationScale[0] );
7460 		v7 = vec_ld( 11, &dominantTris[i+3].normalizationScale[0] );
7461 
7462 		v0 = vec_perm( v0, v1, vecPerm1 );
7463 		v2 = vec_perm( v2, v3, vecPerm2 );
7464 		v4 = vec_perm( v4, v5, vecPerm3 );
7465 		v6 = vec_perm( v6, v7, vecPerm4 );
7466 
7467 		vecS0 = vec_splat( v0, 0 );
7468 		vecS1 = vec_splat( v0, 1 );
7469 		vecS2 = vec_splat( v0, 2 );
7470 
7471 		vecS0_2 = vec_splat( v2, 0);
7472 		vecS1_2 = vec_splat( v2, 1 );
7473 		vecS2_2 = vec_splat( v2, 2 );
7474 
7475 		vecS0_3 = vec_splat( v4, 0 );
7476 		vecS1_3 = vec_splat( v4, 1 );
7477 		vecS2_3 = vec_splat( v4, 2 );
7478 
7479 		vecS0_4 = vec_splat( v6, 0 );
7480 		vecS1_4 = vec_splat( v6, 1 );
7481 		vecS2_4 = vec_splat( v6, 2 );
7482 
7483 		// do calculation
7484 		vecWork1 = vec_perm( vecD2, vecD2, vecPermN1 );
7485 		vecWork2 = vec_perm( vecD1, vecD1, vecPermN0 );
7486 		vecWork3 = vec_perm( vecD5, vecD5, vecPermN1 );
7487 		vecWork4 = vec_perm( vecD4, vecD4, vecPermN0 );
7488 		vecWork5 = vec_perm( vecD8, vecD8, vecPermN1 );
7489 		vecWork6 = vec_perm( vecD7, vecD7, vecPermN0 );
7490 		vecWork7 = vec_perm( vecD11, vecD11, vecPermN1 );
7491 		vecWork8 = vec_perm( vecD10, vecD10, vecPermN0 );
7492 
7493 		vecFirstHalf = vec_madd( vecWork1, vecWork2, zeroVector );
7494 		vecFirstHalf2 = vec_madd( vecWork3, vecWork4, zeroVector );
7495 		vecFirstHalf3 = vec_madd( vecWork5, vecWork6, zeroVector );
7496 		vecFirstHalf4 = vec_madd( vecWork7, vecWork8, zeroVector );
7497 
7498 		vecWork1 = vec_perm( vecD2, vecD2, vecPermN0 );
7499 		vecWork2 = vec_perm( vecD1, vecD1, vecPermN1 );
7500 		vecWork3 = vec_perm( vecD5, vecD5, vecPermN0 );
7501 		vecWork4 = vec_perm( vecD4, vecD4, vecPermN1 );
7502 		vecWork5 = vec_perm( vecD8, vecD8, vecPermN0 );
7503 		vecWork6 = vec_perm( vecD7, vecD7, vecPermN1 );
7504 		vecWork7 = vec_perm( vecD11, vecD11, vecPermN0 );
7505 		vecWork8 = vec_perm( vecD10, vecD10, vecPermN1 );
7506 
7507 		vecSecondHalf = vec_nmsub( vecWork1, vecWork2, vecFirstHalf );
7508 		vecSecondHalf2 = vec_nmsub( vecWork3, vecWork4, vecFirstHalf2 );
7509 		vecSecondHalf3 = vec_nmsub( vecWork5, vecWork6, vecFirstHalf3 );
7510 		vecSecondHalf4 = vec_nmsub( vecWork7, vecWork8, vecFirstHalf4 );
7511 
7512 
7513 		// calculate N values
7514 		vecN = vec_madd( vecS2, vecSecondHalf, zeroVector );
7515 		vecN2 = vec_madd( vecS2_2, vecSecondHalf2, zeroVector );
7516 		vecN3 = vec_madd( vecS2_3, vecSecondHalf3, zeroVector );
7517 		vecN4 = vec_madd( vecS2_4, vecSecondHalf4, zeroVector );
7518 
7519 		// calculate both halves of the calculation for t
7520 		vecWork1 = vecD1;
7521 		vecWork2 = vec_perm( vecD3, vecD3, vecPermT1 );
7522 		vecWork3 = vecD4;
7523 		vecWork4 = vec_perm( vecD6, vecD6, vecPermT1 );
7524 		vecWork5 = vecD7;
7525 		vecWork6 = vec_perm( vecD9, vecD9, vecPermT1 );
7526 		vecWork7 = vecD10;
7527 		vecWork8 = vec_perm( vecD12, vecD12, vecPermT1 );
7528 
7529 		vecFirstHalf = vec_madd( vecWork1, vecWork2, zeroVector );
7530 		vecFirstHalf2 = vec_madd( vecWork3, vecWork4, zeroVector );
7531 		vecFirstHalf3 = vec_madd( vecWork5, vecWork6, zeroVector );
7532 		vecFirstHalf4 = vec_madd( vecWork7, vecWork8, zeroVector );
7533 
7534 		vecWork1 = vecD2;
7535 		vecWork2 = vec_perm( vecD3, vecD3, vecPermT0 );
7536 		vecWork3 = vecD5;
7537 		vecWork4 = vec_perm( vecD6, vecD6, vecPermT0 );
7538 		vecWork5 = vecD8;
7539 		vecWork6 = vec_perm( vecD9, vecD9, vecPermT0 );
7540 		vecWork7 = vecD11;
7541 		vecWork8 = vec_perm( vecD12, vecD12, vecPermT0 );
7542 
7543 		vecSecondHalf = vec_nmsub( vecWork1, vecWork2, vecFirstHalf );
7544 		vecSecondHalf2 = vec_nmsub( vecWork3, vecWork4, vecFirstHalf2 );
7545 		vecSecondHalf3 = vec_nmsub( vecWork5, vecWork6, vecFirstHalf3 );
7546 		vecSecondHalf4 = vec_nmsub( vecWork7, vecWork8, vecFirstHalf4 );
7547 
7548 		// calculate T values
7549 		vecT1 = vec_madd( vecS0, vecSecondHalf, zeroVector );
7550 		vecT1_2 = vec_madd( vecS0_2, vecSecondHalf2, zeroVector );
7551 		vecT1_3 = vec_madd( vecS0_3, vecSecondHalf3, zeroVector );
7552 		vecT1_4 = vec_madd( vecS0_4, vecSecondHalf4, zeroVector );
7553 
7554 #ifndef DERIVE_UNSMOOTHED_BITANGENT
7555 		vecWork1 = vecD1;
7556 		vecWork2 = vec_perm( vecD2, vecD2, vecPermT2 );
7557 		vecWork3 = vecD4;
7558 		vecWork4 = vec_perm( vecD5, vecD5, vecPermT2 );
7559 		vecWork5 = vecD7;
7560 		vecWork6 = vec_perm( vecD8, vecD8, vecPermT2 );
7561 		vecWork7 = vecD10;
7562 		vecWork8 = vec_perm( vecD11, vecD11, vecPermT2 );
7563 
7564 		vecSecondHalf = vec_madd( vecWork1, vecWork2, zeroVector );
7565 		vecSecondHalf2 = vec_madd( vecWork3, vecWork4, zeroVector );
7566 		vecSecondHalf3 = vec_madd( vecWork5, vecWork6, zeroVector );
7567 		vecSecondHalf4 = vec_madd( vecWork7, vecWork8, zeroVector );
7568 
7569 		vecWork1 = vec_perm( vecD1, vecD1, vecPermT2 );
7570 		vecWork2 = vecD2;
7571 		vecWork3 = vec_perm( vecD4, vecD4, vecPermT2 );
7572 		vecWork4 = vecD5;
7573 		vecWork5 = vec_perm( vecD7, vecD7, vecPermT2 );
7574 		vecWork6 = vecD8;
7575 		vecWork7 = vec_perm( vecD10, vecD10, vecPermT2 );
7576 		vecWork8 = vecD11;
7577 
7578 		vecFirstHalf = vec_madd( vecWork1, vecWork2, zeroVector );
7579 		vecFirstHalf2 = vec_madd( vecWork3, vecWork4, zeroVector );
7580 		vecFirstHalf3 = vec_madd( vecWork5, vecWork6, zeroVector );
7581 		vecFirstHalf4 = vec_madd( vecWork7, vecWork8, zeroVector );
7582 
7583 #else
7584 		vecWork1 = vec_perm( vecN, vecN, vecPermN1 );
7585 		vecWork2 = vec_perm( vecT1, vecT1, vecPermN0 );
7586 		vecWork3 = vec_perm( vecN2, vecN2, vecPermN1 );
7587 		vecWork4 = vec_perm( vecT1_2, vecT1_2, vecPermN0 );
7588 		vecWork5 = vec_perm( vecN3, vecN3, vecPermN1 );
7589 		vecWork6 = vec_perm( vecT1_3, vecT1_3, vecPermN0 );
7590 		vecWork7 = vec_perm( vecN4, vecN4, vecPermN1 );
7591 		vecWork8 = vec_perm( vecT1_4, vecT1_4, vecPermN0 );
7592 
7593 		vecSecondHalf = vec_madd( vecWork1, vecWork2, zeroVector );
7594 		vecSecondHalf2 = vec_madd( vecWork3, vecWork4, zeroVector );
7595 		vecSecondHalf3 = vec_madd( vecWork5, vecWork6, zeroVector );
7596 		vecSecondHalf4 = vec_madd( vecWork7, vecWork8, zeroVector );
7597 
7598 		vecWork1 = vec_perm( vecN, vecN, vecPermN0 );
7599 		vecWork2 = vec_perm( vecT1, vecT1, vecPermN1 );
7600 		vecWork3 = vec_perm( vecN2, vecN2, vecPermN0 );
7601 		vecWork4 = vec_perm( vecT1_2, vecT1_2, vecPermN1 );
7602 		vecWork5 = vec_perm( vecN3, vecN3, vecPermN0 );
7603 		vecWork6 = vec_perm( vecT1_3, vecT1_3, vecPermN1 );
7604 		vecWork7 = vec_perm( vecN4, vecN4, vecPermN0 );
7605 		vecWork8 = vec_perm( vecT1_4, vecT1_4, vecPermN1 );
7606 
7607 		vecFirstHalf = vec_madd( vecWork1, vecWork2, zeroVector );
7608 		vecFirstHalf2 = vec_madd( vecWork3, vecWork4, zeroVector );
7609 		vecFirstHalf3 = vec_madd( vecWork5, vecWork6, zeroVector );
7610 		vecFirstHalf4 = vec_madd( vecWork7, vecWork8, zeroVector );
7611 #endif
7612 		// finish the calculation
7613 		vecSecondHalf = vec_madd( vecSecondHalf, vecNegOne, vecFirstHalf );
7614 		vecSecondHalf2 = vec_madd( vecSecondHalf2, vecNegOne, vecFirstHalf2 );
7615 		vecSecondHalf3 = vec_madd( vecSecondHalf3, vecNegOne, vecFirstHalf3 );
7616 		vecSecondHalf4 = vec_madd( vecSecondHalf4, vecNegOne, vecFirstHalf4 );
7617 
7618 		vecT2 = vec_madd( vecS1, vecSecondHalf, zeroVector );
7619 		vecT2_2 = vec_madd( vecS1_2, vecSecondHalf2, zeroVector );
7620 		vecT2_3 = vec_madd( vecS1_3, vecSecondHalf3, zeroVector );
7621 		vecT2_4 = vec_madd( vecS1_4, vecSecondHalf4, zeroVector );
7622 
7623 		// Store results
7624 
7625 		// read values that we need to preserve
7626 		vecLd1 = vec_ld( 0, normalPtr + ( i * DRAWVERT_OFFSET ) );
7627 		vecLd2 = vec_ld( 32, normalPtr + ( i * DRAWVERT_OFFSET ) );
7628 
7629 		//generate vectors to store
7630 		vecStore1 = vec_perm( vecLd1, vecN, vecPermLeadAndThree );
7631 		vecStore2 = vec_perm( vecT1, vecT2, vecPermFirstThreeLast );
7632 		vecStore3 = vec_perm( vecT2, vecLd2, vecPermStore2 );
7633 
7634 		// store out results
7635 		ALIGNED_STORE3( normalPtr + ( i * DRAWVERT_OFFSET ), vecStore1, vecStore2, vecStore3 );
7636 
7637 		// read values that we need to preserve
7638 		vecLd3 = vec_ld( 32, normalPtr + ( (i+1) * DRAWVERT_OFFSET ));
7639 
7640 		// generate vectors to store
7641 		vecStore1 = vec_perm( vecN2, vecT1_2, vecPermFirstThreeLast );
7642 		vecStore2 = vec_perm( vecT1_2, vecT2_2, vecPermStoreSecond );
7643 		vecStore3 = vec_perm( vecT2_2, vecLd3, (vector unsigned char)(8,9,10,11,20,21,22,23,24,25,26,27,28,29,30,31) );
7644 
7645 		// instead of doing permute, shift it where it needs to be and use vec_ste
7646 		// store out vectors
7647 		ALIGNED_STORE3( normalPtr + ((i+1) * DRAWVERT_OFFSET), vecStore1, vecStore2, vecStore3 );
7648 
7649 		// read values that we need to preserve
7650 		vecLd1 = vec_ld( 0, normalPtr + ( (i+2) * DRAWVERT_OFFSET ) );
7651 
7652 		// generate vectors to store
7653 		vecStore1 = vec_perm( vecLd1, vecN3, vecPermFirstThreeLast );
7654 		vecStore2 = vec_perm( vecN3, vecT1_3, vecPermStore3 );
7655 		vecStore3 = vec_perm( vecT1_3, vecT2_3, vecPermStore4 );
7656 
7657 		// store out vectors
7658 		ALIGNED_STORE3( normalPtr + ((i+2) * DRAWVERT_OFFSET), vecStore1, vecStore2, vecStore3 );
7659 
7660 		// read values that we need to preserve
7661 		vecLd2 = vec_ld( 0, normalPtr + ((i+3) * DRAWVERT_OFFSET ) );
7662 		vecLd3 = vec_ld( 32, normalPtr + ((i+3) * DRAWVERT_OFFSET ) );
7663 
7664 		// generate vectors to store
7665 		vecStore1 = vec_perm( vecLd2, vecN4, vecPermHalves );
7666 		vecStore2 = vec_perm( vecN4, vecT1_4, vecPermStore4 );
7667 		vecStore3 = vec_perm( vecT2_4, vecLd3, vecPermFirstThreeLast );
7668 
7669 		// store out vectors
7670 		ALIGNED_STORE3( normalPtr + ((i+3) * DRAWVERT_OFFSET ), vecStore1, vecStore2, vecStore3 );
7671 	}
7672 
7673 	// cleanup
7674 	for ( ; i < numVerts; i++ ) {
7675 		idDrawVert *a, *b, *c;
7676 		float d0, d1, d2, d3, d4;
7677 		float d5, d6, d7, d8, d9;
7678 		float s0, s1, s2;
7679 		float n0, n1, n2;
7680 		float t0, t1, t2;
7681 		float t3, t4, t5;
7682 
7683 		const dominantTri_s &dt = dominantTris[i];
7684 
7685 		a = verts + i;
7686 		b = verts + dt.v2;
7687 		c = verts + dt.v3;
7688 
7689 		d0 = b->xyz[0] - a->xyz[0];
7690 		d1 = b->xyz[1] - a->xyz[1];
7691 		d2 = b->xyz[2] - a->xyz[2];
7692 		d3 = b->st[0] - a->st[0];
7693 
7694 		d4 = b->st[1] - a->st[1];
7695 
7696 		d5 = c->xyz[0] - a->xyz[0];
7697 		d6 = c->xyz[1] - a->xyz[1];
7698 		d7 = c->xyz[2] - a->xyz[2];
7699 		d8 = c->st[0] - a->st[0];
7700 
7701 		d9 = c->st[1] - a->st[1];
7702 
7703 		s0 = dt.normalizationScale[0];
7704 		s1 = dt.normalizationScale[1];
7705 		s2 = dt.normalizationScale[2];
7706 
7707 		n0 = s2 * ( d6 * d2 - d7 * d1 );
7708 		n1 = s2 * ( d7 * d0 - d5 * d2 );
7709 		n2 = s2 * ( d5 * d1 - d6 * d0 );
7710 
7711 		t0 = s0 * ( d0 * d9 - d4 * d5 );
7712 		t1 = s0 * ( d1 * d9 - d4 * d6 );
7713 		t2 = s0 * ( d2 * d9 - d4 * d7 );
7714 
7715 #ifndef DERIVE_UNSMOOTHED_BITANGENT
7716 		t3 = s1 * ( d3 * d5 - d0 * d8 );
7717 		t4 = s1 * ( d3 * d6 - d1 * d8 );
7718 		t5 = s1 * ( d3 * d7 - d2 * d8 );
7719 #else
7720 		t3 = s1 * ( n2 * t1 - n1 * t2 );
7721 		t4 = s1 * ( n0 * t2 - n2 * t0 );
7722 		t5 = s1 * ( n1 * t0 - n0 * t1 );
7723 #endif
7724 
7725 		a->normal[0] = n0;
7726 		a->normal[1] = n1;
7727 		a->normal[2] = n2;
7728 
7729 		a->tangents[0][0] = t0;
7730 		a->tangents[0][1] = t1;
7731 		a->tangents[0][2] = t2;
7732 
7733 		a->tangents[1][0] = t3;
7734 		a->tangents[1][1] = t4;
7735 		a->tangents[1][2] = t5;
7736 	}
7737 }
7738 
7739 #else
7740 /*
7741 ============
7742 idSIMD_AltiVec::DeriveUnsmoothedTangents
7743 
7744 	Derives the normal and orthogonal tangent vectors for the triangle vertices.
7745 	For each vertex the normal and tangent vectors are derived from a single dominant triangle.
7746 ============
7747 */
7748 #define DERIVE_UNSMOOTHED_BITANGENT
7749 
DeriveUnsmoothedTangents(idDrawVert * verts,const dominantTri_s * dominantTris,const int numVerts)7750 void VPCALL idSIMD_AltiVec::DeriveUnsmoothedTangents( idDrawVert *verts, const dominantTri_s *dominantTris, const int numVerts ) {
7751 	int i;
7752 
7753 	for ( i = 0; i < numVerts; i++ ) {
7754 		idDrawVert *a, *b, *c;
7755 		float d0, d1, d2, d3, d4;
7756 		float d5, d6, d7, d8, d9;
7757 		float s0, s1, s2;
7758 		float n0, n1, n2;
7759 		float t0, t1, t2;
7760 		float t3, t4, t5;
7761 
7762 		const dominantTri_s &dt = dominantTris[i];
7763 
7764 		a = verts + i;
7765 		b = verts + dt.v2;
7766 		c = verts + dt.v3;
7767 
7768 		d0 = b->xyz[0] - a->xyz[0];
7769 		d1 = b->xyz[1] - a->xyz[1];
7770 		d2 = b->xyz[2] - a->xyz[2];
7771 		d3 = b->st[0] - a->st[0];
7772 
7773 		d4 = b->st[1] - a->st[1];
7774 
7775 		d5 = c->xyz[0] - a->xyz[0];
7776 		d6 = c->xyz[1] - a->xyz[1];
7777 		d7 = c->xyz[2] - a->xyz[2];
7778 		d8 = c->st[0] - a->st[0];
7779 
7780 		d9 = c->st[1] - a->st[1];
7781 
7782 		s0 = dt.normalizationScale[0];
7783 		s1 = dt.normalizationScale[1];
7784 		s2 = dt.normalizationScale[2];
7785 
7786 		n0 = s2 * ( d6 * d2 - d7 * d1 );
7787 		n1 = s2 * ( d7 * d0 - d5 * d2 );
7788 		n2 = s2 * ( d5 * d1 - d6 * d0 );
7789 
7790 		t0 = s0 * ( d0 * d9 - d4 * d5 );
7791 		t1 = s0 * ( d1 * d9 - d4 * d6 );
7792 		t2 = s0 * ( d2 * d9 - d4 * d7 );
7793 
7794 #ifndef DERIVE_UNSMOOTHED_BITANGENT
7795 		t3 = s1 * ( d3 * d5 - d0 * d8 );
7796 		t4 = s1 * ( d3 * d6 - d1 * d8 );
7797 		t5 = s1 * ( d3 * d7 - d2 * d8 );
7798 #else
7799 		t3 = s1 * ( n2 * t1 - n1 * t2 );
7800 		t4 = s1 * ( n0 * t2 - n2 * t0 );
7801 		t5 = s1 * ( n1 * t0 - n0 * t1 );
7802 #endif
7803 
7804 		a->normal[0] = n0;
7805 		a->normal[1] = n1;
7806 		a->normal[2] = n2;
7807 
7808 		a->tangents[0][0] = t0;
7809 		a->tangents[0][1] = t1;
7810 		a->tangents[0][2] = t2;
7811 
7812 		a->tangents[1][0] = t3;
7813 		a->tangents[1][1] = t4;
7814 		a->tangents[1][2] = t5;
7815 	}
7816 
7817 }
7818 #endif /* DERIVE_UNSMOOTH_DRAWVERT_ALIGNED */
7819 
7820 /*
7821 ============
7822 idSIMD_AltiVec::NormalizeTangents
7823 
7824 	Normalizes each vertex normal and projects and normalizes the
7825 	tangent vectors onto the plane orthogonal to the vertex normal.
7826 ============
7827 */
NormalizeTangents(idDrawVert * verts,const int numVerts)7828 void VPCALL idSIMD_AltiVec::NormalizeTangents( idDrawVert *verts, const int numVerts ) {
7829 
7830 	// idDrawVert size
7831 	assert( sizeof(idDrawVert) == DRAWVERT_OFFSET * sizeof(float) );
7832 
7833 	float *addr = verts[0].normal.ToFloatPtr();
7834 	float *tAddr = verts[0].tangents[0].ToFloatPtr();
7835 
7836 	// v0 through v3 maintain originally loaded values so we don't take
7837 	// as much hit for unaligned stores
7838 	vector float v0, v1, v2, v3;
7839 	// v5 through v8 are the "working" values of the vectors
7840 	vector float v5, v6, v7, v8;
7841 	// working values
7842 	vector float vec1T0, vec1T1, vec2T0, vec2T1, vec3T0, vec3T1, vec4T0, vec4T1;
7843 	vector float vecSum, vecTSum1, vecTSum2, tempSum, tempSum2, tempSum3;
7844 	vector float vecF, vecF2;
7845 	vector float vecTemp, vecTemp2, vecTemp3, vecTemp4;
7846 
7847 	register vector float zeroVector = (vector float)(0.0);
7848 
7849 	vector unsigned char vecPermHalves = (vector unsigned char)(0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23);
7850 	vector unsigned char vecPermLast = (vector unsigned char)(0,1,2,3,4,5,6,7,8,9,10,11,16,17,18,19);
7851 	vector unsigned char vecPermSplatFirstWithZero = (vector unsigned char)(0,1,2,3,0,1,2,3,0,1,2,3,16,17,18,19);
7852 	vector unsigned char vecPerm0, vecPerm1, vecPerm2, vecPerm3;
7853 	vector unsigned char storePerm0, storePerm1, storePerm2, storePerm3;
7854 
7855 	vector float vecTan11, vecTan12, vecTan13, vecTan21, vecTan22, vecTan23;
7856 	vector float vecTan31, vecTan32, vecTan33, vecTan41, vecTan42, vecTan43;
7857 
7858 	vector unsigned char vec1T0Perm, vec1T1Perm, vec2T0Perm, vec2T1Perm, vec3T0Perm, vec3T1Perm, vec4T0Perm, vec4T1Perm;
7859 	vector unsigned char storeT11, storeT12, storeT21, storeT22, storeT31, storeT32;
7860 	vector unsigned char storeT41, storeT42;
7861 
7862 	int i = 0;
7863 
7864 	if ( i+3 < numVerts ) {
7865 		// for loading normal from idDrawVert
7866 		vecPerm0 = vec_add( vec_lvsl( -1, addr ), (vector unsigned char)(1) );
7867 		vecPerm1 = vec_add( vec_lvsl( -1, addr + ( 1 * DRAWVERT_OFFSET ) ), (vector unsigned char)(1) );
7868 		vecPerm2 = vec_add( vec_lvsl( -1, addr + ( 2 * DRAWVERT_OFFSET ) ), (vector unsigned char)(1) );
7869 		vecPerm3 = vec_add( vec_lvsl( -1, addr + ( 3 * DRAWVERT_OFFSET ) ), (vector unsigned char)(1) );
7870 
7871 		// for loading tangents from idDrawVert
7872 		vec1T0Perm = vec_add( vec_lvsl( -1, tAddr + ( 0 * DRAWVERT_OFFSET ) ), (vector unsigned char)(1) );
7873 		vec1T1Perm = vec_add( vec_lvsl( -1, tAddr + ( 0 * DRAWVERT_OFFSET ) + 3 ), (vector unsigned char)(1) );
7874 		vec2T0Perm = vec_add( vec_lvsl( -1, tAddr + ( 1 * DRAWVERT_OFFSET ) ), (vector unsigned char)(1) );
7875 		vec2T1Perm = vec_add( vec_lvsl( -1, tAddr + ( 1 * DRAWVERT_OFFSET ) + 3 ), (vector unsigned char)(1) );
7876 		vec3T0Perm = vec_add( vec_lvsl( -1, tAddr + ( 2 * DRAWVERT_OFFSET ) ), (vector unsigned char)(1) );
7877 		vec3T1Perm = vec_add( vec_lvsl( -1, tAddr + ( 2 * DRAWVERT_OFFSET ) + 3 ), (vector unsigned char)(1) );
7878 		vec4T0Perm = vec_add( vec_lvsl( -1, tAddr + ( 3 * DRAWVERT_OFFSET ) ), (vector unsigned char)(1) );
7879 		vec4T1Perm = vec_add( vec_lvsl( -1, tAddr + ( 3 * DRAWVERT_OFFSET ) + 3 ), (vector unsigned char)(1) );
7880 
7881 		// generate permute vectors to store normals
7882 		storePerm0 = vec_lvsr( 0, addr );
7883 		storePerm1 = vec_lvsr( 0, addr + ( 1 * DRAWVERT_OFFSET ) );
7884 		storePerm2 = vec_lvsr( 0, addr + ( 2 * DRAWVERT_OFFSET ) );
7885 		storePerm3 = vec_lvsr( 0, addr + ( 3 * DRAWVERT_OFFSET ) );
7886 
7887 		// generate permute vectors to store tangents
7888 		storeT11 = vec_lvsr( 0, tAddr + ( 0 * DRAWVERT_OFFSET ) );
7889 		storeT12 = vec_lvsr( 12, tAddr + ( 0 * DRAWVERT_OFFSET ) );
7890 
7891 		storeT21 = vec_lvsr( 0, tAddr + ( 1 * DRAWVERT_OFFSET ) );
7892 		storeT22 = vec_lvsr( 12, tAddr + ( 1 * DRAWVERT_OFFSET ) );
7893 
7894 		storeT31 = vec_lvsr( 0, tAddr + ( 2 * DRAWVERT_OFFSET ) );
7895 		storeT32 = vec_lvsr( 12, tAddr + ( 2 * DRAWVERT_OFFSET ) );
7896 
7897 		storeT41 = vec_lvsr( 0, tAddr + ( 3 * DRAWVERT_OFFSET ) );
7898 		storeT42 = vec_lvsr( 12, tAddr + ( 3 * DRAWVERT_OFFSET ) );
7899 	}
7900 
7901 	for ( ; i+3 < numVerts; i+=4 ) {
7902 
7903 		// load normals
7904 		vector float vecNormal11 = vec_ld( 0, addr + ( i * DRAWVERT_OFFSET ) );
7905 		vector float vecNormal12 = vec_ld( 15, addr + ( i * DRAWVERT_OFFSET ) );
7906 		v0 = vec_perm( vecNormal11, vecNormal12, vecPerm0 );
7907 
7908 		vector float vecNormal21 = vec_ld( 0, addr + ((i+1) * DRAWVERT_OFFSET ) );
7909 		vector float vecNormal22 = vec_ld( 15, addr + ((i+1) * DRAWVERT_OFFSET ) );
7910 		v1 = vec_perm( vecNormal21, vecNormal22, vecPerm1 );
7911 
7912 		vector float vecNormal31 = vec_ld( 0, addr + ( (i+2) * DRAWVERT_OFFSET ) );
7913 		vector float vecNormal32 = vec_ld( 15, addr + ( (i+2) * DRAWVERT_OFFSET ) );
7914 		v2 = vec_perm( vecNormal31, vecNormal32, vecPerm2 );
7915 
7916 		vector float vecNormal41 = vec_ld( 0, addr + ((i+3) * DRAWVERT_OFFSET ) );
7917 		vector float vecNormal42 = vec_ld( 15, addr + ((i+3) * DRAWVERT_OFFSET ) );
7918 		v3 = vec_perm( vecNormal41, vecNormal42, vecPerm3 );
7919 
7920 		// zero out the last element of each useless vector
7921 		v0 = vec_perm( v0, zeroVector, vecPermLast );
7922 		v1 = vec_perm( v1, zeroVector, vecPermLast );
7923 		v2 = vec_perm( v2, zeroVector, vecPermLast );
7924 		v3 = vec_perm( v3, zeroVector, vecPermLast );
7925 
7926 		// got 4 vectors in v0 through v3, sum them each accross
7927 		// and put into one vector
7928 		vecTemp = vec_madd( v0, v0, zeroVector );
7929 
7930 		vecSum = vec_add( vecTemp, vec_sld( vecTemp, vecTemp, 8 ) );
7931 		vecSum = vec_add( vecSum, vec_sld( vecSum, vecSum, 4 ) );
7932 		// element 0 of vecSum now has sum of v0
7933 
7934 		vecTemp2 = vec_madd( v1, v1, zeroVector );
7935 		tempSum = vec_add( vecTemp2, vec_sld( vecTemp2, vecTemp2, 8 ) );
7936 		tempSum = vec_add( tempSum, vec_sld( tempSum, tempSum, 4 ) );
7937 		// put this into vecSum
7938 		vecSum = vec_mergeh( vecSum, tempSum );
7939 
7940 		vecTemp3 = vec_madd( v2, v2, zeroVector );
7941 		tempSum = vec_add( vecTemp3, vec_sld( vecTemp3, vecTemp3, 8 ) );
7942 		tempSum = vec_add( tempSum, vec_sld( tempSum, tempSum, 4 ) );
7943 		// put this into vecSum
7944 		vecSum = vec_perm( vecSum, tempSum, vecPermHalves );
7945 
7946 		vecTemp4 = vec_madd( v3, v3, zeroVector );
7947 		tempSum = vec_add( vecTemp4, vec_sld( vecTemp4, vecTemp4, 8 ) );
7948 		tempSum = vec_add( tempSum, vec_sld( tempSum, tempSum, 4 ) );
7949 		// put this into vecSum
7950 		vecSum = vec_perm( vecSum, tempSum, vecPermLast );
7951 
7952 		// take reciprocal square roots of these
7953 		vecF = ReciprocalSquareRoot( vecSum );
7954 
7955 		// multiply each vector by f
7956 		v5 = vec_madd( v0, vec_splat( vecF, 0 ), zeroVector );
7957 		v6 = vec_madd( v1, vec_splat( vecF, 1 ), zeroVector );
7958 		v7 = vec_madd( v2, vec_splat( vecF, 2 ), zeroVector );
7959 		v8 = vec_madd( v3, vec_splat( vecF, 3 ), zeroVector );
7960 
7961 		// load tangents as unaligned
7962 		vecTan11 = vec_ld( 0, tAddr + ( i * DRAWVERT_OFFSET ) );
7963 		vecTan12 = vec_ld( 11, tAddr + ( i * DRAWVERT_OFFSET ) );
7964 		vecTan13 = vec_ld( 23, tAddr + ( i * DRAWVERT_OFFSET ) );
7965 
7966 		vecTan21 = vec_ld( 0, tAddr + ( (i+1) * DRAWVERT_OFFSET ) );
7967 		vecTan22 = vec_ld( 11, tAddr + ( (i+1) * DRAWVERT_OFFSET ) );
7968 		vecTan23 = vec_ld( 23, tAddr + ( (i+1) * DRAWVERT_OFFSET ) );
7969 
7970 		vecTan31 = vec_ld( 0, tAddr + ( (i+2) * DRAWVERT_OFFSET ) );
7971 		vecTan32 = vec_ld( 11, tAddr + ( (i+2) * DRAWVERT_OFFSET ) );
7972 		vecTan33 = vec_ld( 23, tAddr + ( (i+2) * DRAWVERT_OFFSET ) );
7973 
7974 		vecTan41 = vec_ld( 0, tAddr + ( (i+3) * DRAWVERT_OFFSET ) );
7975 		vecTan42 = vec_ld( 11, tAddr + ( (i+3) * DRAWVERT_OFFSET ) );
7976 		vecTan43 = vec_ld( 23, tAddr + ( (i+3) * DRAWVERT_OFFSET ) );
7977 
7978 		vec1T0 = vec_perm( vecTan11, vecTan12, vec1T0Perm );
7979 		vec1T1 = vec_perm( vecTan12, vecTan13, vec1T1Perm );
7980 		vec2T0 = vec_perm( vecTan21, vecTan22, vec2T0Perm );
7981 		vec2T1 = vec_perm( vecTan22, vecTan23, vec2T1Perm );
7982 		vec3T0 = vec_perm( vecTan31, vecTan32, vec3T0Perm );
7983 		vec3T1 = vec_perm( vecTan32, vecTan33, vec3T1Perm );
7984 		vec4T0 = vec_perm( vecTan41, vecTan42, vec4T0Perm );
7985 		vec4T1 = vec_perm( vecTan42, vecTan43, vec4T1Perm );
7986 
7987 		//zero out last element of tangents
7988 		vec1T0 = vec_perm( vec1T0, zeroVector, vecPermLast );
7989 		vec1T1 = vec_perm( vec1T1, zeroVector, vecPermLast );
7990 		vec2T0 = vec_perm( vec2T0, zeroVector, vecPermLast );
7991 		vec2T1 = vec_perm( vec2T1, zeroVector, vecPermLast );
7992 		vec3T0 = vec_perm( vec3T0, zeroVector, vecPermLast );
7993 		vec3T1 = vec_perm( vec3T1, zeroVector, vecPermLast );
7994 		vec4T0 = vec_perm( vec4T0, zeroVector, vecPermLast );
7995 		vec4T1 = vec_perm( vec4T1, zeroVector, vecPermLast );
7996 
7997 		// all tangents[0]
7998 		tempSum = zeroVector;
7999 		tempSum = vec_madd( vec1T0, v5, tempSum );
8000 		//sum accross tempSum
8001 		vecTSum1 = vec_add( tempSum, vec_sld( tempSum, tempSum, 8 ) );
8002 		vecTSum1 = vec_add( vecTSum1, vec_sld( vecTSum1, vecTSum1, 4 ) );
8003 		//  put tempSum splatted accross vecTSum1
8004 		vecTSum1 = vec_perm( vecTSum1, zeroVector, vecPermSplatFirstWithZero );
8005 		vecTSum1 = vec_madd( vecTSum1, v5, zeroVector );
8006 
8007 		//vec1T0 now contains what needs to be rsqrt'd and multiplied by f
8008 		vec1T0 = vec_sub( vec1T0, vecTSum1 );
8009 
8010 		tempSum = zeroVector;
8011 		tempSum = vec_madd( vec2T0, v6, tempSum );
8012 
8013 		//sum accross tempSum
8014 		vecTSum1 = vec_add( tempSum, vec_sld( tempSum, tempSum, 8 ) );
8015 		vecTSum1 = vec_add( vecTSum1, vec_sld( vecTSum1, vecTSum1, 4 ) );
8016 		vecTSum1 = vec_perm( vecTSum1, zeroVector, vecPermSplatFirstWithZero );
8017 		vecTSum1 = vec_madd( vecTSum1, v6, zeroVector );
8018 		vec2T0 = vec_sub( vec2T0, vecTSum1 );
8019 
8020 		tempSum = zeroVector;
8021 		tempSum = vec_madd( vec3T0, v7, tempSum );
8022 
8023 		//sum accross tempSum
8024 		vecTSum1 = vec_add( tempSum, vec_sld( tempSum, tempSum, 8 ) );
8025 		vecTSum1 = vec_add( vecTSum1, vec_sld( vecTSum1, vecTSum1, 4 ) );
8026 		vecTSum1 = vec_perm( vecTSum1, zeroVector, vecPermSplatFirstWithZero );
8027 		vecTSum1 = vec_madd( vecTSum1, v7, zeroVector );
8028 		vec3T0 = vec_sub( vec3T0, vecTSum1 );
8029 
8030 		tempSum = zeroVector;
8031 		tempSum = vec_madd( vec4T0, v8, tempSum );
8032 
8033 		//sum accross tempSum
8034 		vecTSum1 = vec_add( tempSum, vec_sld( tempSum, tempSum, 8 ) );
8035 		vecTSum1 = vec_add( vecTSum1, vec_sld( vecTSum1, vecTSum1, 4 ) );
8036 		vecTSum1 = vec_perm( vecTSum1, zeroVector, vecPermSplatFirstWithZero );
8037 		vecTSum1 = vec_madd( vecTSum1, v8, zeroVector );
8038 		vec4T0 = vec_sub( vec4T0, vecTSum1 );
8039 
8040 		// all tangents[1]
8041 		tempSum = zeroVector;
8042 		tempSum = vec_madd( vec1T1, v5, tempSum );
8043 
8044 		//sum accross tempSum
8045 		vecTSum1 = vec_add( tempSum, vec_sld( tempSum, tempSum, 8 ) );
8046 		vecTSum1 = vec_add( vecTSum1, vec_sld( vecTSum1, vecTSum1, 4 ) );
8047 		vecTSum1 = vec_perm( vecTSum1, zeroVector, vecPermSplatFirstWithZero );
8048 		vecTSum1 = vec_madd( vecTSum1, v5, zeroVector );
8049 
8050 		//vec1T0 now contains what needs to be rsqrt'd and multiplied by f
8051 		vec1T1 = vec_sub( vec1T1, vecTSum1 );
8052 
8053 		tempSum = zeroVector;
8054 		tempSum = vec_madd( vec2T1, v6, tempSum );
8055 
8056 		//sum accross tempSum
8057 		vecTSum1 = vec_add( tempSum, vec_sld( tempSum, tempSum, 8 ) );
8058 		vecTSum1 = vec_add( vecTSum1, vec_sld( vecTSum1, vecTSum1, 4 ) );
8059 		vecTSum1 = vec_perm( vecTSum1, zeroVector, vecPermSplatFirstWithZero );
8060 		vecTSum1 = vec_madd( vecTSum1, v6, zeroVector );
8061 		vec2T1 = vec_sub( vec2T1, vecTSum1 );
8062 
8063 		tempSum = zeroVector;
8064 		tempSum = vec_madd( vec3T1, v7, tempSum );
8065 
8066 		//sum accross tempSum
8067 		vecTSum1 = vec_add( tempSum, vec_sld( tempSum, tempSum, 8 ) );
8068 		vecTSum1 = vec_add( vecTSum1, vec_sld( vecTSum1, vecTSum1, 4 ) );
8069 		vecTSum1 = vec_perm( vecTSum1, zeroVector, vecPermSplatFirstWithZero );
8070 		vecTSum1 = vec_madd( vecTSum1, v7, zeroVector );
8071 		vec3T1 = vec_sub( vec3T1, vecTSum1 );
8072 
8073 		tempSum = zeroVector;
8074 		tempSum = vec_madd( vec4T1, v8, tempSum );
8075 
8076 		//sum accross tempSum
8077 		vecTSum1 = vec_add( tempSum, vec_sld( tempSum, tempSum, 8 ) );
8078 		vecTSum1 = vec_add( vecTSum1, vec_sld( vecTSum1, vecTSum1, 4 ) );
8079 		vecTSum1 = vec_perm( vecTSum1, zeroVector, vecPermSplatFirstWithZero );
8080 		vecTSum1 = vec_madd( vecTSum1, v8, zeroVector );
8081 		vec4T1 = vec_sub( vec4T1, vecTSum1 );
8082 
8083 
8084 		// sum accross vectors and put into one vector
8085 		vecTemp = vec_madd( vec1T0, vec1T0, zeroVector );
8086 		vecTSum1 = vec_add( vecTemp, vec_sld( vecTemp, vecTemp, 8 ) );
8087 		vecTSum1 = vec_add( vecTSum1, vec_sld( vecTSum1, vecTSum1, 4 ) );
8088 
8089 		// element 0 of vecSum now has sum of v0
8090 		vecTemp = vec_madd( vec2T0, vec2T0, zeroVector );
8091 		tempSum2 = vec_add( vecTemp, vec_sld( vecTemp, vecTemp, 8 ) );
8092 		tempSum2 = vec_add( tempSum2, vec_sld( tempSum2, tempSum2, 4 ) );
8093 		// put this into vecSum
8094 		vecTemp = vec_madd( vec3T0, vec3T0, zeroVector );
8095 		vecTSum1 = vec_mergeh( vecTSum1, tempSum2 );
8096 		tempSum2 = vec_add( vecTemp, vec_sld( vecTemp, vecTemp, 8 ) );
8097 		tempSum2 = vec_add( tempSum2, vec_sld( tempSum2, tempSum2, 4 ) );
8098 		// put this into vecSum
8099 		vecTSum1 = vec_perm( vecTSum1, tempSum2, vecPermHalves );
8100 		vecTemp = vec_madd( vec4T0, vec4T0, zeroVector );
8101 		tempSum2 = vec_add( vecTemp, vec_sld( vecTemp, vecTemp, 8 ) );
8102 		tempSum2 = vec_add( tempSum2, vec_sld( tempSum2, tempSum2, 4 ) );
8103 		// put this into vecSum
8104 		vecTSum1 = vec_perm( vecTSum1, tempSum2, vecPermLast );
8105 
8106 		vecTemp = vec_madd( vec1T1, vec1T1, zeroVector );
8107 		vecTSum2 = vec_add( vecTemp, vec_sld( vecTemp, vecTemp, 8 ) );
8108 		vecTSum2 = vec_add( vecTSum2, vec_sld( vecTSum2, vecTSum2, 4 ) );
8109 		// element 0 of vecSum now has sum of v0
8110 		vecTemp = vec_madd( vec2T1, vec2T1, zeroVector );
8111 		tempSum3 = vec_add( vecTemp, vec_sld( vecTemp, vecTemp, 8 ) );
8112 		tempSum3 = vec_add( tempSum3, vec_sld( tempSum3, tempSum3, 4 ) );
8113 		// put this into vecSum
8114 		vecTSum2 = vec_mergeh( vecTSum2, tempSum3 );
8115 		vecTemp = vec_madd( vec3T1, vec3T1, zeroVector );
8116 		tempSum3 = vec_add( vecTemp, vec_sld( vecTemp, vecTemp, 8 ) );
8117 		tempSum3 = vec_add( tempSum3, vec_sld( tempSum3, tempSum3, 4 ) );
8118 		// put this into vecSum
8119 		vecTSum2 = vec_perm( vecTSum2, tempSum3, vecPermHalves );
8120 		vecTemp = vec_madd( vec4T1, vec4T1, zeroVector );
8121 		tempSum3 = vec_add( vecTemp, vec_sld( vecTemp, vecTemp, 8 ) );
8122 		tempSum3 = vec_add( tempSum3, vec_sld( tempSum3, tempSum3, 4 ) );
8123 		// put this into vecSum
8124 		vecTSum2 = vec_perm( vecTSum2, tempSum3, vecPermLast );
8125 
8126 		// tangents[0]
8127 		vecF = ReciprocalSquareRoot( vecTSum1 );
8128 		// tangents[1]
8129 		vecF2 = ReciprocalSquareRoot( vecTSum2 );
8130 
8131 		// multiply each tangent vector by f
8132 
8133 		vec1T0 = vec_madd( vec1T0, vec_splat( vecF, 0 ), zeroVector );
8134 		vec2T0 = vec_madd( vec2T0, vec_splat( vecF, 1 ), zeroVector );
8135 		vec3T0 = vec_madd( vec3T0, vec_splat( vecF, 2 ), zeroVector );
8136 		vec4T0 = vec_madd( vec4T0, vec_splat( vecF, 3 ), zeroVector );
8137 
8138 		vec1T1 = vec_madd( vec1T1, vec_splat( vecF2, 0 ), zeroVector );
8139 		vec2T1 = vec_madd( vec2T1, vec_splat( vecF2, 1 ), zeroVector );
8140 		vec3T1 = vec_madd( vec3T1, vec_splat( vecF2, 2 ), zeroVector );
8141 		vec4T1 = vec_madd( vec4T1, vec_splat( vecF2, 3 ), zeroVector );
8142 
8143 		// rotate input data
8144 		v5 = vec_perm( v5, v5, storePerm0 );
8145 		v6 = vec_perm( v6, v6, storePerm1 );
8146 		v7 = vec_perm( v7, v7, storePerm2 );
8147 		v8 = vec_perm( v8, v8, storePerm3 );
8148 
8149 		vec_ste( v5, 0, addr + ( (i+0) * DRAWVERT_OFFSET ) );
8150 		vec_ste( v5, 4, addr + ( (i+0) * DRAWVERT_OFFSET ) );
8151 		vec_ste( v5, 8, addr + ( (i+0) * DRAWVERT_OFFSET ) );
8152 
8153 		vec_ste( v6, 0, addr + ( (i+1) * DRAWVERT_OFFSET ) );
8154 		vec_ste( v6, 4, addr + ( (i+1) * DRAWVERT_OFFSET ) );
8155 		vec_ste( v6, 8, addr + ( (i+1) * DRAWVERT_OFFSET ) );
8156 
8157 		vec_ste( v7, 0, addr + ( (i+2) * DRAWVERT_OFFSET ) );
8158 		vec_ste( v7, 4, addr + ( (i+2) * DRAWVERT_OFFSET ) );
8159 		vec_ste( v7, 8, addr + ( (i+2) * DRAWVERT_OFFSET ) );
8160 
8161 		vec_ste( v8, 0, addr + ( (i+3) * DRAWVERT_OFFSET ) );
8162 		vec_ste( v8, 4, addr + ( (i+3) * DRAWVERT_OFFSET ) );
8163 		vec_ste( v8, 8, addr + ( (i+3) * DRAWVERT_OFFSET ) );
8164 
8165 		// store tangents[0] and tangents[1]
8166 		vec1T0 = vec_perm( vec1T0, vec1T0, storeT11 );
8167 		vec1T1 = vec_perm( vec1T1, vec1T1, storeT12 );
8168 
8169 		vec_ste( vec1T0, 0, tAddr + ((i+0) * DRAWVERT_OFFSET ) );
8170 		vec_ste( vec1T0, 4, tAddr + ((i+0) * DRAWVERT_OFFSET ) );
8171 		vec_ste( vec1T0, 8, tAddr + ((i+0) * DRAWVERT_OFFSET ) );
8172 		vec_ste( vec1T1, 12, tAddr + ((i+0) * DRAWVERT_OFFSET ) );
8173 		vec_ste( vec1T1, 16, tAddr + ((i+0) * DRAWVERT_OFFSET ) );
8174 		vec_ste( vec1T1, 20, tAddr + ((i+0) * DRAWVERT_OFFSET ) );
8175 
8176 		// store second tangents[0] and tangents[1]
8177 		vec2T0 = vec_perm( vec2T0, vec2T0, storeT21 );
8178 		vec2T1 = vec_perm( vec2T1, vec2T1, storeT22 );
8179 
8180 		vec_ste( vec2T0, 0, tAddr + ((i+1) * DRAWVERT_OFFSET ) );
8181 		vec_ste( vec2T0, 4, tAddr + ((i+1) * DRAWVERT_OFFSET ) );
8182 		vec_ste( vec2T0, 8, tAddr + ((i+1) * DRAWVERT_OFFSET ) );
8183 		vec_ste( vec2T1, 12, tAddr + ((i+1) * DRAWVERT_OFFSET ) );
8184 		vec_ste( vec2T1, 16, tAddr + ((i+1) * DRAWVERT_OFFSET ) );
8185 		vec_ste( vec2T1, 20, tAddr + ((i+1) * DRAWVERT_OFFSET ) );
8186 
8187 		// store third tangents[0] and tangents[1]
8188 		vec3T0 = vec_perm( vec3T0, vec3T0, storeT31 );
8189 		vec3T1 = vec_perm( vec3T1, vec3T1, storeT32 );
8190 
8191 		vec_ste( vec3T0, 0, tAddr + ((i+2) * DRAWVERT_OFFSET ) );
8192 		vec_ste( vec3T0, 4, tAddr + ((i+2) * DRAWVERT_OFFSET ) );
8193 		vec_ste( vec3T0, 8, tAddr + ((i+2) * DRAWVERT_OFFSET ) );
8194 		vec_ste( vec3T1, 12, tAddr + ((i+2) * DRAWVERT_OFFSET ) );
8195 		vec_ste( vec3T1, 16, tAddr + ((i+2) * DRAWVERT_OFFSET ) );
8196 		vec_ste( vec3T1, 20, tAddr + ((i+2) * DRAWVERT_OFFSET ) );
8197 
8198 		// store fourth tangents[0] and tangents[1]
8199 		vec4T0 = vec_perm( vec4T0, vec4T0, storeT41 );
8200 		vec4T1 = vec_perm( vec4T1, vec4T1, storeT42 );
8201 
8202 		vec_ste( vec4T0, 0, tAddr + ((i+3) * DRAWVERT_OFFSET ) );
8203 		vec_ste( vec4T0, 4, tAddr + ((i+3) * DRAWVERT_OFFSET ) );
8204 		vec_ste( vec4T0, 8, tAddr + ((i+3) * DRAWVERT_OFFSET ) );
8205 		vec_ste( vec4T1, 12, tAddr + ((i+3) * DRAWVERT_OFFSET ) );
8206 		vec_ste( vec4T1, 16, tAddr + ((i+3) * DRAWVERT_OFFSET ) );
8207 		vec_ste( vec4T1, 20, tAddr + ((i+3) * DRAWVERT_OFFSET ) );
8208 	}
8209 
8210 	// cleanup
8211 	for ( ; i < numVerts; i++ ) {
8212 		idVec3 &v = verts[i].normal;
8213 		float f;
8214 
8215 		//f = idMath::RSqrt( v.x * v.x + v.y * v.y + v.z * v.z );
8216 		f = FastScalarInvSqrt( v.x * v.x + v.y * v.y + v.z * v.z );
8217 		v.x *= f; v.y *= f; v.z *= f;
8218 
8219 		for ( int j = 0; j < 2; j++ ) {
8220 			idVec3 &t = verts[i].tangents[j];
8221 
8222 			t -= ( t * v ) * v;
8223 		//	f = idMath::RSqrt( t.x * t.x + t.y * t.y + t.z * t.z );
8224 			f = FastScalarInvSqrt( t.x * t.x + t.y * t.y + t.z * t.z );
8225 			t.x *= f; t.y *= f; t.z *= f;
8226 		}
8227 	}
8228 }
8229 #endif /* ENABLE_DERIVE */
8230 
8231 #ifdef ENABLE_CREATE
8232 
8233 /*
8234 ============
8235 idSIMD_AltiVec::CreateTextureSpaceLightVectors
8236 
8237 	Calculates light vectors in texture space for the given triangle vertices.
8238 	For each vertex the direction towards the light origin is projected onto texture space.
8239 	The light vectors are only calculated for the vertices referenced by the indexes.
8240 ============
8241 */
8242 
CreateTextureSpaceLightVectors(idVec3 * lightVectors,const idVec3 & lightOrigin,const idDrawVert * verts,const int numVerts,const int * indexes,const int numIndexes)8243 void VPCALL idSIMD_AltiVec::CreateTextureSpaceLightVectors( idVec3 *lightVectors, const idVec3 &lightOrigin, const idDrawVert *verts, const int numVerts, const int *indexes, const int numIndexes ) {
8244 
8245 	bool *used = (bool *)_alloca16( numVerts * sizeof( used[0] ) );
8246 	memset( used, 0, numVerts * sizeof( used[0] ) );
8247 
8248 	int i;
8249 	for ( i = 0; i+7 < numIndexes; i+= 8 ) {
8250 		used[indexes[i]] = true;
8251 		used[indexes[i+1]] = true;
8252 		used[indexes[i+2]] = true;
8253 		used[indexes[i+3]] = true;
8254 		used[indexes[i+4]] = true;
8255 		used[indexes[i+5]] = true;
8256 		used[indexes[i+6]] = true;
8257 		used[indexes[i+7]] = true;
8258 	}
8259 
8260 	for ( ; i < numIndexes; i++ ) {
8261 		used[indexes[i]] = true;
8262 	}
8263 
8264 	for ( i = 0; i+1 < numVerts; i+=2 ) {
8265 
8266 		const idDrawVert *v = &verts[i];
8267 		const idDrawVert *v2 = &verts[i+1];
8268 
8269 		float x, y, z;
8270 		float x2, y2, z2;
8271 		idVec3 lightDir, lightDir2;
8272 
8273 		lightDir[0] = lightOrigin[0] - v->xyz[0];
8274 		lightDir[1] = lightOrigin[1] - v->xyz[1];
8275 		lightDir[2] = lightOrigin[2] - v->xyz[2];
8276 
8277 		lightDir2[0] = lightOrigin[0] - v2->xyz[0];
8278 		lightDir2[1] = lightOrigin[1] - v2->xyz[1];
8279 		lightDir2[2] = lightOrigin[2] - v2->xyz[2];
8280 
8281 		x = lightDir[0] * v->tangents[0][0] + lightDir[1] * v->tangents[0][1] + lightDir[2] * v->tangents[0][2];
8282 		y = lightDir[0] * v->tangents[1][0] + lightDir[1] * v->tangents[1][1] + lightDir[2] * v->tangents[1][2];
8283 		z = lightDir[0] * v->normal[0] + lightDir[1] * v->normal[1] + lightDir[2] * v->normal[2];
8284 
8285 		x2 = lightDir2[0] * v2->tangents[0][0] + lightDir2[1] * v2->tangents[0][1] + lightDir2[2] * v2->tangents[0][2];
8286 		y2 = lightDir2[0] * v2->tangents[1][0] + lightDir2[1] * v2->tangents[1][1] + lightDir2[2] * v2->tangents[1][2];
8287 		z2 = lightDir2[0] * v2->normal[0] + lightDir2[1] * v2->normal[1] + lightDir2[2] * v2->normal[2];
8288 
8289 		if ( used[i] ) {
8290 			lightVectors[i][0] = x;
8291 			lightVectors[i][1] = y;
8292 			lightVectors[i][2] = z;
8293 		}
8294 
8295 		if ( used[i+1] ) {
8296 			lightVectors[i+1][0] = x2;
8297 			lightVectors[i+1][1] = y2;
8298 			lightVectors[i+1][2] = z2;
8299 		}
8300 	}
8301 
8302 	// cleanup
8303 	for ( ; i < numVerts; i++ ) {
8304 		if ( !used[i] ) {
8305 			continue;
8306 		}
8307 
8308 		const idDrawVert *v = &verts[i];
8309 		idVec3 lightDir;
8310 
8311 		lightDir[0] = lightOrigin[0] - v->xyz[0];
8312 		lightDir[1] = lightOrigin[1] - v->xyz[1];
8313 		lightDir[2] = lightOrigin[2] - v->xyz[2];
8314 
8315 		lightVectors[i][0] = lightDir[0] * v->tangents[0][0] + lightDir[1] * v->tangents[0][1] + lightDir[2] * v->tangents[0][2];
8316 		lightVectors[i][1] = lightDir[0] * v->tangents[1][0] + lightDir[1] * v->tangents[1][1] + lightDir[2] * v->tangents[1][2];
8317 		lightVectors[i][2] = lightDir[0] * v->normal[0] + lightDir[1] * v->normal[1] + lightDir[2] * v->normal[2];
8318 	}
8319 }
8320 
8321 #if 1
8322 /*
8323 ============
8324 idSIMD_AltiVec::CreateSpecularTextureCoords
8325 
8326 	Calculates specular texture coordinates for the given triangle vertices.
8327 	For each vertex the normalized direction towards the light origin is added to the
8328 	normalized direction towards the view origin and the result is projected onto texture space.
8329 	The texture coordinates are only calculated for the vertices referenced by the indexes.
8330 ============
8331 */
CreateSpecularTextureCoords(idVec4 * texCoords,const idVec3 & lightOrigin,const idVec3 & viewOrigin,const idDrawVert * verts,const int numVerts,const int * indexes,const int numIndexes)8332 void VPCALL idSIMD_AltiVec::CreateSpecularTextureCoords( idVec4 *texCoords, const idVec3 &lightOrigin, const idVec3 &viewOrigin, const idDrawVert *verts, const int numVerts, const int *indexes, const int numIndexes ) {
8333 
8334 	bool *used = (bool *)_alloca16( numVerts * sizeof( used[0] ) );
8335 	memset( used, 0, numVerts * sizeof( used[0] ) );
8336 
8337 	int i;
8338 	for ( i = 0; i+7 < numIndexes; i+= 8 ) {
8339 		used[indexes[i]] = true;
8340 		used[indexes[i+1]] = true;
8341 		used[indexes[i+2]] = true;
8342 		used[indexes[i+3]] = true;
8343 		used[indexes[i+4]] = true;
8344 		used[indexes[i+5]] = true;
8345 		used[indexes[i+6]] = true;
8346 		used[indexes[i+7]] = true;
8347 	}
8348 
8349 	for ( ; i < numIndexes; i++ ) {
8350 		used[indexes[i]] = true;
8351 	}
8352 
8353 	// load lightOrigin and viewOrigin into vectors
8354 	const float *lightOriginPtr = lightOrigin.ToFloatPtr();
8355 	const float *viewOriginPtr = viewOrigin.ToFloatPtr();
8356 	vector unsigned char permVec = vec_lvsl( 0, lightOriginPtr );
8357 	vector unsigned char permVec2 = vec_lvsl( 0, viewOriginPtr );
8358 	vector float v0 = vec_ld( 0, lightOriginPtr );
8359 	vector float v1 = vec_ld( 15, lightOriginPtr );
8360 	vector float v2 = vec_ld( 0, viewOriginPtr );
8361 	vector float v3 = vec_ld( 15, viewOriginPtr );
8362 	vector float vecLightOrigin = vec_perm( v0, v1, permVec );
8363 	vector float vecViewOrigin = vec_perm( v2, v3, permVec2 );
8364 	const vector float zeroVector = (vector float)(0);
8365 	int index;
8366 
8367 	for ( index = 0; index+1 < numVerts; index+=2 ) {
8368 		const float *vertPtr = verts[index].xyz.ToFloatPtr();
8369 		const float *vertPtr2 = verts[index+1].xyz.ToFloatPtr();
8370 
8371 		permVec = vec_add( vec_lvsl( -1, vertPtr ), (vector unsigned char)(1) );
8372 		permVec2 = vec_add( vec_lvsl( -1, vertPtr2 ), (vector unsigned char)(1) );
8373 
8374 		v0 = vec_ld( 0, vertPtr );
8375 		v1 = vec_ld( 15, vertPtr );
8376 		vector float v2 = vec_ld( 31, vertPtr );
8377 		vector float v3 = vec_ld( 47, vertPtr );
8378 		vector float v4 = vec_ld( 63, vertPtr );
8379 
8380 		vector float v5 = vec_ld( 0, vertPtr2 );
8381 		vector float v6 = vec_ld( 15, vertPtr2 );
8382 		vector float v7 = vec_ld( 31, vertPtr2 );
8383 		vector float v8 = vec_ld( 47, vertPtr2 );
8384 		vector float v9 = vec_ld( 63, vertPtr2 );
8385 
8386 		// figure out what values go where
8387 		vector float vecXYZ = vec_perm( v0, v1, permVec );
8388 		vector float vecNormal = vec_perm( v1, v2, permVec );
8389 		vecNormal = vec_sld( vecNormal, vecNormal, 4 );
8390 		const vector float vecTangent0 = vec_perm( v2, v3, permVec );
8391 		permVec = vec_add( permVec, (vector unsigned char)(-4) ); //shift permute right 3 elements
8392 		const vector float vecTangent1 = vec_perm( v3, v4, permVec );
8393 
8394 		vector float vecXYZ2 = vec_perm( v5, v6, permVec2 );
8395 		vector float vecNormal2 = vec_perm( v6, v7, permVec2 );
8396 		vecNormal2 = vec_sld( vecNormal2, vecNormal2, 4 );
8397 		const vector float vecTangent02 = vec_perm( v7, v8, permVec2 );
8398 		permVec2 = vec_add( permVec2, (vector unsigned char)(-4) );
8399 		const vector float vecTangent12 = vec_perm( v8, v9, permVec2 );
8400 
8401 		// calculate lightDir
8402 		vector float vecLightDir = vec_sub( vecLightOrigin, vecXYZ );
8403 		vector float vecViewDir = vec_sub( vecViewOrigin, vecXYZ );
8404 
8405 		vector float vecLightDir2 = vec_sub( vecLightOrigin, vecXYZ2 );
8406 		vector float vecViewDir2 = vec_sub( vecViewOrigin, vecXYZ2 );
8407 
8408 		// calculate distance
8409 		vector float vecTempLight = vec_madd( vecLightDir, vecLightDir, zeroVector );
8410 		vector float vecTempView = vec_madd( vecViewDir, vecViewDir, zeroVector );
8411 
8412 		vector float vecTempLight2 = vec_madd( vecLightDir2, vecLightDir2, zeroVector );
8413 		vector float vecTempView2 = vec_madd( vecViewDir2, vecViewDir2, zeroVector );
8414 
8415 		// sum accross first 3 elements of vector
8416 		vector float tempSum = vec_add( vecTempLight, vec_sld( vecTempLight, vecTempLight, 4 ) );
8417 		vecTempLight = vec_add( tempSum, vec_sld( tempSum, tempSum, 8 ) );
8418 		vector float tempSum2 = vec_add( vecTempView, vec_sld( vecTempView, vecTempView, 4 ) );
8419 		vecTempView = vec_add( tempSum2, vec_sld( tempSum2, tempSum2, 8 ) );
8420 
8421 		vector float tempSum4 = vec_add( vecTempLight2, vec_sld( vecTempLight2, vecTempLight2, 4 ) );
8422 		vecTempLight2 = vec_add( tempSum4, vec_sld( tempSum4, tempSum4, 8 ) );
8423 		vector float tempSum5 = vec_add( vecTempView2, vec_sld( vecTempView2, vecTempView2, 4 ) );
8424 		vecTempView2 = vec_add( tempSum5, vec_sld( tempSum5, tempSum5, 8 ) );
8425 
8426 		// splat sum accross the whole vector
8427 		vecTempLight = vec_splat( vecTempLight, 0 );
8428 		vecTempView = vec_splat( vecTempView, 0 );
8429 
8430 		vecTempLight2 = vec_splat( vecTempLight2, 0 );
8431 		vecTempView2 = vec_splat( vecTempView2, 0 );
8432 
8433 		vecTempLight = ReciprocalSquareRoot( vecTempLight );
8434 		vecTempView = ReciprocalSquareRoot( vecTempView );
8435 
8436 		vecTempLight2 = ReciprocalSquareRoot( vecTempLight2 );
8437 		vecTempView2 = ReciprocalSquareRoot( vecTempView2 );
8438 
8439 		// modify light and view vectors based on ilength
8440 		vecViewDir = vec_madd( vecViewDir, vecTempView, zeroVector );
8441 		vecLightDir = vec_madd( vecLightDir, vecTempLight, vecViewDir );
8442 
8443 		vecViewDir2 = vec_madd( vecViewDir2, vecTempView2, zeroVector );
8444 		vecLightDir2 = vec_madd( vecLightDir2, vecTempLight2, vecViewDir2 );
8445 
8446 		// calculate what to store in each texture coord
8447 		vector float vecTC0 = vec_madd( vecLightDir, vecTangent0, zeroVector );
8448 		vector float vecTC1 = vec_madd( vecLightDir, vecTangent1, zeroVector );
8449 		vector float vecTC2 = vec_madd( vecLightDir, vecNormal, zeroVector );
8450 
8451 		vector float vecTC3 = vec_madd( vecLightDir2, vecTangent02, zeroVector );
8452 		vector float vecTC4 = vec_madd( vecLightDir2, vecTangent12, zeroVector );
8453 		vector float vecTC5 = vec_madd( vecLightDir2, vecNormal2, zeroVector );
8454 
8455 		// sum accross first 3 elements of vector
8456 		vector float tempSum3;
8457 		tempSum = vec_add( vecTC0, vec_sld( vecTC0, vecTC0, 4 ) );
8458 		vecTC0 = vec_add( tempSum, vec_sld( vecTC0, vecTC0, 8 ) );
8459 		tempSum2 = vec_add( vecTC1, vec_sld( vecTC1, vecTC1, 4 ) );
8460 		vecTC1 = vec_add( tempSum2, vec_sld( vecTC1, vecTC1, 8 ) );
8461 		tempSum3 = vec_add( vecTC2, vec_sld( vecTC2, vecTC2, 4 ) );
8462 		vecTC2 = vec_add( tempSum3, vec_sld( vecTC2, vecTC2, 8 ) );
8463 
8464 		tempSum4 = vec_add( vecTC3, vec_sld( vecTC3, vecTC3, 4 ) );
8465 		vecTC3 = vec_add( tempSum4, vec_sld( vecTC3, vecTC3, 8 ) );
8466 		tempSum5 = vec_add( vecTC4, vec_sld( vecTC4, vecTC4, 4 ) );
8467 		vecTC4 = vec_add( tempSum5, vec_sld( vecTC4, vecTC4, 8 ) );
8468 		vector float tempSum6 = vec_add( vecTC5, vec_sld( vecTC5, vecTC5, 4 ) );
8469 		vecTC5 = vec_add( tempSum6, vec_sld( vecTC5, vecTC5, 8 ) );
8470 
8471 		vecTC0 = vec_splat( vecTC0, 0 );
8472 		vecTC1 = vec_splat( vecTC1, 0 );
8473 		vecTC2 = vec_splat( vecTC2, 0 );
8474 
8475 		vecTC3 = vec_splat( vecTC3, 0 );
8476 		vecTC4 = vec_splat( vecTC4, 0 );
8477 		vecTC5 = vec_splat( vecTC5, 0 );
8478 
8479 		if ( used[index] ) {
8480 			// store out results
8481 			vec_ste( vecTC0, 0, &texCoords[index][0] );
8482 			vec_ste( vecTC1, 0, &texCoords[index][1] );
8483 			vec_ste( vecTC2, 0, &texCoords[index][2] );
8484 			vec_ste( (vector float)(1.0), 0, &texCoords[index][3] );
8485 		}
8486 
8487 		if ( used[index+1] ) {
8488 			vec_ste( vecTC3, 0, &texCoords[index+1][0] );
8489 			vec_ste( vecTC4, 0, &texCoords[index+1][1] );
8490 			vec_ste( vecTC5, 0, &texCoords[index+1][2] );
8491 			vec_ste( (vector float)(1.0), 0, &texCoords[index+1][3] );
8492 		}
8493 	}
8494 
8495 	// cleanup
8496 	for ( ; index < numVerts; index++ ) {
8497 		if ( !used[index] ) {
8498 			continue;
8499 		}
8500 
8501 		const float *vertPtr = verts[index].xyz.ToFloatPtr();
8502 
8503 		permVec = vec_add( vec_lvsl( -1, vertPtr ), (vector unsigned char)(1) );
8504 
8505 		v0 = vec_ld( 0, vertPtr );
8506 		v1 = vec_ld( 15, vertPtr );
8507 		vector float v2 = vec_ld( 31, vertPtr );
8508 		vector float v3 = vec_ld( 47, vertPtr );
8509 		vector float v4 = vec_ld( 63, vertPtr );
8510 
8511 		// figure out what values go where
8512 		vector float vecXYZ = vec_perm( v0, v1, permVec );
8513 		vector float vecNormal = vec_perm( v1, v2, permVec );
8514 		vecNormal = vec_sld( vecNormal, vecNormal, 4 );
8515 		const vector float vecTangent0 = vec_perm( v2, v3, permVec );
8516 		permVec = vec_add( permVec, (vector unsigned char)(-4) ); //shift permute right 3 elements
8517 		const vector float vecTangent1 = vec_perm( v3, v4, permVec );
8518 
8519 		// calculate lightDir
8520 		vector float vecLightDir = vec_sub( vecLightOrigin, vecXYZ );
8521 		vector float vecViewDir = vec_sub( vecViewOrigin, vecXYZ );
8522 
8523 		// calculate distance
8524 		vector float vecTempLight = vec_madd( vecLightDir, vecLightDir, zeroVector );
8525 		vector float vecTempView = vec_madd( vecViewDir, vecViewDir, zeroVector );
8526 
8527 		// sum accross first 3 elements of vector
8528 		vector float tempSum = vec_add( vecTempLight, vec_sld( vecTempLight, vecTempLight, 4 ) );
8529 		vecTempLight = vec_add( tempSum, vec_sld( tempSum, tempSum, 8 ) );
8530 		vector float tempSum2 = vec_add( vecTempView, vec_sld( vecTempView, vecTempView, 4 ) );
8531 		vecTempView = vec_add( tempSum2, vec_sld( tempSum2, tempSum2, 8 ) );
8532 
8533 		// splat sum accross the whole vector
8534 		vecTempLight = vec_splat( vecTempLight, 0 );
8535 		vecTempView = vec_splat( vecTempView, 0 );
8536 
8537 		vecTempLight = ReciprocalSquareRoot( vecTempLight );
8538 		vecTempView = ReciprocalSquareRoot( vecTempView );
8539 
8540 		// modify light and view vectors based on ilength
8541 		vecViewDir = vec_madd( vecViewDir, vecTempView, zeroVector );
8542 		vecLightDir = vec_madd( vecLightDir, vecTempLight, vecViewDir );
8543 
8544 		// calculate what to store in each texture coord
8545 		vector float vecTC0 = vec_madd( vecLightDir, vecTangent0, zeroVector );
8546 		vector float vecTC1 = vec_madd( vecLightDir, vecTangent1, zeroVector );
8547 		vector float vecTC2 = vec_madd( vecLightDir, vecNormal, zeroVector );
8548 
8549 		// sum accross first 3 elements of vector
8550 		vector float tempSum3;
8551 		tempSum = vec_add( vecTC0, vec_sld( vecTC0, vecTC0, 4 ) );
8552 		vecTC0 = vec_add( tempSum, vec_sld( vecTC0, vecTC0, 8 ) );
8553 		tempSum2 = vec_add( vecTC1, vec_sld( vecTC1, vecTC1, 4 ) );
8554 		vecTC1 = vec_add( tempSum2, vec_sld( vecTC1, vecTC1, 8 ) );
8555 		tempSum3 = vec_add( vecTC2, vec_sld( vecTC2, vecTC2, 4 ) );
8556 		vecTC2 = vec_add( tempSum3, vec_sld( vecTC2, vecTC2, 8 ) );
8557 
8558 		vecTC0 = vec_splat( vecTC0, 0 );
8559 		vecTC1 = vec_splat( vecTC1, 0 );
8560 		vecTC2 = vec_splat( vecTC2, 0 );
8561 
8562 		// store out results
8563 		vec_ste( vecTC0, 0, &texCoords[index][0] );
8564 		vec_ste( vecTC1, 0, &texCoords[index][1] );
8565 		vec_ste( vecTC2, 0, &texCoords[index][2] );
8566 		vec_ste( (vector float)(1.0), 0, &texCoords[index][3] );
8567 
8568 	}
8569 }
8570 #endif  /* 0 for disable spec coord */
8571 
8572 #if 1
8573 
8574 #ifdef VERTEXCACHE_ALIGNED
8575 /*
8576 ============
8577 idSIMD_AltiVec::CreateShadowCache
8578 ============
8579 */
CreateShadowCache(idVec4 * vertexCache,int * vertRemap,const idVec3 & lightOrigin,const idDrawVert * verts,const int numVerts)8580 int VPCALL idSIMD_AltiVec::CreateShadowCache( idVec4 *vertexCache, int *vertRemap, const idVec3 &lightOrigin, const idDrawVert *verts, const int numVerts ) {
8581 	int outVerts = 0;
8582 	int i = 0;
8583 
8584 	assert( IS_16BYTE_ALIGNED( vertexCache[0] ) );
8585 
8586 	register vector float v0, v1, v2, v3, v4, v5, v6, v7;
8587 	register vector unsigned char vecPerm, vecPerm2, vecPerm3, vecPerm4, vecPerm5;
8588 	register vector float zeroVector = (vector float)(0.0);
8589 	register vector float oneVector = (vector float)(1);
8590 	register vector unsigned char vecPermZeroLast = (vector unsigned char)(0,1,2,3,4,5,6,7,8,9,10,11,16,17,18,19);
8591 
8592 	const float *lPtr = lightOrigin.ToFloatPtr();
8593 	const float *vPtr;
8594 	const float *vPtr2;
8595 	const float *vPtr3;
8596 	const float *vPtr4;
8597 
8598 	// put values into a vector
8599 	vecPerm = vec_add( vec_lvsl( -1, lPtr ), (vector unsigned char)(1) );
8600 	v0 = vec_ld( 0, lPtr );
8601 	v1 = vec_ld( 15, lPtr );
8602 	v0 = vec_perm( v0, v1, vecPerm );
8603 	v0 = vec_perm( v0, zeroVector, vecPermZeroLast );
8604 
8605 	//v0 now contains lightOrigin[0], lightOrigin[1], lightOrigin[2], 0
8606 	for ( ; i+3 < numVerts; i+= 4 ) {
8607 		if ( ! vertRemap[i] ) {
8608 			vPtr = verts[i].xyz.ToFloatPtr();
8609 
8610 #ifndef DRAWVERT_PADDED
8611 			vecPerm2 = vec_add( vec_lvsl( -1, vPtr ), (vector unsigned char)(1) );
8612 			v2 = vec_ld( 0, vPtr );
8613 			v3 = vec_ld( 15, vPtr );
8614 			v7 = vec_perm( v2, v3, vecPerm2 );
8615 #else
8616 			v7 = vec_ld( 0, vPtr );
8617 #endif
8618 			v2 = vec_perm( v7, zeroVector, vecPermZeroLast );
8619 			v3 = vec_perm( v7, oneVector, vecPermZeroLast );
8620 			v1 = vec_sub( v2, v0 );
8621 
8622 			vec_st( v3, 0, &vertexCache[outVerts][0] );
8623 			vec_st( v1, 0, &vertexCache[outVerts+1][0] );
8624 
8625 			vertRemap[i] = outVerts;
8626 			outVerts += 2;
8627 		}
8628 
8629 		if ( ! vertRemap[i+1] ) {
8630 			vPtr2 = verts[i+1].xyz.ToFloatPtr();
8631 
8632 #ifndef DRAWVERT_PADDED
8633 			vecPerm3 = vec_add( vec_lvsl( -1, vPtr2 ), (vector unsigned char)(1) );
8634 			v4 = vec_ld( 0, vPtr2 );
8635 			v5 = vec_ld( 15, vPtr2 );
8636 			v6 = vec_perm( v4, v5, vecPerm3 );
8637 #else
8638 			v6 = vec_ld( 0, vPtr2 );
8639 #endif
8640 			v4 = vec_perm( v6, zeroVector, vecPermZeroLast );
8641 			v5 = vec_perm( v6, oneVector, vecPermZeroLast );
8642 			v6 = vec_sub( v4, v0 );
8643 
8644 			vec_st( v5, 0, &vertexCache[outVerts][0] );
8645 			vec_st( v6, 0, &vertexCache[outVerts+1][0] );
8646 
8647 			vertRemap[i+1] = outVerts;
8648 			outVerts += 2;
8649 		}
8650 
8651 		if ( ! vertRemap[i+2] ) {
8652 			vPtr3 = verts[i+2].xyz.ToFloatPtr();
8653 
8654 #ifndef DRAWVERT_PADDED
8655 			vecPerm4 = vec_add( vec_lvsl( -1, vPtr3 ), (vector unsigned char)(1) );
8656 			v1 = vec_ld( 0, vPtr3 );
8657 			v2 = vec_ld( 15, vPtr3 );
8658 			v3 = vec_perm( v1, v2, vecPerm4 );
8659 #else
8660 			v3 = vec_ld( 0, vPtr3 );
8661 #endif
8662 			v1 = vec_perm( v3, zeroVector, vecPermZeroLast );
8663 			v2 = vec_perm( v3, oneVector, vecPermZeroLast );
8664 			v3 = vec_sub( v1, v0 );
8665 
8666 			vec_st( v2, 0, &vertexCache[outVerts][0] );
8667 			vec_st( v3, 0, &vertexCache[outVerts+1][0] );
8668 
8669 			vertRemap[i+2] = outVerts;
8670 			outVerts += 2;
8671 		}
8672 
8673 		if ( ! vertRemap[i+3] ) {
8674 			vPtr4 = verts[i+3].xyz.ToFloatPtr();
8675 #ifndef DRAWVERT_PADDED
8676 			vecPerm5 = vec_add( vec_lvsl( -1, vPtr4 ), (vector unsigned char)(1) );
8677 			v4 = vec_ld( 0, vPtr4 );
8678 			v5 = vec_ld( 16, vPtr4 );
8679 			v6 = vec_perm( v4, v5, vecPerm5 );
8680 #else
8681 			v6 = vec_ld( 0, vPtr4 );
8682 #endif
8683 			v4 = vec_perm( v6, zeroVector, vecPermZeroLast );
8684 			v5 = vec_perm( v6, oneVector, vecPermZeroLast );
8685 			v6 = vec_sub( v4, v0 );
8686 
8687 			vec_st( v5, 0, &vertexCache[outVerts][0] );
8688 			vec_st( v6, 0, &vertexCache[outVerts+1][0] );
8689 
8690 			vertRemap[i+3] = outVerts;
8691 			outVerts += 2;
8692 		}
8693 	}
8694 
8695 	// cleanup
8696 	for (; i < numVerts; i++ ) {
8697 		if ( vertRemap[i] ) {
8698 			continue;
8699 		}
8700 		const float *v = verts[i].xyz.ToFloatPtr();
8701 		vertexCache[outVerts+0][0] = v[0];
8702 		vertexCache[outVerts+0][1] = v[1];
8703 		vertexCache[outVerts+0][2] = v[2];
8704 		vertexCache[outVerts+0][3] = 1.0f;
8705 
8706 		// R_SetupProjection() builds the projection matrix with a slight crunch
8707 		// for depth, which keeps this w=0 division from rasterizing right at the
8708 		// wrap around point and causing depth fighting with the rear caps
8709 		vertexCache[outVerts+1][0] = v[0] - lightOrigin[0];
8710 		vertexCache[outVerts+1][1] = v[1] - lightOrigin[1];
8711 		vertexCache[outVerts+1][2] = v[2] - lightOrigin[2];
8712 		vertexCache[outVerts+1][3] = 0.0f;
8713 		vertRemap[i] = outVerts;
8714 		outVerts += 2;
8715 	}
8716 	return outVerts;
8717 }
8718 
8719 #else
8720 
8721 /*
8722 ============
8723 idSIMD_AltiVec::CreateShadowCache
8724 ============
8725 */
CreateShadowCache(idVec4 * vertexCache,int * vertRemap,const idVec3 & lightOrigin,const idDrawVert * verts,const int numVerts)8726 int VPCALL idSIMD_AltiVec::CreateShadowCache( idVec4 *vertexCache, int *vertRemap, const idVec3 &lightOrigin, const idDrawVert *verts, const int numVerts ) {
8727 	int outVerts = 0;
8728 	int i = 0;
8729 
8730 	register vector float v0, v1, v2, v3, v4, v5, v6, v7;
8731 	register vector unsigned char vecPerm, vecPerm2, vecPerm3, vecPerm4, vecPerm5;
8732 	register vector float zeroVector = (vector float)(0.0);
8733 	register vector float oneVector = (vector float)(1);
8734 	register vector unsigned char vecPermZeroLast = (vector unsigned char)(0,1,2,3,4,5,6,7,8,9,10,11,16,17,18,19);
8735 
8736 	const float *lPtr = lightOrigin.ToFloatPtr();
8737 	const float *vPtr;
8738 	const float *vPtr2;
8739 	const float *vPtr3;
8740 	const float *vPtr4;
8741 
8742 	// put values into a vector
8743 	vecPerm = vec_add( vec_lvsl( -1, lPtr ), (vector unsigned char)(1) );
8744 	v0 = vec_ld( 0, lPtr );
8745 	v1 = vec_ld( 15, lPtr );
8746 	v0 = vec_perm( v0, v1, vecPerm );
8747 	v0 = vec_perm( v0, zeroVector, vecPermZeroLast );
8748 
8749 	//v0 now contains lightOrigin[0], lightOrigin[1], lightOrigin[2], 0
8750 	for ( ; i+3 < numVerts; i+= 4 ) {
8751 		if ( ! vertRemap[i] ) {
8752 			vPtr = verts[i].xyz.ToFloatPtr();
8753 #ifndef DRAWVERT_PADDED
8754 			vecPerm2 = vec_add( vec_lvsl( -1, vPtr ), (vector unsigned char)(1) );
8755 			v2 = vec_ld( 0, vPtr );
8756 			v3 = vec_ld( 15, vPtr );
8757 			v7 = vec_perm( v2, v3, vecPerm2 );
8758 #else
8759 			v7 = vec_ld( 0, vPtr );
8760 #endif
8761 			v2 = vec_perm( v7, zeroVector, vecPermZeroLast );
8762 			v3 = vec_perm( v7, oneVector, vecPermZeroLast );
8763 			v1 = vec_sub( v2, v0 );
8764 
8765 			// store results
8766 			UNALIGNED_STORE2( &vertexCache[outVerts][0], v3, v1 );
8767 
8768 			vertRemap[i] = outVerts;
8769 			outVerts += 2;
8770 		}
8771 
8772 		if ( ! vertRemap[i+1] ) {
8773 			vPtr2 = verts[i+1].xyz.ToFloatPtr();
8774 #ifndef DRAWVERT_PADDED
8775 			vecPerm3 = vec_add( vec_lvsl( -1, vPtr2 ), (vector unsigned char)(1) );
8776 			v4 = vec_ld( 0, vPtr2 );
8777 			v5 = vec_ld( 15, vPtr2 );
8778 			v6 = vec_perm( v4, v5, vecPerm3 );
8779 #else
8780 			v6 = vec_ld( 0, vPtr2 );
8781 #endif
8782 			v4 = vec_perm( v6, zeroVector, vecPermZeroLast );
8783 			v5 = vec_perm( v6, oneVector, vecPermZeroLast );
8784 			v6 = vec_sub( v4, v0 );
8785 
8786 			// store results
8787 			UNALIGNED_STORE2( &vertexCache[outVerts][0], v5, v6 );
8788 
8789 			vertRemap[i+1] = outVerts;
8790 			outVerts += 2;
8791 		}
8792 
8793 		if ( ! vertRemap[i+2] ) {
8794 			vPtr3 = verts[i+2].xyz.ToFloatPtr();
8795 #ifndef DRAWVERT_PADDED
8796 			vecPerm4 = vec_add( vec_lvsl( -1, vPtr3 ), (vector unsigned char)(1) );
8797 			v1 = vec_ld( 0, vPtr3 );
8798 			v2 = vec_ld( 15, vPtr3 );
8799 			v3 = vec_perm( v1, v2, vecPerm4 );
8800 #else
8801 			v3 = vec_ld( 0, vPtr3 );
8802 #endif
8803 			v1 = vec_perm( v3, zeroVector, vecPermZeroLast );
8804 			v2 = vec_perm( v3, oneVector, vecPermZeroLast );
8805 			v3 = vec_sub( v1, v0 );
8806 
8807 			// store results
8808 			UNALIGNED_STORE2( &vertexCache[outVerts][0], v2, v3 );
8809 
8810 			vertRemap[i+2] = outVerts;
8811 			outVerts += 2;
8812 		}
8813 		if ( ! vertRemap[i+3] ) {
8814 			vPtr4 = verts[i+3].xyz.ToFloatPtr();
8815 #ifndef DRAWVERT_PADDED
8816 			vecPerm5 = vec_add( vec_lvsl( -1, vPtr4 ), (vector unsigned char)(1) );
8817 			v4 = vec_ld( 0, vPtr4 );
8818 			v5 = vec_ld( 16, vPtr4 );
8819 			v6 = vec_perm( v4, v5, vecPerm5 );
8820 #else
8821 			v6 = vec_ld( 0, vPtr4 );
8822 #endif
8823 
8824 			v4 = vec_perm( v6, zeroVector, vecPermZeroLast );
8825 			v5 = vec_perm( v6, oneVector, vecPermZeroLast );
8826 			v6 = vec_sub( v4, v0 );
8827 
8828 			// store results
8829 			UNALIGNED_STORE2( &vertexCache[outVerts][0], v5, v6 );
8830 
8831 
8832 			vertRemap[i+3] = outVerts;
8833 			outVerts += 2;
8834 		}
8835 	}
8836 
8837 	// cleanup
8838 	for (; i < numVerts; i++ ) {
8839 		if ( vertRemap[i] ) {
8840 			continue;
8841 		}
8842 		const float *v = verts[i].xyz.ToFloatPtr();
8843 		vertexCache[outVerts+0][0] = v[0];
8844 		vertexCache[outVerts+0][1] = v[1];
8845 		vertexCache[outVerts+0][2] = v[2];
8846 		vertexCache[outVerts+0][3] = 1.0f;
8847 
8848 		// R_SetupProjection() builds the projection matrix with a slight crunch
8849 		// for depth, which keeps this w=0 division from rasterizing right at the
8850 		// wrap around point and causing depth fighting with the rear caps
8851 		vertexCache[outVerts+1][0] = v[0] - lightOrigin[0];
8852 		vertexCache[outVerts+1][1] = v[1] - lightOrigin[1];
8853 		vertexCache[outVerts+1][2] = v[2] - lightOrigin[2];
8854 		vertexCache[outVerts+1][3] = 0.0f;
8855 		vertRemap[i] = outVerts;
8856 		outVerts += 2;
8857 	}
8858 	return outVerts;
8859 }
8860 #endif /* VERTEXCACHE_ALIGNED */
8861 
8862 #endif /* 0 to disable shadow cache */
8863 
8864 #if 1
8865 
8866 #ifdef VERTEXCACHE_ALIGNED
8867 /*
8868 ============
8869 idSIMD_AltiVec::CreateVertexProgramShadowCache
8870 ============
8871 */
CreateVertexProgramShadowCache(idVec4 * vertexCache,const idDrawVert * verts,const int numVerts)8872 int VPCALL idSIMD_AltiVec::CreateVertexProgramShadowCache( idVec4 *vertexCache, const idDrawVert *verts, const int numVerts ) {
8873 
8874 	// vertexCache aligned
8875 	assert( IS_16BYTE_ALIGNED( vertexCache[0] ) );
8876 	// idDrawVert size
8877 	assert( sizeof(idDrawVert) == DRAWVERT_OFFSET * sizeof(float) );
8878 	// idVec4 size
8879 	assert( sizeof(idVec4) == IDVEC4_OFFSET * sizeof(float) );
8880 
8881 	register vector float v0, v1, v2, v3, v4, v5, v6, v7;
8882 	register vector float zeroVector = (vector float)(0.0);
8883 	register vector float oneVector = (vector float)(1);
8884 	register vector unsigned char vecPermThreeOne = (vector unsigned char)(0,1,2,3,4,5,6,7,8,9,10,11,16,17,18,19);
8885 	vector unsigned char vertPerm1, vertPerm2, vertPerm3, vertPerm4;
8886 	int i = 0;
8887 
8888 #ifndef DRAWVERT_PADDED
8889 	// every fourth one will have the same alignment. Make sure we've got enough here
8890 	if ( i+3 < numVerts ) {
8891 		vertPerm1 = vec_add( vec_lvsl( -1, (float*) verts[0].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
8892 		vertPerm2 = vec_add( vec_lvsl( -1, (float*) verts[1].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
8893 		vertPerm3 = vec_add( vec_lvsl( -1, (float*) verts[2].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
8894 		vertPerm4 = vec_add( vec_lvsl( -1, (float*) verts[3].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
8895 	}
8896 #endif
8897 
8898 	for ( ; i+3 < numVerts; i+=4 ) {
8899 		const float *vertPtr = verts[i].xyz.ToFloatPtr();
8900 		const float *vertPtr2 = verts[i+1].xyz.ToFloatPtr();
8901 		const float *vertPtr3 = verts[i+2].xyz.ToFloatPtr();
8902 		const float *vertPtr4 = verts[i+3].xyz.ToFloatPtr();
8903 
8904 #ifndef DRAWVERT_PADDED
8905 		v0 = vec_ld( 0, vertPtr );
8906 		v1 = vec_ld( 15, vertPtr );
8907 		v2 = vec_ld( 0, vertPtr2 );
8908 		v3 = vec_ld( 15, vertPtr2 );
8909 		v4 = vec_ld( 0, vertPtr3 );
8910 		v5 = vec_ld( 15, vertPtr3 );
8911 		v6 = vec_ld( 0, vertPtr4 );
8912 		v7 = vec_ld( 15, vertPtr4 );
8913 
8914 		v0 = vec_perm( v0, v1, vertPerm1 );
8915 		v1 = vec_perm( v2, v3, vertPerm2 );
8916 		v2 = vec_perm( v4, v5, vertPerm3 );
8917 		v3 = vec_perm( v6, v7, vertPerm4 );
8918 #else
8919 		v0 = vec_ld( 0, vertPtr );
8920 		v1 = vec_ld( 0, vertPtr2 );
8921 		v2 = vec_ld( 0, vertPtr3 );
8922 		v3 = vec_ld( 0, vertPtr4 );
8923 #endif
8924 
8925 		v0 = vec_perm( v0, oneVector, vecPermThreeOne );
8926 		v4 = vec_perm( v0, zeroVector, vecPermThreeOne );
8927 
8928 		v1 = vec_perm( v1, oneVector, vecPermThreeOne );
8929 		v5 = vec_perm( v1, zeroVector, vecPermThreeOne );
8930 
8931 		v2 = vec_perm( v2, oneVector, vecPermThreeOne );
8932 		v6 = vec_perm( v2, zeroVector, vecPermThreeOne );
8933 
8934 		v3 = vec_perm( v3, oneVector, vecPermThreeOne );
8935 		v7 = vec_perm( v3, zeroVector, vecPermThreeOne );
8936 
8937 		// store results
8938 		ALIGNED_STORE4( &vertexCache[i*2][0], v0, v4, v1, v5 );
8939 		ALIGNED_STORE4( &vertexCache[(i+2)*2][0], v2, v6, v3, v7 );
8940 
8941 	}
8942 
8943 	// cleanup
8944 	for ( ; i < numVerts; i++ ) {
8945 		const float *v = verts[i].xyz.ToFloatPtr();
8946 		vertexCache[i*2+0][0] = v[0];
8947 		vertexCache[i*2+1][0] = v[0];
8948 		vertexCache[i*2+0][1] = v[1];
8949 		vertexCache[i*2+1][1] = v[1];
8950 		vertexCache[i*2+0][2] = v[2];
8951 		vertexCache[i*2+1][2] = v[2];
8952 		vertexCache[i*2+0][3] = 1.0f;
8953 		vertexCache[i*2+1][3] = 0.0f;
8954 	}
8955 	return numVerts * 2;
8956 }
8957 
8958 #else
8959 /*
8960 ============
8961 idSIMD_AltiVec::CreateVertexProgramShadowCache
8962 ============
8963 */
CreateVertexProgramShadowCache(idVec4 * vertexCache,const idDrawVert * verts,const int numVerts)8964 int VPCALL idSIMD_AltiVec::CreateVertexProgramShadowCache( idVec4 *vertexCache, const idDrawVert *verts, const int numVerts ) {
8965 
8966 	// idDrawVert size
8967 	assert( sizeof(idDrawVert) == DRAWVERT_OFFSET * sizeof(float) );
8968 	// idVec4 size
8969 	assert( sizeof(idVec4) == IDVEC4_OFFSET * sizeof(float) );
8970 
8971 	register vector float v0, v1, v2, v3, v4, v5, v6, v7;
8972 	register vector float zeroVector = (vector float)(0.0);
8973 	register vector float oneVector = (vector float)(1);
8974 	register vector unsigned char vecPermThreeOne = (vector unsigned char)(0,1,2,3,4,5,6,7,8,9,10,11,16,17,18,19);
8975 	vector unsigned char vertPerm1, vertPerm2, vertPerm3, vertPerm4;
8976 	int i = 0;
8977 
8978 #ifndef DRAWVERT_PADDED
8979 	// every fourth one will have the same alignment. Make sure we've got enough here
8980 	if ( i+3 < numVerts ) {
8981 		vertPerm1 = vec_add( vec_lvsl( -1, (float*) verts[0].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
8982 		vertPerm2 = vec_add( vec_lvsl( -1, (float*) verts[1].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
8983 		vertPerm3 = vec_add( vec_lvsl( -1, (float*) verts[2].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
8984 		vertPerm4 = vec_add( vec_lvsl( -1, (float*) verts[3].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
8985 	}
8986 #endif
8987 
8988 	for ( ; i+3 < numVerts; i+=4 ) {
8989 		const float *vertPtr = verts[i].xyz.ToFloatPtr();
8990 		const float *vertPtr2 = verts[i+1].xyz.ToFloatPtr();
8991 		const float *vertPtr3 = verts[i+2].xyz.ToFloatPtr();
8992 		const float *vertPtr4 = verts[i+3].xyz.ToFloatPtr();
8993 
8994 #ifndef DRAWVERT_PADDED
8995 		v0 = vec_ld( 0, vertPtr );
8996 		v1 = vec_ld( 15, vertPtr );
8997 		v2 = vec_ld( 0, vertPtr2 );
8998 		v3 = vec_ld( 15, vertPtr2 );
8999 		v4 = vec_ld( 0, vertPtr3 );
9000 		v5 = vec_ld( 15, vertPtr3 );
9001 		v6 = vec_ld( 0, vertPtr4 );
9002 		v7 = vec_ld( 15, vertPtr4 );
9003 
9004 		v0 = vec_perm( v0, v1, vertPerm1 );
9005 		v1 = vec_perm( v2, v3, vertPerm2 );
9006 		v2 = vec_perm( v4, v5, vertPerm3 );
9007 		v3 = vec_perm( v6, v7, vertPerm4 );
9008 #else
9009 		v0 = vec_ld( 0, vertPtr );
9010 		v1 = vec_ld( 0, vertPtr2 );
9011 		v2 = vec_ld( 0, vertPtr3 );
9012 		v3 = vec_ld( 0, vertPtr4 );
9013 #endif
9014 
9015 		v0 = vec_perm( v0, oneVector, vecPermThreeOne );
9016 		v4 = vec_perm( v0, zeroVector, vecPermThreeOne );
9017 
9018 		v1 = vec_perm( v1, oneVector, vecPermThreeOne );
9019 		v5 = vec_perm( v1, zeroVector, vecPermThreeOne );
9020 
9021 		v2 = vec_perm( v2, oneVector, vecPermThreeOne );
9022 		v6 = vec_perm( v2, zeroVector, vecPermThreeOne );
9023 
9024 		v3 = vec_perm( v3, oneVector, vecPermThreeOne );
9025 		v7 = vec_perm( v3, zeroVector, vecPermThreeOne );
9026 
9027 		// store results as unaligned
9028 		vector unsigned char storePerm = vec_sub( vec_lvsr( 15, &vertexCache[i*2][0] ), (vector unsigned char)(1) );
9029 		vector unsigned int mask = vec_perm( (vector unsigned int)(0), (vector unsigned int)(-1), storePerm );
9030 		vector float vc1 = vec_ld( 0, &vertexCache[i*2][0] );
9031 		vector float vc2 = vec_ld( 127, &vertexCache[i*2][0] );
9032 
9033 		// right rotate input data
9034 		v0 = vec_perm( v0, v0, storePerm );
9035 		v4 = vec_perm( v4, v4, storePerm );
9036 		v1 = vec_perm( v1, v1, storePerm );
9037 		v5 = vec_perm( v5, v5, storePerm );
9038 		v2 = vec_perm( v2, v2, storePerm );
9039 		v6 = vec_perm( v6, v6, storePerm );
9040 		v3 = vec_perm( v3, v3, storePerm );
9041 		v7 = vec_perm( v7, v7, storePerm );
9042 
9043 		vec_st( vec_sel( vc1, v0, mask ), 0 , &vertexCache[i*2][0] );
9044 		vec_st( vec_sel( v0, v4, mask ), 15 , &vertexCache[i*2][0] );
9045 		vec_st( vec_sel( v4, v1, mask ), 31 , &vertexCache[i*2][0] );
9046 		vec_st( vec_sel( v1, v5, mask ), 47 , &vertexCache[i*2][0] );
9047 		vec_st( vec_sel( v5, v2, mask ), 63 , &vertexCache[i*2][0] );
9048 		vec_st( vec_sel( v2, v6, mask ), 79 , &vertexCache[i*2][0] );
9049 		vec_st( vec_sel( v6, v3, mask ), 95 , &vertexCache[i*2][0] );
9050 		vec_st( vec_sel( v3, v7, mask ), 111 , &vertexCache[i*2][0] );
9051 		vec_st( vec_sel( v7, vc2, mask ), 127 , &vertexCache[i*2][0] );
9052 	}
9053 
9054 	// cleanup
9055 	for ( ; i < numVerts; i++ ) {
9056 		const float *v = verts[i].xyz.ToFloatPtr();
9057 		vertexCache[i*2+0][0] = v[0];
9058 		vertexCache[i*2+1][0] = v[0];
9059 		vertexCache[i*2+0][1] = v[1];
9060 		vertexCache[i*2+1][1] = v[1];
9061 		vertexCache[i*2+0][2] = v[2];
9062 		vertexCache[i*2+1][2] = v[2];
9063 		vertexCache[i*2+0][3] = 1.0f;
9064 		vertexCache[i*2+1][3] = 0.0f;
9065 	}
9066 	return numVerts * 2;
9067 }
9068 
9069 #endif /* VERTEXCACHE_ALIGNED */
9070 
9071 #endif /* 0 to kill VP shader cache */
9072 
9073 #endif /* ENABLE_CREATE */
9074 
9075 #ifdef ENABLE_SOUND_ROUTINES
9076 
9077 #ifdef SOUND_DEST_ALIGNED
9078 /*
9079 ============
9080 idSIMD_AltiVec::UpSamplePCMTo44kHz
9081 
9082   Duplicate samples for 44kHz output.
9083 
9084 	Assumptions:
9085 		Assumes that dest starts at aligned address
9086 ============
9087 */
UpSamplePCMTo44kHz(float * dest,const short * src,const int numSamples,const int kHz,const int numChannels)9088 void idSIMD_AltiVec::UpSamplePCMTo44kHz( float *dest, const short *src, const int numSamples, const int kHz, const int numChannels ) {
9089 
9090 	// dest is aligned
9091 	assert( IS_16BYTE_ALIGNED( dest[0] ) );
9092 
9093 	vector signed short vs0, vs1;
9094 	register vector signed int vi0, vi1;
9095 	register vector float v0, v1, v2, v3, v4, v5, v6, v7, v8, v9;
9096 	// permute vectors
9097 	register vector unsigned char vecFirstHalf = (vector unsigned char)(0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7);
9098 	register vector unsigned char vecSecondHalf = (vector unsigned char)(8,9,10,11,12,13,14,15,8,9,10,11,12,13,14,15);
9099 
9100 	register vector unsigned char vecBottom = (vector unsigned char)(0,1,2,3,0,1,2,3,4,5,6,7,4,5,6,7);
9101 	register vector unsigned char vecTop = (vector unsigned char)(8,9,10,11,8,9,10,11,12,13,14,15,12,13,14,15);
9102 
9103 	// If this can be assumed true, we can eliminate another conditional that checks to see if we can
9104 	// load up a vector before the loop
9105 	assert( numSamples >= 12 );
9106 
9107 	if ( kHz == 11025 ) {
9108 		if ( numChannels == 1 ) {
9109 			// 8 at a time
9110 			int i = 0;
9111 
9112 			vector signed short vsOld = vec_ld( 0, &src[i] );
9113 			vector unsigned char permVec = vec_add( vec_lvsl( -1, &src[i] ), (vector unsigned char)(1) );
9114 
9115 			for ( ; i+7 < numSamples; i+= 8 ) {
9116 				// load src
9117 				vs1 = vec_ld( 15, &src[i] );
9118 				vs0 = vec_perm( vsOld, vs1, permVec );
9119 				vsOld = vs1;
9120 
9121 				// unpack shorts to ints
9122 				vi0 = vec_unpackh( vs0 );
9123 				vi1 = vec_unpackl( vs0 );
9124 				// convert ints to floats
9125 				v0 = vec_ctf( vi0, 0 );
9126 				v1 = vec_ctf( vi1, 0 );
9127 				// permute into vectors in the order to store
9128 
9129 				v2 = vec_splat( v0, 0 );
9130 				v3 = vec_splat( v0, 1 );
9131 				v4 = vec_splat( v0, 2 );
9132 				v5 = vec_splat( v0, 3 );
9133 				v6 = vec_splat( v1, 0 );
9134 				v7 = vec_splat( v1, 1 );
9135 				v8 = vec_splat( v1, 2 );
9136 				v9 = vec_splat( v1, 3 );
9137 
9138 				// store results
9139 				ALIGNED_STORE8( &dest[i*4], v2, v3, v4, v5, v6, v7, v8, v9 );
9140 			}
9141 			// cleanup
9142 			for (; i < numSamples; i++ ) {
9143 				dest[i*4+0] = dest[i*4+1] = dest[i*4+2] = dest[i*4+3] = (float) src[i+0];
9144 			}
9145 		} else {
9146 			int i = 0;
9147 
9148 			vector unsigned char permVec = vec_add( vec_lvsl( -1, &src[0] ), (vector unsigned char)(1) );
9149 			vector signed short vsOld = vec_ld( 0, &src[0] );
9150 
9151 			for ( ; i+7 < numSamples; i += 8 ) {
9152 				// load src
9153 				vs1 = vec_ld( 15, &src[i] );
9154 				vs0 = vec_perm( vsOld, vs1, permVec );
9155 				vsOld = vs1;
9156 
9157 				// unpack shorts to ints
9158 				vi0 = vec_unpackh( vs0 );
9159 				vi1 = vec_unpackl( vs0 );
9160 				// convert ints to floats
9161 				v0 = vec_ctf( vi0, 0 );
9162 				v1 = vec_ctf( vi1, 0 );
9163 				// put into vectors in order to store
9164 				v2 = vec_perm( v0, v0, vecFirstHalf );
9165 				v3 = v2;
9166 				v4 = vec_perm( v0, v0, vecSecondHalf );
9167 				v5 = v4;
9168 				v6 = vec_perm( v1, v1, vecFirstHalf );
9169 				v7 = v6;
9170 				v8 = vec_perm (v1, v1, vecSecondHalf );
9171 				v9 = v8;
9172 
9173 				// store results
9174 				ALIGNED_STORE8( &dest[i*4], v2, v3, v4, v5, v6, v7, v8, v9 );
9175 			}
9176 
9177 			for ( ; i < numSamples; i += 2 ) {
9178 				dest[i*4+0] = dest[i*4+2] = dest[i*4+4] = dest[i*4+6] = (float) src[i+0];
9179 				dest[i*4+1] = dest[i*4+3] = dest[i*4+5] = dest[i*4+7] = (float) src[i+1];
9180 			}
9181 		}
9182 	} else if ( kHz == 22050 ) {
9183 		if ( numChannels == 1 ) {
9184 			int i;
9185 			vector unsigned char permVec = vec_add( vec_lvsl( -1, &src[0] ), (vector unsigned char)(1) );
9186 			vector signed short vsOld = vec_ld( 0, &src[0] );
9187 
9188 			for ( i = 0; i+7 < numSamples; i += 8 ) {
9189 				// load src
9190 				vs1 = vec_ld( 0, &src[i] );
9191 				vs0 = vec_perm( vsOld, vs1, permVec );
9192 				vsOld = vs1;
9193 
9194 				// unpack shorts to ints
9195 				vi0 = vec_unpackh( vs0 );
9196 				vi1 = vec_unpackl( vs0 );
9197 				// convert ints to floats
9198 				v0 = vec_ctf( vi0, 0 );
9199 				v1 = vec_ctf( vi1, 0 );
9200 				// put into vectors in order to store
9201 				v2 = vec_perm( v0, v0, vecBottom );
9202 				v3 = vec_perm( v0, v0, vecTop );
9203 				v4 = vec_perm( v1, v1, vecBottom );
9204 				v5 = vec_perm (v1, v1, vecTop );
9205 
9206 				// store results
9207 				ALIGNED_STORE4( &dest[i*2], v2, v3, v4, v5 );
9208 			}
9209 			// cleanup
9210 			for ( ; i < numSamples; i++ ) {
9211 				dest[i*2+0] = dest[i*2+1] = (float) src[i+0];
9212 			}
9213 		} else {
9214 			int i;
9215 			vector unsigned char permVec = vec_add( vec_lvsl( -1, &src[0] ), (vector unsigned char)(1) );
9216 			vector signed short vsOld = vec_ld( 0, &src[0] );
9217 
9218 			for ( i = 0; i+7 < numSamples; i += 8 ) {
9219 				// load src
9220 				vs1 = vec_ld( 15, &src[i] );
9221 				vs0 = vec_perm( vsOld, vs1, permVec );
9222 				vsOld = vs1;
9223 
9224 				// unpack shorts to ints
9225 				vi0 = vec_unpackh( vs0 );
9226 				vi1 = vec_unpackl( vs0 );
9227 				// convert ints to floats
9228 				v0 = vec_ctf( vi0, 0 );
9229 				v1 = vec_ctf( vi1, 0 );
9230 				// put into vectors in order to store
9231 				v2 = vec_perm( v0, v0, vecFirstHalf );
9232 				v3 = vec_perm( v0, v0, vecSecondHalf );
9233 				v4 = vec_perm( v1, v1, vecFirstHalf );
9234 				v5 = vec_perm (v1, v1, vecSecondHalf );
9235 
9236 				// store results
9237 				ALIGNED_STORE4( &dest[i*2], v2, v3, v4, v5 );
9238 			}
9239 			// cleanup
9240 			for ( ; i < numSamples; i += 2 ) {
9241 				dest[i*2+0] = dest[i*2+2] = (float) src[i+0];
9242 				dest[i*2+1] = dest[i*2+3] = (float) src[i+1];
9243 			}
9244 		}
9245 	} else if ( kHz == 44100 ) {
9246 		int i;
9247 		vector unsigned char permVec = vec_add( vec_lvsl( -1, &src[0] ), (vector unsigned char)(1) );
9248 		vector signed short vsOld = vec_ld( 0, &src[0] );
9249 
9250 		for ( i = 0; i+7 < numSamples; i += 8 ) {
9251 			vs1 = vec_ld( 15, &src[i] );
9252 			vs0 = vec_perm( vsOld,  vs1, permVec );
9253 			vsOld = vs1;
9254 
9255 			//unpack shorts to ints
9256 			vi0 = vec_unpackh( vs0 );
9257 			vi1 = vec_unpackl( vs0 );
9258 
9259 			//convert ints to floats
9260 			v0 = vec_ctf( vi0, 0 );
9261 			v1 = vec_ctf( vi1, 0 );
9262 
9263 			//store results
9264 			ALIGNED_STORE2( &dest[i], v0, v1 );
9265 		}
9266 		//	cleanup
9267 		for ( ; i < numSamples; i++ ) {
9268 			dest[i] = (float) src[i];
9269 		}
9270 	} else {
9271 		assert( 0 );
9272 	}
9273 }
9274 
9275 #else
9276 
9277 /*
9278 ============
9279 idSIMD_AltiVec::UpSamplePCMTo44kHz
9280 
9281   Duplicate samples for 44kHz output.
9282 
9283 	Assumptions:
9284 		No assumptions
9285 ============
9286 */
UpSamplePCMTo44kHz(float * dest,const short * src,const int numSamples,const int kHz,const int numChannels)9287 void idSIMD_AltiVec::UpSamplePCMTo44kHz( float *dest, const short *src, const int numSamples, const int kHz, const int numChannels ) {
9288 
9289 	vector signed short vs0, vs1;
9290 	register vector signed int vi0, vi1;
9291 	register vector float v0, v1, v2, v3, v4, v5, v6, v7, v8, v9;
9292 	// permute vectors
9293 	register vector unsigned char vecFirstHalf = (vector unsigned char)(0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7);
9294 	register vector unsigned char vecSecondHalf = (vector unsigned char)(8,9,10,11,12,13,14,15,8,9,10,11,12,13,14,15);
9295 
9296 	register vector unsigned char vecBottom = (vector unsigned char)(0,1,2,3,0,1,2,3,4,5,6,7,4,5,6,7);
9297 	register vector unsigned char vecTop = (vector unsigned char)(8,9,10,11,8,9,10,11,12,13,14,15,12,13,14,15);
9298 
9299 	// calculate perm vector and masks for stores
9300 	vector unsigned char storePerm = vec_sub( vec_lvsr( 15, &dest[0] ), (vector unsigned char)(1) );
9301 	// original values of dest
9302 	vector float vecDest = vec_ld( 0, &dest[0] );
9303 	vector unsigned int mask = vec_perm( (vector unsigned int)(0), (vector unsigned int)(-1), storePerm );
9304 
9305 	if ( kHz == 11025 ) {
9306 		if ( numChannels == 1 ) {
9307 			// 8 at a time
9308 			int i = 0;
9309 
9310 			vector signed short vsOld = vec_ld( 0, &src[i] );
9311 			vector unsigned char permVec = vec_add( vec_lvsl( -1, &src[i] ), (vector unsigned char)(1) );
9312 
9313 			for ( ; i+7 < numSamples; i+= 8 ) {
9314 				// load src
9315 				vs1 = vec_ld( 15, &src[i] );
9316 				vs0 = vec_perm( vsOld, vs1, permVec );
9317 				vsOld = vs1;
9318 				vector float vecDestEnd = vec_ld( 127, &dest[i*4] );
9319 
9320 				// unpack shorts to ints
9321 				vi0 = vec_unpackh( vs0 );
9322 				vi1 = vec_unpackl( vs0 );
9323 				// convert ints to floats
9324 				v0 = vec_ctf( vi0, 0 );
9325 				v1 = vec_ctf( vi1, 0 );
9326 				// permute into vectors in the order to store
9327 
9328 				v2 = vec_splat( v0, 0 );
9329 				v3 = vec_splat( v0, 1 );
9330 				v4 = vec_splat( v0, 2 );
9331 				v5 = vec_splat( v0, 3 );
9332 				v6 = vec_splat( v1, 0 );
9333 				v7 = vec_splat( v1, 1 );
9334 				v8 = vec_splat( v1, 2 );
9335 				v9 = vec_splat( v1, 3 );
9336 
9337 				v2 = vec_perm( v2, v2, storePerm );
9338 				v3 = vec_perm( v3, v3, storePerm );
9339 				v4 = vec_perm( v4, v4, storePerm );
9340 				v5 = vec_perm( v5, v5, storePerm );
9341 				v6 = vec_perm( v6, v6, storePerm );
9342 				v7 = vec_perm( v7, v7, storePerm );
9343 				v8 = vec_perm( v8, v8, storePerm );
9344 				v9 = vec_perm( v9, v9, storePerm );
9345 
9346 				// store results
9347 				vec_st( vec_sel( vecDest, v2, mask ), 0, &dest[i*4] );
9348 				vec_st( vec_sel( v2, v3, mask ), 15, &dest[i*4] );
9349 				vec_st( vec_sel( v3, v4, mask ), 31, &dest[i*4] );
9350 				vec_st( vec_sel( v4, v5, mask ), 47, &dest[i*4] );
9351 				vec_st( vec_sel( v5, v6, mask ), 63, &dest[i*4] );
9352 				vec_st( vec_sel( v6, v7, mask ), 79, &dest[i*4] );
9353 				vec_st( vec_sel( v7, v8, mask ), 95, &dest[i*4] );
9354 				vec_st( vec_sel( v8, v9, mask ), 111, &dest[i*4] );
9355 				vecDest = vec_sel( v9, vecDestEnd, mask );
9356 				vec_st( vecDest, 127, &dest[i*4] );
9357 			}
9358 			// cleanup
9359 			for (; i < numSamples; i++ ) {
9360 				dest[i*4+0] = dest[i*4+1] = dest[i*4+2] = dest[i*4+3] = (float) src[i+0];
9361 			}
9362 		} else {
9363 			int i = 0;
9364 
9365 			vector unsigned char permVec = vec_add( vec_lvsl( -1, &src[0] ), (vector unsigned char)(1) );
9366 			vector signed short vsOld = vec_ld( 0, &src[0] );
9367 
9368 			for ( ; i+7 < numSamples; i += 8 ) {
9369 				// load src
9370 				vs1 = vec_ld( 15, &src[i] );
9371 				vs0 = vec_perm( vsOld, vs1, permVec );
9372 				vsOld = vs1;
9373 				vector float vecDestEnd = vec_ld( 127, &dest[i*4] );
9374 
9375 				// unpack shorts to ints
9376 				vi0 = vec_unpackh( vs0 );
9377 				vi1 = vec_unpackl( vs0 );
9378 				// convert ints to floats
9379 				v0 = vec_ctf( vi0, 0 );
9380 				v1 = vec_ctf( vi1, 0 );
9381 				// put into vectors in order to store
9382 				v2 = vec_perm( v0, v0, vecFirstHalf );
9383 				v3 = v2;
9384 				v4 = vec_perm( v0, v0, vecSecondHalf );
9385 				v5 = v4;
9386 				v6 = vec_perm( v1, v1, vecFirstHalf );
9387 				v7 = v6;
9388 				v8 = vec_perm (v1, v1, vecSecondHalf );
9389 				v9 = v8;
9390 
9391 				v2 = vec_perm( v2, v2, storePerm );
9392 				v3 = vec_perm( v3, v3, storePerm );
9393 				v4 = vec_perm( v4, v4, storePerm );
9394 				v5 = vec_perm( v5, v5, storePerm );
9395 				v6 = vec_perm( v6, v6, storePerm );
9396 				v7 = vec_perm( v7, v7, storePerm );
9397 				v8 = vec_perm( v8, v8, storePerm );
9398 				v9 = vec_perm( v9, v9, storePerm );
9399 
9400 				// store results
9401 				vec_st( vec_sel( vecDest, v2, mask ), 0, &dest[i*4] );
9402 				vec_st( vec_sel( v2, v3, mask ), 15, &dest[i*4] );
9403 				vec_st( vec_sel( v3, v4, mask ), 31, &dest[i*4] );
9404 				vec_st( vec_sel( v4, v5, mask ), 47, &dest[i*4] );
9405 				vec_st( vec_sel( v5, v6, mask ), 63, &dest[i*4] );
9406 				vec_st( vec_sel( v6, v7, mask ), 79, &dest[i*4] );
9407 				vec_st( vec_sel( v7, v8, mask ), 95, &dest[i*4] );
9408 				vec_st( vec_sel( v8, v9, mask ), 111, &dest[i*4] );
9409 				vecDest = vec_sel( v9, vecDestEnd, mask );
9410 				vec_st( vecDest, 127, &dest[i*4] );
9411 			}
9412 
9413 			for ( ; i < numSamples; i += 2 ) {
9414 				dest[i*4+0] = dest[i*4+2] = dest[i*4+4] = dest[i*4+6] = (float) src[i+0];
9415 				dest[i*4+1] = dest[i*4+3] = dest[i*4+5] = dest[i*4+7] = (float) src[i+1];
9416 			}
9417 		}
9418 	} else if ( kHz == 22050 ) {
9419 		if ( numChannels == 1 ) {
9420 			int i;
9421 			vector unsigned char permVec = vec_add( vec_lvsl( -1, &src[0] ), (vector unsigned char)(1) );
9422 			vector signed short vsOld = vec_ld( 0, &src[0] );
9423 
9424 			for ( i = 0; i+7 < numSamples; i += 8 ) {
9425 				// load src
9426 				vs1 = vec_ld( 0, &src[i] );
9427 				vs0 = vec_perm( vsOld, vs1, permVec );
9428 				vsOld = vs1;
9429 				vector float vecDestEnd = vec_ld( 63, &dest[i*2] );
9430 
9431 				// unpack shorts to ints
9432 				vi0 = vec_unpackh( vs0 );
9433 				vi1 = vec_unpackl( vs0 );
9434 				// convert ints to floats
9435 				v0 = vec_ctf( vi0, 0 );
9436 				v1 = vec_ctf( vi1, 0 );
9437 				// put into vectors in order to store
9438 				v2 = vec_perm( v0, v0, vecBottom );
9439 				v3 = vec_perm( v0, v0, vecTop );
9440 				v4 = vec_perm( v1, v1, vecBottom );
9441 				v5 = vec_perm (v1, v1, vecTop );
9442 
9443 				v2 = vec_perm( v2, v2, storePerm );
9444 				v3 = vec_perm( v3, v3, storePerm );
9445 				v4 = vec_perm( v4, v4, storePerm );
9446 				v5 = vec_perm( v5, v5, storePerm );
9447 
9448 				// store results
9449 				vec_st( vec_sel( vecDest, v2, mask ), 0, &dest[i*2] );
9450 				vec_st( vec_sel( v2, v3, mask ), 15, &dest[i*2] );
9451 				vec_st( vec_sel( v3, v4, mask ), 31, &dest[i*2] );
9452 				vec_st( vec_sel( v4, v5, mask ), 47, &dest[i*2] );
9453 				vecDest = vec_sel( v5, vecDestEnd, mask );
9454 				vec_st( vecDest, 63, &dest[i*2] );
9455 
9456 			}
9457 			// cleanup
9458 			for ( ; i < numSamples; i++ ) {
9459 				dest[i*2+0] = dest[i*2+1] = (float) src[i+0];
9460 			}
9461 		} else {
9462 			int i;
9463 			vector unsigned char permVec = vec_add( vec_lvsl( -1, &src[0] ), (vector unsigned char)(1) );
9464 			vector signed short vsOld = vec_ld( 0, &src[0] );
9465 
9466 			for ( i = 0; i+7 < numSamples; i += 8 ) {
9467 				// load src
9468 				vs1 = vec_ld( 15, &src[i] );
9469 				vs0 = vec_perm( vsOld, vs1, permVec );
9470 				vsOld = vs1;
9471 				vector float vecDestEnd = vec_ld( 63, &dest[i*2] );
9472 
9473 				// unpack shorts to ints
9474 				vi0 = vec_unpackh( vs0 );
9475 				vi1 = vec_unpackl( vs0 );
9476 				// convert ints to floats
9477 				v0 = vec_ctf( vi0, 0 );
9478 				v1 = vec_ctf( vi1, 0 );
9479 				// put into vectors in order to store
9480 				v2 = vec_perm( v0, v0, vecFirstHalf );
9481 				v3 = vec_perm( v0, v0, vecSecondHalf );
9482 				v4 = vec_perm( v1, v1, vecFirstHalf );
9483 				v5 = vec_perm (v1, v1, vecSecondHalf );
9484 
9485 				v2 = vec_perm( v2, v2, storePerm );
9486 				v3 = vec_perm( v3, v3, storePerm );
9487 				v4 = vec_perm( v4, v4, storePerm );
9488 				v5 = vec_perm( v5, v5, storePerm );
9489 
9490 				// store results
9491 				vec_st( vec_sel( vecDest, v2, mask ), 0, &dest[i*2] );
9492 				vec_st( vec_sel( v2, v3, mask ), 15, &dest[i*2] );
9493 				vec_st( vec_sel( v3, v4, mask ), 31, &dest[i*2] );
9494 				vec_st( vec_sel( v4, v5, mask ), 47, &dest[i*2] );
9495 				vecDest = vec_sel( v5, vecDestEnd, mask );
9496 				vec_st( vecDest, 63, &dest[i*2] );
9497 			}
9498 			// cleanup
9499 			for ( ; i < numSamples; i += 2 ) {
9500 				dest[i*2+0] = dest[i*2+2] = (float) src[i+0];
9501 				dest[i*2+1] = dest[i*2+3] = (float) src[i+1];
9502 			}
9503 		}
9504 	} else if ( kHz == 44100 ) {
9505 		int i;
9506 		vector unsigned char permVec = vec_add( vec_lvsl( -1, &src[0] ), (vector unsigned char)(1) );
9507 		vector signed short vsOld = vec_ld( 0, &src[0] );
9508 
9509 		for ( i = 0; i+7 < numSamples; i += 8 ) {
9510 			//vs0 = vec_ld( 0, &src[i] );
9511 			vs1 = vec_ld( 15, &src[i] );
9512 			vs0 = vec_perm( vsOld,  vs1, permVec );
9513 			vsOld = vs1;
9514 			vector float vecDestEnd = vec_ld( 31, &dest[i] );
9515 
9516 			//unpack shorts to ints
9517 			vi0 = vec_unpackh( vs0 );
9518 			vi1 = vec_unpackl( vs0 );
9519 
9520 			//convert ints to floats
9521 			v0 = vec_ctf( vi0, 0 );
9522 			v1 = vec_ctf( vi1, 0 );
9523 
9524 			v0 = vec_perm( v0, v0, storePerm );
9525 			v1 = vec_perm( v1, v1, storePerm );
9526 
9527 			// store results
9528 			vec_st( vec_sel( vecDest, v0, mask ), 0, &dest[i] );
9529 			vec_st( vec_sel( v0, v1, mask ), 15, &dest[i] );
9530 			vecDest = vec_sel( v1, vecDestEnd, mask );
9531 			vec_st( vecDest, 31, &dest[i] );
9532 		}
9533 		//	cleanup
9534 		for ( ; i < numSamples; i++ ) {
9535 			dest[i] = (float) src[i];
9536 		}
9537 	} else {
9538 		assert( 0 );
9539 	}
9540 }
9541 
9542 #endif
9543 
9544 #ifdef SOUND_DEST_ALIGNED
9545 /*
9546 ============
9547 idSIMD_AltiVec::UpSampleOGGTo44kHz
9548 
9549   Duplicate samples for 44kHz output.
9550 
9551 	Assumptions:
9552 		Assumes that dest starts at aligned address
9553 ============
9554 */
UpSampleOGGTo44kHz(float * dest,const float * const * ogg,const int numSamples,const int kHz,const int numChannels)9555 void idSIMD_AltiVec::UpSampleOGGTo44kHz( float *dest, const float * const *ogg, const int numSamples, const int kHz, const int numChannels ) {
9556 	// dest is aligned
9557 	assert( IS_16BYTE_ALIGNED( dest[0] ) );
9558 
9559 	register vector float oggVec1, oggVec2, oggVec3, oggVec4, oggVec5, oggVec6, oggVec7, oggVec8;
9560 	register vector float constVec, zeroVector;
9561 	register vector float v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10;
9562 	vector unsigned char vecPerm1;
9563 	vector unsigned char vecPerm2;
9564 
9565 	vector unsigned char vecOneTwo = (vector unsigned char)(0,1,2,3,0,1,2,3,4,5,6,7,4,5,6,7);
9566 	vector unsigned char vecThreeFour = (vector unsigned char)(8,9,10,11,8,9,10,11,12,13,14,15,12,13,14,15);
9567 	vector unsigned char vecFirst = (vector unsigned char)(0,1,2,3,16,17,18,19,0,1,2,3,16,17,18,19);
9568 	vector unsigned char vecSecond = (vector unsigned char)(4,5,6,7,20,21,22,23,4,5,6,7,20,21,22,23);
9569 	vector unsigned char vecThird = (vector unsigned char)(8,9,10,11,24,25,26,27,8,9,10,11,24,25,26,27);
9570 	vector unsigned char vecFourth = (vector unsigned char)(12,13,14,15,28,29,30,31,12,13,14,15,28,29,30,31);
9571 
9572 	constVec = (vector float)(32768.0f);
9573 	zeroVector = (vector float)(0.0);
9574 
9575 	if ( kHz == 11025 ) {
9576 		if ( numChannels == 1 ) {
9577 			 // calculate perm vector and do first load
9578 			 vecPerm1 = vec_add( vec_lvsl( -1, (int*) &ogg[0][0] ), (vector unsigned char)(1) );
9579 			 v10 = vec_ld( 0, &ogg[0][0] );
9580 
9581 			 int i;
9582 			 for ( i = 0; i+7 < numSamples; i += 8 ) {
9583 				// as it happens, ogg[0][i] through ogg[0][i+3] are contiguous in memory
9584 				v8 = v10;
9585 				v9 = vec_ld( 15, &ogg[0][i] );
9586 				v10 = vec_ld( 31, &ogg[0][i] );
9587 				v0 = vec_perm( v8, v9, vecPerm1 );
9588 				v1 = vec_perm( v9, v10, vecPerm1 );
9589 
9590 				// now we have the elements in a vector, we want
9591 				// to splat them each accross their own vector
9592 				oggVec1 = vec_splat( v0, 0 );
9593 				oggVec2 = vec_splat( v0, 1 );
9594 				oggVec3 = vec_splat( v0, 2 );
9595 				oggVec4 = vec_splat( v0, 3 );
9596 				oggVec5 = vec_splat( v1, 0 );
9597 				oggVec6 = vec_splat( v1, 1 );
9598 				oggVec7 = vec_splat( v1, 2 );
9599 				oggVec8 = vec_splat( v1, 3 );
9600 
9601 				v0 = vec_madd( oggVec1, constVec, zeroVector );
9602 				v1 = vec_madd( oggVec2, constVec, zeroVector );
9603 				v2 = vec_madd( oggVec3, constVec, zeroVector );
9604 				v3 = vec_madd( oggVec4, constVec, zeroVector );
9605 				v4 = vec_madd( oggVec5, constVec, zeroVector );
9606 				v5 = vec_madd( oggVec6, constVec, zeroVector );
9607 				v6 = vec_madd( oggVec7, constVec, zeroVector );
9608 				v7 = vec_madd( oggVec8, constVec, zeroVector );
9609 
9610 				//store results
9611 				ALIGNED_STORE8( &dest[i*4], v0, v1, v2, v3, v4, v5, v6, v7 );
9612 
9613 			}
9614 
9615 			//cleanup
9616 			for ( ; i < numSamples; i++ ) {
9617 				dest[i*4+0] = dest[i*4+1] = dest[i*4+2] = dest[i*4+3] = ogg[0][i] * 32768.0f;
9618 			}
9619 
9620 		} else {
9621 
9622 			// calculate perm vec for ogg
9623 			vecPerm1 = vec_add( vec_lvsl( -1, (int*) &ogg[0][0] ), (vector unsigned char)(1) );
9624 			vecPerm2 = vec_add( vec_lvsl( -1, (int*) &ogg[1][0] ), (vector unsigned char)(1) );
9625 			v7 = vec_ld( 0, &ogg[1][0] );
9626 			v9 = vec_ld( 0, &ogg[0][0] );
9627 			int i;
9628 
9629 			for ( i = 0; i+3 < numSamples >> 1; i+=4 ) {  // +1 += 2
9630 				// load and splat from the array ( ogg[0][i] to ogg[0][i+3] )
9631 				v8 = v9;
9632 				v9 = vec_ld( 15, &ogg[0][i] );
9633 				v0 = vec_perm( v8, v9, vecPerm1 );
9634 
9635 				// now we have the elements in a vector, we want
9636 				// to splat them each accross their own vector
9637 				oggVec1 = vec_splat( v0, 0 );
9638 				oggVec2 = vec_splat( v0, 1 );
9639 				oggVec3 = vec_splat( v0, 2 );
9640 				oggVec4 = vec_splat( v0, 3 );
9641 
9642 				// load and splat from the array ( ogg[1][i] to ogg[1][i+3] )
9643 				v6 = v7;
9644 				v7 = vec_ld( 15, &ogg[1][i] );
9645 				v1 = vec_perm( v6, v7, vecPerm2 );
9646 
9647 				// now we have the elements in a vector, we want
9648 				// to splat them each accross their own vector
9649 				oggVec5 = vec_splat( v1, 0 );
9650 				oggVec6 = vec_splat( v1, 1 );
9651 				oggVec7 = vec_splat( v1, 2 );
9652 				oggVec8 = vec_splat( v1, 3 );
9653 
9654 				oggVec1 = vec_madd( oggVec1, constVec, zeroVector ); // ogg[0][i] * 32768
9655 				oggVec2 = vec_madd( oggVec2, constVec, zeroVector ); // ogg[0][i+1] * 32768
9656 				oggVec3 = vec_madd( oggVec3, constVec, zeroVector ); // ogg[0][i+2] * 32768
9657 				oggVec4 = vec_madd( oggVec4, constVec, zeroVector ); // ogg[0][i+3] * 32768
9658 				oggVec5 = vec_madd( oggVec5, constVec, zeroVector ); // ogg[1][i] * 32768
9659 				oggVec6 = vec_madd( oggVec6, constVec, zeroVector ); // ogg[1][i+1] * 32768
9660 				oggVec7 = vec_madd( oggVec7, constVec, zeroVector ); // ogg[1][i+2] * 32768
9661 				oggVec8 = vec_madd( oggVec8, constVec, zeroVector ); // ogg[1][i+3] * 32768
9662 
9663 				//merge generates the interleaved pattern that we want and it
9664 				//doesn't require a permute vector, so use that instead
9665 				v0 = vec_mergeh( oggVec1, oggVec5 );
9666 				v1 = vec_mergel( oggVec1, oggVec5 );
9667 				v2 = vec_mergeh( oggVec2, oggVec6 );
9668 				v3 = vec_mergel( oggVec2, oggVec6 );
9669 
9670 				v4 = vec_mergeh( oggVec3, oggVec7 );
9671 				v5 = vec_mergel( oggVec3, oggVec7 );
9672 				v6 = vec_mergeh( oggVec4, oggVec8 );
9673 				v10 = vec_mergel( oggVec4, oggVec8 );
9674 
9675 				//store results
9676 				ALIGNED_STORE8( &dest[i*8], v0, v1, v2, v3, v4, v5, v6, v10 );
9677 			}
9678 
9679 			//cleanup
9680 			for ( ; i < numSamples >> 1; i++ ) {
9681 				dest[i*8+0] = dest[i*8+2] = dest[i*8+4] = dest[i*8+6] = ogg[0][i] * 32768.0f;
9682 				dest[i*8+1] = dest[i*8+3] = dest[i*8+5] = dest[i*8+7] = ogg[1][i] * 32768.0f;
9683 			}
9684 		}
9685 	} else if ( kHz == 22050 ) {
9686 		if ( numChannels == 1 ) {
9687 
9688 			 // calculate perm vector and do first load
9689 			 vecPerm1 = vec_add( vec_lvsl( -1, (int*) &ogg[0][0] ), (vector unsigned char)(1) );
9690 			 v10 = vec_ld( 0, &ogg[0][0] );
9691 
9692 			int i;
9693 
9694 			for ( i = 0; i+7 < numSamples; i += 8 ) {
9695 				// load values from ogg
9696 				v8 = v10;
9697 				v9 = vec_ld( 15, &ogg[0][i] );
9698 				v10 = vec_ld( 31, &ogg[0][i] );
9699 				v0 = vec_perm( v8, v9, vecPerm1 );
9700 				v1 = vec_perm( v9, v10, vecPerm1 );
9701 
9702 				// multiply
9703 				v0 = vec_madd( v0, constVec, zeroVector );
9704 				v1 = vec_madd( v1, constVec, zeroVector );
9705 
9706 				// permute into results vectors to store
9707 				v5 = vec_perm( v0, v0, vecOneTwo );
9708 				v6 = vec_perm( v0, v0, vecThreeFour);
9709 				v7 = vec_perm( v1, v1, vecOneTwo );
9710 				v8 = vec_perm( v1, v1, vecThreeFour );
9711 
9712 				//store results
9713 				ALIGNED_STORE4( &dest[i*2], v5, v6, v7, v8 );
9714 			}
9715 			// cleanup
9716 			for ( ; i < numSamples; i++ ) {
9717 				dest[i*2+0] = dest[i*2+1] = ogg[0][i] * 32768.0f;
9718 			}
9719 		} else {
9720 
9721 			// calculate perm vector and do first load
9722 			vecPerm1 = vec_add( vec_lvsl( -1, (int*) &ogg[0][0] ), (vector unsigned char)(1) );
9723 			vecPerm2 = vec_add( vec_lvsl( -1, (int*) &ogg[1][0] ), (vector unsigned char)(1) );
9724 			v7 = vec_ld( 0, &ogg[1][0] );
9725 			v9 = vec_ld( 0, &ogg[0][0] );
9726 
9727 			int i;
9728 			for ( i = 0; i+3 < numSamples >> 1; i += 4 ) {
9729 				// load ogg[0][i] to ogg[0][i+4]
9730 				v8 = v9;
9731 				v9 = vec_ld( 15, &ogg[0][i] );
9732 				v0 = vec_perm( v8, v9, vecPerm1 );
9733 
9734 				// load ogg[1][i] to ogg[1][i+3]
9735 				v6 = v7;
9736 				v7 = vec_ld( 15, &ogg[1][i] );
9737 				v1 = vec_perm( v6, v7, vecPerm2 );
9738 
9739 				// multiply
9740 				v0 = vec_madd( v0, constVec, zeroVector );
9741 				v1 = vec_madd( v1, constVec, zeroVector );
9742 
9743 				// generate result vectors to store
9744 				v2 = vec_perm( v0, v1, vecFirst );
9745 				v3 = vec_perm( v0, v1, vecSecond );
9746 				v4 = vec_perm( v0, v1, vecThird );
9747 				v5 = vec_perm( v0, v1, vecFourth );
9748 
9749 				// store results
9750 				ALIGNED_STORE4( &dest[i*4], v2, v3, v4, v5 );
9751 			}
9752 			// cleanup
9753 			for ( ; i < numSamples >> 1; i++ ) {
9754 				dest[i*4+0] = dest[i*4+2] = ogg[0][i] * 32768.0f;
9755 				dest[i*4+1] = dest[i*4+3] = ogg[1][i] * 32768.0f;
9756 			}
9757 		}
9758 	} else if ( kHz == 44100 ) {
9759 		if ( numChannels == 1 ) {
9760 			// calculate perm vector and do first load
9761 			vecPerm1 = vec_add( vec_lvsl( -1, (int*) &ogg[0][0] ), (vector unsigned char)(1) );
9762 
9763 			v9 = vec_ld( 0, &ogg[0][0] );
9764 			int i;
9765 
9766 			for ( i = 0; i+7 < numSamples; i += 8 ) {
9767 				// load values from ogg
9768 				v8 = v9;
9769 				v7 = vec_ld( 15, &ogg[0][i] );
9770 				v6 = v7;
9771 				v9 = vec_ld( 31, &ogg[0][i] );
9772 
9773 				v0 = vec_perm( v8, v7, vecPerm1 );
9774 				v1 = vec_perm( v6, v9, vecPerm1 );
9775 
9776 				// multiply
9777 				v0 = vec_madd( v0, constVec, zeroVector );
9778 				v1 = vec_madd( v1, constVec, zeroVector );
9779 
9780 				ALIGNED_STORE2( &dest[i], v0, v1 );
9781 			}
9782 
9783 			// cleanup
9784 			for ( ; i < numSamples; i++ ) {
9785 				dest[i*1+0] = ogg[0][i] * 32768.0f;
9786 			}
9787 		} else {
9788 
9789 			// calculate perm vector and do first load
9790 			vecPerm1 = vec_add( vec_lvsl( -1, (int*) &ogg[0][0] ), (vector unsigned char)(1) );
9791 			vecPerm2 = vec_add( vec_lvsl( -1, (int*) &ogg[1][0] ), (vector unsigned char)(1) );
9792 			v7 = vec_ld( 0, &ogg[1][0] );
9793 			v9 = vec_ld( 0, &ogg[0][0] );
9794 			int i;
9795 
9796 			for ( i = 0; i+3 < numSamples >> 1; i += 4 ) {
9797 				v8 = v9;
9798 				v9 = vec_ld( 15, &ogg[0][i] );
9799 				v0 = vec_perm( v8, v9, vecPerm1 );
9800 
9801 				// load ogg[1][i] to ogg[1][i+3]
9802 				v6 = v7;
9803 				v7 = vec_ld( 15, &ogg[1][i] );
9804 				v1 = vec_perm( v6, v7, vecPerm2 );
9805 
9806 				// multiply
9807 				v0 = vec_madd( v0, constVec, zeroVector );
9808 				v1 = vec_madd( v1, constVec, zeroVector );
9809 
9810 				// generate result vectors
9811 				v2 = vec_mergeh( v0, v1 );
9812 				v3 = vec_mergel( v0, v1 );
9813 
9814 				// store results
9815 				ALIGNED_STORE2( &dest[i*2], v2, v3 );
9816 			}
9817 			// cleanup
9818 			for ( ; i < numSamples >> 1; i++ ) {
9819 				dest[i*2+0] = ogg[0][i] * 32768.0f;
9820 				dest[i*2+1] = ogg[1][i] * 32768.0f;
9821 			}
9822 		}
9823 	} else {
9824 		assert( 0 );
9825 	}
9826 }
9827 
9828 #else
9829 
9830 /*
9831 ============
9832 idSIMD_AltiVec::UpSampleOGGTo44kHz
9833 
9834   Duplicate samples for 44kHz output.
9835 
9836 	Assumptions:
9837 		No assumptions
9838 ============
9839 */
UpSampleOGGTo44kHz(float * dest,const float * const * ogg,const int numSamples,const int kHz,const int numChannels)9840 void idSIMD_AltiVec::UpSampleOGGTo44kHz( float *dest, const float * const *ogg, const int numSamples, const int kHz, const int numChannels ) {
9841 
9842 	register vector float oggVec1, oggVec2, oggVec3, oggVec4, oggVec5, oggVec6, oggVec7, oggVec8;
9843 	register vector float constVec, zeroVector;
9844 	register vector float v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10;
9845 	vector unsigned char vecPerm1;
9846 	vector unsigned char vecPerm2;
9847 
9848 	vector unsigned char vecOneTwo = (vector unsigned char)(0,1,2,3,0,1,2,3,4,5,6,7,4,5,6,7);
9849 	vector unsigned char vecThreeFour = (vector unsigned char)(8,9,10,11,8,9,10,11,12,13,14,15,12,13,14,15);
9850 	vector unsigned char vecFirst = (vector unsigned char)(0,1,2,3,16,17,18,19,0,1,2,3,16,17,18,19);
9851 	vector unsigned char vecSecond = (vector unsigned char)(4,5,6,7,20,21,22,23,4,5,6,7,20,21,22,23);
9852 	vector unsigned char vecThird = (vector unsigned char)(8,9,10,11,24,25,26,27,8,9,10,11,24,25,26,27);
9853 	vector unsigned char vecFourth = (vector unsigned char)(12,13,14,15,28,29,30,31,12,13,14,15,28,29,30,31);
9854 
9855 	vector unsigned char storePerm;
9856 
9857 	constVec = (vector float)(32768.0f);
9858 	zeroVector = (vector float)(0.0);
9859 
9860 	 // calculate perm vector and masks for stores
9861 	 storePerm = vec_sub( vec_lvsr( 15, &dest[0] ), (vector unsigned char)(1) );
9862 	 // original values of dest
9863 	 vector float vecDest = vec_ld( 0, &dest[0] );
9864 	 vector unsigned int mask = vec_perm( (vector unsigned int)(0), (vector unsigned int)(-1), storePerm );
9865 
9866 	if ( kHz == 11025 ) {
9867 		if ( numChannels == 1 ) {
9868 			 // calculate perm vector and do first load
9869 			 vecPerm1 = vec_add( vec_lvsl( -1, (int*) &ogg[0][0] ), (vector unsigned char)(1) );
9870 			 v10 = vec_ld( 0, &ogg[0][0] );
9871 
9872 			 int i;
9873 			 for ( i = 0; i+7 < numSamples; i += 8 ) {
9874 				// as it happens, ogg[0][i] through ogg[0][i+3] are contiguous in memory
9875 				v8 = v10;
9876 				v9 = vec_ld( 15, &ogg[0][i] );
9877 				v10 = vec_ld( 31, &ogg[0][i] );
9878 				vector float vecDestEnd = vec_ld( 127, &dest[i*4] );
9879 				v0 = vec_perm( v8, v9, vecPerm1 );
9880 				v1 = vec_perm( v9, v10, vecPerm1 );
9881 
9882 				// now we have the elements in a vector, we want
9883 				// to splat them each accross their own vector
9884 				oggVec1 = vec_splat( v0, 0 );
9885 				oggVec2 = vec_splat( v0, 1 );
9886 				oggVec3 = vec_splat( v0, 2 );
9887 				oggVec4 = vec_splat( v0, 3 );
9888 				oggVec5 = vec_splat( v1, 0 );
9889 				oggVec6 = vec_splat( v1, 1 );
9890 				oggVec7 = vec_splat( v1, 2 );
9891 				oggVec8 = vec_splat( v1, 3 );
9892 
9893 				v0 = vec_madd( oggVec1, constVec, zeroVector );
9894 				v1 = vec_madd( oggVec2, constVec, zeroVector );
9895 				v2 = vec_madd( oggVec3, constVec, zeroVector );
9896 				v3 = vec_madd( oggVec4, constVec, zeroVector );
9897 				v4 = vec_madd( oggVec5, constVec, zeroVector );
9898 				v5 = vec_madd( oggVec6, constVec, zeroVector );
9899 				v6 = vec_madd( oggVec7, constVec, zeroVector );
9900 				v7 = vec_madd( oggVec8, constVec, zeroVector );
9901 
9902 				// rotate input data
9903 				v0 = vec_perm( v0, v0, storePerm );
9904 				v1 = vec_perm( v1, v1, storePerm );
9905 				v2 = vec_perm( v2, v2, storePerm );
9906 				v3 = vec_perm( v3, v3, storePerm );
9907 				v4 = vec_perm( v4, v4, storePerm );
9908 				v5 = vec_perm( v5, v5, storePerm );
9909 				v6 = vec_perm( v6, v6, storePerm );
9910 				v7 = vec_perm( v7, v7, storePerm );
9911 
9912 				// store results
9913 				vec_st( vec_sel( vecDest, v0, mask ), 0, &dest[i*4] );
9914 				vec_st( vec_sel( v0, v1, mask ), 15, &dest[i*4] );
9915 				vec_st( vec_sel( v1, v2, mask ), 31, &dest[i*4] );
9916 				vec_st( vec_sel( v2, v3, mask ), 47, &dest[i*4] );
9917 				vec_st( vec_sel( v3, v4, mask ), 63, &dest[i*4] );
9918 				vec_st( vec_sel( v4, v5, mask ), 79, &dest[i*4] );
9919 				vec_st( vec_sel( v5, v6, mask ), 95, &dest[i*4] );
9920 				vec_st( vec_sel( v6, v7, mask ), 111, &dest[i*4] );
9921 				vecDest = vec_sel( v7, vecDestEnd, mask );
9922 				vec_st( vecDest, 127, &dest[i*4] );
9923 			}
9924 
9925 			//cleanup
9926 			for ( ; i < numSamples; i++ ) {
9927 				dest[i*4+0] = dest[i*4+1] = dest[i*4+2] = dest[i*4+3] = ogg[0][i] * 32768.0f;
9928 			}
9929 
9930 		} else {
9931 
9932 			// calculate perm vec for ogg
9933 			vecPerm1 = vec_add( vec_lvsl( -1, (int*) &ogg[0][0] ), (vector unsigned char)(1) );
9934 			vecPerm2 = vec_add( vec_lvsl( -1, (int*) &ogg[1][0] ), (vector unsigned char)(1) );
9935 			v7 = vec_ld( 0, &ogg[1][0] );
9936 			v9 = vec_ld( 0, &ogg[0][0] );
9937 			int i;
9938 
9939 			for ( i = 0; i+3 < numSamples >> 1; i+=4 ) {  // +1 += 2
9940 				// load and splat from the array ( ogg[0][i] to ogg[0][i+3] )
9941 				v8 = v9;
9942 				v9 = vec_ld( 15, &ogg[0][i] );
9943 				vector float vecDestEnd = vec_ld( 127, &dest[i*8] );
9944 				v0 = vec_perm( v8, v9, vecPerm1 );
9945 
9946 				// now we have the elements in a vector, we want
9947 				// to splat them each accross their own vector
9948 				oggVec1 = vec_splat( v0, 0 );
9949 				oggVec2 = vec_splat( v0, 1 );
9950 				oggVec3 = vec_splat( v0, 2 );
9951 				oggVec4 = vec_splat( v0, 3 );
9952 
9953 				// load and splat from the array ( ogg[1][i] to ogg[1][i+3] )
9954 				v6 = v7;
9955 				v7 = vec_ld( 15, &ogg[1][i] );
9956 				v1 = vec_perm( v6, v7, vecPerm2 );
9957 
9958 				// now we have the elements in a vector, we want
9959 				// to splat them each accross their own vector
9960 				oggVec5 = vec_splat( v1, 0 );
9961 				oggVec6 = vec_splat( v1, 1 );
9962 				oggVec7 = vec_splat( v1, 2 );
9963 				oggVec8 = vec_splat( v1, 3 );
9964 
9965 				oggVec1 = vec_madd( oggVec1, constVec, zeroVector ); // ogg[0][i] * 32768
9966 				oggVec2 = vec_madd( oggVec2, constVec, zeroVector ); // ogg[0][i+1] * 32768
9967 				oggVec3 = vec_madd( oggVec3, constVec, zeroVector ); // ogg[0][i+2] * 32768
9968 				oggVec4 = vec_madd( oggVec4, constVec, zeroVector ); // ogg[0][i+3] * 32768
9969 				oggVec5 = vec_madd( oggVec5, constVec, zeroVector ); // ogg[1][i] * 32768
9970 				oggVec6 = vec_madd( oggVec6, constVec, zeroVector ); // ogg[1][i+1] * 32768
9971 				oggVec7 = vec_madd( oggVec7, constVec, zeroVector ); // ogg[1][i+2] * 32768
9972 				oggVec8 = vec_madd( oggVec8, constVec, zeroVector ); // ogg[1][i+3] * 32768
9973 
9974 				//merge generates the interleaved pattern that we want and it
9975 				//doesn't require a permute vector, so use that instead
9976 				v0 = vec_mergeh( oggVec1, oggVec5 );
9977 				v1 = vec_mergel( oggVec1, oggVec5 );
9978 				v2 = vec_mergeh( oggVec2, oggVec6 );
9979 				v3 = vec_mergel( oggVec2, oggVec6 );
9980 
9981 				v4 = vec_mergeh( oggVec3, oggVec7 );
9982 				v5 = vec_mergel( oggVec3, oggVec7 );
9983 				v6 = vec_mergeh( oggVec4, oggVec8 );
9984 				v10 = vec_mergel( oggVec4, oggVec8 );
9985 
9986 				// rotate input data
9987 				v0 = vec_perm( v0, v0, storePerm );
9988 				v1 = vec_perm( v1, v1, storePerm );
9989 				v2 = vec_perm( v2, v2, storePerm );
9990 				v3 = vec_perm( v3, v3, storePerm );
9991 				v4 = vec_perm( v4, v4, storePerm );
9992 				v5 = vec_perm( v5, v5, storePerm );
9993 				v6 = vec_perm( v6, v6, storePerm );
9994 				v10 = vec_perm( v10, v10, storePerm );
9995 
9996 				// store results
9997 				vec_st( vec_sel( vecDest, v0, mask ), 0, &dest[i*8] );
9998 				vec_st( vec_sel( v0, v1, mask ), 15, &dest[i*8] );
9999 				vec_st( vec_sel( v1, v2, mask ), 31, &dest[i*8] );
10000 				vec_st( vec_sel( v2, v3, mask ), 47, &dest[i*8] );
10001 				vec_st( vec_sel( v3, v4, mask ), 63, &dest[i*8] );
10002 				vec_st( vec_sel( v4, v5, mask ), 79, &dest[i*8] );
10003 				vec_st( vec_sel( v5, v6, mask ), 95, &dest[i*8] );
10004 				vec_st( vec_sel( v6, v10, mask ), 111, &dest[i*8] );
10005 				vecDest = vec_sel( v10, vecDestEnd, mask );
10006 				vec_st( vecDest, 127, &dest[i*8] );
10007 			}
10008 
10009 			//cleanup
10010 			for ( ; i < numSamples >> 1; i++ ) {
10011 				dest[i*8+0] = dest[i*8+2] = dest[i*8+4] = dest[i*8+6] = ogg[0][i] * 32768.0f;
10012 				dest[i*8+1] = dest[i*8+3] = dest[i*8+5] = dest[i*8+7] = ogg[1][i] * 32768.0f;
10013 			}
10014 		}
10015 	} else if ( kHz == 22050 ) {
10016 		if ( numChannels == 1 ) {
10017 
10018 		 // calculate perm vector and do first load
10019 			 vecPerm1 = vec_add( vec_lvsl( -1, (int*) &ogg[0][0] ), (vector unsigned char)(1) );
10020 			 v10 = vec_ld( 0, &ogg[0][0] );
10021 
10022 			int i;
10023 
10024 			for ( i = 0; i+7 < numSamples; i += 8 ) {
10025 
10026 				// load values from ogg
10027 				v8 = v10;
10028 				v9 = vec_ld( 15, &ogg[0][i] );
10029 				v10 = vec_ld( 31, &ogg[0][i] );
10030 				vector float vecDestEnd = vec_ld( 63, &dest[i*2] );
10031 				v0 = vec_perm( v8, v9, vecPerm1 );
10032 				v1 = vec_perm( v9, v10, vecPerm1 );
10033 
10034 				// multiply
10035 				v0 = vec_madd( v0, constVec, zeroVector );
10036 				v1 = vec_madd( v1, constVec, zeroVector );
10037 
10038 				// permute into results vectors to store
10039 				v5 = vec_perm( v0, v0, vecOneTwo );
10040 				v6 = vec_perm( v0, v0, vecThreeFour);
10041 				v7 = vec_perm( v1, v1, vecOneTwo );
10042 				v8 = vec_perm( v1, v1, vecThreeFour );
10043 
10044 				// rotate input data
10045 				v5 = vec_perm( v5, v5, storePerm );
10046 				v6 = vec_perm( v6, v6, storePerm );
10047 				v7 = vec_perm( v7, v7, storePerm );
10048 				v8 = vec_perm( v8, v8, storePerm );
10049 
10050 				// store results
10051 				vec_st( vec_sel( vecDest, v5, mask ), 0, &dest[i*2] );
10052 				vec_st( vec_sel( v5, v6, mask ), 15, &dest[i*2] );
10053 				vec_st( vec_sel( v6, v7, mask ), 31, &dest[i*2] );
10054 				vec_st( vec_sel( v7, v8, mask ), 47, &dest[i*2] );
10055 				vecDest = vec_sel( v8, vecDestEnd, mask );
10056 				vec_st( vecDest, 63, &dest[i*2] );
10057 			}
10058 
10059 			// cleanup
10060 			for ( ; i < numSamples; i++ ) {
10061 				dest[i*2+0] = dest[i*2+1] = ogg[0][i] * 32768.0f;
10062 			}
10063 		} else {
10064 
10065 			// calculate perm vector and do first load
10066 			vecPerm1 = vec_add( vec_lvsl( -1, (int*) &ogg[0][0] ), (vector unsigned char)(1) );
10067 			vecPerm2 = vec_add( vec_lvsl( -1, (int*) &ogg[1][0] ), (vector unsigned char)(1) );
10068 			v7 = vec_ld( 0, &ogg[1][0] );
10069 			v9 = vec_ld( 0, &ogg[0][0] );
10070 
10071 			int i;
10072 			for ( i = 0; i+3 < numSamples >> 1; i += 4 ) {
10073 				// load ogg[0][i] to ogg[0][i+4]
10074 				v8 = v9;
10075 				v9 = vec_ld( 15, &ogg[0][i] );
10076 				vector float vecDestEnd = vec_ld( 63, &dest[i*4] );
10077 				v0 = vec_perm( v8, v9, vecPerm1 );
10078 
10079 				// load ogg[1][i] to ogg[1][i+3]
10080 				v6 = v7;
10081 				v7 = vec_ld( 15, &ogg[1][i] );
10082 				v1 = vec_perm( v6, v7, vecPerm2 );
10083 
10084 				// multiply
10085 				v0 = vec_madd( v0, constVec, zeroVector );
10086 				v1 = vec_madd( v1, constVec, zeroVector );
10087 
10088 				// generate result vectors to store
10089 				v2 = vec_perm( v0, v1, vecFirst );
10090 				v3 = vec_perm( v0, v1, vecSecond );
10091 				v4 = vec_perm( v0, v1, vecThird );
10092 				v5 = vec_perm( v0, v1, vecFourth );
10093 
10094 				// rotate input data
10095 				v2 = vec_perm( v2, v2, storePerm );
10096 				v3 = vec_perm( v3, v3, storePerm );
10097 				v4 = vec_perm( v4, v4, storePerm );
10098 				v5 = vec_perm( v5, v5, storePerm );
10099 
10100 				// store results
10101 				vec_st( vec_sel( vecDest, v2, mask ), 0, &dest[i*4] );
10102 				vec_st( vec_sel( v2, v3, mask ), 15, &dest[i*4] );
10103 				vec_st( vec_sel( v3, v4, mask ), 31, &dest[i*4] );
10104 				vec_st( vec_sel( v4, v5, mask ), 47, &dest[i*4] );
10105 				vecDest = vec_sel( v5, vecDestEnd, mask );
10106 				vec_st( vecDest, 63, &dest[i*4] );
10107 			}
10108 
10109 			// cleanup
10110 			for ( ; i < numSamples >> 1; i++ ) {
10111 				dest[i*4+0] = dest[i*4+2] = ogg[0][i] * 32768.0f;
10112 				dest[i*4+1] = dest[i*4+3] = ogg[1][i] * 32768.0f;
10113 			}
10114 		}
10115 	} else if ( kHz == 44100 ) {
10116 		if ( numChannels == 1 ) {
10117 			// calculate perm vector and do first load
10118 			vecPerm1 = vec_add( vec_lvsl( -1, (int*) &ogg[0][0] ), (vector unsigned char)(1) );
10119 
10120 			v9 = vec_ld( 0, &ogg[0][0] );
10121 			int i;
10122 
10123 			for ( i = 0; i+7 < numSamples; i += 8 ) {
10124 				// load values from ogg
10125 				v8 = v9;
10126 				v7 = vec_ld( 15, &ogg[0][i] );
10127 				v6 = v7;
10128 				v9 = vec_ld( 31, &ogg[0][i] );
10129 				vector float vecDestEnd = vec_ld( 31, &dest[i] );
10130 
10131 				v0 = vec_perm( v8, v7, vecPerm1 );
10132 				v1 = vec_perm( v6, v9, vecPerm1 );
10133 
10134 				// multiply
10135 				v0 = vec_madd( v0, constVec, zeroVector );
10136 				v1 = vec_madd( v1, constVec, zeroVector );
10137 
10138 				// rotate data
10139 				v0 = vec_perm( v0, v0, storePerm );
10140 				v1 = vec_perm( v1, v1, storePerm );
10141 
10142 				// store results
10143 				vec_st( vec_sel( vecDest, v0, mask ), 0, &dest[i] );
10144 				vec_st( vec_sel( v0, v1, mask ), 15, &dest[i] );
10145 				vecDest = vec_sel( v1, vecDestEnd, mask );
10146 				vec_st( vecDest, 31, &dest[i] );
10147 			}
10148 
10149 			// cleanup
10150 			for ( ; i < numSamples; i++ ) {
10151 				dest[i*1+0] = ogg[0][i] * 32768.0f;
10152 			}
10153 		} else {
10154 
10155 			// calculate perm vector and do first load
10156 			vecPerm1 = vec_add( vec_lvsl( -1, (int*) &ogg[0][0] ), (vector unsigned char)(1) );
10157 			vecPerm2 = vec_add( vec_lvsl( -1, (int*) &ogg[1][0] ), (vector unsigned char)(1) );
10158 			v7 = vec_ld( 0, &ogg[1][0] );
10159 			v9 = vec_ld( 0, &ogg[0][0] );
10160 			int i;
10161 
10162 			for ( i = 0; i+3 < numSamples >> 1; i += 4 ) {
10163 				v8 = v9;
10164 				v9 = vec_ld( 15, &ogg[0][i] );
10165 				v0 = vec_perm( v8, v9, vecPerm1 );
10166 
10167 				// load ogg[1][i] to ogg[1][i+3]
10168 				v6 = v7;
10169 				v7 = vec_ld( 15, &ogg[1][i] );
10170 				v1 = vec_perm( v6, v7, vecPerm2 );
10171 
10172 				// multiply
10173 				v0 = vec_madd( v0, constVec, zeroVector );
10174 				v1 = vec_madd( v1, constVec, zeroVector );
10175 
10176 				// generate result vectors
10177 				v2 = vec_mergeh( v0, v1 );
10178 				v3 = vec_mergel( v0, v1 );
10179 
10180 				// store results
10181 				UNALIGNED_STORE2( &dest[i*2], v2, v3 );
10182 			}
10183 			// cleanup
10184 			for ( ; i < numSamples >> 1; i++ ) {
10185 				dest[i*2+0] = ogg[0][i] * 32768.0f;
10186 				dest[i*2+1] = ogg[1][i] * 32768.0f;
10187 			}
10188 		}
10189 	} else {
10190 		assert( 0 );
10191 	}
10192 }
10193 #endif /* SOUND_DEST_ALIGNED */
10194 
10195 #ifdef SOUND_DEST_ALIGNED
10196 /*
10197 ============
10198 idSIMD_AltiVec::MixSoundTwoSpeakerMono
10199 
10200 	Assumptions:
10201 		Assumes that mixBuffer starts at aligned address
10202 ============
10203 */
MixSoundTwoSpeakerMono(float * mixBuffer,const float * samples,const int numSamples,const float lastV[2],const float currentV[2])10204 void VPCALL idSIMD_AltiVec::MixSoundTwoSpeakerMono( float *mixBuffer, const float *samples, const int numSamples, const float lastV[2], const float currentV[2] ) {
10205 
10206 	// mixBuffer is aligned
10207 	assert( IS_16BYTE_ALIGNED( mixBuffer[0] ) );
10208 
10209 	int i;
10210 	float inc[2];
10211 	float spkr[4];
10212 
10213 	register vector float vecInc;
10214 	register vector float vecSpeaker1, vecSpeaker2, vecSpeaker3, vecSpeaker4;
10215 	register vector float vecMixBuffer1, vecMixBuffer2, vecMixBuffer3, vecMixBuffer4;
10216 	register vector float vecSamplesLd1, vecSamplesLd2;
10217 	register vector float vecSamples1, vecSamples2, vecSamples3, vecSamples4;
10218 
10219 	register vector unsigned char permVec1 = (vector unsigned char)(0,1,2,3,0,1,2,3,4,5,6,7,4,5,6,7); //0,0,1,1
10220 	register vector unsigned char permVec2 = (vector unsigned char)(8,9,10,11,8,9,10,11,12,13,14,15,12,13,14,15); //2,2,3,3
10221 	register vector unsigned char permVec3 = (vector unsigned char)(16,17,18,19,16,17,18,19,20,21,22,23,20,21,22,23); //4,4,5,5
10222 	register vector unsigned char permVec4 = (vector unsigned char)(24,25,26,27,24,25,26,27,28,29,30,31,28,29,30,31); //6,6,7,7
10223 
10224 	//constants
10225 	vector float fourVec = (vector float)(4.0);
10226 	vector float zeroVec = (vector float)(0.0);
10227 
10228 	inc[0] = ( currentV[0] - lastV[0] ) / MIXBUFFER_SAMPLES;
10229 	inc[1] = ( currentV[1] - lastV[1] ) / MIXBUFFER_SAMPLES;
10230 
10231 	spkr[0] = lastV[0];
10232 	spkr[1] = lastV[1];
10233 	spkr[2] = lastV[0] + inc[0];
10234 	spkr[3] = lastV[1] + inc[1];
10235 
10236 	assert( numSamples == MIXBUFFER_SAMPLES );
10237 
10238 	inc[0] *= 2;
10239 	inc[1] *= 2;
10240 
10241 	//load data into registers
10242 	vector float v0 = loadSplatUnalignedScalar( &inc[0] );
10243 	vector float v1 = loadSplatUnalignedScalar( &inc[1] );
10244 	vecInc = vec_mergeh( v0, v1 );
10245 
10246 	vector float v2 = loadSplatUnalignedScalar( &spkr[0] );
10247 	vector float v3 = loadSplatUnalignedScalar( &spkr[1] );
10248 	vector float v4 = loadSplatUnalignedScalar( &spkr[2] );
10249 	vector float v5 = loadSplatUnalignedScalar( &spkr[3] );
10250 
10251 	// load spkr array
10252 	v0 = vec_mergeh( v2, v4 );
10253 	v1 = vec_mergeh( v3, v5 );
10254 	vecSpeaker1 = vec_mergeh( v0, v1 );
10255 
10256 	vecSpeaker2 = vec_add( vecSpeaker1, vecInc );
10257 	vecSpeaker3 = vec_add( vecSpeaker2, vecInc );
10258 	vecSpeaker4 = vec_add( vecSpeaker3, vecInc );
10259 	vecInc = vec_madd( vecInc, fourVec, zeroVec );
10260 
10261 	vector unsigned char samplesPerm = vec_add( vec_lvsl( -1, &samples[0] ), (vector unsigned char)(1) );
10262 	vector float vecSamplesLast = vec_ld( 0, &samples[0] );
10263 
10264 	//since MIXBUFFER_SAMPLES is a multiple of 8, we don't
10265 	//need a cleanup loop
10266 	for( i=0 ; i+7 < MIXBUFFER_SAMPLES; i += 8 ) {
10267 
10268 		//load samples and mix buffers
10269 		vecSamplesLd1 = vecSamplesLast; //vec_ld( 0, &samples[i] );
10270 		vecSamplesLd2 = vec_ld( 15, &samples[i] );
10271 		vecSamplesLast = vec_ld( 31, &samples[i] );
10272 
10273 		vecSamplesLd1 = vec_perm( vecSamplesLd1, vecSamplesLd2, samplesPerm );
10274 		vecSamplesLd2 = vec_perm( vecSamplesLd2, vecSamplesLast, samplesPerm );
10275 
10276 		vecMixBuffer1 = vec_ld( 0, &mixBuffer[i*2] );
10277 		vecMixBuffer2 = vec_ld( 0, &mixBuffer[i*2+4] );
10278 		vecMixBuffer3 = vec_ld( 0, &mixBuffer[i*2+8] );
10279 		vecMixBuffer4 = vec_ld( 0, &mixBuffer[i*2+12] );
10280 
10281 		vecSamples1 = vec_perm( vecSamplesLd1, vecSamplesLd2, permVec1 );
10282 		vecSamples2 = vec_perm( vecSamplesLd1, vecSamplesLd2, permVec2 );
10283 		vecSamples3 = vec_perm( vecSamplesLd1, vecSamplesLd2, permVec3 );
10284 		vecSamples4 = vec_perm( vecSamplesLd1, vecSamplesLd2, permVec4 );
10285 
10286 		vecMixBuffer1 = vec_madd( vecSamples1, vecSpeaker1, vecMixBuffer1 );
10287 		vecMixBuffer2 = vec_madd( vecSamples2, vecSpeaker2, vecMixBuffer2 );
10288 		vecMixBuffer3 = vec_madd( vecSamples3, vecSpeaker3, vecMixBuffer3 );
10289 		vecMixBuffer4 = vec_madd( vecSamples4, vecSpeaker4, vecMixBuffer4 );
10290 
10291 		// store results
10292 		ALIGNED_STORE4( &mixBuffer[i*2], vecMixBuffer1, vecMixBuffer2, vecMixBuffer3, vecMixBuffer4 );
10293 
10294 		//add for next iteration
10295 		vecSpeaker1 = vec_add( vecSpeaker1, vecInc );
10296 		vecSpeaker2 = vec_add( vecSpeaker2, vecInc );
10297 		vecSpeaker3 = vec_add( vecSpeaker3, vecInc );
10298 		vecSpeaker4 = vec_add( vecSpeaker4, vecInc );
10299 	}
10300 }
10301 
10302 #else
10303 
10304 /*
10305 ============
10306 idSIMD_AltiVec::MixSoundTwoSpeakerMono
10307 
10308 	Assumptions:
10309 		No assumptions
10310 ============
10311 */
MixSoundTwoSpeakerMono(float * mixBuffer,const float * samples,const int numSamples,const float lastV[2],const float currentV[2])10312 void VPCALL idSIMD_AltiVec::MixSoundTwoSpeakerMono( float *mixBuffer, const float *samples, const int numSamples, const float lastV[2], const float currentV[2] ) {
10313 
10314 	int i;
10315 	float inc[2];
10316 	float spkr[4];
10317 
10318 	register vector float vecInc;
10319 	register vector float vecSpeaker1, vecSpeaker2, vecSpeaker3, vecSpeaker4;
10320 	register vector float vecMixBuffer1, vecMixBuffer2, vecMixBuffer3, vecMixBuffer4;
10321 	register vector float vecSamplesLd1, vecSamplesLd2;
10322 	register vector float vecSamples1, vecSamples2, vecSamples3, vecSamples4;
10323 
10324 	register vector unsigned char permVec1 = (vector unsigned char)(0,1,2,3,0,1,2,3,4,5,6,7,4,5,6,7); //0,0,1,1
10325 	register vector unsigned char permVec2 = (vector unsigned char)(8,9,10,11,8,9,10,11,12,13,14,15,12,13,14,15); //2,2,3,3
10326 	register vector unsigned char permVec3 = (vector unsigned char)(16,17,18,19,16,17,18,19,20,21,22,23,20,21,22,23); //4,4,5,5
10327 	register vector unsigned char permVec4 = (vector unsigned char)(24,25,26,27,24,25,26,27,28,29,30,31,28,29,30,31); //6,6,7,7
10328 
10329 	//constants
10330 	vector float fourVec = (vector float)(4.0);
10331 	vector float zeroVec = (vector float)(0.0);
10332 
10333 	inc[0] = ( currentV[0] - lastV[0] ) / MIXBUFFER_SAMPLES;
10334 	inc[1] = ( currentV[1] - lastV[1] ) / MIXBUFFER_SAMPLES;
10335 
10336 	spkr[0] = lastV[0];
10337 	spkr[1] = lastV[1];
10338 	spkr[2] = lastV[0] + inc[0];
10339 	spkr[3] = lastV[1] + inc[1];
10340 
10341 	assert( numSamples == MIXBUFFER_SAMPLES );
10342 
10343 	inc[0] *= 2;
10344 	inc[1] *= 2;
10345 
10346 	//load data into registers
10347 	vector float v0 = loadSplatUnalignedScalar( &inc[0] );
10348 	vector float v1 = loadSplatUnalignedScalar( &inc[1] );
10349 	vecInc = vec_mergeh( v0, v1 );
10350 
10351 	vector float v2 = loadSplatUnalignedScalar( &spkr[0] );
10352 	vector float v3 = loadSplatUnalignedScalar( &spkr[1] );
10353 	vector float v4 = loadSplatUnalignedScalar( &spkr[2] );
10354 	vector float v5 = loadSplatUnalignedScalar( &spkr[3] );
10355 
10356 	// load spkr array
10357 	v0 = vec_mergeh( v2, v4 );
10358 	v1 = vec_mergeh( v3, v5 );
10359 	vecSpeaker1 = vec_mergeh( v0, v1 );
10360 
10361 	vecSpeaker2 = vec_add( vecSpeaker1, vecInc );
10362 	vecSpeaker3 = vec_add( vecSpeaker2, vecInc );
10363 	vecSpeaker4 = vec_add( vecSpeaker3, vecInc );
10364 	vecInc = vec_madd( vecInc, fourVec, zeroVec );
10365 
10366 	vector unsigned char samplesPerm = vec_add( vec_lvsl( -1, &samples[0] ), (vector unsigned char)(1) );
10367 	vector unsigned char mixBufferPerm = vec_add( vec_lvsl( -1, &mixBuffer[0]), (vector unsigned char)(1) );
10368 	vector float vecSamplesLast = vec_ld( 0, &samples[0] );
10369 	vector float vecDest = vec_ld( 0, &mixBuffer[0] );
10370 
10371 	//since MIXBUFFER_SAMPLES is a multiple of 8, we don't
10372 	//need a cleanup loop
10373 	for( i=0 ; i+7 < MIXBUFFER_SAMPLES; i += 8 ) {
10374 
10375 		//load samples and mix buffers
10376 		vecSamplesLd1 = vecSamplesLast;
10377 		vecSamplesLd2 = vec_ld( 15, &samples[i] );
10378 		vecSamplesLast = vec_ld( 31, &samples[i] );
10379 
10380 		vecSamplesLd1 = vec_perm( vecSamplesLd1, vecSamplesLd2, samplesPerm );
10381 		vecSamplesLd2 = vec_perm( vecSamplesLd2, vecSamplesLast, samplesPerm );
10382 
10383 		vecMixBuffer1 = vecDest;
10384 		vecMixBuffer2 = vec_ld( 15, &mixBuffer[i*2] );
10385 		vecMixBuffer3 = vec_ld( 31, &mixBuffer[i*2] );
10386 		vecMixBuffer4 = vec_ld( 47, &mixBuffer[i*2] );
10387 		vector float vecDestEnd = vec_ld( 63, &mixBuffer[i*2] );
10388 
10389 		vecMixBuffer1 = vec_perm( vecMixBuffer1, vecMixBuffer2, mixBufferPerm );
10390 		vecMixBuffer2 = vec_perm( vecMixBuffer2, vecMixBuffer3, mixBufferPerm );
10391 		vecMixBuffer3 = vec_perm( vecMixBuffer3, vecMixBuffer4, mixBufferPerm );
10392 		vecMixBuffer4 = vec_perm( vecMixBuffer4, vecDestEnd, mixBufferPerm );
10393 
10394 		vecSamples1 = vec_perm( vecSamplesLd1, vecSamplesLd2, permVec1 );
10395 		vecSamples2 = vec_perm( vecSamplesLd1, vecSamplesLd2, permVec2 );
10396 		vecSamples3 = vec_perm( vecSamplesLd1, vecSamplesLd2, permVec3 );
10397 		vecSamples4 = vec_perm( vecSamplesLd1, vecSamplesLd2, permVec4 );
10398 
10399 		vecMixBuffer1 = vec_madd( vecSamples1, vecSpeaker1, vecMixBuffer1 );
10400 		vecMixBuffer2 = vec_madd( vecSamples2, vecSpeaker2, vecMixBuffer2 );
10401 		vecMixBuffer3 = vec_madd( vecSamples3, vecSpeaker3, vecMixBuffer3 );
10402 		vecMixBuffer4 = vec_madd( vecSamples4, vecSpeaker4, vecMixBuffer4 );
10403 
10404 		// store results
10405 		UNALIGNED_STORE4( &mixBuffer[i*2], vecMixBuffer1, vecMixBuffer2, vecMixBuffer3, vecMixBuffer4 );
10406 
10407 		//add for next iteration
10408 		vecSpeaker1 = vec_add( vecSpeaker1, vecInc );
10409 		vecSpeaker2 = vec_add( vecSpeaker2, vecInc );
10410 		vecSpeaker3 = vec_add( vecSpeaker3, vecInc );
10411 		vecSpeaker4 = vec_add( vecSpeaker4, vecInc );
10412 	}
10413 }
10414 
10415 #endif /* SOUND_DEST_ALIGNED */
10416 
10417 #ifdef SOUND_DEST_ALIGNED
10418 /*
10419 ============
10420 idSIMD_AltiVec::MixSoundTwoSpeakerStereo
10421 
10422 	Assumptions:
10423 		Assumes that mixBuffer starts at aligned address
10424 ============
10425 */
MixSoundTwoSpeakerStereo(float * mixBuffer,const float * samples,const int numSamples,const float lastV[2],const float currentV[2])10426 void VPCALL idSIMD_AltiVec::MixSoundTwoSpeakerStereo( float *mixBuffer, const float *samples, const int numSamples, const float lastV[2], const float currentV[2] ) {
10427 	// mixBuffer is aligned
10428 	assert( IS_16BYTE_ALIGNED( mixBuffer[0] ) );
10429 
10430 	int i, k;
10431 	float inc[2];
10432 	float spkr[4];
10433 
10434 	// loading buffers
10435 	register vector float vecMixBuffer1, vecMixBuffer2, vecMixBuffer3, vecMixBuffer4;
10436 	// loading buffers
10437 	register vector float vecSamples1, vecSamples2, vecSamples3, vecSamples4;
10438 	register vector float vecSpeaker1, vecSpeaker2, vecSpeaker3, vecSpeaker4;
10439 	register vector float vecInc;
10440 	vector float fourVec = (vector float)(4.0);
10441 	vector float zeroVec = (vector float)(0.0);
10442 
10443 	assert( numSamples == MIXBUFFER_SAMPLES );
10444 
10445 	inc[0] = ( currentV[0] - lastV[0] ) / MIXBUFFER_SAMPLES;
10446 	inc[1] = ( currentV[1] - lastV[1] ) / MIXBUFFER_SAMPLES;
10447 
10448 	spkr[0] = lastV[0];
10449 	spkr[1] = lastV[1];
10450 	spkr[2] = lastV[0] + inc[0];
10451 	spkr[3] = lastV[1] + inc[1];
10452 
10453 	for ( k = 0; k < 2; k++ ) {
10454 		inc[k] *= 2;
10455 	}
10456 
10457 	// load data in vectors
10458 	vector float v0 = loadSplatUnalignedScalar( &inc[0] );
10459 	vector float v1 = loadSplatUnalignedScalar( &inc[1] );
10460 	vecInc = vec_mergeh( v0, v1 );
10461 
10462 	vector float v2 = loadSplatUnalignedScalar( &spkr[0] );
10463 	vector float v3 = loadSplatUnalignedScalar( &spkr[1] );
10464 	vector float v4 = loadSplatUnalignedScalar( &spkr[2] );
10465 	vector float v5 = loadSplatUnalignedScalar( &spkr[3] );
10466 
10467 	// load spkr array
10468 	v0 = vec_mergeh( v2, v4 );
10469 	v1 = vec_mergeh( v3, v5 );
10470 	vecSpeaker1 = vec_mergeh( v0, v1 );
10471 
10472 	vecSpeaker2 = vec_add( vecSpeaker1, vecInc );
10473 	vecSpeaker3 = vec_add( vecSpeaker2, vecInc );
10474 	vecSpeaker4 = vec_add( vecSpeaker3, vecInc );
10475 	vecInc = vec_madd( vecInc, fourVec, zeroVec );
10476 
10477 	vector unsigned char samplesPerm = vec_add( vec_lvsl( -1, &samples[0] ), (vector unsigned char)(1) );
10478 	vector float vecSamplesLast = vec_ld( 0, &samples[0] );
10479 
10480 	//since MIXBUFFER_SAMPLES is a multiple of 8, we don't
10481 	//need a cleanup loop
10482 	for( i = 0 ; i+7 < MIXBUFFER_SAMPLES; i += 8 ) {
10483 		// load mix buffers and samples
10484 		vecMixBuffer1 = vec_ld( 0, &mixBuffer[i*2] );
10485 		vecMixBuffer2 = vec_ld( 0, &mixBuffer[i*2+4] );
10486 		vecMixBuffer3 = vec_ld( 0, &mixBuffer[i*2+8] );
10487 		vecMixBuffer4 = vec_ld( 0, &mixBuffer[i*2+12] );
10488 
10489 		vecSamples1 = vecSamplesLast;
10490 		vecSamples2 = vec_ld( 15, &samples[i*2] );
10491 		vecSamples3 = vec_ld( 31, &samples[i*2] );
10492 		vecSamples4 = vec_ld( 47, &samples[i*2] );
10493 		vecSamplesLast = vec_ld( 63, &samples[i*2] );
10494 
10495 		vecSamples1 = vec_perm( vecSamples1, vecSamples2, samplesPerm );
10496 		vecSamples2 = vec_perm( vecSamples2, vecSamples3, samplesPerm );
10497 		vecSamples3 = vec_perm( vecSamples3, vecSamples4, samplesPerm );
10498 		vecSamples4 = vec_perm( vecSamples4, vecSamplesLast, samplesPerm );
10499 
10500 		vecMixBuffer1 = vec_madd( vecSamples1, vecSpeaker1, vecMixBuffer1 );
10501 		vecMixBuffer2 = vec_madd( vecSamples2, vecSpeaker2, vecMixBuffer2 );
10502 		vecMixBuffer3 = vec_madd( vecSamples3, vecSpeaker3, vecMixBuffer3 );
10503 		vecMixBuffer4 = vec_madd( vecSamples4, vecSpeaker4, vecMixBuffer4 );
10504 
10505 		vecSpeaker1 = vec_add( vecSpeaker1, vecInc );
10506 		vecSpeaker2 = vec_add( vecSpeaker2, vecInc );
10507 		vecSpeaker3 = vec_add( vecSpeaker3, vecInc );
10508 		vecSpeaker4 = vec_add( vecSpeaker4, vecInc );
10509 
10510 		//store results
10511 		ALIGNED_STORE4( &mixBuffer[i*2], vecMixBuffer1, vecMixBuffer2, vecMixBuffer3, vecMixBuffer4 );
10512 	}
10513 }
10514 #else
10515 
10516 /*
10517 ============
10518 idSIMD_AltiVec::MixSoundTwoSpeakerStereo
10519 
10520 	Assumptions:
10521 		No assumptions
10522 ============
10523 */
MixSoundTwoSpeakerStereo(float * mixBuffer,const float * samples,const int numSamples,const float lastV[2],const float currentV[2])10524 void VPCALL idSIMD_AltiVec::MixSoundTwoSpeakerStereo( float *mixBuffer, const float *samples, const int numSamples, const float lastV[2], const float currentV[2] ) {
10525 
10526 	int i, k;
10527 	float inc[2];
10528 	float spkr[4];
10529 	// loading buffers
10530 	register vector float vecMixBuffer1, vecMixBuffer2, vecMixBuffer3, vecMixBuffer4;
10531 	// loading buffers
10532 	register vector float vecSamples1, vecSamples2, vecSamples3, vecSamples4;
10533 	register vector float vecSpeaker1, vecSpeaker2, vecSpeaker3, vecSpeaker4;
10534 	register vector float vecInc;
10535 	vector float fourVec = (vector float)(4.0);
10536 	vector float zeroVec = (vector float)(0.0);
10537 
10538 	assert( numSamples == MIXBUFFER_SAMPLES );
10539 
10540 	inc[0] = ( currentV[0] - lastV[0] ) / MIXBUFFER_SAMPLES;
10541 	inc[1] = ( currentV[1] - lastV[1] ) / MIXBUFFER_SAMPLES;
10542 
10543 	spkr[0] = lastV[0];
10544 	spkr[1] = lastV[1];
10545 	spkr[2] = lastV[0] + inc[0];
10546 	spkr[3] = lastV[1] + inc[1];
10547 
10548 	for ( k = 0; k < 2; k++ ) {
10549 		inc[k] *= 2;
10550 	}
10551 
10552 	// load data in vectors
10553 	vector float v0 = loadSplatUnalignedScalar( &inc[0] );
10554 	vector float v1 = loadSplatUnalignedScalar( &inc[1] );
10555 	vecInc = vec_mergeh( v0, v1 );
10556 
10557 	vector float v2 = loadSplatUnalignedScalar( &spkr[0] );
10558 	vector float v3 = loadSplatUnalignedScalar( &spkr[1] );
10559 	vector float v4 = loadSplatUnalignedScalar( &spkr[2] );
10560 	vector float v5 = loadSplatUnalignedScalar( &spkr[3] );
10561 
10562 	// load spkr array
10563 	v0 = vec_mergeh( v2, v4 );
10564 	v1 = vec_mergeh( v3, v5 );
10565 	vecSpeaker1 = vec_mergeh( v0, v1 );
10566 
10567 	vecSpeaker2 = vec_add( vecSpeaker1, vecInc );
10568 	vecSpeaker3 = vec_add( vecSpeaker2, vecInc );
10569 	vecSpeaker4 = vec_add( vecSpeaker3, vecInc );
10570 	vecInc = vec_madd( vecInc, fourVec, zeroVec );
10571 
10572 	vector unsigned char samplesPerm = vec_add( vec_lvsl( -1, &samples[0] ), (vector unsigned char)(1) );
10573 	vector unsigned char mixBufferPerm = vec_add( vec_lvsl( -1, &mixBuffer[0] ), (vector unsigned char)(1) );
10574 	vector float vecSamplesLast = vec_ld( 0, &samples[0] );
10575 	vector float vecDest = vec_ld( 0, &mixBuffer[0] );
10576 
10577 	//since MIXBUFFER_SAMPLES is a multiple of 8, we don't
10578 	//need a cleanup loop
10579 	for( i = 0 ; i+7 < MIXBUFFER_SAMPLES; i += 8 ) {
10580 		// load mix buffers and samples
10581 		vecMixBuffer1 = vecDest;
10582 		vecMixBuffer2 = vec_ld( 15, &mixBuffer[i*2] );
10583 		vecMixBuffer3 = vec_ld( 31, &mixBuffer[i*2] );
10584 		vecMixBuffer4 = vec_ld( 47, &mixBuffer[i*2] );
10585 		vector float vecDestEnd = vec_ld( 63, &mixBuffer[i*2] );
10586 
10587 		vecMixBuffer1 = vec_perm( vecMixBuffer1, vecMixBuffer2, mixBufferPerm );
10588 		vecMixBuffer2 = vec_perm( vecMixBuffer2, vecMixBuffer3, mixBufferPerm );
10589 		vecMixBuffer3 = vec_perm( vecMixBuffer3, vecMixBuffer4, mixBufferPerm );
10590 		vecMixBuffer4 = vec_perm( vecMixBuffer4, vecDestEnd, mixBufferPerm );
10591 
10592 		vecSamples1 = vecSamplesLast;
10593 		vecSamples2 = vec_ld( 15, &samples[i*2] );
10594 		vecSamples3 = vec_ld( 31, &samples[i*2] );
10595 		vecSamples4 = vec_ld( 47, &samples[i*2] );
10596 		vecSamplesLast = vec_ld( 63, &samples[i*2] );
10597 
10598 		vecSamples1 = vec_perm( vecSamples1, vecSamples2, samplesPerm );
10599 		vecSamples2 = vec_perm( vecSamples2, vecSamples3, samplesPerm );
10600 		vecSamples3 = vec_perm( vecSamples3, vecSamples4, samplesPerm );
10601 		vecSamples4 = vec_perm( vecSamples4, vecSamplesLast, samplesPerm );
10602 
10603 		vecMixBuffer1 = vec_madd( vecSamples1, vecSpeaker1, vecMixBuffer1 );
10604 		vecMixBuffer2 = vec_madd( vecSamples2, vecSpeaker2, vecMixBuffer2 );
10605 		vecMixBuffer3 = vec_madd( vecSamples3, vecSpeaker3, vecMixBuffer3 );
10606 		vecMixBuffer4 = vec_madd( vecSamples4, vecSpeaker4, vecMixBuffer4 );
10607 
10608 		vecSpeaker1 = vec_add( vecSpeaker1, vecInc );
10609 		vecSpeaker2 = vec_add( vecSpeaker2, vecInc );
10610 		vecSpeaker3 = vec_add( vecSpeaker3, vecInc );
10611 		vecSpeaker4 = vec_add( vecSpeaker4, vecInc );
10612 
10613 		// store results
10614 		UNALIGNED_STORE4( &mixBuffer[i*2], vecMixBuffer1, vecMixBuffer2, vecMixBuffer3, vecMixBuffer4 );
10615 	}
10616 }
10617 
10618 #endif /* SOUND_DEST_ALIGNED */
10619 
10620 #ifdef SOUND_DEST_ALIGNED
10621 /*
10622 ============
10623 idSIMD_AltiVec::MixSoundSixSpeakerMono
10624 
10625 	Assumptions:
10626 		Assumes that mixBuffer starts at aligned address
10627 ============
10628 */
MixSoundSixSpeakerMono(float * mixBuffer,const float * samples,const int numSamples,const float lastV[6],const float currentV[6])10629 void VPCALL idSIMD_AltiVec::MixSoundSixSpeakerMono( float *mixBuffer, const float *samples, const int numSamples, const float lastV[6], const float currentV[6] ) {
10630 
10631 	// mixBuffer is aligned
10632 	assert( IS_16BYTE_ALIGNED( mixBuffer[0] ) );
10633 
10634 	float incL[24];
10635 	float sL[24];
10636 	int i, k;
10637 
10638 	vector float vecIncl1, vecIncl2, vecIncl3, vecIncl4, vecIncl5, vecIncl6, vecIncl7;
10639 	vector float vecSL1, vecSL2, vecSL3, vecSL4, vecSL5, vecSL6, vecSL7;
10640 	vector float vecSamplesLd;
10641 	vector float vecSamples1, vecSamples2, vecSamples3, vecSamples4, vecSamples5, vecSamples6;
10642 	vector float vecMixBuffer1, vecMixBuffer2, vecMixBuffer3, vecMixBuffer4, vecMixBuffer5, vecMixBuffer6;
10643 	// permute vectors for sample
10644 	vector unsigned char samplePerm2 = (vector unsigned char)( 0,1,2,3,0,1,2,3,4,5,6,7,4,5,6,7);
10645 	vector unsigned char samplePerm5 = (vector unsigned char)( 8,9,10,11,8,9,10,11,12,13,14,15,12,13,14,15);
10646 
10647 	assert( numSamples == MIXBUFFER_SAMPLES );
10648 	assert( SPEAKER_RIGHT == 1 );
10649 	assert( SPEAKER_BACKRIGHT == 5 );
10650 
10651 	// incL array, 6 elements repeated
10652 	incL[0] = incL[6] = incL[12] = incL[18] = ( currentV[0] - lastV[0] ) / MIXBUFFER_SAMPLES;
10653 	incL[1] = incL[7] = incL[13] = incL[19] = ( currentV[1] - lastV[1] ) / MIXBUFFER_SAMPLES;
10654 	incL[2] = incL[8] = incL[14] = incL[20] = ( currentV[2] - lastV[2] ) / MIXBUFFER_SAMPLES;
10655 	incL[3] = incL[9] = incL[15] = incL[21] = ( currentV[3] - lastV[3] ) / MIXBUFFER_SAMPLES;
10656 	incL[4] = incL[10] = incL[16] = incL[22] = ( currentV[4] - lastV[4] ) / MIXBUFFER_SAMPLES;
10657 	incL[5] = incL[11] = incL[17] = incL[23] = ( currentV[5] - lastV[5] ) / MIXBUFFER_SAMPLES;
10658 
10659 	// sL array repeated
10660 	for ( k = 0; k < 6; k++ ) {
10661 		sL[k] = lastV[k];
10662 	}
10663 	for ( k = 6; k < 12; k++ ) {
10664 		sL[k] = lastV[k-6] + incL[k];
10665 	}
10666 	for ( k = 12; k < 18; k++ ) {
10667 		sL[k] = lastV[k-12] + incL[k] + incL[k];
10668 	}
10669 	for ( k = 18; k < 24; k++ ) {
10670 		sL[k] = lastV[k-18] + incL[k] + incL[k] + incL[k];
10671 	}
10672 
10673 	// multiply by 2 since doing 12 at a time
10674 	for ( k = 0; k < 24; k++ ) {
10675 		incL[k] *= 4;
10676 	}
10677 
10678 	//load the data
10679 	vector unsigned char incPerm = vec_add( vec_lvsl( -1, &incL[0] ), (vector unsigned char)(1) );
10680 	vector unsigned char slPerm = vec_add( vec_lvsl( -1, &sL[0] ), (vector unsigned char)(1) );
10681 
10682 	vecIncl1 = vec_ld( 0, &incL[0] );
10683 	vecIncl2 = vec_ld( 15, &incL[0] );
10684 	vecIncl3 = vec_ld( 31, &incL[0] );
10685 	vecIncl4 = vec_ld( 47, &incL[0] );
10686 	vecIncl5 = vec_ld( 63, &incL[0] );
10687 	vecIncl6 = vec_ld( 79, &incL[0] );
10688 	vecIncl7 = vec_ld( 95, &incL[0] );
10689 
10690 	vecIncl1 = vec_perm( vecIncl1, vecIncl2, incPerm );
10691 	vecIncl2 = vec_perm( vecIncl2, vecIncl3, incPerm );
10692 	vecIncl3 = vec_perm( vecIncl3, vecIncl4, incPerm );
10693 	vecIncl4 = vec_perm( vecIncl4, vecIncl5, incPerm );
10694 	vecIncl5 = vec_perm( vecIncl5, vecIncl6, incPerm );
10695 	vecIncl6 = vec_perm( vecIncl6, vecIncl7, incPerm );
10696 
10697 	vecSL1 = vec_ld( 0, &sL[0] );
10698 	vecSL2 = vec_ld( 15, &sL[0] );
10699 	vecSL3 = vec_ld( 31, &sL[0] );
10700 	vecSL4 = vec_ld( 47, &sL[0] );
10701 	vecSL5 = vec_ld( 63, &sL[0] );
10702 	vecSL6 = vec_ld( 79, &sL[0] );
10703 	vecSL7 = vec_ld( 95, &sL[0] );
10704 
10705 	vecSL1 = vec_perm( vecSL1, vecSL2, slPerm );
10706 	vecSL2 = vec_perm( vecSL2, vecSL3, slPerm );
10707 	vecSL3 = vec_perm( vecSL3, vecSL4, slPerm );
10708 	vecSL4 = vec_perm( vecSL4, vecSL5, slPerm );
10709 	vecSL5 = vec_perm( vecSL5, vecSL6, slPerm );
10710 	vecSL6 = vec_perm( vecSL6, vecSL7, slPerm );
10711 
10712 
10713 	vector unsigned char samplesPerm = vec_add( vec_lvsl( -1, &samples[0] ), (vector unsigned char)(1) );
10714 	vector float vecSamplesLast = vec_ld( 0, &samples[0] );
10715 
10716 	//since MIXBUFFER_SAMPLES is a multiple of 4, we don't
10717 	//need a cleanup loop
10718 	for( i = 0; i <= MIXBUFFER_SAMPLES - 4; i += 4 ) {
10719 		//load mix buffer into vectors, assume aligned
10720 		vecMixBuffer1 = vec_ld( 0, &mixBuffer[i*6] );
10721 		vecMixBuffer2 = vec_ld( 0, &mixBuffer[(i*6)+4] );
10722 		vecMixBuffer3 = vec_ld( 0, &mixBuffer[(i*6)+8] );
10723 		vecMixBuffer4 = vec_ld( 0, &mixBuffer[(i*6)+12] );
10724 		vecMixBuffer5 = vec_ld( 0, &mixBuffer[(i*6)+16] );
10725 		vecMixBuffer6 = vec_ld( 0, &mixBuffer[(i*6)+20] );
10726 
10727 		//load samples into vector
10728 		vector float vecSamplesLd2 = vec_ld( 15, &samples[i] );
10729 		vecSamplesLd = vec_perm( vecSamplesLast, vecSamplesLd2, samplesPerm );
10730 		vecSamplesLast = vecSamplesLd2;
10731 
10732 		//permute to get them ordered how we want
10733 		vecSamples1 = vec_splat( vecSamplesLd, 0 );
10734 		vecSamples2 = vec_perm( vecSamplesLd, vecSamplesLd, samplePerm2 );
10735 		vecSamples3 = vec_splat( vecSamplesLd, 1 );
10736 		vecSamples4 = vec_splat( vecSamplesLd, 2 );
10737 		vecSamples5 = vec_perm( vecSamplesLd, vecSamplesLd, samplePerm5 );
10738 		vecSamples6 = vec_splat( vecSamplesLd, 3 );
10739 
10740 		//do calculation
10741 		vecMixBuffer1 = vec_madd( vecSamples1, vecSL1, vecMixBuffer1 );
10742 		vecMixBuffer2 = vec_madd( vecSamples2, vecSL2, vecMixBuffer2 );
10743 		vecMixBuffer3 = vec_madd( vecSamples3, vecSL3, vecMixBuffer3 );
10744 		vecMixBuffer4 = vec_madd( vecSamples4, vecSL4, vecMixBuffer4 );
10745 		vecMixBuffer5 = vec_madd( vecSamples5, vecSL5, vecMixBuffer5 );
10746 		vecMixBuffer6 = vec_madd( vecSamples6, vecSL6, vecMixBuffer6 );
10747 
10748 		//store out results
10749 		ALIGNED_STORE6( &mixBuffer[i*6], vecMixBuffer1, vecMixBuffer2, vecMixBuffer3, vecMixBuffer4, vecMixBuffer5, vecMixBuffer6 );
10750 
10751 		// add for next iteration
10752 		vecSL1 = vec_add( vecSL1, vecIncl1 );
10753 		vecSL2 = vec_add( vecSL2, vecIncl2 );
10754 		vecSL3 = vec_add( vecSL3, vecIncl3 );
10755 		vecSL4 = vec_add( vecSL4, vecIncl4 );
10756 		vecSL5 = vec_add( vecSL5, vecIncl5 );
10757 		vecSL6 = vec_add( vecSL6, vecIncl6 );
10758 	}
10759 }
10760 #else
10761 
10762 /*
10763 ============
10764 idSIMD_AltiVec::MixSoundSixSpeakerMono
10765 
10766 	Assumptions:
10767 		No assumptions
10768 ============
10769 */
MixSoundSixSpeakerMono(float * mixBuffer,const float * samples,const int numSamples,const float lastV[6],const float currentV[6])10770 void VPCALL idSIMD_AltiVec::MixSoundSixSpeakerMono( float *mixBuffer, const float *samples, const int numSamples, const float lastV[6], const float currentV[6] ) {
10771 
10772 	float incL[24];
10773 	float sL[24];
10774 	int i, k;
10775 
10776 	vector float vecIncl1, vecIncl2, vecIncl3, vecIncl4, vecIncl5, vecIncl6, vecIncl7;
10777 	vector float vecSL1, vecSL2, vecSL3, vecSL4, vecSL5, vecSL6, vecSL7;
10778 	vector float vecSamplesLd;
10779 	vector float vecSamples1, vecSamples2, vecSamples3, vecSamples4, vecSamples5, vecSamples6;
10780 	vector float vecMixBuffer1, vecMixBuffer2, vecMixBuffer3, vecMixBuffer4, vecMixBuffer5, vecMixBuffer6;
10781 	// permute vectors for sample
10782 	register vector unsigned char samplePerm2 = (vector unsigned char)( 0,1,2,3,0,1,2,3,4,5,6,7,4,5,6,7);
10783 	register vector unsigned char samplePerm5 = (vector unsigned char)( 8,9,10,11,8,9,10,11,12,13,14,15,12,13,14,15);
10784 
10785 	assert( numSamples == MIXBUFFER_SAMPLES );
10786 	assert( SPEAKER_RIGHT == 1 );
10787 	assert( SPEAKER_BACKRIGHT == 5 );
10788 
10789 	// incL array, 6 elements repeated
10790 	incL[0] = incL[6] = incL[12] = incL[18] = ( currentV[0] - lastV[0] ) / MIXBUFFER_SAMPLES;
10791 	incL[1] = incL[7] = incL[13] = incL[19] = ( currentV[1] - lastV[1] ) / MIXBUFFER_SAMPLES;
10792 	incL[2] = incL[8] = incL[14] = incL[20] = ( currentV[2] - lastV[2] ) / MIXBUFFER_SAMPLES;
10793 	incL[3] = incL[9] = incL[15] = incL[21] = ( currentV[3] - lastV[3] ) / MIXBUFFER_SAMPLES;
10794 	incL[4] = incL[10] = incL[16] = incL[22] = ( currentV[4] - lastV[4] ) / MIXBUFFER_SAMPLES;
10795 	incL[5] = incL[11] = incL[17] = incL[23] = ( currentV[5] - lastV[5] ) / MIXBUFFER_SAMPLES;
10796 
10797 	// sL array repeated
10798 	for ( k = 0; k < 6; k++ ) {
10799 		sL[k] = lastV[k];
10800 	}
10801 	for ( k = 6; k < 12; k++ ) {
10802 		sL[k] = lastV[k-6] + incL[k];
10803 	}
10804 	for ( k = 12; k < 18; k++ ) {
10805 		sL[k] = lastV[k-12] + incL[k] + incL[k];
10806 	}
10807 	for ( k = 18; k < 24; k++ ) {
10808 		sL[k] = lastV[k-18] + incL[k] + incL[k] + incL[k];
10809 	}
10810 
10811 	// multiply by 2 since doing 12 at a time
10812 	for ( k = 0; k < 24; k++ ) {
10813 		incL[k] *= 4;
10814 	}
10815 
10816 	// load the data
10817 	vector unsigned char incPerm = vec_add( vec_lvsl( -1, &incL[0] ), (vector unsigned char)(1) );
10818 	vector unsigned char slPerm = vec_add( vec_lvsl( -1, &sL[0] ), (vector unsigned char)(1) );
10819 
10820 	vecIncl1 = vec_ld( 0, &incL[0] );
10821 	vecIncl2 = vec_ld( 15, &incL[0] );
10822 	vecIncl3 = vec_ld( 31, &incL[0] );
10823 	vecIncl4 = vec_ld( 47, &incL[0] );
10824 	vecIncl5 = vec_ld( 63, &incL[0] );
10825 	vecIncl6 = vec_ld( 79, &incL[0] );
10826 	vecIncl7 = vec_ld( 95, &incL[0] );
10827 
10828 	vecIncl1 = vec_perm( vecIncl1, vecIncl2, incPerm );
10829 	vecIncl2 = vec_perm( vecIncl2, vecIncl3, incPerm );
10830 	vecIncl3 = vec_perm( vecIncl3, vecIncl4, incPerm );
10831 	vecIncl4 = vec_perm( vecIncl4, vecIncl5, incPerm );
10832 	vecIncl5 = vec_perm( vecIncl5, vecIncl6, incPerm );
10833 	vecIncl6 = vec_perm( vecIncl6, vecIncl7, incPerm );
10834 
10835 	vecSL1 = vec_ld( 0, &sL[0] );
10836 	vecSL2 = vec_ld( 15, &sL[0] );
10837 	vecSL3 = vec_ld( 31, &sL[0] );
10838 	vecSL4 = vec_ld( 47, &sL[0] );
10839 	vecSL5 = vec_ld( 63, &sL[0] );
10840 	vecSL6 = vec_ld( 79, &sL[0] );
10841 	vecSL7 = vec_ld( 95, &sL[0] );
10842 
10843 	vecSL1 = vec_perm( vecSL1, vecSL2, slPerm );
10844 	vecSL2 = vec_perm( vecSL2, vecSL3, slPerm );
10845 	vecSL3 = vec_perm( vecSL3, vecSL4, slPerm );
10846 	vecSL4 = vec_perm( vecSL4, vecSL5, slPerm );
10847 	vecSL5 = vec_perm( vecSL5, vecSL6, slPerm );
10848 	vecSL6 = vec_perm( vecSL6, vecSL7, slPerm );
10849 
10850 	vector unsigned char samplesPerm = vec_add( vec_lvsl( -1, &samples[0] ), (vector unsigned char)(1) );
10851 	vector unsigned char mixBufferPerm = vec_add( vec_lvsl( -1, &mixBuffer[0] ), (vector unsigned char)(1) );
10852 	vector float vecSamplesLast = vec_ld( 0, &samples[0] );
10853 	vector float vecDest = vec_ld( 0, &mixBuffer[0] );
10854 
10855 	//since MIXBUFFER_SAMPLES is a multiple of 4, we don't
10856 	//need a cleanup loop
10857 	for( i = 0; i <= MIXBUFFER_SAMPLES - 4; i += 4 ) {
10858 		//load mix buffer into vectors
10859 		vecMixBuffer1 = vecDest;
10860 		vecMixBuffer2 = vec_ld( 15, &mixBuffer[i*6] );
10861 		vecMixBuffer3 = vec_ld( 31, &mixBuffer[i*6] );
10862 		vecMixBuffer4 = vec_ld( 47, &mixBuffer[i*6] );
10863 		vecMixBuffer5 = vec_ld( 63, &mixBuffer[i*6] );
10864 		vecMixBuffer6 = vec_ld( 79, &mixBuffer[i*6] );
10865 		vector float vecDestEnd = vec_ld( 95, &mixBuffer[i*6] );
10866 
10867 		vecMixBuffer1 = vec_perm( vecMixBuffer1, vecMixBuffer2, mixBufferPerm );
10868 		vecMixBuffer2 = vec_perm( vecMixBuffer2, vecMixBuffer3, mixBufferPerm );
10869 		vecMixBuffer3 = vec_perm( vecMixBuffer3, vecMixBuffer4, mixBufferPerm );
10870 		vecMixBuffer4 = vec_perm( vecMixBuffer4, vecMixBuffer5, mixBufferPerm );
10871 		vecMixBuffer5 = vec_perm( vecMixBuffer5, vecMixBuffer6, mixBufferPerm );
10872 		vecMixBuffer6 = vec_perm( vecMixBuffer6, vecDestEnd, mixBufferPerm );
10873 
10874 		//load samples into vector
10875 		vector float vecSamplesLd2 = vec_ld( 15, &samples[i] );
10876 		vecSamplesLd = vec_perm( vecSamplesLast, vecSamplesLd2, samplesPerm );
10877 		vecSamplesLast = vecSamplesLd2;
10878 
10879 		//permute to get them ordered how we want
10880 		vecSamples1 = vec_splat( vecSamplesLd, 0 );
10881 		vecSamples2 = vec_perm( vecSamplesLd, vecSamplesLd, samplePerm2 );
10882 		vecSamples3 = vec_splat( vecSamplesLd, 1 );
10883 		vecSamples4 = vec_splat( vecSamplesLd, 2 );
10884 		vecSamples5 = vec_perm( vecSamplesLd, vecSamplesLd, samplePerm5 );
10885 		vecSamples6 = vec_splat( vecSamplesLd, 3 );
10886 
10887 		//do calculation
10888 		vecMixBuffer1 = vec_madd( vecSamples1, vecSL1, vecMixBuffer1 );
10889 		vecMixBuffer2 = vec_madd( vecSamples2, vecSL2, vecMixBuffer2 );
10890 		vecMixBuffer3 = vec_madd( vecSamples3, vecSL3, vecMixBuffer3 );
10891 		vecMixBuffer4 = vec_madd( vecSamples4, vecSL4, vecMixBuffer4 );
10892 		vecMixBuffer5 = vec_madd( vecSamples5, vecSL5, vecMixBuffer5 );
10893 		vecMixBuffer6 = vec_madd( vecSamples6, vecSL6, vecMixBuffer6 );
10894 
10895 		// store results
10896 		UNALIGNED_STORE6( &mixBuffer[i*6], vecMixBuffer1, vecMixBuffer2, vecMixBuffer3, vecMixBuffer4, vecMixBuffer5, vecMixBuffer6 );
10897 
10898 		// add for next iteration
10899 		vecSL1 = vec_add( vecSL1, vecIncl1 );
10900 		vecSL2 = vec_add( vecSL2, vecIncl2 );
10901 		vecSL3 = vec_add( vecSL3, vecIncl3 );
10902 		vecSL4 = vec_add( vecSL4, vecIncl4 );
10903 		vecSL5 = vec_add( vecSL5, vecIncl5 );
10904 		vecSL6 = vec_add( vecSL6, vecIncl6 );
10905 	}
10906 }
10907 
10908 #endif /* SOUND_DEST_ALIGNED */
10909 
10910 #ifdef SOUND_DEST_ALIGNED
10911 /*
10912 ============
10913 idSIMD_AltiVec::MixSoundSixSpeakerStereo
10914 
10915 	Assumptions:
10916 		Assumes that mixBuffer starts at aligned address
10917 ============
10918 */
10919 
MixSoundSixSpeakerStereo(float * mixBuffer,const float * samples,const int numSamples,const float lastV[6],const float currentV[6])10920 void VPCALL idSIMD_AltiVec::MixSoundSixSpeakerStereo( float *mixBuffer, const float *samples, const int numSamples, const float lastV[6], const float currentV[6] ) {
10921 
10922 	// mixBuffer is aligned
10923 	assert( IS_16BYTE_ALIGNED( mixBuffer[0] ) );
10924 
10925 	float incL[12];
10926 	float sL[12];
10927 	int i;
10928 	vector float vecIncl1, vecIncl2, vecIncl3, vecIncl4;
10929 	vector float vecSL1, vecSL2, vecSL3, vecSL4;
10930 	vector float vecSamplesLd;
10931 	vector float vecSamples1, vecSamples2, vecSamples3;
10932 	vector float vecMixBuffer1, vecMixBuffer2, vecMixBuffer3;
10933 	// permute vectors for sample
10934 	vector unsigned char samplePerm1 = (vector unsigned char)( 0,1,2,3,4,5,6,7,0,1,2,3,0,1,2,3);
10935 	vector unsigned char samplePerm3 = (vector unsigned char)( 8,9,10,11,8,9,10,11,8,9,10,11,12,13,14,15);
10936 
10937 	assert( numSamples == MIXBUFFER_SAMPLES );
10938 	assert( SPEAKER_RIGHT == 1 );
10939 	assert( SPEAKER_BACKRIGHT == 5 );
10940 
10941 	// incL array, 6 elements repeated
10942 	incL[0] = incL[6] = ( currentV[0] - lastV[0] ) / MIXBUFFER_SAMPLES;
10943 	incL[1] = incL[7] = ( currentV[1] - lastV[1] ) / MIXBUFFER_SAMPLES;
10944 	incL[2] = incL[8] = ( currentV[2] - lastV[2] ) / MIXBUFFER_SAMPLES;
10945 	incL[3] = incL[9] = ( currentV[3] - lastV[3] ) / MIXBUFFER_SAMPLES;
10946 	incL[4] = incL[10] = ( currentV[4] - lastV[4] ) / MIXBUFFER_SAMPLES;
10947 	incL[5] = incL[11] = ( currentV[5] - lastV[5] ) / MIXBUFFER_SAMPLES;
10948 
10949 	// sL array repeated
10950 	sL[0] = lastV[0];
10951 	sL[1] = lastV[1];
10952 	sL[2] = lastV[2];
10953 	sL[3] = lastV[3];
10954 	sL[4] = lastV[4];
10955 	sL[5] = lastV[5];
10956 	sL[6] = lastV[0] + incL[0];
10957 	sL[7] = lastV[1] + incL[1];
10958 	sL[8] = lastV[2] + incL[2];
10959 	sL[9] = lastV[3] + incL[3];
10960 	sL[10] = lastV[4] + incL[4];
10961 	sL[11] = lastV[5] + incL[5];
10962 
10963 	// multiply by 2 since doing 12 at a time
10964 	incL[0] *= 2;
10965 	incL[1] *= 2;
10966 	incL[2] *= 2;
10967 	incL[3] *= 2;
10968 	incL[4] *= 2;
10969 	incL[5] *= 2;
10970 	incL[6] *= 2;
10971 	incL[7] *= 2;
10972 	incL[8] *= 2;
10973 	incL[9] *= 2;
10974 	incL[10] *= 2;
10975 	incL[11] *= 2;
10976 
10977 	//we aligned this data, so load it up
10978 	vector unsigned char incPerm = vec_add( vec_lvsl( -1, &incL[0] ), (vector unsigned char)(1) );
10979 	vector unsigned char slPerm = vec_add( vec_lvsl( -1, &sL[0] ), (vector unsigned char)(1) );
10980 	vecIncl1 = vec_ld( 0, &incL[0] );
10981 	vecIncl2 = vec_ld( 15, &incL[0] );
10982 	vecIncl3 = vec_ld( 31, &incL[0] );
10983 	vecIncl4 = vec_ld( 47, &incL[0] );
10984 
10985 	vecIncl1 = vec_perm( vecIncl1, vecIncl2, incPerm );
10986 	vecIncl2 = vec_perm( vecIncl2, vecIncl3, incPerm );
10987 	vecIncl3 = vec_perm( vecIncl3, vecIncl4, incPerm );
10988 
10989 	vecSL1 = vec_ld( 0, &sL[0] );
10990 	vecSL2 = vec_ld( 15, &sL[0] );
10991 	vecSL3 = vec_ld( 31, &sL[0] );
10992 	vecSL4 = vec_ld( 47, &sL[0] );
10993 
10994 	vecSL1 = vec_perm( vecSL1, vecSL2, slPerm );
10995 	vecSL2 = vec_perm( vecSL2, vecSL3, slPerm );
10996 	vecSL3 = vec_perm( vecSL3, vecSL4, slPerm );
10997 
10998 	vector unsigned char samplesPerm = vec_add( vec_lvsl( -1, &samples[0] ), (vector unsigned char)(1) );
10999 	vector float vecSamplesLast = vec_ld( 0, &samples[0] );
11000 
11001 	for( i = 0; i <= MIXBUFFER_SAMPLES - 2; i += 2 ) {
11002 
11003 		//load mix buffer into vectors, assume aligned
11004 		vecMixBuffer1 = vec_ld( 0, &mixBuffer[i*6] );
11005 		vecMixBuffer2 = vec_ld( 0, &mixBuffer[(i*6)+4] );
11006 		vecMixBuffer3 = vec_ld( 0, &mixBuffer[(i*6)+8] );
11007 
11008 		//load samples into vector
11009 		vector float vecSamplesLd2 = vec_ld( 15, &samples[i*2] );
11010 		vecSamplesLd = vec_perm( vecSamplesLast, vecSamplesLd2, samplesPerm );
11011 		vecSamplesLast = vecSamplesLd2;
11012 
11013 		//permute to get them ordered how we want. For the 2nd vector,
11014 		//the order happens to be the same as the order we loaded them
11015 		//in, so there's no need to permute that one
11016 		vecSamples1 = vec_perm( vecSamplesLd, vecSamplesLd, samplePerm1 );
11017 		vecSamples2 = vecSamplesLd;
11018 		vecSamples3 = vec_perm( vecSamplesLd, vecSamplesLd, samplePerm3 );
11019 
11020 		//do calculation
11021 		vecMixBuffer1 = vec_madd( vecSamples1, vecSL1, vecMixBuffer1 );
11022 		vecMixBuffer2 = vec_madd( vecSamples2, vecSL2, vecMixBuffer2 );
11023 		vecMixBuffer3 = vec_madd( vecSamples3, vecSL3, vecMixBuffer3 );
11024 
11025 		//store out results
11026 		ALIGNED_STORE3( &mixBuffer[i*6], vecMixBuffer1, vecMixBuffer2, vecMixBuffer3 );
11027 
11028 		// add for next iteration
11029 		vecSL1 = vec_add( vecSL1, vecIncl1 );
11030 		vecSL2 = vec_add( vecSL2, vecIncl2 );
11031 		vecSL3 = vec_add( vecSL3, vecIncl3 );
11032 	}
11033 }
11034 #else
11035 
11036 /*
11037 ============
11038 idSIMD_AltiVec::MixSoundSixSpeakerStereo
11039 
11040 	Assumptions:
11041 		No assumptions
11042 ============
11043 */
MixSoundSixSpeakerStereo(float * mixBuffer,const float * samples,const int numSamples,const float lastV[6],const float currentV[6])11044 void VPCALL idSIMD_AltiVec::MixSoundSixSpeakerStereo( float *mixBuffer, const float *samples, const int numSamples, const float lastV[6], const float currentV[6] ) {
11045 
11046 	float incL[12];
11047 	float sL[12];
11048 
11049 	int i;
11050 	vector float vecIncl1, vecIncl2, vecIncl3, vecIncl4;
11051 	vector float vecSL1, vecSL2, vecSL3, vecSL4;
11052 	vector float vecSamplesLd;
11053 	vector float vecSamples1, vecSamples2, vecSamples3;
11054 	vector float vecMixBuffer1, vecMixBuffer2, vecMixBuffer3;
11055 	// permute vectors for sample
11056 	vector unsigned char samplePerm1 = (vector unsigned char)( 0,1,2,3,4,5,6,7,0,1,2,3,0,1,2,3);
11057 	vector unsigned char samplePerm3 = (vector unsigned char)( 8,9,10,11,8,9,10,11,8,9,10,11,12,13,14,15);
11058 
11059 	assert( numSamples == MIXBUFFER_SAMPLES );
11060 	assert( SPEAKER_RIGHT == 1 );
11061 	assert( SPEAKER_BACKRIGHT == 5 );
11062 
11063 	// incL array, 6 elements repeated
11064 	incL[0] = incL[6] = ( currentV[0] - lastV[0] ) / MIXBUFFER_SAMPLES;
11065 	incL[1] = incL[7] = ( currentV[1] - lastV[1] ) / MIXBUFFER_SAMPLES;
11066 	incL[2] = incL[8] = ( currentV[2] - lastV[2] ) / MIXBUFFER_SAMPLES;
11067 	incL[3] = incL[9] = ( currentV[3] - lastV[3] ) / MIXBUFFER_SAMPLES;
11068 	incL[4] = incL[10] = ( currentV[4] - lastV[4] ) / MIXBUFFER_SAMPLES;
11069 	incL[5] = incL[11] = ( currentV[5] - lastV[5] ) / MIXBUFFER_SAMPLES;
11070 
11071 	// sL array repeated
11072 	sL[0] = lastV[0];
11073 	sL[1] = lastV[1];
11074 	sL[2] = lastV[2];
11075 	sL[3] = lastV[3];
11076 	sL[4] = lastV[4];
11077 	sL[5] = lastV[5];
11078 	sL[6] = lastV[0] + incL[0];
11079 	sL[7] = lastV[1] + incL[1];
11080 	sL[8] = lastV[2] + incL[2];
11081 	sL[9] = lastV[3] + incL[3];
11082 	sL[10] = lastV[4] + incL[4];
11083 	sL[11] = lastV[5] + incL[5];
11084 
11085 	// multiply by 2 since doing 12 at a time
11086 	incL[0] *= 2;
11087 	incL[1] *= 2;
11088 	incL[2] *= 2;
11089 	incL[3] *= 2;
11090 	incL[4] *= 2;
11091 	incL[5] *= 2;
11092 	incL[6] *= 2;
11093 	incL[7] *= 2;
11094 	incL[8] *= 2;
11095 	incL[9] *= 2;
11096 	incL[10] *= 2;
11097 	incL[11] *= 2;
11098 
11099 	// load the data
11100 	vector unsigned char incPerm = vec_add( vec_lvsl( -1, &incL[0] ), (vector unsigned char)(1) );
11101 	vector unsigned char slPerm = vec_add( vec_lvsl( -1, &sL[0] ), (vector unsigned char)(1) );
11102 	vecIncl1 = vec_ld( 0, &incL[0] );
11103 	vecIncl2 = vec_ld( 15, &incL[0] );
11104 	vecIncl3 = vec_ld( 31, &incL[0] );
11105 	vecIncl4 = vec_ld( 47, &incL[0] );
11106 
11107 	vecIncl1 = vec_perm( vecIncl1, vecIncl2, incPerm );
11108 	vecIncl2 = vec_perm( vecIncl2, vecIncl3, incPerm );
11109 	vecIncl3 = vec_perm( vecIncl3, vecIncl4, incPerm );
11110 
11111 	vecSL1 = vec_ld( 0, &sL[0] );
11112 	vecSL2 = vec_ld( 15, &sL[0] );
11113 	vecSL3 = vec_ld( 31, &sL[0] );
11114 	vecSL4 = vec_ld( 47, &sL[0] );
11115 
11116 	vecSL1 = vec_perm( vecSL1, vecSL2, slPerm );
11117 	vecSL2 = vec_perm( vecSL2, vecSL3, slPerm );
11118 	vecSL3 = vec_perm( vecSL3, vecSL4, slPerm );
11119 
11120 	vector unsigned char samplesPerm = vec_add( vec_lvsl( -1, &samples[0] ), (vector unsigned char)(1) );
11121 	vector unsigned char mixBufferPerm = vec_add( vec_lvsl( -1, &mixBuffer[0] ), (vector unsigned char)(1) );
11122 	vector float vecSamplesLast = vec_ld( 0, &samples[0] );
11123 	vector float vecDest = vec_ld( 0, &mixBuffer[0] );
11124 
11125 	for( i = 0; i <= MIXBUFFER_SAMPLES - 2; i += 2 ) {
11126 
11127 		//load mix buffer into vectors
11128 		vecMixBuffer1 = vecDest;
11129 		vecMixBuffer2 = vec_ld( 15, &mixBuffer[i*6] );
11130 		vecMixBuffer3 = vec_ld( 31, &mixBuffer[i*6] );
11131 		vector float vecDestEnd = vec_ld( 47, &mixBuffer[i*6] );
11132 
11133 		vecMixBuffer1 = vec_perm( vecMixBuffer1, vecMixBuffer2, mixBufferPerm );
11134 		vecMixBuffer2 = vec_perm( vecMixBuffer2, vecMixBuffer3, mixBufferPerm );
11135 		vecMixBuffer3 = vec_perm( vecMixBuffer3, vecDestEnd, mixBufferPerm );
11136 
11137 		//load samples into vector
11138 		vector float vecSamplesLd2 = vec_ld( 15, &samples[i*2] );
11139 		vecSamplesLd = vec_perm( vecSamplesLast, vecSamplesLd2, samplesPerm );
11140 		vecSamplesLast = vecSamplesLd2;
11141 
11142 		//permute to get them ordered how we want. For the 2nd vector,
11143 		//the order happens to be the same as the order we loaded them
11144 		//in, so there's no need to permute that one
11145 		vecSamples1 = vec_perm( vecSamplesLd, vecSamplesLd, samplePerm1 );
11146 		vecSamples2 = vecSamplesLd;
11147 		vecSamples3 = vec_perm( vecSamplesLd, vecSamplesLd, samplePerm3 );
11148 
11149 		//do calculation
11150 		vecMixBuffer1 = vec_madd( vecSamples1, vecSL1, vecMixBuffer1 );
11151 		vecMixBuffer2 = vec_madd( vecSamples2, vecSL2, vecMixBuffer2 );
11152 		vecMixBuffer3 = vec_madd( vecSamples3, vecSL3, vecMixBuffer3 );
11153 
11154 		// store results
11155 		UNALIGNED_STORE3( &mixBuffer[i*6], vecMixBuffer1, vecMixBuffer2, vecMixBuffer3 );
11156 
11157 		// add for next iteration
11158 		vecSL1 = vec_add( vecSL1, vecIncl1 );
11159 		vecSL2 = vec_add( vecSL2, vecIncl2 );
11160 		vecSL3 = vec_add( vecSL3, vecIncl3 );
11161 	}
11162 }
11163 
11164 #endif
11165 
11166 /*
11167 ============
11168 idSIMD_AltiVec::MixedSoundToSamples
11169 ============
11170 */
MixedSoundToSamples(short * samples,const float * mixBuffer,const int numSamples)11171 void VPCALL idSIMD_AltiVec::MixedSoundToSamples( short *samples, const float *mixBuffer, const int numSamples ) {
11172 	 //this is basically a clamp for sound mixing
11173 	register vector float v0, v1, v2, v3, v4, v5, v6, v7;
11174 	register vector signed int vi0, vi1, vi2, vi3;
11175 	register vector signed short vs0, vs1;
11176 	register vector float minVec, maxVec, constVec;
11177 	int i = 0;
11178 
11179 	//unaligned at start, since samples is not 16-byte aligned
11180 	for ( ;  NOT_16BYTE_ALIGNED( samples[i] ) && ( i < numSamples ); i++ ) {
11181 		samples[i] = mixBuffer[i] <= -32768.0f ? -32768 : mixBuffer[i] >= 32767.0f ? 32767 : (short) mixBuffer[i];
11182 	}
11183 
11184 	constVec = (vector float)(65536.0f);
11185 
11186 	//splat min/max into a vector
11187 	minVec = (vector float)(-32768.0f);
11188 	maxVec = (vector float)(32767.0f);
11189 
11190 	vector float vecOld = vec_ld( 0, &mixBuffer[i] );
11191 	vector unsigned char permVec = vec_add( vec_lvsl( -1, &mixBuffer[i] ), (vector unsigned char)(1) );
11192 
11193 	//vectorize!
11194 	for ( ; i+15 < numSamples; i += 16 ) {
11195 		//load source
11196 		v0 = vecOld;
11197 		v1 = vec_ld( 15, &mixBuffer[i] );
11198 		v2 = vec_ld( 31, &mixBuffer[i] );
11199 		v3 = vec_ld( 31, &mixBuffer[i] );
11200 		vecOld = vec_ld( 47, &mixBuffer[i] );
11201 
11202 		v0 = vec_perm( v0, v1, permVec );
11203 		v1 = vec_perm( v1, v2, permVec );
11204 		v2 = vec_perm( v2, v3, permVec );
11205 		v3 = vec_perm( v3, vecOld, permVec );
11206 
11207 		//apply minimum
11208 		v4 = vec_max( v0, minVec );
11209 		v5 = vec_max( v1, minVec );
11210 		v6 = vec_max( v2, minVec );
11211 		v7 = vec_max( v3, minVec );
11212 
11213 		//apply maximum
11214 		v4 = vec_min( v4, maxVec );
11215 		v5 = vec_min( v5, maxVec );
11216 		v6 = vec_min( v6, maxVec );
11217 		v7 = vec_min( v7, maxVec );
11218 
11219 		// convert floats to ints
11220 		vi0 = vec_cts( v4, 0 );
11221 		vi1 = vec_cts( v5, 0 );
11222 		vi2 = vec_cts( v6, 0 );
11223 		vi3 = vec_cts( v7, 0 );
11224 
11225 		// pack ints into shorts
11226 		vs0 = vec_pack( vi0, vi1 );
11227 		vs1 = vec_pack( vi2, vi3 );
11228 		ALIGNED_STORE2( &samples[i], vs0, vs1 );
11229 	}
11230 
11231 	//handle cleanup
11232 	for ( ; i < numSamples ; i++ ) {
11233 		samples[i] = mixBuffer[i] <= -32768.0f ? -32768 : mixBuffer[i] >= 32767.0f ? 32767 : (short) mixBuffer[i];
11234 	}
11235 }
11236 #endif /* ENABLE_SOUND_ROUTINES */
11237 
11238 #endif /* MACOS_X */
11239