1 /*
2 ===========================================================================
3
4 Doom 3 GPL Source Code
5 Copyright (C) 1999-2011 id Software LLC, a ZeniMax Media company.
6
7 This file is part of the Doom 3 GPL Source Code ("Doom 3 Source Code").
8
9 Doom 3 Source Code is free software: you can redistribute it and/or modify
10 it under the terms of the GNU General Public License as published by
11 the Free Software Foundation, either version 3 of the License, or
12 (at your option) any later version.
13
14 Doom 3 Source Code is distributed in the hope that it will be useful,
15 but WITHOUT ANY WARRANTY; without even the implied warranty of
16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 GNU General Public License for more details.
18
19 You should have received a copy of the GNU General Public License
20 along with Doom 3 Source Code. If not, see <http://www.gnu.org/licenses/>.
21
22 In addition, the Doom 3 Source Code is also subject to certain additional terms. You should have received a copy of these additional terms immediately following the terms and conditions of the GNU General Public License which accompanied the Doom 3 Source Code. If not, please request a copy in writing from id Software at the address below.
23
24 If you have questions concerning this license or the applicable additional terms, you may contact in writing id Software LLC, c/o ZeniMax Media Inc., Suite 120, Rockville, Maryland 20850 USA.
25
26 ===========================================================================
27 */
28
29 #include <math.h>
30 #include <float.h>
31
32 #include "sys/platform.h"
33
34 #include "idlib/math/Simd_AltiVec.h"
35
36 // Doom3 SIMD Library version 0.5
37 // Patrick Flanagan (pflanagan@apple.com)
38 // Sanjay Patel (spatel@apple.com)
39 // Architecture & Performance Group, Apple Computer
40
41
42 //===============================================================
43 //
44 // AltiVec implementation of idSIMDProcessor
45 //
46 //===============================================================
47
48 #if defined(MACOS_X) && defined(__GNUC__) && defined(__ALTIVEC__)
49
50 #ifdef PPC_INTRINSICS
51 // for square root estimate instruction
52 #include <ppc_intrinsics.h>
53 #endif
54
55 // Data struct sizes
56
57 #ifndef DRAWVERT_PADDED
58 // 60 bytes, 15 floats at 4 bytes each
59 #define DRAWVERT_OFFSET 15
60 #else
61 // 64 bytes, 16 floats
62 #define DRAWVERT_OFFSET 16
63 #endif
64 // 16 bytes each, 4 floats
65 #define PLANE_OFFSET 4
66 // 16 bytes each, 4 floats
67 #define IDVEC4_OFFSET 4
68
69 // Alignment tests
70 #define IS_16BYTE_ALIGNED( x ) ( ( (unsigned int)&x & 0x0F ) == 0 )
71 #define NOT_16BYTE_ALIGNED( x ) ( ( (unsigned int)&x & 0x0F) != 0 )
72
73 // Aligned storing floats
74 #define ALIGNED_STORE2( ADDR, V0, V1 ) \
75 vec_st( V0, 0, ADDR ); \
76 vec_st( V1, 16, ADDR )
77
78 #define ALIGNED_STORE3( ADDR, V0, V1, V2 ) \
79 vec_st( V0, 0, ADDR ); \
80 vec_st( V1, 16, ADDR ); \
81 vec_st( V2, 32, ADDR )
82
83 #define ALIGNED_STORE4( ADDR, V0, V1, V2, V3 ) \
84 vec_st( V0, 0, ADDR ); \
85 vec_st( V1, 16, ADDR ); \
86 vec_st( V2, 32, ADDR ); \
87 vec_st( V3, 48, ADDR )
88
89 #define ALIGNED_STORE6( ADDR, V0, V1, V2, V3, V4, V5 ) \
90 vec_st( V0, 0, ADDR ); \
91 vec_st( V1, 16, ADDR ); \
92 vec_st( V2, 32, ADDR ); \
93 vec_st( V3, 48, ADDR ); \
94 vec_st( V4, 64, ADDR ); \
95 vec_st( V5, 80, ADDR )
96
97 #define ALIGNED_STORE8( ADDR, V0, V1, V2, V3, V4, V5, V6, V7 ) \
98 vec_st( V0, 0, ADDR ); \
99 vec_st( V1, 16, ADDR ); \
100 vec_st( V2, 32, ADDR ); \
101 vec_st( V3, 48, ADDR ); \
102 vec_st( V4, 64, ADDR ); \
103 vec_st( V5, 80, ADDR ); \
104 vec_st( V6, 96, ADDR ); \
105 vec_st( V7, 112, ADDR )
106
107 // Unaligned storing floats. These assume that we can trash the input
108 #define UNALIGNED_STORE1( ADDR, V0 ) { \
109 /* use store element */ \
110 vector unsigned char ULStoreMacroPerm = vec_lvsr( 0, ADDR ); \
111 V0 = vec_perm( V0, V0, ULStoreMacroPerm ); \
112 vec_ste( V0, 0, ADDR ); \
113 vec_ste( V0, 4, ADDR ); \
114 vec_ste( V0, 8, ADDR ); \
115 vec_ste( V0, 12, ADDR ); \
116 }
117
118 #define UNALIGNED_STORE2( ADDR, V0, V1 ) { \
119 /* load up the values that are there now */ \
120 vector float ULStoreMacro1 = vec_ld( 0, ADDR ); \
121 vector float ULStoreMacro2 = vec_ld( 31, ADDR ); \
122 /* generate permute vector and mask */ \
123 vector unsigned char ULStoreMacroPerm = vec_sub( vec_lvsr( 15, ADDR ), (vector unsigned char)(1) ); \
124 vector unsigned int ULStoreMacroMask = vec_perm( (vector unsigned int)(0), (vector unsigned int)(-1), ULStoreMacroPerm ); \
125 /* right rotate input data */ \
126 V0 = vec_perm( V0, V0, ULStoreMacroPerm ); \
127 V1 = vec_perm( V1, V1, ULStoreMacroPerm ); \
128 /* setup the output vectors */ \
129 vector float ULStoreVal1, ULStoreVal2, ULStoreVal3; \
130 ULStoreVal1 = vec_sel( ULStoreMacro1, V0, ULStoreMacroMask ); \
131 ULStoreVal2 = vec_sel( V0, V1, ULStoreMacroMask ); \
132 ULStoreVal3 = vec_sel( V1, ULStoreMacro2, ULStoreMacroMask ); \
133 /* store results */ \
134 vec_st( ULStoreVal1, 0, ADDR ); \
135 vec_st( ULStoreVal2, 15, ADDR ); \
136 vec_st( ULStoreVal3, 31, ADDR ); }
137
138 #define UNALIGNED_STORE3( ADDR, V0, V1, V2 ) { \
139 /* load up the values that are there now */ \
140 vector float ULStoreMacro1 = vec_ld( 0, ADDR ); \
141 vector float ULStoreMacro2 = vec_ld( 47, ADDR ); \
142 /* generate permute vector and mask */ \
143 vector unsigned char ULStoreMacroPerm = vec_sub( vec_lvsr( 15, ADDR ), (vector unsigned char)(1) ); \
144 vector unsigned int ULStoreMacroMask = vec_perm( (vector unsigned int)(0), (vector unsigned int)(-1), ULStoreMacroPerm ); \
145 /* right rotate input data */ \
146 V0 = vec_perm( V0, V0, ULStoreMacroPerm ); \
147 V1 = vec_perm( V1, V1, ULStoreMacroPerm ); \
148 V2 = vec_perm( V2, V2, ULStoreMacroPerm ); \
149 /* setup the output vectors */ \
150 vector float ULStoreVal1, ULStoreVal2, ULStoreVal3, ULStoreVal4; \
151 ULStoreVal1 = vec_sel( ULStoreMacro1, V0, ULStoreMacroMask ); \
152 ULStoreVal2 = vec_sel( V0, V1, ULStoreMacroMask ); \
153 ULStoreVal3 = vec_sel( V1, V2, ULStoreMacroMask ); \
154 ULStoreVal4 = vec_sel( V2, ULStoreMacro2, ULStoreMacroMask ); \
155 /* store results */ \
156 vec_st( ULStoreVal1, 0, ADDR ); \
157 vec_st( ULStoreVal2, 15, ADDR ); \
158 vec_st( ULStoreVal3, 31, ADDR ); \
159 vec_st( ULStoreVal4, 47, ADDR ); }
160
161 #define UNALIGNED_STORE4( ADDR, V0, V1, V2, V3 ) { \
162 /* load up the values that are there now */ \
163 vector float ULStoreMacro1 = vec_ld( 0, ADDR ); \
164 vector float ULStoreMacro2 = vec_ld( 63, ADDR ); \
165 /* generate permute vector and mask */ \
166 vector unsigned char ULStoreMacroPerm = vec_sub( vec_lvsr( 15, ADDR ), (vector unsigned char)(1) ); \
167 vector unsigned int ULStoreMacroMask = vec_perm( (vector unsigned int)(0), (vector unsigned int)(-1), ULStoreMacroPerm ); \
168 /* right rotate input data */ \
169 V0 = vec_perm( V0, V0, ULStoreMacroPerm ); \
170 V1 = vec_perm( V1, V1, ULStoreMacroPerm ); \
171 V2 = vec_perm( V2, V2, ULStoreMacroPerm ); \
172 V3 = vec_perm( V3, V3, ULStoreMacroPerm ); \
173 /* setup the output vectors */ \
174 vector float ULStoreVal1, ULStoreVal2, ULStoreVal3, ULStoreVal4, ULStoreVal5; \
175 ULStoreVal1 = vec_sel( ULStoreMacro1, V0, ULStoreMacroMask ); \
176 ULStoreVal2 = vec_sel( V0, V1, ULStoreMacroMask ); \
177 ULStoreVal3 = vec_sel( V1, V2, ULStoreMacroMask ); \
178 ULStoreVal4 = vec_sel( V2, V3, ULStoreMacroMask ); \
179 ULStoreVal5 = vec_sel( V3, ULStoreMacro2, ULStoreMacroMask ); \
180 /* store results */ \
181 vec_st( ULStoreVal1, 0, ADDR ); \
182 vec_st( ULStoreVal2, 15, ADDR ); \
183 vec_st( ULStoreVal3, 31, ADDR ); \
184 vec_st( ULStoreVal4, 47, ADDR ); \
185 vec_st( ULStoreVal5, 63, ADDR ); }
186
187 #define UNALIGNED_STORE6( ADDR, V0, V1, V2, V3, V4, V5 ) { \
188 /* load up the values that are there now */ \
189 vector float ULStoreMacro1 = vec_ld( 0, ADDR ); \
190 vector float ULStoreMacro2 = vec_ld( 95, ADDR ); \
191 /* generate permute vector and mask */ \
192 vector unsigned char ULStoreMacroPerm = vec_sub( vec_lvsr( 15, ADDR ), (vector unsigned char)(1) ); \
193 vector unsigned int ULStoreMacroMask = vec_perm( (vector unsigned int)(0), (vector unsigned int)(-1), ULStoreMacroPerm ); \
194 /* right rotate input data */ \
195 V0 = vec_perm( V0, V0, ULStoreMacroPerm ); \
196 V1 = vec_perm( V1, V1, ULStoreMacroPerm ); \
197 V2 = vec_perm( V2, V2, ULStoreMacroPerm ); \
198 V3 = vec_perm( V3, V3, ULStoreMacroPerm ); \
199 V4 = vec_perm( V4, V4, ULStoreMacroPerm ); \
200 V5 = vec_perm( V5, V5, ULStoreMacroPerm ); \
201 /* setup the output vectors */ \
202 vector float ULStoreVal1, ULStoreVal2, ULStoreVal3, ULStoreVal4, ULStoreVal5, ULStoreVal6, ULStoreVal7; \
203 ULStoreVal1 = vec_sel( ULStoreMacro1, V0, ULStoreMacroMask ); \
204 ULStoreVal2 = vec_sel( V0, V1, ULStoreMacroMask ); \
205 ULStoreVal3 = vec_sel( V1, V2, ULStoreMacroMask ); \
206 ULStoreVal4 = vec_sel( V2, V3, ULStoreMacroMask ); \
207 ULStoreVal5 = vec_sel( V3, V4, ULStoreMacroMask ); \
208 ULStoreVal6 = vec_sel( V4, V5, ULStoreMacroMask ); \
209 ULStoreVal7 = vec_sel( V5, ULStoreMacro2, ULStoreMacroMask ); \
210 /* store results */ \
211 vec_st( ULStoreVal1, 0, ADDR ); \
212 vec_st( ULStoreVal2, 15, ADDR ); \
213 vec_st( ULStoreVal3, 31, ADDR ); \
214 vec_st( ULStoreVal4, 47, ADDR ); \
215 vec_st( ULStoreVal5, 63, ADDR ); \
216 vec_st( ULStoreVal6, 79, ADDR ); \
217 vec_st( ULStoreVal7, 95, ADDR ); }
218
219 #define UNALIGNED_STORE9( ADDR, V0, V1, V2, V3, V4, V5, V6, V7, V8 ) { \
220 /* load up the values that are there now */ \
221 vector float ULStoreMacro1 = vec_ld( 0, ADDR ); \
222 vector float ULStoreMacro2 = vec_ld( 143, ADDR ); \
223 /* generate permute vector and mask */ \
224 vector unsigned char ULStoreMacroPerm = vec_sub( vec_lvsr( 15, ADDR ), (vector unsigned char)(1) ); \
225 vector unsigned int ULStoreMacroMask = vec_perm( (vector unsigned int)(0), (vector unsigned int)(-1), ULStoreMacroPerm ); \
226 /* right rotate input data */ \
227 V0 = vec_perm( V0, V0, ULStoreMacroPerm ); \
228 V1 = vec_perm( V1, V1, ULStoreMacroPerm ); \
229 V2 = vec_perm( V2, V2, ULStoreMacroPerm ); \
230 V3 = vec_perm( V3, V3, ULStoreMacroPerm ); \
231 V4 = vec_perm( V4, V4, ULStoreMacroPerm ); \
232 V5 = vec_perm( V5, V5, ULStoreMacroPerm ); \
233 V6 = vec_perm( V6, V6, ULStoreMacroPerm ); \
234 V7 = vec_perm( V7, V7, ULStoreMacroPerm ); \
235 V8 = vec_perm( V8, V8, ULStoreMacroPerm ); \
236 /* setup the output vectors */ \
237 vector float ULStoreVal1, ULStoreVal2, ULStoreVal3, ULStoreVal4, ULStoreVal5, ULStoreVal6, ULStoreVal7; \
238 vector float ULStoreVal8, ULStoreVal9, ULStoreVal10; \
239 ULStoreVal1 = vec_sel( ULStoreMacro1, V0, ULStoreMacroMask ); \
240 ULStoreVal2 = vec_sel( V0, V1, ULStoreMacroMask ); \
241 ULStoreVal3 = vec_sel( V1, V2, ULStoreMacroMask ); \
242 ULStoreVal4 = vec_sel( V2, V3, ULStoreMacroMask ); \
243 ULStoreVal5 = vec_sel( V3, V4, ULStoreMacroMask ); \
244 ULStoreVal6 = vec_sel( V4, V5, ULStoreMacroMask ); \
245 ULStoreVal7 = vec_sel( V5, V6, ULStoreMacroMask ); \
246 ULStoreVal8 = vec_sel( V6, V7, ULStoreMacroMask ); \
247 ULStoreVal9 = vec_sel( V7, V8, ULStoreMacroMask ); \
248 ULStoreVal10 = vec_sel( V8, ULStoreMacro2, ULStoreMacroMask ); \
249 /* store results */ \
250 vec_st( ULStoreVal1, 0, ADDR ); \
251 vec_st( ULStoreVal2, 15, ADDR ); \
252 vec_st( ULStoreVal3, 31, ADDR ); \
253 vec_st( ULStoreVal4, 47, ADDR ); \
254 vec_st( ULStoreVal5, 63, ADDR ); \
255 vec_st( ULStoreVal6, 79, ADDR ); \
256 vec_st( ULStoreVal7, 95, ADDR ); \
257 vec_st( ULStoreVal8, 111, ADDR ); \
258 vec_st( ULStoreVal9, 127, ADDR ); \
259 vec_st( ULStoreVal10, 143, ADDR ); }
260
261 /*
262 ============
263 idSIMD_AltiVec::GetName
264 ============
265 */
GetName(void) const266 const char *idSIMD_AltiVec::GetName( void ) const {
267 return "AltiVec";
268 }
269
270 /*
271 Helper Functions
272 */
273 #if 0
274 // Prints the values of a vector, useful for debugging but
275 // should never be called in real code
276 inline void debugPrintVector( vector float v, char *msg ) {
277 printf("%s -- %vf\n", msg, v );
278 }
279
280 inline void debugPrintVector( vector unsigned int v, char *msg ) {
281 printf("%s -- %vd\n", msg, v );
282 }
283
284 inline void debugPrintVector( vector bool int v, char *msg ) {
285 printf("%s -- %vi\n", msg, v );
286 }
287
288 inline void debugPrintVector( vector unsigned char v, char *msg ) {
289 printf("%s -- %vuc\n", msg, v );
290 }
291
292 inline void debugPrintVector( vector unsigned short v, char *msg ) {
293 printf("%s -- %vs\n", msg, v );
294 }
295 #endif
296 /*
297 ===============
298 Reciprocal
299
300 For each element in vector:
301 n = 1 / n
302 ===============
303 */
304
305 // Use Newton-Raphson to calculate reciprocal of a vector
Reciprocal(vector float v)306 inline vector float Reciprocal( vector float v ) {
307 //Get the reciprocal estimate
308 vector float estimate = vec_re( v );
309 //One round of Newton-Raphson refinement
310 return vec_madd( vec_nmsub( estimate, v, (vector float) (1.0) ), estimate, estimate );
311 }
312
313 /*
314 ===============
315 ReciprocalSquareRoot
316
317 For each element in vector:
318 n = 1 / sqrt(n)
319 ===============
320 */
321 // Reciprocal square root estimate of a vector
ReciprocalSquareRoot(vector float v)322 inline vector float ReciprocalSquareRoot( vector float v ) {
323 //Get the square root reciprocal estimate
324 vector float zero = (vector float)(0);
325 vector float oneHalf = (vector float)(0.5);
326 vector float one = (vector float)(1.0);
327 vector float estimate = vec_rsqrte( vec_max( v, (vector float)(FLT_MIN) ) );
328
329 //One round of Newton-Raphson refinement
330 vector float estimateSquared = vec_madd( estimate, estimate, zero );
331 vector float halfEstimate = vec_madd( estimate, oneHalf, zero );
332 return vec_madd( vec_nmsub( v, estimateSquared, one ), halfEstimate, estimate );
333 }
334
335
336 /*
337 ===============
338 Divide
339
340 For each element in vectors:
341 n = a / b
342 ===============
343 */
344 // Use reciprocal estimate and multiply to divide a vector
Divide(vector float a,vector float b)345 inline vector float Divide( vector float a, vector float b ) {
346 return vec_madd( a, Reciprocal( b ), (vector float)(0) );
347 }
348
349 /*
350 ===============
351 loadSplatUnalignedScalar
352
353 For each element in vector:
354 n = s
355 ===============
356 */
loadSplatUnalignedScalar(const float * s)357 inline vector float loadSplatUnalignedScalar( const float *s ) {
358 vector unsigned char splatMap = vec_lvsl( 0, s );
359 vector float v = vec_ld( 0, s );
360 splatMap = (vector unsigned char) vec_splat( (vector float) splatMap, 0 );
361 return vec_perm( v, v, splatMap );
362 }
363
364 /*
365 ===============
366 VectorATan16
367
368 For each element in vector:
369 n = idMath::ATan16( x, y )
370 ===============
371 */
372 // calculates arc tangent of a vector with 16 bits of precision, based on atan16 in idMath
VectorATan16(vector float x,vector float y)373 inline vector float VectorATan16( vector float x, vector float y ) {
374
375 vector float xDivY = Divide( x, y );
376 vector float yDivX = Divide( y, x );
377 vector float zeroVector = (vector float)(0);
378
379 vector bool int vecCmp = vec_cmpgt( vec_abs( y ), vec_abs( x ) );
380 vector float vecA = vec_sel( yDivX, xDivY, vecCmp );
381 vector bool int vecCmp2 = vec_cmplt( vecA, zeroVector );
382 vector float vecS = vec_madd( vecA, vecA, (vector float)(0) );
383
384 // do calculation for S
385 vector float vecWork1 = vec_madd( (vector float)(0.0028662257f), vecS, (vector float)(-0.0161657367f) );
386 vecWork1 = vec_madd( vecWork1, vecS, (vector float)(0.0429096138f) );
387 vecWork1 = vec_madd( vecWork1, vecS, (vector float)(-0.0752896400f) );
388 vecWork1 = vec_madd( vecWork1, vecS, (vector float)(0.1065626393f) );
389 vecWork1 = vec_madd( vecWork1, vecS, (vector float)(-0.1420889944f) );
390 vecWork1 = vec_madd( vecWork1, vecS, (vector float)(0.1999355085f) );
391 vecWork1 = vec_madd( vecWork1, vecS, (vector float)(-0.3333314528f) );
392 vecWork1 = vec_madd( vecWork1, vecS, (vector float)(1) );
393
394 // get the regular S value
395 vecS = vec_madd( vecWork1, vecA, (vector float)(0) );
396
397 // calculate what to return if y > x
398 vector float negSPlusHalfPI = vec_madd( vecS, (vector float)(-1), (vector float)(0.5f * 3.14159265358979323846f) );
399 vector float negSMinusHalfPI = vec_madd( vecS, (vector float)(-1), (vector float)(-0.5f * 3.14159265358979323846f) );
400 vector float modRet = vec_sel( negSPlusHalfPI, negSMinusHalfPI, vecCmp2 );
401
402 return vec_sel( modRet, vecS, vecCmp );
403 }
404
405 /*
406 ===============
407 VectorSin16
408
409 For each element in vector:
410 n = idMath::Sin16( v )
411 ===============
412 */
VectorSin16(vector float v)413 inline vector float VectorSin16( vector float v ) {
414 vector float zero = (vector float)(0);
415
416 #if 0
417 // load up half PI and use it to calculate the rest of the values. This is
418 // sometimes cheaper than loading them from memory
419
420 vector float halfPI = (vector float) ( 0.5f * 3.14159265358979323846f );
421 vector float PI = vec_add( halfPI, halfPI );
422 vector float oneandhalfPI = vec_add( PI, halfPI );
423 vector float twoPI = vec_add( oneandhalfPI, halfPI );
424 #else
425 vector float halfPI = (vector float) ( 0.5f * 3.14159265358979323846f );
426 vector float PI = (vector float)(3.14159265358979323846f);
427 vector float oneandhalfPI = (vector float)(3.14159265358979323846f + ( 0.5f * 3.14159265358979323846f ) );
428 vector float twoPI = (vector float)( 2.0f * 3.14159265358979323846f);
429 #endif
430
431 vector bool int vecCmp1, vecCmp2, vecCmp3, vecCmp4;
432
433 vector float vecMod;
434 vector float vecResult;
435
436 // fix the range if needbe
437 vecMod = vec_floor( Divide( v, twoPI ) );
438 vecResult = vec_nmsub( vecMod, twoPI, v );
439
440 vector float vecPIminusA = vec_sub( PI, vecResult );
441 vector float vecAminus2PI = vec_sub( vecResult, twoPI );
442
443 vecCmp1 = vec_cmplt( vecResult, PI );
444 vecCmp2 = vec_cmpgt( vecResult, halfPI );
445
446 // these are the ones where a > PI + HALF_PI so set a = a - TWO_PI
447 vecCmp3 = vec_cmpgt( vecResult, oneandhalfPI );
448
449 // we also want to set a = PI - a everywhere that !(a < PI) and !(a > PI + HALF_PI)
450 vecCmp4 = vec_and( vec_xor( vecCmp3, (vector bool int)(1) ), vec_xor( vecCmp1, (vector bool int)(1) ) ); // everywhere that both of those are false
451
452 // these are ones where a < PI and a > HALF_PI so we set a = PI - a
453 vecCmp1 = vec_and( vecCmp1, vecCmp2 );
454 vecCmp1 = vec_or( vecCmp1, vecCmp4 );
455
456 // put the correct values into place
457 vecResult = vec_sel( vecResult, vecPIminusA, vecCmp1 );
458 vecResult = vec_sel( vecResult, vecAminus2PI, vecCmp3 );
459
460 // calculate answer
461 vector float vecASquared = vec_madd( vecResult, vecResult, zero );
462 vector float vecEst = vec_madd( (vector float)(-2.39e-08f), vecASquared, (vector float)(2.7526e-06f) );
463 vecEst = vec_madd( vecEst, vecASquared, (vector float)(-1.98409e-04f) );
464 vecEst = vec_madd( vecEst, vecASquared, (vector float)(8.3333315e-03f) );
465 vecEst = vec_madd( vecEst, vecASquared, (vector float)(-1.666666664e-01f) );
466 vecEst = vec_madd( vecEst, vecASquared, (vector float)(1.0f) );
467 return vec_madd( vecResult, vecEst, zero );
468 }
469
470 /*
471 ===============
472 vecSplatWithRunTime
473
474 For each element in vector:
475 n = v(i)
476 ===============
477 */
478 // splats an element across a vector using a runtime variable
vecSplatWithRunTime(vector float v,int i)479 inline vector float vecSplatWithRunTime( vector float v, int i ) {
480 vector unsigned char rotate = vec_lvsl( i * sizeof( float ), (int*) 0L );
481 v = vec_perm( v, v, rotate );
482 return vec_splat( v, 0 );
483 }
484
485
486 /*
487 ===============
488 FastScalarInvSqrt
489
490 n = 1 / sqrt( f )
491 ===============
492 */
FastScalarInvSqrt(float f)493 inline float FastScalarInvSqrt( float f ) {
494 #ifdef PPC_INTRINSICS
495 float estimate;
496 const float kSmallestFloat = FLT_MIN;
497
498 //Calculate a 5 bit starting estimate for the reciprocal sqrt
499 estimate = __frsqrte ( f + kSmallestFloat );
500
501 //if you require less precision, you may reduce the number of loop iterations.
502 // This will do 2 rounds of NR
503 estimate = estimate + 0.5f * estimate * ( 1.0f - f * estimate * estimate );
504 estimate = estimate + 0.5f * estimate * ( 1.0f - f * estimate * estimate );
505 return estimate;
506 #else
507 return idMath::InvSqrt( f );
508 #endif
509 }
510
511 /*
512 ===============
513 FastScalarInvSqrt_x3
514
515 arg1 = 1 / sqrt( arg1 )
516 arg2 = 1 / sqrt( arg2 )
517 arg3 = 1 / sqrt( arg3 )
518 ===============
519 */
FastScalarInvSqrt_x3(float * arg1,float * arg2,float * arg3)520 inline void FastScalarInvSqrt_x3( float *arg1, float *arg2, float *arg3 ) {
521 #ifdef PPC_INTRINSICS
522 register float estimate1, estimate2, estimate3;
523 const float kSmallestFloat = FLT_MIN;
524
525 //Calculate a 5 bit starting estimate for the reciprocal sqrt of each
526 estimate1 = __frsqrte ( *arg1 + kSmallestFloat );
527 estimate2 = __frsqrte ( *arg2 + kSmallestFloat );
528 estimate3 = __frsqrte ( *arg3 + kSmallestFloat );
529
530 // two rounds newton-raphson
531 estimate1 = estimate1 + 0.5f * estimate1 * ( 1.0f - *arg1 * estimate1 * estimate1 );
532 estimate2 = estimate2 + 0.5f * estimate2 * ( 1.0f - *arg2 * estimate2 * estimate2 );
533 estimate3 = estimate3 + 0.5f * estimate3 * ( 1.0f - *arg3 * estimate3 * estimate3 );
534 estimate1 = estimate1 + 0.5f * estimate1 * ( 1.0f - *arg1 * estimate1 * estimate1 );
535 estimate2 = estimate2 + 0.5f * estimate2 * ( 1.0f - *arg2 * estimate2 * estimate2 );
536 estimate3 = estimate3 + 0.5f * estimate3 * ( 1.0f - *arg3 * estimate3 * estimate3 );
537
538 *arg1 = estimate1;
539 *arg2 = estimate2;
540 *arg3 = estimate3;
541 #else
542 *arg1 = idMath::InvSqrt( *arg1 );
543 *arg2 = idMath::InvSqrt( *arg2 );
544 *arg3 = idMath::InvSqrt( *arg3 );
545 #endif
546 }
547
548 /*
549 ===============
550 FastScalarInvSqrt_x6
551
552 arg1 = 1 / sqrt( arg1 )
553 arg2 = 1 / sqrt( arg2 )
554 arg3 = 1 / sqrt( arg3 )
555 arg4 = 1 / sqrt( arg4 )
556 arg5 = 1 / sqrt( arg5 )
557 arg6 = 1 / sqrt( arg6 )
558
559 On a G5, you've got 2 pipeline stages to fill. (2 FPU's with 6 stages each)
560 ===============
561 */
FastScalarInvSqrt_x6(float * arg1,float * arg2,float * arg3,float * arg4,float * arg5,float * arg6)562 inline void FastScalarInvSqrt_x6( float *arg1, float *arg2, float *arg3, float *arg4, float *arg5, float *arg6 ) {
563 #ifdef PPC_INTRINSICS
564 register float estimate1, estimate2, estimate3, estimate4, estimate5, estimate6;
565 const float kSmallestFloat = FLT_MIN;
566
567 //Calculate a 5 bit starting estimate for the reciprocal sqrt of each
568 estimate1 = __frsqrte ( *arg1 + kSmallestFloat );
569 estimate2 = __frsqrte ( *arg2 + kSmallestFloat );
570 estimate3 = __frsqrte ( *arg3 + kSmallestFloat );
571 estimate4 = __frsqrte ( *arg4 + kSmallestFloat );
572 estimate5 = __frsqrte ( *arg5 + kSmallestFloat );
573 estimate6 = __frsqrte ( *arg6 + kSmallestFloat );
574
575 // two rounds newton-raphson
576 estimate1 = estimate1 + 0.5f * estimate1 * ( 1.0f - *arg1 * estimate1 * estimate1 );
577 estimate2 = estimate2 + 0.5f * estimate2 * ( 1.0f - *arg2 * estimate2 * estimate2 );
578 estimate3 = estimate3 + 0.5f * estimate3 * ( 1.0f - *arg3 * estimate3 * estimate3 );
579 estimate4 = estimate4 + 0.5f * estimate4 * ( 1.0f - *arg4 * estimate4 * estimate4 );
580 estimate5 = estimate5 + 0.5f * estimate5 * ( 1.0f - *arg5 * estimate5 * estimate5 );
581 estimate6 = estimate6 + 0.5f * estimate6 * ( 1.0f - *arg6 * estimate6 * estimate6 );
582
583 estimate1 = estimate1 + 0.5f * estimate1 * ( 1.0f - *arg1 * estimate1 * estimate1 );
584 estimate2 = estimate2 + 0.5f * estimate2 * ( 1.0f - *arg2 * estimate2 * estimate2 );
585 estimate3 = estimate3 + 0.5f * estimate3 * ( 1.0f - *arg3 * estimate3 * estimate3 );
586 estimate4 = estimate4 + 0.5f * estimate4 * ( 1.0f - *arg4 * estimate4 * estimate4 );
587 estimate5 = estimate5 + 0.5f * estimate5 * ( 1.0f - *arg5 * estimate5 * estimate5 );
588 estimate6 = estimate6 + 0.5f * estimate6 * ( 1.0f - *arg6 * estimate6 * estimate6 );
589
590 *arg1 = estimate1;
591 *arg2 = estimate2;
592 *arg3 = estimate3;
593 *arg4 = estimate4;
594 *arg5 = estimate5;
595 *arg6 = estimate6;
596 #else
597 *arg1 = idMath::InvSqrt( *arg1 );
598 *arg2 = idMath::InvSqrt( *arg2 );
599 *arg3 = idMath::InvSqrt( *arg3 );
600 *arg4 = idMath::InvSqrt( *arg4 );
601 *arg5 = idMath::InvSqrt( *arg5 );
602 *arg6 = idMath::InvSqrt( *arg6 );
603 #endif
604 }
605
606
607 // End Helper Functions
608
609 #ifdef ENABLE_SIMPLE_MATH
610
611 /*
612 ============
613 idSIMD_AltiVec::Add
614
615 dst[i] = constant + src[i];
616 ============
617 */
Add(float * dst,const float constant,const float * src,const int count)618 void VPCALL idSIMD_AltiVec::Add( float *dst, const float constant, const float *src, const int count ) {
619 vector float v0, v1, v2, v3;
620 vector float v0_low, v0_hi, v1_hi;
621 vector unsigned char permVec;
622 vector float constVec;
623 int i;
624
625 // handle unaligned cases at beginning
626 for ( i = 0; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count ); i++ ) {
627 dst[i] = constant + src[i];
628 }
629
630 //splat constant into a vector
631 constVec = loadSplatUnalignedScalar( &constant );
632
633 //calculate permute and do first load
634 permVec = vec_add( vec_lvsl( -1, (int*) &src[i] ), (vector unsigned char)(1) );
635 v1_hi = vec_ld( 0, &src[i] );
636
637 //vectorize!
638 for ( ; i+7 < count; i += 8 ) {
639 //load source
640 v0_low = v1_hi;
641 v0_hi = vec_ld( 15, &src[i] );
642 v1_hi = vec_ld( 31, &src[i] );
643
644 v0 = vec_perm( v0_low, v0_hi, permVec );
645 v1 = vec_perm( v0_hi, v1_hi, permVec );
646
647 v2 = vec_add( v0, constVec );
648 v3 = vec_add( v1, constVec );
649
650 // store results
651 ALIGNED_STORE2( &dst[i], v2, v3 );
652 }
653
654 //handle cleanup
655 for ( ; i < count ; i++ ) {
656 dst[i] = constant + src[i];
657 }
658 }
659
660 /*
661 ============
662 idSIMD_AltiVec::Add
663
664 dst[i] = src0[i] + src1[i];
665 ============
666 */
Add(float * dst,const float * src0,const float * src1,const int count)667 void VPCALL idSIMD_AltiVec::Add( float *dst, const float *src0, const float *src1, const int count ) {
668
669 register vector float v0, v1, v2, v3, v4, v5;
670 //src0
671 register vector float v0_low, v0_hi, v2_low, v2_hi;
672 //src1
673 register vector float v1_low, v1_hi, v3_low, v3_hi;
674 //permute vectors
675 register vector unsigned char permVec1, permVec2;
676 vector unsigned char oneCharVector = (vector unsigned char)(1);
677
678 int i;
679
680 //unaligned at start
681 for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count ); i++ ) {
682 dst[i] = src0[i] + src1[i];
683 }
684
685 //calculate permute and do loads
686 permVec1 = vec_add( vec_lvsl( -1, (int*) &src0[i] ), oneCharVector );
687 permVec2 = vec_add( vec_lvsl( -1, (int*) &src1[i] ), oneCharVector );
688 v2_hi = vec_ld( 0, &src0[i] );
689 v3_hi = vec_ld( 0, &src1[i] );
690
691 //vectorize!
692 for ( ; i+7 < count; i += 8 ) {
693 //load source
694 v0_low = v2_hi;
695 v0_hi = vec_ld( 15, &src0[i] );
696 v2_low = v0_hi;
697 v2_hi = vec_ld( 31, &src0[i] );
698
699 v1_low = v3_hi;
700 v1_hi = vec_ld( 15, &src1[i] );
701 v3_low = v1_hi;
702 v3_hi = vec_ld( 31, &src1[i] );
703
704 v0 = vec_perm( v0_low, v0_hi, permVec1 );
705 v1 = vec_perm( v1_low, v1_hi, permVec2 );
706 v2 = vec_perm( v2_low, v2_hi, permVec1 );
707 v3 = vec_perm( v3_low, v3_hi, permVec2 );
708
709 v4 = vec_add( v0, v1 );
710 v5 = vec_add( v2, v3 );
711
712 ALIGNED_STORE2( &dst[i], v4, v5 );
713
714 }
715
716 //handle cleanup
717 for ( ; i < count ; i++ ) {
718 dst[i] = src0[i] + src1[i];
719 }
720 }
721
722 /*
723 ============
724 idSIMD_AltiVec::Sub
725
726 dst[i] = constant - src[i];
727 ============
728 */
Sub(float * dst,const float constant,const float * src,const int count)729 void VPCALL idSIMD_AltiVec::Sub( float *dst, const float constant, const float *src, const int count ) {
730
731 register vector float v0, v1, v2, v3;
732 register vector float v0_low, v0_hi, v1_low, v1_hi;
733 register vector unsigned char permVec;
734 register vector float constVec;
735 vector unsigned char oneCharVector = (vector unsigned char)(1);
736 int i;
737
738 //handle unaligned at start
739 for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count ); i++ ) {
740 dst[i] = constant - src[i];
741 }
742
743 //splat constant into a vector
744 constVec = loadSplatUnalignedScalar( &constant );
745
746 //calculate permute vector and do first load
747 permVec = vec_add( vec_lvsl( -1, (int*) &src[i] ), oneCharVector );
748 v1_hi = vec_ld( 0, &src[i] );
749
750 //vectorize!
751 for ( ; i+7 < count; i += 8 ) {
752 //load source
753 v0_low = v1_hi;
754 v0_hi = vec_ld( 15, &src[i] );
755 v1_low = v0_hi;
756 v1_hi = vec_ld( 31, &src[i] );
757
758 v0 = vec_perm( v0_low, v0_hi, permVec );
759 v1 = vec_perm( v1_low, v1_hi, permVec );
760
761 v2 = vec_sub( constVec, v0 );
762 v3 = vec_sub( constVec, v1 );
763
764 ALIGNED_STORE2( &dst[i], v2, v3 );
765 }
766
767 //handle cleanup
768 for ( ; i < count ; i++ ) {
769 dst[i] = constant - src[i];
770 }
771 }
772
773 /*
774 ============
775 idSIMD_AltiVec::Sub
776
777 dst[i] = src0[i] - src1[i];
778 ============
779 */
Sub(float * dst,const float * src0,const float * src1,const int count)780 void VPCALL idSIMD_AltiVec::Sub( float *dst, const float *src0, const float *src1, const int count ) {
781 register vector float v0, v1, v2, v3, v4, v5;
782 //src0
783 register vector float v0_low, v0_hi, v2_low, v2_hi;
784 //src1
785 register vector float v1_low, v1_hi, v3_low, v3_hi;
786 register vector unsigned char permVec1, permVec2;
787 vector unsigned char oneCharVector = (vector unsigned char)(1);
788 int i;
789
790 //handle unaligned at start
791 for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count ); i++ ) {
792 dst[i] = src0[i] - src1[i];
793 }
794
795 //calculate permute and do first loads
796 permVec1 = vec_add( vec_lvsl( -1, (int*) &src0[i] ), oneCharVector );
797 permVec2 = vec_add( vec_lvsl( -1, (int*) &src1[i] ), oneCharVector );
798 v2_hi = vec_ld( 0, &src0[i] );
799 v3_hi = vec_ld( 0, &src1[i] );
800
801 //vectorize!
802 for ( ; i+7 < count; i += 8 ) {
803 //load source
804 v0_low = v2_hi;
805 v0_hi = vec_ld( 15, &src0[i] );
806 v2_low = v0_hi;
807 v2_hi = vec_ld( 31, &src0[i] );
808
809 v1_low = v3_hi;
810 v1_hi = vec_ld( 15, &src1[i] );
811 v3_low = v1_hi;
812 v3_hi = vec_ld( 31, &src1[i] );
813
814 v0 = vec_perm( v0_low, v0_hi, permVec1 );
815 v1 = vec_perm( v1_low, v1_hi, permVec2 );
816 v2 = vec_perm( v2_low, v2_hi, permVec1 );
817 v3 = vec_perm( v3_low, v3_hi, permVec2 );
818
819 v4 = vec_sub( v0, v1 );
820 v5 = vec_sub( v2, v3 );
821
822 ALIGNED_STORE2( &dst[i], v4, v5 );
823 }
824
825 //handle cleanup
826 for ( ; i < count ; i++ ) {
827 dst[i] = src0[i] - src1[i];
828 }
829 }
830
831 /*
832 ============
833 idSIMD_AltiVec::Mul
834
835 dst[i] = constant * src[i];
836 ============
837 */
Mul(float * dst,const float constant,const float * src,const int count)838 void VPCALL idSIMD_AltiVec::Mul( float *dst, const float constant, const float *src, const int count) {
839 register vector float v0, v0_low, v0_hi, v1_low, v1_hi, v1, v2, v3;
840 register vector float constVec;
841 register vector unsigned char permVec;
842 vector unsigned char oneCharVector = (vector unsigned char)(1);
843 register vector float zeroVector = (vector float)(0.0);
844 int i;
845
846 // handle unaligned data at start
847 for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count );i++ ) {
848 dst[i] = constant * src[i];
849 }
850
851 //splat constant into a vector
852 constVec = loadSplatUnalignedScalar( &constant );
853
854 permVec = vec_add( vec_lvsl( -1, (int*) &src[i] ), oneCharVector );
855 v1_hi = vec_ld( 0, &src[i] );
856
857 //vectorize!
858 for ( ; i+7 < count; i += 8 ) {
859 //load source
860 v0_low = v1_hi;
861 v0_hi = vec_ld( 15, &src[i] );
862 v1_low = v0_hi;
863 v1_hi = vec_ld( 31, &src[i] );
864
865 v0 = vec_perm( v0_low, v0_hi, permVec );
866 v1 = vec_perm( v1_low, v1_hi, permVec );
867
868 v2 = vec_madd( constVec, v0, zeroVector );
869 v3 = vec_madd( constVec, v1, zeroVector );
870
871 ALIGNED_STORE2( &dst[i], v2, v3 );
872 }
873
874 //handle cleanup
875 for ( ; i < count ; i++ ) {
876 dst[i] = constant * src[i];
877 }
878 }
879
880 /*
881 ============
882 idSIMD_AltiVec::Mul
883
884 dst[i] = src0[i] * src1[i];
885 ============
886 */
Mul(float * dst,const float * src0,const float * src1,const int count)887 void VPCALL idSIMD_AltiVec::Mul( float *dst, const float *src0, const float *src1, const int count ) {
888 register vector float v0, v1, v2, v3, v4, v5;
889 //src0
890 register vector float v0_low, v0_hi, v2_low, v2_hi;
891 //src1
892 register vector float v1_low, v1_hi, v3_low, v3_hi;
893 //permute vectors
894 register vector unsigned char permVec1, permVec2;
895 register vector float constVec = (vector float)(0.0);
896 vector unsigned char oneCharVector = (vector unsigned char)(1);
897 int i;
898
899 //handle unaligned at start
900 for ( i = 0; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count );i++ ) {
901 dst[i] = src0[i] * src1[i];
902 }
903
904 //calculate permute and do loads
905 permVec1 = vec_add( vec_lvsl( -1, (int*) &src0[i] ), oneCharVector );
906 permVec2 = vec_add( vec_lvsl( -1, (int*) &src1[i] ), oneCharVector );
907 v2_hi = vec_ld( 0, &src0[i] );
908 v3_hi = vec_ld( 0, &src1[i] );
909
910 //vectorize!
911 for ( ; i+7 < count; i += 8 ) {
912 //load source
913 v0_low = v2_hi;
914 v0_hi = vec_ld( 15, &src0[i] );
915 v2_low = v0_hi;
916 v2_hi = vec_ld( 31, &src0[i] );
917
918 v1_low = v3_hi;
919 v1_hi = vec_ld( 15, &src1[i] );
920 v3_low = v1_hi;
921 v3_hi = vec_ld( 31, &src1[i] );
922
923 v0 = vec_perm( v0_low, v0_hi, permVec1 );
924 v1 = vec_perm( v1_low, v1_hi, permVec2 );
925 v2 = vec_perm( v2_low, v2_hi, permVec1 );
926 v3 = vec_perm( v3_low, v3_hi, permVec2 );
927
928 //no such thing as regular multiply so we do
929 //multiply then add zero
930 v4 = vec_madd( v0, v1, constVec );
931 v5 = vec_madd( v2, v3, constVec );
932
933 ALIGNED_STORE2( &dst[i], v4, v5 );
934 }
935
936 //handle cleanup
937 for ( ; i < count ; i++ ) {
938 dst[i] = src0[i] * src1[i];
939 }
940 }
941
942 /*
943 ============
944 idSIMD_AltiVec::Div
945
946 dst[i] = constant / divisor[i];
947 ============
948 */
Div(float * dst,const float constant,const float * divisor,const int count)949 void VPCALL idSIMD_AltiVec::Div( float *dst, const float constant, const float *divisor, const int count ) {
950 register vector float v0, v1, v2, v3;
951 register vector float v0_low, v0_hi, v1_low, v1_hi;
952 register vector unsigned char permVec;
953 register vector float constVec;
954 vector unsigned char oneCharVector = (vector unsigned char)(1);
955 int i;
956
957 //handle unaligned at start
958 for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count );i++ ) {
959 dst[i] = constant / divisor[i];
960 }
961
962 //splat constant into a vector
963 constVec = loadSplatUnalignedScalar( &constant );
964
965 //calculate permute and do first loads
966 permVec = vec_add( vec_lvsl( -1, (int*) &divisor[i] ), oneCharVector );
967 v1_hi = vec_ld( 0, &divisor[i] );
968
969 //vectorize!
970 for ( ; i+7 < count; i += 8 ) {
971 //load source
972 v0_low = v1_hi;
973 v0_hi = vec_ld( 15, &divisor[i] );
974 v1_low = v0_hi;
975 v1_hi = vec_ld( 31, &divisor[i] );
976
977 v0 = vec_perm( v0_low, v0_hi, permVec );
978 v1 = vec_perm( v1_low, v1_hi, permVec );
979
980 v2 = Divide( constVec, v0 );
981 v3 = Divide( constVec, v1 );
982
983 ALIGNED_STORE2( &dst[i], v2, v3 );
984 }
985
986 //handle cleanup
987 for ( ; i < count ; i++ ) {
988 dst[i] = constant / divisor[i];
989 }
990 }
991
992 /*
993 ============
994 idSIMD_AltiVec::Div
995
996 dst[i] = src0[i] / src1[i];
997 ============
998 */
Div(float * dst,const float * src0,const float * src1,const int count)999 void VPCALL idSIMD_AltiVec::Div( float *dst, const float *src0, const float *src1, const int count ) {
1000 register vector float v0, v1, v2, v3, v4, v5;
1001 //src0
1002 register vector float v0_low, v0_hi, v2_low, v2_hi;
1003 //src1
1004 register vector float v1_low, v1_hi, v3_low, v3_hi;
1005 //permute vectors
1006 register vector unsigned char permVec1, permVec2;
1007 vector unsigned char oneCharVector = (vector unsigned char)(1);
1008 int i;
1009
1010 //handle unaligned at start
1011 for ( i = 0; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count );i++ ) {
1012 dst[i] = src0[i] / src1[i];
1013 }
1014
1015 //calculate permute and do loads
1016 permVec1 = vec_add( vec_lvsl( -1, (int*) &src0[i] ), oneCharVector );
1017 permVec2 = vec_add( vec_lvsl( -1, (int*) &src1[i] ), oneCharVector );
1018 v2_hi = vec_ld( 0, &src0[i] );
1019 v3_hi = vec_ld( 0, &src1[i] );
1020
1021 //vectorize!
1022 for ( ; i+7 < count; i += 8 ) {
1023 //load source
1024 v0_low = v2_hi;
1025 v0_hi = vec_ld( 15, &src0[i] );
1026 v2_low = v0_hi;
1027 v2_hi = vec_ld( 31, &src0[i] );
1028
1029 v1_low = v3_hi;
1030 v1_hi = vec_ld( 15, &src1[i] );
1031 v3_low = v1_hi;
1032 v3_hi = vec_ld( 31, &src1[i] );
1033
1034 v0 = vec_perm( v0_low, v0_hi, permVec1 );
1035 v1 = vec_perm( v1_low, v1_hi, permVec2 );
1036 v2 = vec_perm( v2_low, v2_hi, permVec1 );
1037 v3 = vec_perm( v3_low, v3_hi, permVec2 );
1038
1039 v4 = Divide( v0, v1 );
1040 v5 = Divide( v2, v3 );
1041
1042 ALIGNED_STORE2( &dst[i], v4, v5 );
1043 }
1044
1045 //handle cleanup
1046 for ( ; i < count ; i++ ) {
1047 dst[i] = src0[i] / src1[i];
1048 }
1049 }
1050
1051 /*
1052 ============
1053 idSIMD_AltiVec::MulAdd
1054
1055 dst[i] += constant * src[i];
1056 ============
1057 */
MulAdd(float * dst,const float constant,const float * src,const int count)1058 void VPCALL idSIMD_AltiVec::MulAdd( float *dst, const float constant, const float *src, const int count ) {
1059
1060 register vector float v0, v1, v2, v3, v4, v5;
1061 register vector float constVec;
1062 //src
1063 register vector float v0_low, v0_hi, v2_low, v2_hi;
1064 //permute vectors
1065 register vector unsigned char permVec1;
1066 vector unsigned char oneCharVector = (vector unsigned char)(1);
1067 int i;
1068
1069 //handle unaligned at start
1070 for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count );i++ ) {
1071 dst[i] += constant * src[i];
1072 }
1073
1074 //splat constant into a vector
1075 constVec = loadSplatUnalignedScalar( &constant );
1076
1077 //calculate permute and do loads
1078 permVec1 = vec_add( vec_lvsl( -1, (int*) &src[i] ), oneCharVector );
1079 v2_hi = vec_ld( 0, &src[i] );
1080
1081 //vectorize!
1082 for ( ; i+7 < count; i += 8 ) {
1083 v0_low = v2_hi;
1084 v0_hi = vec_ld( 15, &src[i] );
1085 v2_low = v0_hi;
1086 v2_hi = vec_ld( 31, &src[i] );
1087
1088 v0 = vec_perm( v0_low, v0_hi, permVec1 );
1089 v2 = vec_perm( v2_low, v2_hi, permVec1 );
1090
1091 // at this point, dst is known to be aligned
1092 v1 = vec_ld( 0, &dst[i] );
1093 v3 = vec_ld( 16, &dst[i] );
1094
1095 v4 = vec_madd( constVec, v0, v1 );
1096 v5 = vec_madd( constVec, v2, v3 );
1097
1098 ALIGNED_STORE2( &dst[i], v4, v5 );
1099 }
1100
1101 //handle cleanup
1102 for ( ; i < count ; i++ ) {
1103 dst[i] += constant * src[i];
1104 }
1105 }
1106
1107 /*
1108 ============
1109 idSIMD_AltiVec::MulAdd
1110
1111 dst[i] += src0[i] * src1[i];
1112 ============
1113 */
MulAdd(float * dst,const float * src0,const float * src1,const int count)1114 void VPCALL idSIMD_AltiVec::MulAdd( float *dst, const float *src0, const float *src1, const int count ) {
1115 register vector float v0, v1, v2, v3, v4, v5, v6, v7;
1116 //src0
1117 register vector float v0_low, v0_hi, v2_low, v2_hi;
1118 //src1
1119 register vector float v1_low, v1_hi, v3_low, v3_hi;
1120 //permute vectors
1121 register vector unsigned char permVec1, permVec2;
1122 vector unsigned char oneCharVector = (vector unsigned char)(1);
1123
1124 int i;
1125
1126 //unaligned at start
1127 for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count );i++ ) {
1128 dst[i] += src0[i] * src1[i];
1129 }
1130
1131 //calculate permute and do loads
1132 permVec1 = vec_add( vec_lvsl( -1, (int*) &src0[i] ), oneCharVector );
1133 permVec2 = vec_add( vec_lvsl( -1, (int*) &src1[i] ), oneCharVector );
1134 v2_hi = vec_ld( 0, &src0[i] );
1135 v3_hi = vec_ld( 0, &src1[i] );
1136
1137 //vectorize!
1138 for ( ; i+7 < count; i += 8 ) {
1139 // load sources
1140 v0_low = v2_hi;
1141 v0_hi = vec_ld( 15, &src0[i] );
1142 v2_low = v0_hi;
1143 v2_hi = vec_ld( 31, &src0[i] );
1144
1145 v1_low = v3_hi;
1146 v1_hi = vec_ld( 15, &src1[i] );
1147 v3_low = v1_hi;
1148 v3_hi = vec_ld( 31, &src1[i] );
1149
1150 v0 = vec_perm( v0_low, v0_hi, permVec1 );
1151 v1 = vec_perm( v1_low, v1_hi, permVec2 );
1152 v2 = vec_perm( v2_low, v2_hi, permVec1 );
1153 v3 = vec_perm( v3_low, v3_hi, permVec2 );
1154
1155 //we know dst is aligned because we handled unaligned cases
1156 //up front
1157 v4 = vec_ld( 0, &dst[i] );
1158 v5 = vec_ld( 16, &dst[i] );
1159
1160 v6 = vec_madd( v0, v1, v4 );
1161 v7 = vec_madd( v2, v3, v5 );
1162
1163 ALIGNED_STORE2( &dst[i], v6, v7 );
1164 }
1165
1166 //handle cleanup
1167 for ( ; i < count ; i++ ) {
1168 dst[i] += src0[i] * src1[i];
1169 }
1170 }
1171
1172 /*
1173 ============
1174 idSIMD_AltiVec::MulSub
1175
1176 dst[i] -= constant * src[i];
1177 ============
1178 */
MulSub(float * dst,const float constant,const float * src,const int count)1179 void VPCALL idSIMD_AltiVec::MulSub( float *dst, const float constant, const float *src, const int count ) {
1180 register vector float v0, v1, v2, v3, v4, v5;
1181 register vector float constVec;
1182 //src
1183 register vector float v0_low, v0_hi, v2_low, v2_hi;
1184 //permute vectors
1185 register vector unsigned char permVec1;
1186 vector unsigned char oneCharVector = (vector unsigned char)(1);
1187 int i;
1188
1189 //handle unaligned at start
1190 for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count );i++ ) {
1191 dst[i] -= constant * src[i];
1192 }
1193
1194 //splat constant into a vector
1195 constVec = loadSplatUnalignedScalar( &constant );
1196
1197 //calculate permute and do loads
1198 permVec1 = vec_add( vec_lvsl( -1, (int*) &src[i] ), oneCharVector );
1199 v2_hi = vec_ld( 0, &src[i] );
1200
1201 //vectorize!
1202 for ( ; i+7 < count; i += 8 ) {
1203 v0_low = v2_hi;
1204 v0_hi = vec_ld( 15, &src[i] );
1205 v2_low = v0_hi;
1206 v2_hi = vec_ld( 31, &src[i] );
1207
1208 v0 = vec_perm( v0_low, v0_hi, permVec1 );
1209 v2 = vec_perm( v2_low, v2_hi, permVec1 );
1210
1211 //we know dst will be aligned here because we already handled the preceeding
1212 //unaligned cases
1213 v1 = vec_ld( 0, &dst[i] );
1214 v3 = vec_ld( 16, &dst[i] );
1215
1216 v4 = vec_nmsub( v0, constVec, v1 );
1217 v5 = vec_nmsub( v2, constVec, v3 );
1218
1219 ALIGNED_STORE2( &dst[i], v4, v5 );
1220 }
1221
1222 //handle cleanup
1223 for ( ; i < count ; i++ ) {
1224 dst[i] -= constant * src[i];
1225 }
1226 }
1227
1228 /*
1229 ============
1230 idSIMD_AltiVec::MulSub
1231
1232 dst[i] -= src0[i] * src1[i];
1233 ============
1234 */
MulSub(float * dst,const float * src0,const float * src1,const int count)1235 void VPCALL idSIMD_AltiVec::MulSub( float *dst, const float *src0, const float *src1, const int count ) {
1236 register vector float v0, v1, v2, v3, v4, v5, v6, v7;
1237 //src0
1238 register vector float v0_low, v0_hi, v2_low, v2_hi;
1239 //src1
1240 register vector float v1_low, v1_hi, v3_low, v3_hi;
1241 //permute vectors
1242 register vector unsigned char permVec1, permVec2;
1243 vector unsigned char oneCharVector = (vector unsigned char)(1);
1244 int i;
1245
1246 //unaligned at start
1247 for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count );i++ ) {
1248 dst[i] -= src0[i] * src1[i];
1249 }
1250
1251 //calculate permute and do loads
1252 permVec1 = vec_add( vec_lvsl( -1, (int*) &src0[i] ), oneCharVector );
1253 permVec2 = vec_add( vec_lvsl( -1, (int*) &src1[i] ), oneCharVector );
1254 v2_hi = vec_ld( 0, &src0[i] );
1255 v3_hi = vec_ld( 0, &src1[i] );
1256
1257
1258 //vectorize!
1259 for ( ; i+7 < count; i += 8 ) {
1260 // load sources
1261 v0_low = v2_hi;
1262 v0_hi = vec_ld( 15, &src0[i] );
1263 v2_low = v0_hi;
1264 v2_hi = vec_ld( 31, &src0[i] );
1265
1266 v1_low = v3_hi;
1267 v1_hi = vec_ld( 15, &src1[i] );
1268 v3_low = v1_hi;
1269 v3_hi = vec_ld( 31, &src1[i] );
1270
1271 v0 = vec_perm( v0_low, v0_hi, permVec1 );
1272 v1 = vec_perm( v1_low, v1_hi, permVec2 );
1273 v2 = vec_perm( v2_low, v2_hi, permVec1 );
1274 v3 = vec_perm( v3_low, v3_hi, permVec2 );
1275
1276 //we know dst is aligned because we handled unaligned cases
1277 //up front
1278 v4 = vec_ld( 0, &dst[i] );
1279 v5 = vec_ld( 16, &dst[i] );
1280
1281 v6 = vec_nmsub( v0, v1, v4 );
1282 v7 = vec_nmsub( v2, v3, v5 );
1283
1284 ALIGNED_STORE2( &dst[i], v6, v7 );
1285 }
1286
1287 //handle cleanup
1288 for ( ; i < count ; i++ ) {
1289 dst[i] -= src0[i] * src1[i];
1290 }
1291 }
1292
1293 #endif /* ENABLE_SIMPLE_MATH */
1294
1295 #ifdef ENABLE_DOT
1296 /*
1297 ============
1298 idSIMD_AltiVec::Dot
1299
1300 dst[i] = constant * src[i];
1301 ============
1302 */
Dot(float * dst,const idVec3 & constant,const idVec3 * src,const int count)1303 void VPCALL idSIMD_AltiVec::Dot( float *dst, const idVec3 &constant, const idVec3 *src, const int count ) {
1304
1305 register vector float vecLd1, vecLd2, vecLd3, vecLd4, vecLd5, vecLd6;
1306 register vector float vecX, vecY, vecZ;
1307 vector float vecX2, vecY2, vecZ2;
1308 const float *addr = src[0].ToFloatPtr();
1309 float tempVal[4];
1310 float constVal[4];
1311 register vector float zeroVector = (vector float)(0.0);
1312 register vector float vecConstX, vecConstY, vecConstZ;
1313
1314 // permute vectors
1315 register vector unsigned char permX1 = (vector unsigned char)(0,1,2,3,12,13,14,15,24,25,26,27,28,29,30,31); //last 4 bytes are junk
1316 register vector unsigned char permX2 = (vector unsigned char)(0,1,2,3,4,5,6,7,8,9,10,11,20,21,22,23);
1317
1318 register vector unsigned char permY1 = (vector unsigned char)(4,5,6,7,16,17,18,19,28,29,30,31,0,1,2,3); //last 4 bytes are junk
1319 register vector unsigned char permY2 = (vector unsigned char)(0,1,2,3,4,5,6,7,8,9,10,11,24,25,26,27);
1320
1321 register vector unsigned char permZ1 = (vector unsigned char)(8,9,10,11,20,21,22,23,0,1,2,3,4,5,6,7); //last 8 bytes are junk
1322 register vector unsigned char permZ2 = (vector unsigned char)(0,1,2,3,4,5,6,7,16,17,18,19,28,29,30,31);
1323
1324 int i;
1325
1326 // for scalar cleanup, if necessary
1327 constVal[0] = constant[0];
1328 constVal[1] = constant[1];
1329 constVal[2] = constant[2];
1330 constVal[3] = 0;
1331
1332 vector unsigned char constPerm = vec_lvsl( 0, constant.ToFloatPtr() );
1333 vecLd1 = vec_ld( 0, constant.ToFloatPtr() );
1334 vecLd2 = vec_ld( 11, constant.ToFloatPtr() );
1335 vecLd1 = vec_perm( vecLd1, vecLd2, constPerm );
1336
1337
1338 // populate const vectors
1339 vecConstX = vec_splat( vecLd1, 0 );
1340 vecConstY = vec_splat( vecLd1, 1 );
1341 vecConstZ = vec_splat( vecLd1, 2 );
1342
1343 vector unsigned char permVec = vec_add( vec_lvsl( -1, addr ), (vector unsigned char)(1) );
1344 vector float vecOld = vec_ld( 0, addr );
1345
1346 // handle unaligned case at beginning
1347 for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count ); i++ ) {
1348 dst[i] = constant * src[i];
1349 }
1350
1351 for ( ; i + 7 < count; i += 8 ) {
1352 float *vecPtr = (float*)( addr + (i*3) );
1353 vector float v0, v1, v2, v3, v4, v5;
1354
1355 v0 = vecOld; //vec_ld( 0, vecPtr );
1356 v1 = vec_ld( 15, vecPtr );
1357 v2 = vec_ld( 31, vecPtr );
1358 v3 = vec_ld( 47, vecPtr );
1359 v4 = vec_ld( 63, vecPtr );
1360 v5 = vec_ld( 79, vecPtr );
1361 vecOld = vec_ld( 95, vecPtr );
1362
1363 vecLd1 = vec_perm( v0, v1, permVec );
1364 vecLd2 = vec_perm( v1, v2, permVec );
1365 vecLd3 = vec_perm( v2, v3, permVec );
1366
1367 vecLd4 = vec_perm( v3, v4, permVec );
1368 vecLd5 = vec_perm( v4, v5, permVec );
1369 vecLd6 = vec_perm( v5, vecOld, permVec );
1370
1371 // permute into X Y Z vectors
1372 vecX = vec_perm( vecLd1, vecLd2, permX1 );
1373 vecY = vec_perm( vecLd1, vecLd2, permY1 );
1374 vecZ = vec_perm( vecLd1, vecLd2, permZ1 );
1375 vecX = vec_perm( vecX, vecLd3, permX2 );
1376 vecY = vec_perm( vecY, vecLd3, permY2 );
1377 vecZ = vec_perm( vecZ, vecLd3, permZ2 );
1378
1379 vecX2 = vec_perm( vecLd4, vecLd5, permX1 );
1380 vecY2 = vec_perm( vecLd4, vecLd5, permY1 );
1381 vecZ2 = vec_perm( vecLd4, vecLd5, permZ1 );
1382 vecX2 = vec_perm( vecX2, vecLd6, permX2 );
1383 vecY2 = vec_perm( vecY2, vecLd6, permY2 );
1384 vecZ2 = vec_perm( vecZ2, vecLd6, permZ2 );
1385
1386 // do multiply
1387 vecX = vec_madd( vecX, vecConstX, zeroVector );
1388 vecY = vec_madd( vecY, vecConstY, vecX );
1389 vecZ = vec_madd( vecZ, vecConstZ, vecY );
1390
1391 vecX2 = vec_madd( vecX2, vecConstX, zeroVector );
1392 vecY2 = vec_madd( vecY2, vecConstY, vecX2 );
1393 vecZ2 = vec_madd( vecZ2, vecConstZ, vecY2 );
1394
1395 // store out results
1396 ALIGNED_STORE2( &dst[i], vecZ, vecZ2 );
1397 }
1398
1399 //cleanup
1400 for ( ; i < count; i++ ) {
1401 // look up whats at the address we want, cast it as float pointer, then
1402 // dereference that pointer
1403 tempVal[0] = *( addr + (i*3) + 0 );
1404 tempVal[1] = *( addr + (i*3) + 1 );
1405 tempVal[2] = *( addr + (i*3) + 2 );
1406 dst[i] = constVal[0] * tempVal[0] + constVal[1] * tempVal[1] + constVal[2] * tempVal[2];
1407 }
1408 }
1409
1410
1411 /*
1412 ============
1413 idSIMD_AltiVec::Dot
1414
1415 dst[i] = constant * src[i].Normal() + src[i][3];
1416 ============
1417 */
Dot(float * dst,const idVec3 & constant,const idPlane * src,const int count)1418 void VPCALL idSIMD_AltiVec::Dot( float *dst, const idVec3 &constant, const idPlane *src, const int count ) {
1419 //#define OPER(X) dst[(X)] = constant * src[(X)].Normal() + src[(X)][3];
1420
1421 assert( sizeof(idPlane) == PLANE_OFFSET * sizeof(float) );
1422
1423 int i;
1424 float constVal[4];
1425 float srcVal[3];
1426 float srcI3;
1427 float tempVal;
1428
1429 vector float vecPlaneLd1, vecPlaneLd2, vecPlaneLd3, vecPlaneLd4;
1430 vector float vecPlaneLd5, vecPlaneLd6, vecPlaneLd7, vecPlaneLd8;
1431 vector float vecX, vecY, vecZ, vecI3;
1432 vector float vecX2, vecY2, vecZ2, vecI32;
1433 vector float vecConstX, vecConstY, vecConstZ;
1434
1435 constVal[0] = constant[0];
1436 constVal[1] = constant[1];
1437 constVal[2] = constant[2];
1438 constVal[3] = 1;
1439
1440 vector unsigned char constPerm = vec_lvsl( 0, constant.ToFloatPtr() );
1441 vector float v0 = vec_ld( 0, constant.ToFloatPtr() );
1442 vector float v1 = vec_ld( 11, constant.ToFloatPtr() );
1443 vector float vecConst = vec_perm( v0, v1, constPerm );
1444
1445 vecConstX = vec_splat( vecConst, 0 );
1446 vecConstY = vec_splat( vecConst, 1 );
1447 vecConstZ = vec_splat( vecConst, 2 );
1448
1449 // handle unaligned case at beginning
1450 for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count ); i++ ) {
1451 dst[i] = constant * src[i].Normal() + src[i][3];
1452 }
1453
1454 const float *addr = src[i].ToFloatPtr();
1455 vector unsigned char permVec = vec_add( vec_lvsl( -1, addr ), (vector unsigned char)(1) );
1456 vector float vecOld = vec_ld( 0, addr );
1457
1458 for ( ; i + 7 < count; i += 8 ) {
1459 float *planePtr = (float*)( addr + (i*PLANE_OFFSET) );
1460 vector float v0, v1, v2, v3, v4, v5, v6, v7;
1461
1462 v0 = vecOld; //vec_ld( 0, planePtr );
1463 v1 = vec_ld( 15, planePtr );
1464 v2 = vec_ld( 31, planePtr );
1465 v3 = vec_ld( 47, planePtr );
1466 v4 = vec_ld( 63, planePtr );
1467 v5 = vec_ld( 79, planePtr );
1468 v6 = vec_ld( 95, planePtr );
1469 v7 = vec_ld( 111, planePtr );
1470 vecOld = vec_ld( 127, planePtr );
1471
1472 vecPlaneLd1 = vec_perm( v0, v1, permVec );
1473 vecPlaneLd2 = vec_perm( v1, v2, permVec );
1474 vecPlaneLd3 = vec_perm( v2, v3, permVec );
1475 vecPlaneLd4 = vec_perm( v3, v4, permVec );
1476
1477 vecPlaneLd5 = vec_perm( v4, v5, permVec );
1478 vecPlaneLd6 = vec_perm( v5, v6, permVec );
1479 vecPlaneLd7 = vec_perm( v6, v7, permVec );
1480 vecPlaneLd8 = vec_perm( v7, vecOld, permVec );
1481
1482 // permute into X Y Z vectors, since this is square its basically
1483 // a matrix transpose
1484 v0 = vec_mergeh( vecPlaneLd1, vecPlaneLd3 );
1485 v1 = vec_mergeh( vecPlaneLd2, vecPlaneLd4 );
1486 v2 = vec_mergel( vecPlaneLd1, vecPlaneLd3 );
1487 v3 = vec_mergel( vecPlaneLd2, vecPlaneLd4 );
1488
1489 vecX = vec_mergeh( v0, v1 );
1490 vecY = vec_mergel( v0, v1 );
1491 vecZ = vec_mergeh( v2, v3 );
1492 vecI3 = vec_mergel( v2, v3 );
1493
1494 v4 = vec_mergeh( vecPlaneLd5, vecPlaneLd7 );
1495 v5 = vec_mergeh( vecPlaneLd6, vecPlaneLd8 );
1496 v6 = vec_mergel( vecPlaneLd5, vecPlaneLd7 );
1497 v7 = vec_mergel( vecPlaneLd6, vecPlaneLd8 );
1498
1499 vecX2 = vec_mergeh( v4, v5 );
1500 vecY2 = vec_mergel( v4, v5 );
1501 vecZ2 = vec_mergeh( v6, v7 );
1502 vecI32 = vec_mergel( v6, v7 );
1503
1504 // do calculation
1505 v6 = vec_madd( vecZ, vecConstZ, vecI3 );
1506 v5 = vec_madd( vecY, vecConstY, v6 );
1507 v4 = vec_madd( vecX, vecConstX, v5 );
1508
1509 v0 = vec_madd( vecZ2, vecConstZ, vecI32 );
1510 v1 = vec_madd( vecY2, vecConstY, v0 );
1511 v2 = vec_madd( vecX2, vecConstX, v1 );
1512
1513 // store results
1514 ALIGNED_STORE2( &dst[i], v4, v2 );
1515 }
1516
1517 // cleanup
1518 for ( ; i < count; i++ ) {
1519 // populate srcVal with src X Y Z
1520 srcVal[0] = *(addr + (i*PLANE_OFFSET) + 0 );
1521 srcVal[1] = *(addr + (i*PLANE_OFFSET) + 1 );
1522 srcVal[2] = *(addr + (i*PLANE_OFFSET) + 2 );
1523
1524 // put src[i][3] into srcI3
1525 srcI3 = *(addr + (i*PLANE_OFFSET) + 3 );
1526
1527 tempVal = constVal[0] * srcVal[0] + constVal[1] * srcVal[1] + constVal[2] * srcVal[2];
1528 dst[i] = tempVal + srcI3;
1529 }
1530 }
1531
1532 #ifndef DRAWVERT_PADDED
1533 /*
1534 ============
1535 idSIMD_AltiVec::Dot
1536
1537 dst[i] = constant * src[i].xyz;
1538 ============
1539 */
Dot(float * dst,const idVec3 & constant,const idDrawVert * src,const int count)1540 void VPCALL idSIMD_AltiVec::Dot( float *dst, const idVec3 &constant, const idDrawVert *src, const int count ) {
1541 //#define OPER(X) dst[(X)] = constant * src[(X)].xyz;
1542
1543 // idDrawVert size is 60 bytes
1544 assert( sizeof(idDrawVert) == DRAWVERT_OFFSET * sizeof( float ) );
1545
1546 register vector float v0, v1, v2, v3, v4, v5, v6, v7;
1547 int i;
1548 register vector float vecConstX, vecConstY, vecConstZ;
1549 register vector float vecSrcX1, vecSrcY1, vecSrcZ1;
1550 register vector float zeroVector = (vector float)(0.0);
1551 vector unsigned char vertPerm1, vertPerm2, vertPerm3, vertPerm4;
1552
1553 vector unsigned char constPerm = vec_lvsl( 0, constant.ToFloatPtr() );
1554 v0 = vec_ld( 0, constant.ToFloatPtr() );
1555 v1 = vec_ld( 11, constant.ToFloatPtr() );
1556 v0 = vec_perm( v0, v1, constPerm );
1557
1558 // permute into constant vectors
1559 vecConstX = vec_splat( v0, 0 );
1560 vecConstY = vec_splat( v0, 1 );
1561 vecConstZ = vec_splat( v0, 2 );
1562
1563 // handle unaligned case at beginning
1564 for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count ); i++ ) {
1565 dst[i] = constant * src[i].xyz;
1566 }
1567
1568 // every fourth one will have the same alignment. Make sure we've got enough here
1569 if ( i+3 < count ) {
1570 vertPerm1 = vec_add( vec_lvsl( -1, (float*) src[i].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
1571 vertPerm2 = vec_add( vec_lvsl( -1, (float*) src[i+1].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
1572 vertPerm3 = vec_add( vec_lvsl( -1, (float*) src[i+2].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
1573 vertPerm4 = vec_add( vec_lvsl( -1, (float*) src[i+3].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
1574 }
1575
1576 for ( ; i+3 < count; i += 4 ) {
1577 const float *vertPtr = src[i].xyz.ToFloatPtr();
1578 const float *vertPtr2 = src[i+1].xyz.ToFloatPtr();
1579 const float *vertPtr3 = src[i+2].xyz.ToFloatPtr();
1580 const float *vertPtr4 = src[i+3].xyz.ToFloatPtr();
1581
1582 v0 = vec_ld( 0, vertPtr );
1583 v1 = vec_ld( 11, vertPtr );
1584 v2 = vec_ld( 0, vertPtr2 );
1585 v3 = vec_ld( 11, vertPtr2 );
1586 v4 = vec_ld( 0, vertPtr3 );
1587 v5 = vec_ld( 11, vertPtr3 );
1588 v6 = vec_ld( 0, vertPtr4 );
1589 v7 = vec_ld( 11, vertPtr4 );
1590
1591 v0 = vec_perm( v0, v1, vertPerm1 );
1592 v2 = vec_perm( v2, v3, vertPerm2 );
1593 v4 = vec_perm( v4, v5, vertPerm3 );
1594 v6 = vec_perm( v6, v7, vertPerm4 );
1595
1596 // transpose into X Y Z vectors
1597 v1 = vec_mergeh( v0, v4 );
1598 v3 = vec_mergeh( v2, v6 );
1599 v5 = vec_mergel( v0, v4 );
1600 v7 = vec_mergel( v2, v6 );
1601
1602 vecSrcX1 = vec_mergeh( v1, v3 );
1603 vecSrcY1 = vec_mergel( v1, v3 );
1604 vecSrcZ1 = vec_mergeh( v5, v7 );
1605
1606 // now calculate dot product
1607 vecSrcX1 = vec_madd( vecSrcX1, vecConstX, zeroVector );
1608 vecSrcY1 = vec_madd( vecSrcY1, vecConstY, vecSrcX1 );
1609 vecSrcZ1 = vec_madd( vecSrcZ1, vecConstZ, vecSrcY1 );
1610
1611 // store results
1612 vec_st( vecSrcZ1, 0, &dst[i] );
1613 }
1614
1615 for ( ; i < count; i++ ) {
1616 dst[i] = constant * src[i].xyz;
1617 }
1618 }
1619 #else
1620 /*
1621 ============
1622 idSIMD_AltiVec::Dot
1623
1624 dst[i] = constant * src[i].xyz;
1625 ============
1626 */
Dot(float * dst,const idVec3 & constant,const idDrawVert * src,const int count)1627 void VPCALL idSIMD_AltiVec::Dot( float *dst, const idVec3 &constant, const idDrawVert *src, const int count ) {
1628 //#define OPER(X) dst[(X)] = constant * src[(X)].xyz;
1629
1630 // idDrawVert size is 64 bytes
1631 assert( sizeof(idDrawVert) == DRAWVERT_OFFSET * sizeof( float ) );
1632
1633 register vector float v0, v1, v2, v3, v4, v5, v6, v7;
1634 int i;
1635 register vector float vecConstX, vecConstY, vecConstZ;
1636 register vector float vecSrcX1, vecSrcY1, vecSrcZ1;
1637 register vector float zeroVector = (vector float)(0.0);
1638 vector unsigned char vertPerm1, vertPerm2, vertPerm3, vertPerm4;
1639
1640 vector unsigned char constPerm = vec_lvsl( 0, constant.ToFloatPtr() );
1641 v0 = vec_ld( 0, constant.ToFloatPtr() );
1642 v1 = vec_ld( 11, constant.ToFloatPtr() );
1643 v0 = vec_perm( v0, v1, constPerm );
1644
1645 // permute into constant vectors
1646 vecConstX = vec_splat( v0, 0 );
1647 vecConstY = vec_splat( v0, 1 );
1648 vecConstZ = vec_splat( v0, 2 );
1649
1650 // handle unaligned case at beginning
1651 for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count ); i++ ) {
1652 dst[i] = constant * src[i].xyz;
1653 }
1654
1655 for ( ; i+3 < count; i += 4 ) {
1656 const float *vertPtr = src[i].xyz.ToFloatPtr();
1657 const float *vertPtr2 = src[i+1].xyz.ToFloatPtr();
1658 const float *vertPtr3 = src[i+2].xyz.ToFloatPtr();
1659 const float *vertPtr4 = src[i+3].xyz.ToFloatPtr();
1660
1661 v0 = vec_ld( 0, vertPtr );
1662 v2 = vec_ld( 0, vertPtr2 );
1663 v4 = vec_ld( 0, vertPtr3 );
1664 v6 = vec_ld( 0, vertPtr4 );
1665
1666 // transpose into X Y Z vectors
1667 v1 = vec_mergeh( v0, v4 );
1668 v3 = vec_mergeh( v2, v6 );
1669 v5 = vec_mergel( v0, v4 );
1670 v7 = vec_mergel( v2, v6 );
1671
1672 vecSrcX1 = vec_mergeh( v1, v3 );
1673 vecSrcY1 = vec_mergel( v1, v3 );
1674 vecSrcZ1 = vec_mergeh( v5, v7 );
1675
1676 // now calculate dot product
1677 vecSrcX1 = vec_madd( vecSrcX1, vecConstX, zeroVector );
1678 vecSrcY1 = vec_madd( vecSrcY1, vecConstY, vecSrcX1 );
1679 vecSrcZ1 = vec_madd( vecSrcZ1, vecConstZ, vecSrcY1 );
1680
1681 // store results
1682 vec_st( vecSrcZ1, 0, &dst[i] );
1683 }
1684
1685 for ( ; i < count; i++ ) {
1686 dst[i] = constant * src[i].xyz;
1687 }
1688 }
1689
1690 #endif /* DRAWVERT_PADDED */
1691
1692 /*
1693 ============
1694 idSIMD_AltiVec::Dot
1695
1696 dst[i] = constant.Normal() * src[i] + constant[3];
1697 ============
1698 */
Dot(float * dst,const idPlane & constant,const idVec3 * src,const int count)1699 void VPCALL idSIMD_AltiVec::Dot( float *dst, const idPlane &constant, const idVec3 *src, const int count ) {
1700 //#define OPER(X) dst[(X)] = constant.Normal() * src[(X)] + constant[3];
1701
1702 register vector float vecLd1, vecLd2, vecLd3, vecLd4, vecLd5, vecLd6;
1703 register vector float vecX, vecY, vecZ, vecX2, vecY2, vecZ2;
1704 register vector float zeroVector = (vector float)(0.0);
1705 register vector float vecConstX, vecConstY, vecConstZ;
1706 register vector float vecConst3;
1707
1708 idVec3 constNormal = constant.Normal();
1709 float const3 = constant[3];
1710
1711 // permute vectors
1712 register vector unsigned char permX1 = (vector unsigned char)(0,1,2,3,12,13,14,15,24,25,26,27,28,29,30,31); //last 4 bytes are junk
1713 register vector unsigned char permX2 = (vector unsigned char)(0,1,2,3,4,5,6,7,8,9,10,11,20,21,22,23);
1714
1715 register vector unsigned char permY1 = (vector unsigned char)(4,5,6,7,16,17,18,19,28,29,30,31,0,1,2,3); //last 4 bytes are junk
1716 register vector unsigned char permY2 = (vector unsigned char)(0,1,2,3,4,5,6,7,8,9,10,11,24,25,26,27);
1717
1718 register vector unsigned char permZ1 = (vector unsigned char)(8,9,10,11,20,21,22,23,0,1,2,3,4,5,6,7); //last 8 bytes are junk
1719 register vector unsigned char permZ2 = (vector unsigned char)(0,1,2,3,4,5,6,7,16,17,18,19,28,29,30,31);
1720
1721 int i;
1722
1723 vector unsigned char constPerm = vec_lvsl( 0, constant.ToFloatPtr() );
1724 vecLd1 = vec_ld( 0, constant.ToFloatPtr() );
1725 vecLd2 = vec_ld( 15, constant.ToFloatPtr() );
1726 vecLd1 = vec_perm( vecLd1, vecLd2, constPerm );
1727
1728 // populate const vec
1729 vecConstX = vec_splat( vecLd1, 0 );
1730 vecConstY = vec_splat( vecLd1, 1 );
1731 vecConstZ = vec_splat( vecLd1, 2 );
1732
1733 // put constant to add in vector
1734 vecConst3 = loadSplatUnalignedScalar( &const3 );
1735
1736 // handle unaligned case at beginning
1737 for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count ); i++ ) {
1738 dst[i] = constant.Normal() * src[i] + constant[3];
1739 }
1740
1741 const float *addr = src[i].ToFloatPtr();
1742 vector unsigned char permVec = vec_add( vec_lvsl( -1, addr ), (vector unsigned char)(1) );
1743 vector float vecOld = vec_ld( 0, addr );
1744
1745 for ( ; i+7 < count; i += 8 ) {
1746 float *vecPtr = (float*)( addr + (i*3) );
1747 vector float v0, v1, v2, v3, v4, v5;
1748
1749 v0 = vecOld; //vec_ld( 0, vecPtr );
1750 v1 = vec_ld( 15, vecPtr );
1751 v2 = vec_ld( 31, vecPtr );
1752 v3 = vec_ld( 47, vecPtr );
1753 v4 = vec_ld( 63, vecPtr );
1754 v5 = vec_ld( 79, vecPtr );
1755 vecOld = vec_ld( 95, vecPtr );
1756
1757 vecLd1 = vec_perm( v0, v1, permVec );
1758 vecLd2 = vec_perm( v1, v2, permVec );
1759 vecLd3 = vec_perm( v2, v3, permVec );
1760
1761 vecLd4 = vec_perm( v3, v4, permVec );
1762 vecLd5 = vec_perm( v4, v5, permVec );
1763 vecLd6 = vec_perm( v5, vecOld, permVec );
1764
1765 // permute into X Y Z vectors
1766 vecX = vec_perm( vecLd1, vecLd2, permX1 );
1767 vecY = vec_perm( vecLd1, vecLd2, permY1 );
1768 vecZ = vec_perm( vecLd1, vecLd2, permZ1 );
1769 vecX = vec_perm( vecX, vecLd3, permX2 );
1770 vecY = vec_perm( vecY, vecLd3, permY2 );
1771 vecZ = vec_perm( vecZ, vecLd3, permZ2 );
1772
1773 vecX2 = vec_perm( vecLd4, vecLd5, permX1 );
1774 vecY2 = vec_perm( vecLd4, vecLd5, permY1 );
1775 vecZ2 = vec_perm( vecLd4, vecLd5, permZ1 );
1776 vecX2 = vec_perm( vecX2, vecLd6, permX2 );
1777 vecY2 = vec_perm( vecY2, vecLd6, permY2 );
1778 vecZ2 = vec_perm( vecZ2, vecLd6, permZ2 );
1779
1780 // calculate dot product
1781 vecX = vec_madd( vecX, vecConstX, zeroVector );
1782 vecY = vec_madd( vecY, vecConstY, vecX );
1783 vecZ = vec_madd( vecZ, vecConstZ, vecY );
1784
1785 vecX2 = vec_madd( vecX2, vecConstX, zeroVector );
1786 vecY2 = vec_madd( vecY2, vecConstY, vecX2 );
1787 vecZ2 = vec_madd( vecZ2, vecConstZ, vecY2 );
1788
1789 // add in constant[3]
1790 vecZ = vec_add( vecZ, vecConst3 );
1791 vecZ2 = vec_add( vecZ2, vecConst3 );
1792
1793 // store out results
1794 ALIGNED_STORE2( &dst[i], vecZ, vecZ2 );
1795 }
1796
1797 //cleanup
1798 for ( ; i < count; i++ ) {
1799 dst[i] = constNormal * src[i] + const3;
1800 }
1801 }
1802
1803 /*
1804 ============
1805 idSIMD_AltiVec::Dot
1806
1807 dst[i] = constant.Normal() * src[i].Normal() + constant[3] * src[i][3];
1808 ============
1809 */
Dot(float * dst,const idPlane & constant,const idPlane * src,const int count)1810 void VPCALL idSIMD_AltiVec::Dot( float *dst, const idPlane &constant, const idPlane *src, const int count ) {
1811 //#define OPER(X) dst[(X)] = constant.Normal() * src[(X)].Normal() + constant[3] * src[(X)][3];
1812
1813 // check plane size
1814 assert( sizeof(idPlane) == PLANE_OFFSET * sizeof(float) );
1815
1816 float constVal[4];
1817 float srcVal[4];
1818
1819 int i;
1820 const float *constPtr = constant.ToFloatPtr();
1821
1822 register vector float vecX, vecY, vecZ, vecI3;
1823 register vector float vecX2, vecY2, vecZ2, vecI32;
1824
1825 vector float vecPlaneLd1, vecPlaneLd2, vecPlaneLd3, vecPlaneLd4;
1826 vector float vecPlaneLd5, vecPlaneLd6, vecPlaneLd7, vecPlaneLd8;
1827 register vector float zeroVector = (vector float)(0.0);
1828 register vector float vecConstX, vecConstY, vecConstZ, vecConstI3;
1829
1830 constVal[0] = *(constPtr);
1831 constVal[1] = *(constPtr+1);
1832 constVal[2] = *(constPtr+2);
1833 constVal[3] = *(constPtr+3);
1834
1835 // populate const vector
1836 vector unsigned char constPerm = vec_lvsl( 0, constant.ToFloatPtr() );
1837 vector float v0 = vec_ld( 0, constant.ToFloatPtr() );
1838 vector float v1 = vec_ld( 15, constant.ToFloatPtr() );
1839 vector float vecConst = vec_perm( v0, v1, constPerm );
1840
1841 vecConstX = vec_splat( vecConst, 0 );
1842 vecConstY = vec_splat( vecConst, 1 );
1843 vecConstZ = vec_splat( vecConst, 2 );
1844 vecConstI3 = vec_splat( vecConst, 3 );
1845
1846 // handle unaligned case at beginning
1847 for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count ); i++ ) {
1848 dst[i] = constant.Normal() * src[i].Normal() + constant[3] * src[i][3];
1849 }
1850
1851 const float *srcPtr = src[i].ToFloatPtr();
1852 vector unsigned char permVec = vec_add( vec_lvsl( -1, srcPtr ), (vector unsigned char)(1) );
1853 vector float vecOld = vec_ld( 0, srcPtr );
1854
1855 for ( ; i+7 < count; i += 8 ) {
1856 float *planePtr = (float*)( srcPtr + (i*PLANE_OFFSET) );
1857 vector float v0, v1, v2, v3, v4, v5, v6, v7;
1858
1859 v0 = vecOld; // vec_ld( 0, planePtr );
1860 v1 = vec_ld( 15, planePtr );
1861 v2 = vec_ld( 31, planePtr );
1862 v3 = vec_ld( 47, planePtr );
1863 v4 = vec_ld( 63, planePtr );
1864 v5 = vec_ld( 79, planePtr );
1865 v6 = vec_ld( 95, planePtr );
1866 v7 = vec_ld( 111, planePtr );
1867 vecOld = vec_ld( 127, planePtr );
1868
1869 vecPlaneLd1 = vec_perm( v0, v1, permVec );
1870 vecPlaneLd2 = vec_perm( v1, v2, permVec );
1871 vecPlaneLd3 = vec_perm( v2, v3, permVec );
1872 vecPlaneLd4 = vec_perm( v3, v4, permVec );
1873
1874 vecPlaneLd5 = vec_perm( v4, v5, permVec );
1875 vecPlaneLd6 = vec_perm( v5, v6, permVec );
1876 vecPlaneLd7 = vec_perm( v6, v7, permVec );
1877 vecPlaneLd8 = vec_perm( v7, vecOld, permVec );
1878
1879 // permute into X Y Z vectors, since this is square its basically
1880 // a matrix transpose
1881 v0 = vec_mergeh( vecPlaneLd1, vecPlaneLd3 );
1882 v1 = vec_mergeh( vecPlaneLd2, vecPlaneLd4 );
1883 v2 = vec_mergel( vecPlaneLd1, vecPlaneLd3 );
1884 v3 = vec_mergel( vecPlaneLd2, vecPlaneLd4 );
1885
1886 vecX = vec_mergeh( v0, v1 );
1887 vecY = vec_mergel( v0, v1 );
1888 vecZ = vec_mergeh( v2, v3 );
1889 vecI3 = vec_mergel( v2, v3 );
1890
1891 v4 = vec_mergeh( vecPlaneLd5, vecPlaneLd7 );
1892 v5 = vec_mergeh( vecPlaneLd6, vecPlaneLd8 );
1893 v6 = vec_mergel( vecPlaneLd5, vecPlaneLd7 );
1894 v7 = vec_mergel( vecPlaneLd6, vecPlaneLd8 );
1895
1896 vecX2 = vec_mergeh( v4, v5 );
1897 vecY2 = vec_mergel( v4, v5 );
1898 vecZ2 = vec_mergeh( v6, v7 );
1899 vecI32 = vec_mergel( v6, v7 );
1900
1901 // do calculation
1902 v4 = vec_madd( vecConstX, vecX, zeroVector );
1903 v5 = vec_madd( vecConstY, vecY, v4 );
1904 v6 = vec_madd( vecConstZ, vecZ, v5 );
1905 v7 = vec_madd( vecConstI3, vecI3, v6 );
1906
1907 v0 = vec_madd( vecConstX, vecX2, zeroVector );
1908 v1 = vec_madd( vecConstY, vecY2, v0 );
1909 v2 = vec_madd( vecConstZ, vecZ2, v1 );
1910 v3 = vec_madd( vecConstI3, vecI32, v2 );
1911
1912 //store result
1913 ALIGNED_STORE2( &dst[i], v7, v3 );
1914 }
1915
1916 // cleanup
1917 for ( ; i < count; i++ ) {
1918 //dst[i] = constant.Normal() * src[i].Normal() + constant[3] * src[i][3];
1919 srcVal[0] = *(srcPtr + (i*PLANE_OFFSET) + 0 );
1920 srcVal[1] = *(srcPtr + (i*PLANE_OFFSET) + 1 );
1921 srcVal[2] = *(srcPtr + (i*PLANE_OFFSET) + 2 );
1922 srcVal[3] = *(srcPtr + (i*PLANE_OFFSET) + 3 );
1923 dst[i] = srcVal[0] * constVal[0] + srcVal[1] * constVal[1] + srcVal[2] * constVal[2] + constVal[3] * srcVal[3];
1924 }
1925 }
1926
1927
1928 #ifndef DRAWVERT_PADDED
1929 /*
1930 ============
1931 idSIMD_AltiVec::Dot
1932
1933 dst[i] = constant.Normal() * src[i].xyz + constant[3];
1934 ============
1935 */
Dot(float * dst,const idPlane & constant,const idDrawVert * src,const int count)1936 void VPCALL idSIMD_AltiVec::Dot( float *dst, const idPlane &constant, const idDrawVert *src, const int count ) {
1937 //#define OPER(X) dst[(X)] = constant.Normal() * src[(X)].xyz + constant[3];
1938
1939 // idDrawVert size is 60 bytes
1940 assert( sizeof(idDrawVert) == DRAWVERT_OFFSET * sizeof( float ) );
1941
1942 int i;
1943 const float *constPtr = constant.ToFloatPtr();
1944 const float *srcPtr = src[0].xyz.ToFloatPtr();
1945
1946 register vector float v0, v1, v2, v3, v4, v5, v6, v7;
1947 register vector float vecConstX, vecConstY, vecConstZ, vecConstI3;
1948 register vector float vecSrcX1, vecSrcY1, vecSrcZ1;
1949 register vector float vecDest1;
1950 register vector float zeroVector = (vector float)(0.0);
1951 vector unsigned char vertPerm1, vertPerm2, vertPerm3, vertPerm4;
1952
1953 float constVal[4];
1954 float srcVal[3];
1955
1956 constVal[0] = *(constPtr+0);
1957 constVal[1] = *(constPtr+1);
1958 constVal[2] = *(constPtr+2);
1959 constVal[3] = *(constPtr+3);
1960
1961 // populate const vec
1962 vector unsigned char constPerm = vec_lvsl( 0, constant.ToFloatPtr() );
1963 v0 = vec_ld( 0, constant.ToFloatPtr() );
1964 v1 = vec_ld( 15, constant.ToFloatPtr() );
1965 v0 = vec_perm( v0, v1, constPerm );
1966
1967 vecConstX = vec_splat( v0, 0 );
1968 vecConstY = vec_splat( v0, 1 );
1969 vecConstZ = vec_splat( v0, 2 );
1970 vecConstI3 = vec_splat( v0, 3 );
1971
1972 // handle unaligned case at beginning
1973 for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count ); i++ ) {
1974 dst[i] = constant.Normal() * src[i].xyz + constant[3];
1975 }
1976
1977 // every fourth one will have the same alignment, so can store these. Make sure we
1978 // have enough so we don't run off the end of the array
1979 if ( i+3 < count ) {
1980 vertPerm1 = vec_add( vec_lvsl( -1, (float*) src[i].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
1981 vertPerm2 = vec_add( vec_lvsl( -1, (float*) src[i+1].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
1982 vertPerm3 = vec_add( vec_lvsl( -1, (float*) src[i+2].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
1983 vertPerm4 = vec_add( vec_lvsl( -1, (float*) src[i+3].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
1984 }
1985
1986 for ( ; i+3 < count; i+=4 ) {
1987 const float *vertPtr = src[i].xyz.ToFloatPtr();
1988 const float *vertPtr2 = src[i+1].xyz.ToFloatPtr();
1989 const float *vertPtr3 = src[i+2].xyz.ToFloatPtr();
1990 const float *vertPtr4 = src[i+3].xyz.ToFloatPtr();
1991
1992 v0 = vec_ld( 0, vertPtr );
1993 v1 = vec_ld( 11, vertPtr );
1994 v2 = vec_ld( 0, vertPtr2 );
1995 v3 = vec_ld( 11, vertPtr2 );
1996 v4 = vec_ld( 0, vertPtr3 );
1997 v5 = vec_ld( 11, vertPtr3 );
1998 v6 = vec_ld( 0, vertPtr4 );
1999 v7 = vec_ld( 11, vertPtr4 );
2000
2001 v0 = vec_perm( v0, v1, vertPerm1 );
2002 v2 = vec_perm( v2, v3, vertPerm2 );
2003 v4 = vec_perm( v4, v5, vertPerm3 );
2004 v6 = vec_perm( v6, v7, vertPerm4 );
2005
2006 // transpose into X Y Z vectors
2007 v1 = vec_mergeh( v0, v4 );
2008 v3 = vec_mergeh( v2, v6 );
2009 v5 = vec_mergel( v0, v4 );
2010 v7 = vec_mergel( v2, v6 );
2011
2012 vecSrcX1 = vec_mergeh( v1, v3 );
2013 vecSrcY1 = vec_mergel( v1, v3 );
2014 vecSrcZ1 = vec_mergeh( v5, v7 );
2015
2016 // now calculate dot product
2017 vecSrcX1 = vec_madd( vecSrcX1, vecConstX, zeroVector );
2018 vecSrcY1 = vec_madd( vecSrcY1, vecConstY, vecSrcX1 );
2019 vecSrcZ1 = vec_madd( vecSrcZ1, vecConstZ, vecSrcY1 );
2020 vecDest1 = vec_add( vecSrcZ1, vecConstI3 );
2021
2022 // store results
2023 vec_st( vecDest1, 0, &dst[i] );
2024 }
2025
2026 // cleanup
2027 for ( ; i < count; i++ ) {
2028 srcVal[0] = *(srcPtr + (i*DRAWVERT_OFFSET) + 0 );
2029 srcVal[1] = *(srcPtr + (i*DRAWVERT_OFFSET) + 1 );
2030 srcVal[2] = *(srcPtr + (i*DRAWVERT_OFFSET) + 2 );
2031 // dst[i] = constant.Normal() * src[i].xyz + constant[3];
2032
2033 dst[i] = constVal[0] * srcVal[0] + constVal[1] * srcVal[1] + constVal[2] * srcVal[2];
2034 dst[i] += constVal[3];
2035 }
2036 }
2037 #else
2038 /*
2039 ============
2040 idSIMD_AltiVec::Dot
2041
2042 dst[i] = constant.Normal() * src[i].xyz + constant[3];
2043 ============
2044 */
Dot(float * dst,const idPlane & constant,const idDrawVert * src,const int count)2045 void VPCALL idSIMD_AltiVec::Dot( float *dst, const idPlane &constant, const idDrawVert *src, const int count ) {
2046 //#define OPER(X) dst[(X)] = constant.Normal() * src[(X)].xyz + constant[3];
2047
2048 // idDrawVert size is 60 bytes
2049 assert( sizeof(idDrawVert) == DRAWVERT_OFFSET * sizeof( float ) );
2050
2051 int i;
2052 const float *constPtr = constant.ToFloatPtr();
2053 const float *srcPtr = src[0].xyz.ToFloatPtr();
2054
2055 register vector float v0, v1, v2, v3, v4, v5, v6, v7;
2056 register vector float vecConstX, vecConstY, vecConstZ, vecConstI3;
2057 register vector float vecSrcX1, vecSrcY1, vecSrcZ1;
2058 register vector float vecDest1;
2059 register vector float zeroVector = (vector float)(0.0);
2060 vector unsigned char vertPerm1, vertPerm2, vertPerm3, vertPerm4;
2061
2062 float constVal[4];
2063 float srcVal[3];
2064
2065 constVal[0] = *(constPtr+0);
2066 constVal[1] = *(constPtr+1);
2067 constVal[2] = *(constPtr+2);
2068 constVal[3] = *(constPtr+3);
2069
2070 // populate const vec
2071 vector unsigned char constPerm = vec_lvsl( 0, constant.ToFloatPtr() );
2072 v0 = vec_ld( 0, constant.ToFloatPtr() );
2073 v1 = vec_ld( 15, constant.ToFloatPtr() );
2074 v0 = vec_perm( v0, v1, constPerm );
2075
2076 vecConstX = vec_splat( v0, 0 );
2077 vecConstY = vec_splat( v0, 1 );
2078 vecConstZ = vec_splat( v0, 2 );
2079 vecConstI3 = vec_splat( v0, 3 );
2080
2081 // handle unaligned case at beginning
2082 for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count ); i++ ) {
2083 dst[i] = constant.Normal() * src[i].xyz + constant[3];
2084 }
2085
2086 for ( ; i+3 < count; i+=4 ) {
2087 const float *vertPtr = src[i].xyz.ToFloatPtr();
2088 const float *vertPtr2 = src[i+1].xyz.ToFloatPtr();
2089 const float *vertPtr3 = src[i+2].xyz.ToFloatPtr();
2090 const float *vertPtr4 = src[i+3].xyz.ToFloatPtr();
2091
2092 v0 = vec_ld( 0, vertPtr );
2093 v2 = vec_ld( 0, vertPtr2 );
2094 v4 = vec_ld( 0, vertPtr3 );
2095 v6 = vec_ld( 0, vertPtr4 );
2096
2097 // transpose into X Y Z vectors
2098 v1 = vec_mergeh( v0, v4 );
2099 v3 = vec_mergeh( v2, v6 );
2100 v5 = vec_mergel( v0, v4 );
2101 v7 = vec_mergel( v2, v6 );
2102
2103 vecSrcX1 = vec_mergeh( v1, v3 );
2104 vecSrcY1 = vec_mergel( v1, v3 );
2105 vecSrcZ1 = vec_mergeh( v5, v7 );
2106
2107 // now calculate dot product
2108 vecSrcX1 = vec_madd( vecSrcX1, vecConstX, zeroVector );
2109 vecSrcY1 = vec_madd( vecSrcY1, vecConstY, vecSrcX1 );
2110 vecSrcZ1 = vec_madd( vecSrcZ1, vecConstZ, vecSrcY1 );
2111 vecDest1 = vec_add( vecSrcZ1, vecConstI3 );
2112
2113 // store results
2114 vec_st( vecDest1, 0, &dst[i] );
2115 }
2116
2117 // cleanup
2118 for ( ; i < count; i++ ) {
2119 srcVal[0] = *(srcPtr + (i*DRAWVERT_OFFSET) + 0 );
2120 srcVal[1] = *(srcPtr + (i*DRAWVERT_OFFSET) + 1 );
2121 srcVal[2] = *(srcPtr + (i*DRAWVERT_OFFSET) + 2 );
2122 // dst[i] = constant.Normal() * src[i].xyz + constant[3];
2123
2124 dst[i] = constVal[0] * srcVal[0] + constVal[1] * srcVal[1] + constVal[2] * srcVal[2];
2125 dst[i] += constVal[3];
2126 }
2127 }
2128
2129 #endif /* DRAWVERT_PADDED */
2130
2131 /*
2132 ============
2133 idSIMD_AltiVec::Dot
2134
2135 dst[i] = src0[i] * src1[i];
2136 ============
2137 */
Dot(float * dst,const idVec3 * src0,const idVec3 * src1,const int count)2138 void VPCALL idSIMD_AltiVec::Dot( float *dst, const idVec3 *src0, const idVec3 *src1, const int count ) {
2139 //#define OPER(X) dst[(X)] = src0[(X)] * src1[(X)];
2140
2141 int i;
2142 float src0Val[3];
2143 float src1Val[3];
2144
2145 register vector float vecLd1, vecLd2, vecLd3, vecLd4, vecLd5, vecLd6;
2146 vector float vecLd7, vecLd8, vecLd9, vecLd10, vecLd11, vecLd12;
2147 register vector float vecX0, vecY0, vecZ0, vecX1, vecY1, vecZ1;
2148 register vector float vecX02, vecY02, vecZ02, vecX12, vecY12, vecZ12;
2149 register vector float zeroVector = (vector float)(0.0);
2150 // permute vectors
2151 register vector unsigned char permX1 = (vector unsigned char)(0,1,2,3,12,13,14,15,24,25,26,27,28,29,30,31); //last 4 bytes are junk
2152 register vector unsigned char permX2 = (vector unsigned char)(0,1,2,3,4,5,6,7,8,9,10,11,20,21,22,23);
2153 register vector unsigned char permY1 = (vector unsigned char)(4,5,6,7,16,17,18,19,28,29,30,31,0,1,2,3); //last 4 bytes are junk
2154 register vector unsigned char permY2 = (vector unsigned char)(0,1,2,3,4,5,6,7,8,9,10,11,24,25,26,27);
2155 register vector unsigned char permZ1 = (vector unsigned char)(8,9,10,11,20,21,22,23,0,1,2,3,4,5,6,7); //last 8 bytes are junk
2156 register vector unsigned char permZ2 = (vector unsigned char)(0,1,2,3,4,5,6,7,16,17,18,19,28,29,30,31);
2157
2158 // handle unaligned case at beginning
2159 for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count ); i++ ) {
2160 dst[i] = src0[i] * src1[i];
2161 }
2162
2163 const float *src0Ptr = src0[i].ToFloatPtr();
2164 const float *src1Ptr = src1[i].ToFloatPtr();
2165 vector unsigned char permVec1 = vec_add( vec_lvsl( -1, src0Ptr ), (vector unsigned char)(1) );
2166 vector unsigned char permVec2 = vec_add( vec_lvsl( -1, src1Ptr ), (vector unsigned char)(1) );
2167 vector float vecOld0 = vec_ld( 0, src0Ptr );
2168 vector float vecOld1 = vec_ld( 0, src1Ptr );
2169
2170 for ( i = 0; i+7 < count; i += 8 ) {
2171 float *s0Ptr = (float*)( src0Ptr + (i*3) );
2172 float *s1Ptr = (float*)( src1Ptr + (i*3) );
2173
2174 vector float v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11;
2175 v0 = vecOld0;
2176 v1 = vec_ld( 15, s0Ptr );
2177 v2 = vec_ld( 31, s0Ptr );
2178 v3 = vec_ld( 47, s0Ptr );
2179 v4 = vec_ld( 63, s0Ptr );
2180 v5 = vec_ld( 79, s0Ptr );
2181 vecOld0 = vec_ld( 95, s0Ptr );
2182
2183 v6 = vecOld1;
2184 v7 = vec_ld( 15, s1Ptr );
2185 v8 = vec_ld( 31, s1Ptr );
2186 v9 = vec_ld( 47, s1Ptr );
2187 v10 = vec_ld( 63, s1Ptr );
2188 v11 = vec_ld( 79, s1Ptr );
2189 vecOld1 = vec_ld( 95, s1Ptr );
2190
2191 vecLd1 = vec_perm( v0, v1, permVec1 );
2192 vecLd2 = vec_perm( v1, v2, permVec1 );
2193 vecLd3 = vec_perm( v2, v3, permVec1 );
2194 vecLd4 = vec_perm( v3, v4, permVec1 );
2195 vecLd5 = vec_perm( v4, v5, permVec1 );
2196 vecLd6 = vec_perm( v5, vecOld0, permVec1 );
2197
2198 vecLd7 = vec_perm( v6, v7, permVec2 );
2199 vecLd8 = vec_perm( v7, v8, permVec2 );
2200 vecLd9 = vec_perm( v8, v9, permVec2 );
2201 vecLd10 = vec_perm( v9, v10, permVec2 );
2202 vecLd11 = vec_perm( v10, v11, permVec2 );
2203 vecLd12 = vec_perm( v11, vecOld1, permVec2 );
2204
2205 // permute into X Y Z vectors
2206 vecX0 = vec_perm( vecLd1, vecLd2, permX1 );
2207 vecY0 = vec_perm( vecLd1, vecLd2, permY1 );
2208 vecZ0 = vec_perm( vecLd1, vecLd2, permZ1 );
2209 vecX0 = vec_perm( vecX0, vecLd3, permX2 );
2210 vecY0 = vec_perm( vecY0, vecLd3, permY2 );
2211 vecZ0 = vec_perm( vecZ0, vecLd3, permZ2 );
2212
2213 vecX02 = vec_perm( vecLd4, vecLd5, permX1 );
2214 vecY02 = vec_perm( vecLd4, vecLd5, permY1 );
2215 vecZ02 = vec_perm( vecLd4, vecLd5, permZ1 );
2216 vecX02 = vec_perm( vecX02, vecLd6, permX2 );
2217 vecY02 = vec_perm( vecY02, vecLd6, permY2 );
2218 vecZ02 = vec_perm( vecZ02, vecLd6, permZ2 );
2219
2220 vecX1 = vec_perm( vecLd7, vecLd8, permX1 );
2221 vecY1 = vec_perm( vecLd7, vecLd8, permY1 );
2222 vecZ1 = vec_perm( vecLd7, vecLd8, permZ1 );
2223 vecX1 = vec_perm( vecX1, vecLd9, permX2 );
2224 vecY1 = vec_perm( vecY1, vecLd9, permY2 );
2225 vecZ1 = vec_perm( vecZ1, vecLd9, permZ2 );
2226
2227 vecX12 = vec_perm( vecLd10, vecLd11, permX1 );
2228 vecY12 = vec_perm( vecLd10, vecLd11, permY1 );
2229 vecZ12 = vec_perm( vecLd10, vecLd11, permZ1 );
2230 vecX12 = vec_perm( vecX12, vecLd12, permX2 );
2231 vecY12 = vec_perm( vecY12, vecLd12, permY2 );
2232 vecZ12 = vec_perm( vecZ12, vecLd12, permZ2 );
2233
2234 // do multiply
2235 vecX0 = vec_madd( vecX0, vecX1, zeroVector );
2236 vecY0 = vec_madd( vecY0, vecY1, vecX0 );
2237 vecZ0 = vec_madd( vecZ0, vecZ1, vecY0 );
2238 vecX02 = vec_madd( vecX02, vecX12, zeroVector );
2239 vecY02 = vec_madd( vecY02, vecY12, vecX02 );
2240 vecZ02 = vec_madd( vecZ02, vecZ12, vecY02 );
2241
2242 // store out results
2243 ALIGNED_STORE2( &dst[i], vecZ0, vecZ02 );
2244 }
2245
2246 // cleanup
2247 for ( ; i < count; i++ ) {
2248 // dst[i] = src0[i] * src1[i];
2249 src0Val[0] = *( src0Ptr + (i*3) + 0 );
2250 src0Val[1] = *( src0Ptr + (i*3) + 1 );
2251 src0Val[2] = *( src0Ptr + (i*3) + 2 );
2252
2253 src1Val[0] = *( src1Ptr + (i*3) + 0 );
2254 src1Val[1] = *( src1Ptr + (i*3) + 1 );
2255 src1Val[2] = *( src1Ptr + (i*3) + 2 );
2256
2257 dst[i] = src0Val[0] * src1Val[0] + src0Val[1] * src1Val[1] + src0Val[2] * src1Val[2];
2258 }
2259 }
2260
2261 /*
2262 ============
2263 idSIMD_AltiVec::Dot
2264
2265 dot = src1[0] * src2[0] + src1[1] * src2[1] + src1[2] * src2[2] + ...
2266 ============
2267 */
Dot(float & dot,const float * src1,const float * src2,const int count)2268 void VPCALL idSIMD_AltiVec::Dot( float &dot, const float *src1, const float *src2, const int count ) {
2269 dot = 0.0f;
2270
2271 register vector float v0, v1, v2, v3;
2272 register vector float zeroVector;
2273 register vector float runningTotal1, runningTotal2;
2274 //src0
2275 register vector float v0_low, v0_hi, v2_low, v2_hi;
2276 //src1
2277 register vector float v1_low, v1_hi, v3_low, v3_hi;
2278 //permute vectors
2279 register vector unsigned char permVec1, permVec2;
2280 vector unsigned char oneCharVector = (vector unsigned char)(1);
2281
2282 int i = 0;
2283
2284 runningTotal1 = (vector float)(0.0);
2285 runningTotal2 = (vector float)(0.0);
2286 zeroVector = (vector float)(0.0);
2287
2288 if ( count >= 8 ) {
2289 //calculate permute and do loads
2290 permVec1 = vec_add( vec_lvsl( -1, (int*) &src1[i] ), oneCharVector );
2291 permVec2 = vec_add( vec_lvsl( -1, (int*) &src2[i] ), oneCharVector );
2292 v2_hi = vec_ld( 0, &src1[i] );
2293 v3_hi = vec_ld( 0, &src2[i] );
2294
2295 //vectorize!
2296 for ( ; i+7 < count; i += 8 ) {
2297 //load sources
2298 v0_low = v2_hi;
2299 v0_hi = vec_ld( 15, &src1[i] );
2300 v2_low = v0_hi;
2301 v2_hi = vec_ld( 31, &src1[i] );
2302
2303 v1_low = v3_hi;
2304 v1_hi = vec_ld( 15, &src2[i] );
2305 v3_low = v1_hi;
2306 v3_hi = vec_ld( 31, &src2[i] );
2307
2308 v0 = vec_perm( v0_low, v0_hi, permVec1 );
2309 v1 = vec_perm( v1_low, v1_hi, permVec2 );
2310 v2 = vec_perm( v2_low, v2_hi, permVec1 );
2311 v3 = vec_perm( v3_low, v3_hi, permVec2 );
2312
2313 //multiply together and keep running sum
2314 runningTotal1 = vec_madd( v0, v1, runningTotal1 );
2315 runningTotal2 = vec_madd( v2, v3, runningTotal2 );
2316 }
2317
2318 runningTotal1 = vec_add( runningTotal1, runningTotal2 );
2319
2320 // sum accross vector
2321 v0 = vec_add( runningTotal1, vec_sld( runningTotal1, runningTotal1, 8 ) );
2322 v1 = vec_add( v0, vec_sld( v0, v0, 4 ) );
2323 runningTotal1 = vec_splat( v1, 0 );
2324 vec_ste( runningTotal1, 0, &dot );
2325 }
2326
2327 //handle cleanup. when profiling the game, we found that most of the counts to this function were small, so it
2328 // spends a lot of time in this scalar code. It's already really really fast (eg 1 TB tick) for scalar code for
2329 // counts less than 50, so not much point in trying to get vector code in on the action
2330 for ( ; i < count ; i++ ) {
2331 dot += src1[i] * src2[i];
2332 }
2333
2334 }
2335 #endif /* ENABLE_DOT */
2336
2337 #ifdef ENABLE_COMPARES
2338
2339 /*
2340 ============
2341 idSIMD_AltiVec::CmpGT
2342
2343 dst[i] = src0[i] > constant;
2344 ============
2345 */
2346
CmpGT(byte * dst,const float * src0,const float constant,const int count)2347 void VPCALL idSIMD_AltiVec::CmpGT( byte *dst, const float *src0, const float constant, const int count ) {
2348 //#define OPER(X) dst[(X)] = src0[(X)] > constant;
2349
2350 register vector float v0, v1, v2, v3;
2351 register vector bool int vr1, vr2, vr3, vr4;
2352 register vector bool short vs1, vs2;
2353 register vector float v0_low, v0_hi, v1_low, v1_hi, v2_low, v2_hi, v3_low, v3_hi;
2354 register vector unsigned char vc1;
2355 register vector bool char vbc1;
2356 register vector float constVec;
2357 register vector unsigned char oneVector = (vector unsigned char)(1);
2358 register vector unsigned char permVec;
2359 int i;
2360
2361 //handle unaligned at start
2362 for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count );i++ ) {
2363 dst[i] = src0[i] > constant;
2364 }
2365
2366 //splat constant into a vector
2367 constVec = loadSplatUnalignedScalar( &constant );
2368
2369 //calculate permute and do loads
2370 permVec = vec_add( vec_lvsl( -1, (int*) &src0[i] ), oneVector );
2371 v3_hi = vec_ld( 0, &src0[i] );
2372
2373 //vectorize!
2374 for ( ; i+15 < count; i += 16 ) {
2375 // load values
2376 v0_low = v3_hi;
2377 v0_hi = vec_ld( 15, &src0[i] );
2378 v1_low = v0_hi;
2379 v1_hi = vec_ld( 31, &src0[i] );
2380 v2_low = v1_hi;
2381 v2_hi = vec_ld( 47, &src0[i] );
2382 v3_low = v2_hi;
2383 v3_hi = vec_ld( 63, &src0[i] );
2384
2385 //permute into the vectors we want
2386 v0 = vec_perm( v0_low, v0_hi, permVec );
2387 v1 = vec_perm( v1_low, v1_hi, permVec );
2388 v2 = vec_perm( v2_low, v2_hi, permVec );
2389 v3 = vec_perm( v3_low, v3_hi, permVec );
2390
2391 //do comparison
2392 vr1 = vec_cmpgt( v0, constVec );
2393 vr2 = vec_cmpgt( v1, constVec );
2394 vr3 = vec_cmpgt( v2, constVec );
2395 vr4 = vec_cmpgt( v3, constVec );
2396
2397 // pack results into shorts
2398 vs1 = vec_pack(vr1, vr2);
2399 vs2 = vec_pack(vr3, vr4);
2400
2401 // pack results into byte
2402 vbc1 = vec_pack(vs1, vs2);
2403
2404 //AND with 1 to get true=1 not true=255
2405 vc1 = vec_and( vbc1, oneVector );
2406
2407 //store results
2408 vec_st( vc1, 0, &dst[i] );
2409 }
2410
2411 //handle cleanup
2412 for ( ; i < count ; i++ ) {
2413 dst[i] = src0[i] > constant;
2414 }
2415 }
2416
2417
2418 /*
2419 ============
2420 idSIMD_AltiVec::CmpGT
2421
2422 dst[i] |= ( src0[i] > constant ) << bitNum;
2423 ============
2424 */
CmpGT(byte * dst,const byte bitNum,const float * src0,const float constant,const int count)2425 void VPCALL idSIMD_AltiVec::CmpGT( byte *dst, const byte bitNum, const float *src0, const float constant, const int count ) {
2426 //#define OPER(X) dst[(X)] |= ( src0[(X)] > constant ) << bitNum;
2427
2428 // Temp vector registers
2429 register vector bool int vtbi0, vtbi1, vtbi2, vtbi3;
2430 register vector bool short vtbs0, vtbs1;
2431 register vector bool char vtbc0;
2432 register vector unsigned char vtuc0;
2433 register vector unsigned char permVec, permVec2;
2434
2435 // dest vectors
2436 register vector unsigned char vd;
2437 // bitNum vectors
2438 register vector unsigned char bitNumVec;
2439 // src0 vectors
2440 register vector float vs0, vs1, vs2, vs3;
2441 register vector float vs0_low, vs0_hi, vs1_low, vs1_hi, vs2_low, vs2_hi, vs3_low, vs3_hi;
2442 // constant vector
2443 register vector float constVec;
2444 // all one's
2445 register vector unsigned char oneVector = (vector unsigned char)(1);
2446 int i = 0;
2447
2448 //handle unaligned at start
2449 for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count );i++ ) {
2450 dst[i] |= ( src0[i] > constant ) << bitNum;
2451 }
2452
2453 //splat constant into a vector
2454 constVec = loadSplatUnalignedScalar( &constant );
2455
2456 //bitNum is unaligned.
2457 permVec2 = vec_lvsl( 0, &bitNum );
2458 vtuc0 = vec_ld( 0, &bitNum );
2459 bitNumVec = vec_perm( vtuc0, vtuc0, permVec2 );
2460 bitNumVec = vec_splat( bitNumVec, 0 );
2461
2462 //calculate permute and do loads
2463 permVec = vec_add( vec_lvsl( -1, (int*) &src0[i] ), oneVector );
2464 vs3_hi = vec_ld( 0, &src0[i] );
2465
2466 //vectorize!
2467 for ( ; i+15 < count; i += 16 ) {
2468 //load sources (floats)
2469 vs0_low = vs3_hi;
2470 vs0_hi = vec_ld( 15, &src0[i] );
2471 vs1_low = vs0_hi;
2472 vs1_hi = vec_ld( 31, &src0[i] );
2473 vs2_low = vs1_hi;
2474 vs2_hi = vec_ld( 47, &src0[i] );
2475 vs3_low = vs2_hi;
2476 vs3_hi = vec_ld( 63, &src0[i] );
2477
2478 //permute into the vectors we want
2479 vs0 = vec_perm( vs0_low, vs0_hi, permVec );
2480 vs1 = vec_perm( vs1_low, vs1_hi, permVec );
2481 vs2 = vec_perm( vs2_low, vs2_hi, permVec );
2482 vs3 = vec_perm( vs3_low, vs3_hi, permVec );
2483
2484 //load dest (bytes) as unsigned char
2485 vd = vec_ld( 0, &dst[i] );
2486
2487 // do comparison and get bool int result
2488 vtbi0 = vec_cmpgt( vs0, constVec );
2489 vtbi1 = vec_cmpgt( vs1, constVec );
2490 vtbi2 = vec_cmpgt( vs2, constVec );
2491 vtbi3 = vec_cmpgt( vs3, constVec );
2492
2493 // pack results into shorts
2494 vtbs0 = vec_pack(vtbi0, vtbi1);
2495 vtbs1 = vec_pack(vtbi2, vtbi3);
2496
2497 // pack results into byte
2498 vtbc0 = vec_pack(vtbs0, vtbs1);
2499
2500 //and with 1 to get true=1 instead of true=255
2501 vtuc0 = vec_and(vtbc0, oneVector);
2502 vtuc0 = vec_sl(vtuc0, bitNumVec );
2503
2504 //or with original
2505 vd = vec_or( vd, vtuc0 );
2506
2507 vec_st( vd, 0, &dst[i] );
2508 }
2509
2510 //handle cleanup
2511 for ( ; i < count ; i++ ) {
2512 dst[i] |= ( src0[i] > constant ) << bitNum;
2513 }
2514 }
2515
2516 /*
2517 ============
2518 idSIMD_AltiVec::CmpGE
2519
2520 dst[i] = src0[i] >= constant;
2521 ============
2522 */
CmpGE(byte * dst,const float * src0,const float constant,const int count)2523 void VPCALL idSIMD_AltiVec::CmpGE( byte *dst, const float *src0, const float constant, const int count ) {
2524
2525 register vector float v0, v1, v2, v3;
2526 register vector bool int vr1, vr2, vr3, vr4;
2527 register vector bool short vs1, vs2;
2528 register vector float v0_low, v0_hi, v1_low, v1_hi, v2_low, v2_hi, v3_low, v3_hi;
2529 register vector unsigned char vc1;
2530 register vector bool char vbc1;
2531 register vector float constVec;
2532 register vector unsigned char oneVector = (vector unsigned char)(1);
2533 register vector unsigned char permVec;
2534 int i = 0;
2535
2536 //handle unaligned at start
2537 for ( ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count );i++ ) {
2538 dst[i] = src0[i] >= constant;
2539 }
2540
2541 //splat constant into a vector
2542 constVec = loadSplatUnalignedScalar( &constant );
2543
2544 //calculate permute and do loads
2545 permVec = vec_add( vec_lvsl( -1, (int*) &src0[i] ), oneVector );
2546 v3_hi = vec_ld( 0, &src0[i] );
2547
2548 //vectorize!
2549 for ( ; i+15 < count; i += 16 ) {
2550 // load values
2551 v0_low = v3_hi;
2552 v0_hi = vec_ld( 15, &src0[i] );
2553 v1_low = v0_hi;
2554 v1_hi = vec_ld( 31, &src0[i] );
2555 v2_low = v1_hi;
2556 v2_hi = vec_ld( 47, &src0[i] );
2557 v3_low = v2_hi;
2558 v3_hi = vec_ld( 63, &src0[i] );
2559
2560 //permute into the vectors we want
2561 v0 = vec_perm( v0_low, v0_hi, permVec );
2562 v1 = vec_perm( v1_low, v1_hi, permVec );
2563 v2 = vec_perm( v2_low, v2_hi, permVec );
2564 v3 = vec_perm( v3_low, v3_hi, permVec );
2565
2566 //do comparison
2567 vr1 = vec_cmpge( v0, constVec );
2568 vr2 = vec_cmpge( v1, constVec );
2569 vr3 = vec_cmpge( v2, constVec );
2570 vr4 = vec_cmpge( v3, constVec );
2571
2572 // pack results into shorts
2573 vs1 = vec_pack(vr1, vr2);
2574 vs2 = vec_pack(vr3, vr4);
2575
2576 // pack results into byte
2577 vbc1 = vec_pack(vs1, vs2);
2578
2579 //AND with 1 to get true=1 not true=255
2580 vc1 = vec_and( vbc1, oneVector );
2581
2582 //store results
2583 vec_st( vc1, 0, &dst[i] );
2584 }
2585
2586 //handle cleanup
2587 for ( ; i < count ; i++ ) {
2588 dst[i] = src0[i] >= constant;
2589 }
2590 }
2591
2592 /*
2593 ============
2594 idSIMD_AltiVec::CmpGE
2595
2596 dst[i] |= ( src0[i] >= constant ) << bitNum;
2597 ============
2598 */
CmpGE(byte * dst,const byte bitNum,const float * src0,const float constant,const int count)2599 void VPCALL idSIMD_AltiVec::CmpGE( byte *dst, const byte bitNum, const float *src0, const float constant, const int count ) {
2600 register vector bool int vtbi0, vtbi1, vtbi2, vtbi3;
2601 register vector bool short vtbs0, vtbs1;
2602 register vector bool char vtbc0;
2603 register vector unsigned char vtuc0;
2604 register vector unsigned char permVec, permVec2;
2605
2606 // dest vectors
2607 register vector unsigned char vd;
2608 // bitNum vectors
2609 register vector unsigned char bitNumVec;
2610 // src0 vectors
2611 register vector float vs0, vs1, vs2, vs3;
2612 register vector float vs0_low, vs0_hi, vs1_low, vs1_hi, vs2_low, vs2_hi, vs3_low, vs3_hi;
2613 // constant vector
2614 register vector float constVec;
2615 // all one's
2616 register vector unsigned char oneVector = (vector unsigned char)(1);
2617 int i = 0;
2618
2619 //handle unaligned at start
2620 for ( ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count );i++ ) {
2621 dst[i] |= ( src0[i] >= constant ) << bitNum;
2622 }
2623
2624 //splat constant into a vector
2625 constVec = loadSplatUnalignedScalar( &constant );
2626
2627 //bitNum is unaligned.
2628 permVec2 = vec_lvsl( 0, &bitNum );
2629 vtuc0 = vec_ld( 0, &bitNum );
2630 bitNumVec = vec_perm( vtuc0, vtuc0, permVec2 );
2631 bitNumVec = vec_splat( bitNumVec, 0 );
2632
2633 //calculate permute and do loads
2634 permVec = vec_add( vec_lvsl( -1, (int*) &src0[i] ), oneVector );
2635 vs3_hi = vec_ld( 0, &src0[i] );
2636
2637 //vectorize!
2638 for ( ; i+15 < count; i += 16 ) {
2639 //load sources (floats)
2640 vs0_low = vs3_hi;
2641 vs0_hi = vec_ld( 15, &src0[i] );
2642 vs1_low = vs0_hi;
2643 vs1_hi = vec_ld( 31, &src0[i] );
2644 vs2_low = vs1_hi;
2645 vs2_hi = vec_ld( 47, &src0[i] );
2646 vs3_low = vs2_hi;
2647 vs3_hi = vec_ld( 63, &src0[i] );
2648
2649 //permute into the vectors we want
2650 vs0 = vec_perm( vs0_low, vs0_hi, permVec );
2651 vs1 = vec_perm( vs1_low, vs1_hi, permVec );
2652 vs2 = vec_perm( vs2_low, vs2_hi, permVec );
2653 vs3 = vec_perm( vs3_low, vs3_hi, permVec );
2654
2655 //load dest (bytes) as unsigned char
2656 vd = vec_ld( 0, &dst[i] );
2657
2658 // do comparison and get bool int result
2659 vtbi0 = vec_cmpge( vs0, constVec );
2660 vtbi1 = vec_cmpge( vs1, constVec );
2661 vtbi2 = vec_cmpge( vs2, constVec );
2662 vtbi3 = vec_cmpge( vs3, constVec );
2663
2664 // pack results into shorts
2665 vtbs0 = vec_pack(vtbi0, vtbi1);
2666 vtbs1 = vec_pack(vtbi2, vtbi3);
2667
2668 // pack results into byte
2669 vtbc0 = vec_pack(vtbs0, vtbs1);
2670
2671 //and with 1L to get true=1 instead of true=255
2672 vtuc0 = vec_and(vtbc0, oneVector);
2673 vtuc0 = vec_sl(vtuc0, bitNumVec );
2674
2675 //or with original
2676 vd = vec_or( vd, vtuc0 );
2677
2678 vec_st( vd, 0, &dst[i] );
2679 }
2680
2681 //handle cleanup
2682 for ( ; i < count ; i++ ) {
2683 dst[i] |= ( src0[i] >= constant ) << bitNum;
2684 }
2685 }
2686
2687
2688 /*
2689 ============
2690 idSIMD_AltiVec::CmpLT
2691
2692 dst[i] = src0[i] < constant;
2693 ============
2694 */
CmpLT(byte * dst,const float * src0,const float constant,const int count)2695 void VPCALL idSIMD_AltiVec::CmpLT( byte *dst, const float *src0, const float constant, const int count ) {
2696 //#define OPER(X) dst[(X)] = src0[(X)] < constant;
2697 register vector float v0, v1, v2, v3;
2698 register vector bool int vr1, vr2, vr3, vr4;
2699 register vector bool short vs1, vs2;
2700 register vector float v0_low, v0_hi, v1_low, v1_hi, v2_low, v2_hi, v3_low, v3_hi;
2701 register vector unsigned char vc1;
2702 register vector bool char vbc1;
2703 register vector float constVec;
2704 register vector unsigned char oneVector = (vector unsigned char)(1);
2705 register vector unsigned char permVec;
2706 int i = 0;
2707
2708 //handle unaligned at start
2709 for ( ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count );i++ ) {
2710 dst[i] = src0[i] < constant;
2711 }
2712
2713 //splat constant into a vector
2714 constVec = loadSplatUnalignedScalar( &constant );
2715
2716 //calculate permute and do loads
2717 permVec = vec_add( vec_lvsl( -1, (int*) &src0[i] ), oneVector );
2718 v3_hi = vec_ld( 0, &src0[i] );
2719
2720 //vectorize!
2721 for ( ; i+15 < count; i += 16 ) {
2722 // load values
2723 v0_low = v3_hi;
2724 v0_hi = vec_ld( 15, &src0[i] );
2725 v1_low = v0_hi;
2726 v1_hi = vec_ld( 31, &src0[i] );
2727 v2_low = v1_hi;
2728 v2_hi = vec_ld( 47, &src0[i] );
2729 v3_low = v2_hi;
2730 v3_hi = vec_ld( 63, &src0[i] );
2731
2732 //permute into the vectors we want
2733 v0 = vec_perm( v0_low, v0_hi, permVec );
2734 v1 = vec_perm( v1_low, v1_hi, permVec );
2735 v2 = vec_perm( v2_low, v2_hi, permVec );
2736 v3 = vec_perm( v3_low, v3_hi, permVec );
2737
2738 //do comparison
2739 vr1 = vec_cmplt( v0, constVec );
2740 vr2 = vec_cmplt( v1, constVec );
2741 vr3 = vec_cmplt( v2, constVec );
2742 vr4 = vec_cmplt( v3, constVec );
2743
2744 // pack results into shorts
2745 vs1 = vec_pack(vr1, vr2);
2746 vs2 = vec_pack(vr3, vr4);
2747
2748 // pack results into byte
2749 vbc1 = vec_pack(vs1, vs2);
2750
2751 //AND with 1 to get true=1 not true=255
2752 vc1 = vec_and( vbc1, oneVector );
2753
2754 //store results
2755 vec_st( vc1, 0, &dst[i] );
2756 }
2757
2758 //handle cleanup
2759 for ( ; i < count ; i++ ) {
2760 dst[i] = src0[i] < constant;
2761 }
2762 }
2763
2764 /*
2765 ============
2766 idSIMD_AltiVec::CmpLT
2767
2768 dst[i] |= ( src0[i] < constant ) << bitNum;
2769 ============
2770 */
CmpLT(byte * dst,const byte bitNum,const float * src0,const float constant,const int count)2771 void VPCALL idSIMD_AltiVec::CmpLT( byte *dst, const byte bitNum, const float *src0, const float constant, const int count ) {
2772 //#define OPER(X) dst[(X)] |= ( src0[(X)] < constant ) << bitNum;
2773 register vector bool int vtbi0, vtbi1, vtbi2, vtbi3;
2774 register vector bool short vtbs0, vtbs1;
2775 register vector bool char vtbc0;
2776 register vector unsigned char vtuc0;
2777 register vector unsigned char permVec, permVec2;
2778
2779 // dest vectors
2780 register vector unsigned char vd;
2781 // bitNum vectors
2782 register vector unsigned char bitNumVec;
2783 // src0 vectors
2784 register vector float vs0, vs1, vs2, vs3;
2785 register vector float vs0_low, vs0_hi, vs1_low, vs1_hi, vs2_low, vs2_hi, vs3_low, vs3_hi;
2786 // constant vector
2787 register vector float constVec;
2788 // all one's
2789 register vector unsigned char oneVector = (vector unsigned char)(1);
2790 int i = 0;
2791
2792 //handle unaligned at start
2793 for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count );i++ ) {
2794 dst[i] |= ( src0[i] < constant ) << bitNum;
2795 }
2796
2797 //splat constant into a vector
2798 constVec = loadSplatUnalignedScalar( &constant );
2799
2800 //bitNum is unaligned.
2801 permVec2 = vec_lvsl( 0, &bitNum );
2802 vtuc0 = vec_ld( 0, &bitNum );
2803 bitNumVec = vec_perm( vtuc0, vtuc0, permVec2 );
2804 bitNumVec = vec_splat( bitNumVec, 0 );
2805
2806 //calculate permute and do loads
2807 permVec = vec_add( vec_lvsl( -1, (int*) &src0[i] ), oneVector );
2808 vs3_hi = vec_ld( 0, &src0[i] );
2809
2810 //vectorize!
2811 for ( ; i+15 < count; i += 16 ) {
2812 //load sources (floats)
2813 vs0_low = vs3_hi;
2814 vs0_hi = vec_ld( 15, &src0[i] );
2815 vs1_low = vs0_hi;
2816 vs1_hi = vec_ld( 31, &src0[i] );
2817 vs2_low = vs1_hi;
2818 vs2_hi = vec_ld( 47, &src0[i] );
2819 vs3_low = vs2_hi;
2820 vs3_hi = vec_ld( 63, &src0[i] );
2821
2822 //permute into the vectors we want
2823 vs0 = vec_perm( vs0_low, vs0_hi, permVec );
2824 vs1 = vec_perm( vs1_low, vs1_hi, permVec );
2825 vs2 = vec_perm( vs2_low, vs2_hi, permVec );
2826 vs3 = vec_perm( vs3_low, vs3_hi, permVec );
2827
2828 //load dest (bytes) as unsigned char
2829 vd = vec_ld( 0, &dst[i] );
2830
2831 // do comparison and get bool int result
2832 vtbi0 = vec_cmplt( vs0, constVec );
2833 vtbi1 = vec_cmplt( vs1, constVec );
2834 vtbi2 = vec_cmplt( vs2, constVec );
2835 vtbi3 = vec_cmplt( vs3, constVec );
2836
2837 // pack results into shorts
2838 vtbs0 = vec_pack(vtbi0, vtbi1);
2839 vtbs1 = vec_pack(vtbi2, vtbi3);
2840
2841 // pack results into byte
2842 vtbc0 = vec_pack(vtbs0, vtbs1);
2843
2844 //and with 1L to get true=1 instead of true=255
2845 vtuc0 = vec_and(vtbc0, oneVector);
2846 vtuc0 = vec_sl(vtuc0, bitNumVec );
2847
2848 //or with original
2849 vd = vec_or( vd, vtuc0 );
2850
2851 vec_st( vd, 0, &dst[i] );
2852 }
2853
2854 //handle cleanup
2855 for ( ; i < count ; i++ ) {
2856 dst[i] |= ( src0[i] < constant ) << bitNum;
2857 }
2858
2859 }
2860 //#endif
2861
2862 /*
2863 ============
2864 idSIMD_AltiVec::CmpLE
2865
2866 dst[i] = src0[i] <= constant;
2867 ============
2868 */
CmpLE(byte * dst,const float * src0,const float constant,const int count)2869 void VPCALL idSIMD_AltiVec::CmpLE( byte *dst, const float *src0, const float constant, const int count ) {
2870 //#define OPER(X) dst[(X)] = src0[(X)] <= constant;
2871 register vector float v0, v1, v2, v3;
2872 register vector bool int vr1, vr2, vr3, vr4;
2873 register vector bool short vs1, vs2;
2874 register vector float v0_low, v0_hi, v1_low, v1_hi, v2_low, v2_hi, v3_low, v3_hi;
2875 register vector unsigned char vc1;
2876 register vector bool char vbc1;
2877 register vector float constVec;
2878 register vector unsigned char oneVector = (vector unsigned char)(1);
2879 register vector unsigned char permVec;
2880 int i = 0;
2881
2882 //handle unaligned at start
2883 for ( ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count );i++ ) {
2884 dst[i] = src0[i] <= constant;
2885 }
2886
2887 //splat constant into a vector
2888 constVec = loadSplatUnalignedScalar( &constant );
2889
2890 //calculate permute and do loads
2891 permVec = vec_add( vec_lvsl( -1, (int*) &src0[i] ), oneVector );
2892 v3_hi = vec_ld( 0, &src0[i] );
2893
2894 //vectorize!
2895 for ( ; i+15 < count; i += 16 ) {
2896 // load values
2897 v0_low = v3_hi;
2898 v0_hi = vec_ld( 15, &src0[i] );
2899 v1_low = v0_hi;
2900 v1_hi = vec_ld( 31, &src0[i] );
2901 v2_low = v1_hi;
2902 v2_hi = vec_ld( 47, &src0[i] );
2903 v3_low = v2_hi;
2904 v3_hi = vec_ld( 63, &src0[i] );
2905
2906 //permute into the vectors we want
2907 v0 = vec_perm( v0_low, v0_hi, permVec );
2908 v1 = vec_perm( v1_low, v1_hi, permVec );
2909 v2 = vec_perm( v2_low, v2_hi, permVec );
2910 v3 = vec_perm( v3_low, v3_hi, permVec );
2911
2912 //do comparison
2913 vr1 = vec_cmple( v0, constVec );
2914 vr2 = vec_cmple( v1, constVec );
2915 vr3 = vec_cmple( v2, constVec );
2916 vr4 = vec_cmple( v3, constVec );
2917
2918 // pack results into shorts
2919 vs1 = vec_pack(vr1, vr2);
2920 vs2 = vec_pack(vr3, vr4);
2921
2922 // pack results into byte
2923 vbc1 = vec_pack(vs1, vs2);
2924
2925 //AND with 1 to get true=1 not true=255
2926 vc1 = vec_and( vbc1, oneVector );
2927
2928 //store results
2929 vec_st( vc1, 0, &dst[i] );
2930 }
2931
2932 //handle cleanup
2933 for ( ; i < count ; i++ ) {
2934 dst[i] = src0[i] <= constant;
2935 }
2936 }
2937
2938 /*
2939 ============
2940 idSIMD_AltiVec::CmpLE
2941
2942 dst[i] |= ( src0[i] <= constant ) << bitNum;
2943 ============
2944 */
CmpLE(byte * dst,const byte bitNum,const float * src0,const float constant,const int count)2945 void VPCALL idSIMD_AltiVec::CmpLE( byte *dst, const byte bitNum, const float *src0, const float constant, const int count ) {
2946 //#define OPER(X) dst[(X)] |= ( src0[(X)] <= constant ) << bitNum;
2947 register vector bool int vtbi0, vtbi1, vtbi2, vtbi3;
2948 register vector bool short vtbs0, vtbs1;
2949 register vector bool char vtbc0;
2950 register vector unsigned char vtuc0;
2951 register vector unsigned char permVec, permVec2;
2952
2953 // dest vectors
2954 register vector unsigned char vd;
2955 // bitNum vectors
2956 register vector unsigned char bitNumVec;
2957 // src0 vectors
2958 register vector float vs0, vs1, vs2, vs3;
2959 register vector float vs0_low, vs0_hi, vs1_low, vs1_hi, vs2_low, vs2_hi, vs3_low, vs3_hi;
2960 // constant vector
2961 register vector float constVec;
2962 // all one's
2963 register vector unsigned char oneVector = (vector unsigned char)(1);
2964 int i = 0;
2965
2966 //handle unaligned at start
2967 for ( ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count );i++ ) {
2968 dst[i] |= ( src0[i] <= constant ) << bitNum;
2969 }
2970
2971 //splat constant into a vector
2972 constVec = loadSplatUnalignedScalar( &constant );
2973
2974 //bitNum is unaligned.
2975 permVec2 = vec_lvsl( 0, &bitNum );
2976 vtuc0 = vec_ld( 0, &bitNum );
2977 bitNumVec = vec_perm( vtuc0, vtuc0, permVec2 );
2978 bitNumVec = vec_splat( bitNumVec, 0 );
2979
2980 //calculate permute and do loads
2981 permVec = vec_add( vec_lvsl( -1, (int*) &src0[i] ), oneVector );
2982 vs3_hi = vec_ld( 0, &src0[i] );
2983
2984 //vectorize!
2985 for ( ; i+15 < count; i += 16 ) {
2986 //load sources (floats)
2987 vs0_low = vs3_hi;
2988 vs0_hi = vec_ld( 15, &src0[i] );
2989 vs1_low = vs0_hi;
2990 vs1_hi = vec_ld( 31, &src0[i] );
2991 vs2_low = vs1_hi;
2992 vs2_hi = vec_ld( 47, &src0[i] );
2993 vs3_low = vs2_hi;
2994 vs3_hi = vec_ld( 63, &src0[i] );
2995
2996 //permute into the vectors we want
2997 vs0 = vec_perm( vs0_low, vs0_hi, permVec );
2998 vs1 = vec_perm( vs1_low, vs1_hi, permVec );
2999 vs2 = vec_perm( vs2_low, vs2_hi, permVec );
3000 vs3 = vec_perm( vs3_low, vs3_hi, permVec );
3001
3002 //load dest (bytes) as unsigned char
3003 vd = vec_ld( 0, &dst[i] );
3004
3005 // do comparison and get bool int result
3006 vtbi0 = vec_cmple( vs0, constVec );
3007 vtbi1 = vec_cmple( vs1, constVec );
3008 vtbi2 = vec_cmple( vs2, constVec );
3009 vtbi3 = vec_cmple( vs3, constVec );
3010
3011 // pack results into shorts
3012 vtbs0 = vec_pack(vtbi0, vtbi1);
3013 vtbs1 = vec_pack(vtbi2, vtbi3);
3014
3015 // pack results into byte
3016 vtbc0 = vec_pack(vtbs0, vtbs1);
3017
3018 //and with 1L to get true=1 instead of true=255
3019 vtuc0 = vec_and(vtbc0, oneVector);
3020 vtuc0 = vec_sl(vtuc0, bitNumVec );
3021
3022 //or with original
3023 vd = vec_or( vd, vtuc0 );
3024
3025 vec_st( vd, 0, &dst[i] );
3026 }
3027
3028 //handle cleanup
3029 for ( ; i < count ; i++ ) {
3030 dst[i] |= ( src0[i] <= constant ) << bitNum;
3031 }
3032 }
3033 #endif /* ENABLE_COMPARES */
3034
3035 #ifdef ENABLE_MINMAX
3036
3037 /*
3038 ============
3039 idSIMD_AltiVec::MinMax
3040 ============
3041 */
MinMax(float & min,float & max,const float * src,const int count)3042 void VPCALL idSIMD_AltiVec::MinMax( float &min, float &max, const float *src, const int count ) {
3043 min = idMath::INFINITY; max = -idMath::INFINITY;
3044 //#define OPER(X) if ( src[(X)] < min ) {min = src[(X)];} if ( src[(X)] > max ) {max = src[(X)];}
3045
3046 register vector float v0, v1, v2, v3;
3047 register vector float maxVec, minVec, tempMin, tempMax;
3048 register vector unsigned char permVec;
3049 register vector float v0_low, v0_hi, v1_low, v1_hi;
3050 vector unsigned char oneCharVector = (vector unsigned char)(1);
3051 int i = 0;
3052
3053 if ( count >= 4 ) {
3054
3055 //calculate permute and do first load to
3056 //get a starting point for min and max
3057 permVec = vec_add( vec_lvsl( -1, (int*) &src[0] ), oneCharVector );
3058 v1_hi = vec_ld( 0, &src[0] );
3059
3060 maxVec = loadSplatUnalignedScalar( &max );
3061 minVec = loadSplatUnalignedScalar( &min );
3062
3063 //vectorize!
3064 for ( ; i+7 < count; i += 8 ) {
3065 //load sources
3066 v0_low = v1_hi;
3067 v0_hi = vec_ld( 15, &src[i] );
3068 v1_low = v0_hi;
3069 v1_hi = vec_ld( 31, &src[i] );
3070 v0 = vec_perm( v0_low, v0_hi, permVec );
3071 v1 = vec_perm( v1_low, v1_hi, permVec );
3072
3073 // minimum
3074 v2 = vec_min( v0, v1 );
3075 minVec = vec_min( minVec, v2 );
3076 // maximum
3077 v3 = vec_max( v0, v1 );
3078 maxVec = vec_max( maxVec, v3 );
3079 }
3080
3081 //minVec and maxVec hold the min/max elements from the array, but now
3082 //we need to figure out which particular element it is
3083
3084 tempMin = minVec;
3085 tempMax = maxVec;
3086
3087 // rotate vector around and compare to itself to find the real min/max
3088 tempMin = vec_min( tempMin, vec_sld( tempMin, tempMin, 8 ) );
3089 tempMax = vec_max( tempMax, vec_sld( tempMax, tempMax, 8 ) );
3090 tempMin = vec_min( tempMin, vec_sld( tempMin, tempMin, 4 ) );
3091 tempMax = vec_max( tempMax, vec_sld( tempMax, tempMax, 4 ) );
3092 minVec = vec_splat( tempMin, 0 );
3093 maxVec = vec_splat( tempMax, 0 );
3094 vec_ste( minVec, 0, &min );
3095 vec_ste( maxVec, 0, &max );
3096 }
3097
3098 //cleanup
3099 for ( ; i < count; i++ ) {
3100 if ( src[i] < min ) {
3101 min = src[i];
3102 }
3103 if ( src[i] > max ) {
3104 max = src[i];
3105 }
3106 }
3107 }
3108
3109 /*
3110 ============
3111 idSIMD_AltiVec::MinMax
3112 ============
3113 */
MinMax(idVec2 & min,idVec2 & max,const idVec2 * src,const int count)3114 void VPCALL idSIMD_AltiVec::MinMax( idVec2 &min, idVec2 &max, const idVec2 *src, const int count ) {
3115 min[0] = min[1] = idMath::INFINITY; max[0] = max[1] = -idMath::INFINITY;
3116 //#define OPER(X) const idVec2 &v = src[(X)]; if ( v[0] < min[0] ) { min[0] = v[0]; } if ( v[0] > max[0] ) { max[0] = v[0]; } if ( v[1] < min[1] ) { min[1] = v[1]; } if ( v[1] > max[1] ) { max[1] = v[1]; }
3117
3118 idVec2 v;
3119 int i = 0;
3120 int j;
3121
3122 const float *srcPtr = src[0].ToFloatPtr();
3123 register vector float vecLd1, vecLd2, vecLd3, vecLd4;
3124 register vector float vecMin, vecMax;
3125
3126 register vector float v0, v1, v2, v3;
3127
3128 if ( count > 4 ) {
3129
3130 vecMin = (vector float)(FLT_MAX);
3131 vecMax = (vector float)(FLT_MIN);
3132
3133 vector unsigned char permVec = vec_add( vec_lvsl( -1, srcPtr ), (vector unsigned char)(1) );
3134 vector float vecOld = vec_ld( 0, srcPtr );
3135
3136 for ( i = 0, j = 0; i+7 < count; i += 8, j += 4) {
3137 // load data
3138 float *vecPtr = (float*)( srcPtr + (j*4) );
3139 vector float v0, v1, v2, v3;
3140
3141 v0 = vecOld;
3142 v1 = vec_ld( 15, vecPtr );
3143 v2 = vec_ld( 31, vecPtr );
3144 v3 = vec_ld( 47, vecPtr );
3145 vecOld = vec_ld( 63, vecPtr );
3146
3147 vecLd1 = vec_perm( v0, v1, permVec );
3148 vecLd2 = vec_perm( v1, v2, permVec );
3149 vecLd3 = vec_perm( v2, v3, permVec );
3150 vecLd4 = vec_perm( v3, vecOld, permVec );
3151
3152 // each of these vectors contains 2 elements
3153 // looks like | X Y X Y | X Y X Y
3154 v0 = vec_min( vecLd1, vecLd2 );
3155 v1 = vec_min( vecLd3, vecLd4 );
3156 v0 = vec_min( v0, v1 );
3157
3158 v2 = vec_max( vecLd1, vecLd2 );
3159 v3 = vec_max( vecLd3, vecLd4 );
3160 v2 = vec_max( v2, v3 );
3161
3162 // since its always X Y X Y we don't have to re-merge each time. we can wait
3163 // until the end
3164 vecMin = vec_min( v0, vecMin );
3165 vecMax = vec_max( v2, vecMax );
3166 }
3167
3168 vecMin = vec_min( vecMin, vec_sld( vecMin, vecMin, 8 ) );
3169 vecMax = vec_max( vecMax, vec_sld( vecMax, vecMax, 8 ) );
3170 v0 = vec_splat( vecMin, 0 );
3171 v1 = vec_splat( vecMin, 1 );
3172 v2 = vec_splat( vecMax, 0 );
3173 v3 = vec_splat( vecMax, 1 );
3174
3175 vec_ste( v0, 0, &min[0] );
3176 vec_ste( v1, 0, &min[1] );
3177 vec_ste( v2, 0, &max[0] );
3178 vec_ste( v3, 0, &max[1] );
3179 }
3180
3181 // cleanup
3182 for ( ; i < count; i++ ) {
3183 v = src[i];
3184
3185 if ( v[0] < min[0] ) {
3186 min[0] = v[0];
3187 }
3188 if ( v[0] > max[0] ) {
3189 max[0] = v[0];
3190 }
3191
3192 if ( v[1] < min[1] ) {
3193 min[1] = v[1];
3194 }
3195 if ( v[1] > max[1] ) {
3196 max[1] = v[1];
3197 }
3198 }
3199 }
3200
3201 /*
3202 ============
3203 idSIMD_AltiVec::MinMax
3204 ============
3205 */
MinMax(idVec3 & min,idVec3 & max,const idVec3 * src,const int count)3206 void VPCALL idSIMD_AltiVec::MinMax( idVec3 &min, idVec3 &max, const idVec3 *src, const int count ) {
3207 min[0] = min[1] = min[2] = idMath::INFINITY; max[0] = max[1] = max[2] = -idMath::INFINITY;
3208 //#define OPER(X) const idVec3 &v = src[(X)]; if ( v[0] < min[0] ) { min[0] = v[0]; } if ( v[0] > max[0] ) { max[0] = v[0]; } if ( v[1] < min[1] ) { min[1] = v[1]; } if ( v[1] > max[1] ) { max[1] = v[1]; } if ( v[2] < min[2] ) { min[2] = v[2]; } if ( v[2] > max[2] ) { max[2] = v[2]; }
3209
3210 int i = 0;
3211 const float *srcPtr = src[0].ToFloatPtr();
3212 idVec3 v;
3213
3214 register vector float vecLd1, vecLd2, vecLd3;
3215 register vector float vecMin, vecMax;
3216 register vector float vecSrc1, vecSrc2, vecSrc3, vecSrc4;
3217 register vector float vecMin1, vecMin2, vecMax1, vecMax2;
3218
3219 if ( count >= 4 ) {
3220
3221 vecMin = (vector float)(FLT_MAX);
3222 vecMax = (vector float)(FLT_MIN);
3223
3224 vector unsigned char permVec = vec_add( vec_lvsl( -1, srcPtr), (vector unsigned char)(1) );
3225 vector float vecOld = vec_ld( 0, srcPtr );
3226
3227 // 4 elements at a time
3228 for ( ; i+3 < count; i += 4 ) {
3229 float *vecPtr = (float*)( srcPtr + (i*3) );
3230 vector float v0, v1, v2;
3231
3232 v0 = vecOld;
3233 v1 = vec_ld( 15, vecPtr );
3234 v2 = vec_ld( 31, vecPtr );
3235 vecOld = vec_ld( 47, vecPtr );
3236
3237 vecLd1 = vec_perm( v0, v1, permVec );
3238 vecLd2 = vec_perm( v1, v2, permVec );
3239 vecLd3 = vec_perm( v2, vecOld, permVec );
3240
3241 // put each idVec3 into its own vector as X Y Z (crap)
3242 vecSrc1 = vecLd1;
3243 vecSrc2 = vec_sld( vecLd1, vecLd2, 12 );
3244 vecSrc3 = vec_sld( vecLd2, vecLd3, 8 );
3245 vecSrc4 = vec_sld( vecLd3, vecLd3, 4 );
3246
3247 // do min and max
3248 vecMin1 = vec_min( vecSrc1, vecSrc2 );
3249 vecMin2 = vec_min( vecSrc3, vecSrc4 );
3250 vecMin1 = vec_min( vecMin1, vecMin2 );
3251 vecMin = vec_min( vecMin, vecMin1 );
3252
3253 vecMax1 = vec_max( vecSrc1, vecSrc2 );
3254 vecMax2 = vec_max( vecSrc3, vecSrc4 );
3255 vecMax1 = vec_max( vecMax1, vecMax2 );
3256 vecMax = vec_max( vecMax1, vecMax );
3257 }
3258
3259 // store results
3260 vector float v0, v1, v2, v3, v4, v5;
3261 v0 = vec_splat( vecMin, 0 );
3262 v1 = vec_splat( vecMin, 1 );
3263 v2 = vec_splat( vecMin, 2 );
3264 v3 = vec_splat( vecMax, 0 );
3265 v4 = vec_splat( vecMax, 1 );
3266 v5 = vec_splat( vecMax, 2 );
3267
3268 vec_ste( v0, 0, &min[0] );
3269 vec_ste( v1, 0, &min[1] );
3270 vec_ste( v2, 0, &min[2] );
3271 vec_ste( v3, 0, &max[0] );
3272 vec_ste( v4, 0, &max[1] );
3273 vec_ste( v5, 0, &max[2] );
3274 }
3275
3276 // cleanup
3277 for ( ; i < count; i ++ ) {
3278 v = src[i];
3279
3280 if ( v[0] < min[0] ) {
3281 min[0] = v[0];
3282 }
3283 if ( v[0] > max[0] ) {
3284 max[0] = v[0];
3285 }
3286 if ( v[1] < min[1] ) {
3287 min[1] = v[1];
3288 }
3289 if ( v[1] > max[1] ) {
3290 max[1] = v[1];
3291 }
3292 if ( v[2] < min[2] ) {
3293 min[2] = v[2];
3294 }
3295 if ( v[2] > max[2] ) {
3296 max[2] = v[2];
3297 }
3298 }
3299 }
3300
3301 #ifndef DRAWVERT_PADDED
3302 /*
3303 ============
3304 idSIMD_AltiVec::MinMax
3305 ============
3306 */
MinMax(idVec3 & min,idVec3 & max,const idDrawVert * src,const int count)3307 void VPCALL idSIMD_AltiVec::MinMax( idVec3 &min, idVec3 &max, const idDrawVert *src, const int count ) {
3308
3309 min[0] = min[1] = min[2] = idMath::INFINITY; max[0] = max[1] = max[2] = -idMath::INFINITY;
3310 idVec3 v;
3311 int i = 0;
3312 register vector float vecMin, vecMax;
3313
3314 register vector float v0, v1, v2, v3, v4, v5, v6, v7;
3315 register vector float vecMin1, vecMin2, vecMax1, vecMax2;
3316
3317 if ( count >= 4 ) {
3318 vecMin = (vector float)(FLT_MAX);
3319 vecMax = (vector float)(FLT_MIN);
3320
3321 vector unsigned char vertPerm1 = vec_add( vec_lvsl( -1, (float*) src[i].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
3322 vector unsigned char vertPerm2 = vec_add( vec_lvsl( -1, (float*) src[i+1].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
3323 vector unsigned char vertPerm3 = vec_add( vec_lvsl( -1, (float*) src[i+2].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
3324 vector unsigned char vertPerm4 = vec_add( vec_lvsl( -1, (float*) src[i+3].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
3325
3326 for ( ; i+3 < count; i += 4) {
3327 const float *vertPtr = src[i].xyz.ToFloatPtr();
3328 const float *vertPtr2 = src[i+1].xyz.ToFloatPtr();
3329 const float *vertPtr3 = src[i+2].xyz.ToFloatPtr();
3330 const float *vertPtr4 = src[i+3].xyz.ToFloatPtr();
3331
3332 v0 = vec_ld( 0, vertPtr );
3333 v1 = vec_ld( 11, vertPtr );
3334 v2 = vec_ld( 0, vertPtr2 );
3335 v3 = vec_ld( 11, vertPtr2 );
3336 v4 = vec_ld( 0, vertPtr3 );
3337 v5 = vec_ld( 11, vertPtr3 );
3338 v6 = vec_ld( 0, vertPtr4 );
3339 v7 = vec_ld( 11, vertPtr4 );
3340
3341 v0 = vec_perm( v0, v1, vertPerm1 );
3342 v2 = vec_perm( v2, v3, vertPerm2 );
3343 v4 = vec_perm( v4, v5, vertPerm3 );
3344 v6 = vec_perm( v6, v7, vertPerm4 );
3345
3346 vecMin1 = vec_min( v0, v2 );
3347 vecMin2 = vec_min( v4, v6 );
3348 vecMin1 = vec_min( vecMin1, vecMin2 );
3349 vecMin = vec_min( vecMin, vecMin1 );
3350
3351 vecMax1 = vec_max( v0, v2 );
3352 vecMax2 = vec_max( v4, v6 );
3353 vecMax1 = vec_max( vecMax1, vecMax2 );
3354 vecMax = vec_max( vecMax, vecMax1 );
3355 }
3356
3357 // now we have min/max vectors in X Y Z form, store out
3358 v0 = vec_splat( vecMin, 0 );
3359 v1 = vec_splat( vecMin, 1 );
3360 v2 = vec_splat( vecMin, 2 );
3361 v3 = vec_splat( vecMax, 0 );
3362 v4 = vec_splat( vecMax, 1 );
3363 v5 = vec_splat( vecMax, 2 );
3364
3365 vec_ste( v0, 0, &min[0] );
3366 vec_ste( v1, 0, &min[1] );
3367 vec_ste( v2, 0, &min[2] );
3368 vec_ste( v3, 0, &max[0] );
3369 vec_ste( v4, 0, &max[1] );
3370 vec_ste( v5, 0, &max[2] );
3371 }
3372
3373 // cleanup
3374 for ( ; i < count; i++ ) {
3375 v = src[i].xyz;
3376
3377 if ( v[0] < min[0] ) {
3378 min[0] = v[0];
3379 }
3380 if ( v[0] > max[0] ) {
3381 max[0] = v[0];
3382 }
3383
3384 if ( v[1] < min[1] ) {
3385 min[1] = v[1];
3386 }
3387 if ( v[1] > max[1] ) {
3388 max[1] = v[1];
3389 }
3390
3391 if ( v[2] > max[2] ) {
3392 max[2] = v[2];
3393 }
3394
3395 if ( v[2] < min[2] ) {
3396 min[2] = v[2];
3397 }
3398 }
3399 }
3400 #else
3401 /*
3402 ============
3403 idSIMD_AltiVec::MinMax
3404 ============
3405 */
MinMax(idVec3 & min,idVec3 & max,const idDrawVert * src,const int count)3406 void VPCALL idSIMD_AltiVec::MinMax( idVec3 &min, idVec3 &max, const idDrawVert *src, const int count ) {
3407
3408 min[0] = min[1] = min[2] = idMath::INFINITY; max[0] = max[1] = max[2] = -idMath::INFINITY;
3409 idVec3 v;
3410 int i = 0;
3411 register vector float vecMin, vecMax;
3412
3413 register vector float v0, v1, v2, v3, v4, v5, v6, v7;
3414 register vector float vecMin1, vecMin2, vecMax1, vecMax2;
3415
3416 if ( count >= 4 ) {
3417 vecMin = (vector float)(FLT_MAX);
3418 vecMax = (vector float)(FLT_MIN);
3419
3420 for ( ; i+3 < count; i += 4) {
3421 const float *vertPtr = src[i].xyz.ToFloatPtr();
3422 const float *vertPtr2 = src[i+1].xyz.ToFloatPtr();
3423 const float *vertPtr3 = src[i+2].xyz.ToFloatPtr();
3424 const float *vertPtr4 = src[i+3].xyz.ToFloatPtr();
3425
3426 v0 = vec_ld( 0, vertPtr );
3427 v2 = vec_ld( 0, vertPtr2 );
3428 v4 = vec_ld( 0, vertPtr3 );
3429 v6 = vec_ld( 0, vertPtr4 );
3430
3431 vecMin1 = vec_min( v0, v2 );
3432 vecMin2 = vec_min( v4, v6 );
3433 vecMin1 = vec_min( vecMin1, vecMin2 );
3434 vecMin = vec_min( vecMin, vecMin1 );
3435
3436 vecMax1 = vec_max( v0, v2 );
3437 vecMax2 = vec_max( v4, v6 );
3438 vecMax1 = vec_max( vecMax1, vecMax2 );
3439 vecMax = vec_max( vecMax, vecMax1 );
3440 }
3441
3442 // now we have min/max vectors in X Y Z form, store out
3443 v0 = vec_splat( vecMin, 0 );
3444 v1 = vec_splat( vecMin, 1 );
3445 v2 = vec_splat( vecMin, 2 );
3446 v3 = vec_splat( vecMax, 0 );
3447 v4 = vec_splat( vecMax, 1 );
3448 v5 = vec_splat( vecMax, 2 );
3449
3450 vec_ste( v0, 0, &min[0] );
3451 vec_ste( v1, 0, &min[1] );
3452 vec_ste( v2, 0, &min[2] );
3453 vec_ste( v3, 0, &max[0] );
3454 vec_ste( v4, 0, &max[1] );
3455 vec_ste( v5, 0, &max[2] );
3456 }
3457
3458 // cleanup
3459 for ( ; i < count; i++ ) {
3460 v = src[i].xyz;
3461
3462 if ( v[0] < min[0] ) {
3463 min[0] = v[0];
3464 }
3465 if ( v[0] > max[0] ) {
3466 max[0] = v[0];
3467 }
3468
3469 if ( v[1] < min[1] ) {
3470 min[1] = v[1];
3471 }
3472 if ( v[1] > max[1] ) {
3473 max[1] = v[1];
3474 }
3475
3476 if ( v[2] > max[2] ) {
3477 max[2] = v[2];
3478 }
3479
3480 if ( v[2] < min[2] ) {
3481 min[2] = v[2];
3482 }
3483 }
3484 }
3485
3486 #endif /* DRAWVERT_PADDED */
3487
3488 #ifndef DRAWVERT_PADDED
3489 /*
3490 ============
3491 idSIMD_AltiVec::MinMax
3492 ============
3493 */
MinMax(idVec3 & min,idVec3 & max,const idDrawVert * src,const int * indexes,const int count)3494 void VPCALL idSIMD_AltiVec::MinMax( idVec3 &min, idVec3 &max, const idDrawVert *src, const int *indexes, const int count ) {
3495 min[0] = min[1] = min[2] = idMath::INFINITY; max[0] = max[1] = max[2] = -idMath::INFINITY;
3496
3497 idVec3 v;
3498 int i = 0;
3499
3500 register vector float vecMin, vecMax;
3501
3502 register vector float v0, v1, v2, v3, v4, v5, v6, v7;
3503 register vector float vecMin1, vecMin2, vecMax1, vecMax2;
3504
3505 if ( count >= 4 ) {
3506
3507 vecMin = (vector float)(FLT_MAX);
3508 vecMax = (vector float)(FLT_MIN);
3509
3510 vector unsigned char vertPerm1;
3511 vector unsigned char vertPerm2;
3512 vector unsigned char vertPerm3;
3513 vector unsigned char vertPerm4;
3514
3515 for ( ; i+3 < count; i += 4) {
3516 const float *vertPtr = src[indexes[i]].xyz.ToFloatPtr();
3517 const float *vertPtr2 = src[indexes[i+1]].xyz.ToFloatPtr();
3518 const float *vertPtr3 = src[indexes[i+2]].xyz.ToFloatPtr();
3519 const float *vertPtr4 = src[indexes[i+3]].xyz.ToFloatPtr();
3520
3521 vertPerm1 = vec_add( vec_lvsl( -1, vertPtr ), (vector unsigned char)(1) );
3522 vertPerm2 = vec_add( vec_lvsl( -1, vertPtr2 ), (vector unsigned char)(1) );
3523 vertPerm3 = vec_add( vec_lvsl( -1, vertPtr3 ), (vector unsigned char)(1) );
3524 vertPerm4 = vec_add( vec_lvsl( -1, vertPtr4 ), (vector unsigned char)(1) );
3525
3526 v0 = vec_ld( 0, vertPtr );
3527 v1 = vec_ld( 15, vertPtr );
3528 v2 = vec_ld( 0, vertPtr2 );
3529 v3 = vec_ld( 15, vertPtr2 );
3530 v4 = vec_ld( 0, vertPtr3 );
3531 v5 = vec_ld( 15, vertPtr3 );
3532 v6 = vec_ld( 0, vertPtr4 );
3533 v7 = vec_ld( 15, vertPtr4 );
3534
3535 v0 = vec_perm( v0, v1, vertPerm1 );
3536 v2 = vec_perm( v2, v3, vertPerm2 );
3537 v4 = vec_perm( v4, v5, vertPerm3 );
3538 v6 = vec_perm( v6, v7, vertPerm4 );
3539
3540 vecMin1 = vec_min( v0, v2 );
3541 vecMin2 = vec_min( v4, v6 );
3542 vecMin1 = vec_min( vecMin1, vecMin2 );
3543 vecMin = vec_min( vecMin, vecMin1 );
3544
3545 vecMax1 = vec_max( v0, v2 );
3546 vecMax2 = vec_max( v4, v6 );
3547 vecMax1 = vec_max( vecMax1, vecMax2 );
3548 vecMax = vec_max( vecMax, vecMax1 );
3549 }
3550
3551 // now we have min/max vectors in X Y Z form, store out
3552 v0 = vec_splat( vecMin, 0 );
3553 v1 = vec_splat( vecMin, 1 );
3554 v2 = vec_splat( vecMin, 2 );
3555 v3 = vec_splat( vecMax, 0 );
3556 v4 = vec_splat( vecMax, 1 );
3557 v5 = vec_splat( vecMax, 2 );
3558
3559 vec_ste( v0, 0, &min[0] );
3560 vec_ste( v1, 0, &min[1] );
3561 vec_ste( v2, 0, &min[2] );
3562 vec_ste( v3, 0, &max[0] );
3563 vec_ste( v4, 0, &max[1] );
3564 vec_ste( v5, 0, &max[2] );
3565 }
3566
3567 // cleanup
3568 for ( ; i < count; i++ ) {
3569 v = src[indexes[i]].xyz;
3570
3571 if ( v[0] < min[0] ) {
3572 min[0] = v[0];
3573 }
3574 if ( v[0] > max[0] ) {
3575 max[0] = v[0];
3576 }
3577
3578 if ( v[1] < min[1] ) {
3579 min[1] = v[1];
3580 }
3581 if ( v[1] > max[1] ) {
3582 max[1] = v[1];
3583 }
3584
3585 if ( v[2] > max[2] ) {
3586 max[2] = v[2];
3587 }
3588
3589 if ( v[2] < min[2] ) {
3590 min[2] = v[2];
3591 }
3592 }
3593 }
3594 #else
3595 /*
3596 ============
3597 idSIMD_AltiVec::MinMax
3598 ============
3599 */
MinMax(idVec3 & min,idVec3 & max,const idDrawVert * src,const int * indexes,const int count)3600 void VPCALL idSIMD_AltiVec::MinMax( idVec3 &min, idVec3 &max, const idDrawVert *src, const int *indexes, const int count ) {
3601 min[0] = min[1] = min[2] = idMath::INFINITY; max[0] = max[1] = max[2] = -idMath::INFINITY;
3602
3603 idVec3 v;
3604 int i = 0;
3605
3606 register vector float vecMin, vecMax;
3607
3608 register vector float v0, v1, v2, v3, v4, v5, v6, v7;
3609 register vector float vecMin1, vecMin2, vecMax1, vecMax2;
3610
3611 if ( count >= 4 ) {
3612
3613 vecMin = (vector float)(FLT_MAX);
3614 vecMax = (vector float)(FLT_MIN);
3615
3616 vector unsigned char vertPerm1;
3617 vector unsigned char vertPerm2;
3618 vector unsigned char vertPerm3;
3619 vector unsigned char vertPerm4;
3620
3621 for ( ; i+3 < count; i += 4) {
3622 const float *vertPtr = src[indexes[i]].xyz.ToFloatPtr();
3623 const float *vertPtr2 = src[indexes[i+1]].xyz.ToFloatPtr();
3624 const float *vertPtr3 = src[indexes[i+2]].xyz.ToFloatPtr();
3625 const float *vertPtr4 = src[indexes[i+3]].xyz.ToFloatPtr();
3626
3627 v0 = vec_ld( 0, vertPtr );
3628 v2 = vec_ld( 0, vertPtr2 );
3629 v4 = vec_ld( 0, vertPtr3 );
3630 v6 = vec_ld( 0, vertPtr4 );
3631
3632 vecMin1 = vec_min( v0, v2 );
3633 vecMin2 = vec_min( v4, v6 );
3634 vecMin1 = vec_min( vecMin1, vecMin2 );
3635 vecMin = vec_min( vecMin, vecMin1 );
3636
3637 vecMax1 = vec_max( v0, v2 );
3638 vecMax2 = vec_max( v4, v6 );
3639 vecMax1 = vec_max( vecMax1, vecMax2 );
3640 vecMax = vec_max( vecMax, vecMax1 );
3641 }
3642
3643 // now we have min/max vectors in X Y Z form, store out
3644 v0 = vec_splat( vecMin, 0 );
3645 v1 = vec_splat( vecMin, 1 );
3646 v2 = vec_splat( vecMin, 2 );
3647 v3 = vec_splat( vecMax, 0 );
3648 v4 = vec_splat( vecMax, 1 );
3649 v5 = vec_splat( vecMax, 2 );
3650
3651 vec_ste( v0, 0, &min[0] );
3652 vec_ste( v1, 0, &min[1] );
3653 vec_ste( v2, 0, &min[2] );
3654 vec_ste( v3, 0, &max[0] );
3655 vec_ste( v4, 0, &max[1] );
3656 vec_ste( v5, 0, &max[2] );
3657 }
3658
3659 // cleanup
3660 for ( ; i < count; i++ ) {
3661 v = src[indexes[i]].xyz;
3662
3663 if ( v[0] < min[0] ) {
3664 min[0] = v[0];
3665 }
3666 if ( v[0] > max[0] ) {
3667 max[0] = v[0];
3668 }
3669
3670 if ( v[1] < min[1] ) {
3671 min[1] = v[1];
3672 }
3673 if ( v[1] > max[1] ) {
3674 max[1] = v[1];
3675 }
3676
3677 if ( v[2] > max[2] ) {
3678 max[2] = v[2];
3679 }
3680
3681 if ( v[2] < min[2] ) {
3682 min[2] = v[2];
3683 }
3684 }
3685 }
3686
3687
3688 #endif /* DRAWVERT_PADDED */
3689
3690 #endif /* ENABLE_MINMAX */
3691
3692 #ifdef ENABLE_CLAMP
3693
3694 /*
3695 ============
3696 idSIMD_AltiVec::Clamp
3697 ============
3698 */
Clamp(float * dst,const float * src,const float min,const float max,const int count)3699 void VPCALL idSIMD_AltiVec::Clamp( float *dst, const float *src, const float min, const float max, const int count ) {
3700 //#define OPER(X) dst[(X)] = src[(X)] < min ? min : src[(X)] > max ? max : src[(X)];
3701 register vector float v0, v1, v2, v3, v4, v5;
3702 register vector unsigned char permVec;
3703 register vector float v0_low, v0_hi, v1_low, v1_hi;
3704 vector unsigned char oneVector = (vector unsigned char)(1);
3705 register vector float minVec, maxVec;
3706 int i = 0;
3707
3708 //handle unaligned at start
3709 for ( ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count ); i++ ) {
3710 dst[i] = src[i] < min ? min : src[i] > max ? max : src[i];
3711 }
3712
3713 //splat min/max into a vector
3714 minVec = loadSplatUnalignedScalar( &min );
3715 maxVec = loadSplatUnalignedScalar( &max );
3716
3717 //calculate permute and do first load
3718 permVec = vec_add( vec_lvsl( -1, (int*) &src[i] ), oneVector );
3719 v1_hi = vec_ld( 0, &src[i] );
3720
3721
3722 //vectorize!
3723 for ( ; i+7 < count; i += 8 ) {
3724 //load source
3725 v0_low = v1_hi;
3726 v0_hi = vec_ld( 15, &src[i] );
3727 v1_low = v0_hi;
3728 v1_hi = vec_ld( 31, &src[i] );
3729
3730 v0 = vec_perm( v0_low, v0_hi, permVec );
3731 v1 = vec_perm( v1_low, v1_hi, permVec );
3732
3733 //apply minimum
3734 v2 = vec_max( v0, minVec );
3735 v3 = vec_max( v1, minVec );
3736
3737 //apply maximum
3738 v4 = vec_min( v2, maxVec );
3739 v5 = vec_min( v3, maxVec );
3740
3741 ALIGNED_STORE2( &dst[i], v4, v5 );
3742 }
3743
3744 //handle cleanup
3745 for ( ; i < count ; i++ ) {
3746 dst[i] = src[i] < min ? min : src[i] > max ? max : src[i];
3747 }
3748 }
3749
3750 /*
3751 ============
3752 idSIMD_AltiVec::ClampMin
3753 ============
3754 */
ClampMin(float * dst,const float * src,const float min,const int count)3755 void VPCALL idSIMD_AltiVec::ClampMin( float *dst, const float *src, const float min, const int count ) {
3756 //#define OPER(X) dst[(X)] = src[(X)] < min ? min : src[(X)];
3757 register vector float v0, v1, v2, v3;
3758 register vector unsigned char permVec;
3759 register vector float v0_low, v0_hi, v1_low, v1_hi;
3760 register vector float constVec;
3761 vector unsigned char oneVector = (vector unsigned char)(1);
3762 int i = 0;
3763
3764 //handle unaligned at start
3765 for ( ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count ); i++ ) {
3766 dst[i] = src[i] < min ? min : src[i];
3767 }
3768
3769 //splat constant into a vector
3770 constVec = loadSplatUnalignedScalar( &min );
3771
3772 //calculate permute and do first load
3773 permVec = vec_add( vec_lvsl( -1, (int*) &src[i] ), oneVector );
3774 v1_hi = vec_ld( 0, &src[i] );
3775
3776 //vectorize!
3777 for ( ; i+7 < count; i += 8 ) {
3778 //load source
3779 v0_low = v1_hi;
3780 v0_hi = vec_ld( 15, &src[i] );
3781 v1_low = v0_hi;
3782 v1_hi = vec_ld( 31, &src[i] );
3783
3784 v0 = vec_perm( v0_low, v0_hi, permVec );
3785 v1 = vec_perm( v1_low, v1_hi, permVec );
3786
3787 v2 = vec_max( v0, constVec );
3788 v3 = vec_max( v1, constVec );
3789
3790 ALIGNED_STORE2( &dst[i], v2, v3 );
3791 }
3792
3793 //handle cleanup
3794 for ( ; i < count ; i++ ) {
3795 dst[i] = src[i] < min ? min : src[i];
3796 }
3797 }
3798
3799 /*
3800 ============
3801 idSIMD_AltiVec::ClampMax
3802 ============
3803 */
ClampMax(float * dst,const float * src,const float max,const int count)3804 void VPCALL idSIMD_AltiVec::ClampMax( float *dst, const float *src, const float max, const int count ) {
3805 //#define OPER(X) dst[(X)] = src[(X)] > max ? max : src[(X)];
3806 register vector float v0, v1, v2, v3;
3807 register vector unsigned char permVec;
3808 register vector float constVec;
3809 register vector float v0_low, v0_hi, v1_low, v1_hi;
3810 vector unsigned char oneVector = (vector unsigned char)(1);
3811 int i = 0;
3812
3813 //handle unaligned at start
3814 for ( ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count ); i++ ) {
3815 dst[i] = src[i] < max ? max : src[i];
3816 }
3817
3818 //splat constant into a vector
3819 constVec = loadSplatUnalignedScalar( &max );
3820
3821 //calculate permute and do first load
3822 permVec = vec_add( vec_lvsl( -1, (int*) &src[i] ), oneVector );
3823 v1_hi = vec_ld( 0, &src[i] );
3824
3825 //vectorize!
3826 for ( ; i+7 < count; i += 8 ) {
3827 //load source
3828 v0_low = v1_hi;
3829 v0_hi = vec_ld( 15, &src[i] );
3830 v1_low = v0_hi;
3831 v1_hi = vec_ld( 31, &src[i] );
3832
3833 v0 = vec_perm( v0_low, v0_hi, permVec );
3834 v1 = vec_perm( v1_low, v1_hi, permVec );
3835 v2 = vec_min( v0, constVec );
3836 v3 = vec_min( v1, constVec );
3837
3838 ALIGNED_STORE2( &dst[i], v2, v3 );
3839 }
3840
3841 //handle cleanup
3842 for ( ; i < count ; i++ ) {
3843 dst[i] = src[i] < max ? max : src[i];
3844 }
3845 }
3846
3847 #endif /* ENABLE_CLAMP */
3848
3849 #ifdef ENABLE_16ROUTINES
3850
3851 /*
3852 ============
3853 idSIMD_AltiVec::Zero16
3854 ============
3855 */
Zero16(float * dst,const int count)3856 void VPCALL idSIMD_AltiVec::Zero16( float *dst, const int count ) {
3857 memset( dst, 0, count * sizeof( float ) );
3858 }
3859
3860 /*
3861 ============
3862 idSIMD_AltiVec::Negate16
3863
3864 Assumptions:
3865 dst is aligned
3866 ============
3867 */
Negate16(float * dst,const int count)3868 void VPCALL idSIMD_AltiVec::Negate16( float *dst, const int count ) {
3869 //#define OPER(X) ptr[(X)] ^= ( 1 << 31 ) // IEEE 32 bits float sign bit
3870
3871 // dst is aligned
3872 assert( IS_16BYTE_ALIGNED( dst[0] ) );
3873
3874 // round count up to next 4 if needbe
3875 int count2 = ( count + 3 ) & ~3;
3876
3877 int i = 0;
3878 vector float v0, v1, v2, v3;
3879
3880 //know its 16-byte aligned
3881 for ( ; i + 7 < count2; i += 8 ) {
3882 v0 = vec_ld( 0, &dst[i] );
3883 v1 = vec_ld( 16, &dst[i] );
3884
3885 v2 = vec_sub( (vector float)(0), v0 );
3886 v3 = vec_sub( (vector float)(0), v1 );
3887
3888 ALIGNED_STORE2( &dst[i], v2, v3 );
3889 }
3890
3891 for ( ; i < count2; i += 4 ) {
3892 v0 = vec_ld( 0, &dst[i] );
3893 v1 = vec_sub( (vector float)(0), v0 );
3894 vec_st( v1, 0, &dst[i] );
3895 }
3896 }
3897
3898 /*
3899 ============
3900 idSIMD_AltiVec::Copy16
3901 ============
3902 */
Copy16(float * dst,const float * src,const int count)3903 void VPCALL idSIMD_AltiVec::Copy16( float *dst, const float *src, const int count ) {
3904 //#define OPER(X) dst[(X)] = src[(X)]
3905 memcpy( dst, src, sizeof(float) * count );
3906 }
3907
3908 /*
3909 ============
3910 idSIMD_AltiVec::Add16
3911
3912 Assumptions:
3913 Assumes dst, src1, src2 all start at aligned address
3914 ============
3915 */
Add16(float * dst,const float * src1,const float * src2,const int count)3916 void VPCALL idSIMD_AltiVec::Add16( float *dst, const float *src1, const float *src2, const int count ) {
3917 //#define OPER(X) dst[(X)] = src1[(X)] + src2[(X)]
3918
3919 // dst is aligned
3920 assert( IS_16BYTE_ALIGNED( dst[0] ) );
3921 // src1 is aligned
3922 assert( IS_16BYTE_ALIGNED( src1[0] ) );
3923 // src2 is aligned
3924 assert( IS_16BYTE_ALIGNED( src2[0] ) );
3925
3926 // round count up to next 4 if needbe
3927 int count2 = ( count + 3 ) & ~3;
3928
3929 register vector float v0, v1, v2, v3, v4, v5;
3930 int i = 0;
3931
3932 //know all data is 16-byte aligned, so vectorize!
3933 for ( ; i+7 < count2; i += 8 ) {
3934 //load sources
3935 v0 = vec_ld( 0, &src1[i] );
3936 v1 = vec_ld( 16, &src1[i] );
3937 v2 = vec_ld( 0, &src2[i] );
3938 v3 = vec_ld( 16, &src2[i] );
3939 v4 = vec_add( v0, v2 );
3940 v5 = vec_add( v1, v3 );
3941
3942 ALIGNED_STORE2( &dst[i], v4, v5 );
3943 }
3944
3945 for ( ; i < count2; i += 4 ) {
3946 v0 = vec_ld( 0, &src1[i] );
3947 v1 = vec_ld( 0, &src2[i] );
3948 v2 = vec_add( v0, v1 );
3949 vec_st( v2, 0, &dst[i] );
3950 }
3951 }
3952
3953 /*
3954 ============
3955 idSIMD_AltiVec::Sub16
3956
3957 Assumptions:
3958 Assumes that dst, src1, and src2 all start at aligned address
3959 ============
3960 */
Sub16(float * dst,const float * src1,const float * src2,const int count)3961 void VPCALL idSIMD_AltiVec::Sub16( float *dst, const float *src1, const float *src2, const int count ) {
3962 //#define OPER(X) dst[(X)] = src1[(X)] - src2[(X)]
3963 // dst is aligned
3964 assert( IS_16BYTE_ALIGNED( dst[0] ) );
3965 // src1 is aligned
3966 assert( IS_16BYTE_ALIGNED( src1[0] ) );
3967 // src2 is aligned
3968 assert( IS_16BYTE_ALIGNED( src2[0] ) );
3969
3970 // round count up to next 4 if needbe
3971 int count2 = ( count + 3 ) & ~3;
3972
3973 register vector float v0, v1, v2, v3, v4, v5;
3974 int i = 0;
3975
3976 //know data is aligned, so vectorize!
3977 for ( ; i+7 < count2; i += 8 ) {
3978 //load sources
3979 v0 = vec_ld( 0, &src1[i] );
3980 v1 = vec_ld( 16, &src1[i] );
3981 v2 = vec_ld( 0, &src2[i] );
3982 v3 = vec_ld( 16, &src2[i] );
3983 v4 = vec_sub( v0, v2 );
3984 v5 = vec_sub( v1, v3 );
3985
3986 ALIGNED_STORE2( &dst[i], v4, v5 );
3987 }
3988
3989 for ( ; i < count2; i += 4 ) {
3990 v0 = vec_ld( 0, &src1[i] );
3991 v1 = vec_ld( 0, &src2[i] );
3992 v2 = vec_sub( v0, v1 );
3993 vec_st( v2, 0, &dst[i] );
3994 }
3995 }
3996
3997 /*
3998 ============
3999 idSIMD_AltiVec::Mul16
4000
4001 Assumptions:
4002 Assumes that dst and src1 start at aligned address
4003 ============
4004 */
Mul16(float * dst,const float * src1,const float constant,const int count)4005 void VPCALL idSIMD_AltiVec::Mul16( float *dst, const float *src1, const float constant, const int count ) {
4006 //#define OPER(X) dst[(X)] = src1[(X)] * constant
4007
4008 // dst is aligned
4009 assert( IS_16BYTE_ALIGNED( dst[0] ) );
4010 // src1 is aligned
4011 assert( IS_16BYTE_ALIGNED( src1[0] ) );
4012
4013 // round count up to next 4 if needbe
4014 int count2 = ( count + 3 ) & ~3;
4015
4016 register vector float v0, v1, v2, v3;
4017 register vector float constVec;
4018 register vector float zeroVector = (vector float)(0.0);
4019 int i = 0;
4020
4021 //splat constant into a vector
4022 constVec = loadSplatUnalignedScalar( &constant );
4023
4024 //know data is aligned, so vectorize!
4025 for ( ; i+7 < count2; i += 8 ) {
4026 //load source
4027 v0 = vec_ld( 0, &src1[i] );
4028 v1 = vec_ld( 16, &src1[i] );
4029 v2 = vec_madd( constVec, v0, zeroVector );
4030 v3 = vec_madd( constVec, v1, zeroVector );
4031 ALIGNED_STORE2( &dst[i], v2, v3 );
4032 }
4033
4034 for ( ; i < count2; i += 4 ) {
4035 v0 = vec_ld( 0, &src1[i] );
4036 v1 = vec_madd( constVec, v0, zeroVector );
4037 vec_st( v1, 0, &dst[i] );
4038 }
4039 }
4040
4041 /*
4042 ============
4043 idSIMD_AltiVec::AddAssign16
4044
4045 Assumptions:
4046 Assumes that dst and src start at aligned address
4047 ============
4048 */
AddAssign16(float * dst,const float * src,const int count)4049 void VPCALL idSIMD_AltiVec::AddAssign16( float *dst, const float *src, const int count ) {
4050 //#define OPER(X) dst[(X)] += src[(X)]
4051
4052 // dst is aligned
4053 assert( IS_16BYTE_ALIGNED( dst[0] ) );
4054 // src is aligned
4055 assert( IS_16BYTE_ALIGNED( src[0] ) );
4056
4057 // round count up to next 4 if needbe
4058 int count2 = ( count + 3 ) & ~3;
4059
4060 register vector float v0, v1, v2, v3, v4, v5;
4061 int i = 0;
4062
4063 //vectorize!
4064 for ( ; i+7 < count2; i += 8 ) {
4065 v0 = vec_ld( 0, &src[i] );
4066 v1 = vec_ld( 16, &src[i] );
4067 v2 = vec_ld( 0, &dst[i] );
4068 v3 = vec_ld( 16, &dst[i] );
4069 v4 = vec_add( v0, v2 );
4070 v5 = vec_add( v1, v3 );
4071 ALIGNED_STORE2( &dst[i], v4, v5 );
4072 }
4073
4074 for ( ; i < count2; i += 4 ) {
4075 v0 = vec_ld( 0, &src[i] );
4076 v1 = vec_ld( 0, &dst[i] );
4077 v2 = vec_add( v0, v1 );
4078 vec_st( v2, 0, &dst[i] );
4079 }
4080 }
4081
4082 /*
4083 ============
4084 idSIMD_AltiVec::SubAssign16
4085
4086 Assumptions:
4087 Assumes that dst and src start at aligned address
4088 ============
4089 */
SubAssign16(float * dst,const float * src,const int count)4090 void VPCALL idSIMD_AltiVec::SubAssign16( float *dst, const float *src, const int count ) {
4091 //#define OPER(X) dst[(X)] -= src[(X)]
4092 register vector float v0, v1, v2, v3, v4, v5;
4093 int i=0;
4094
4095 // dst is aligned
4096 assert( IS_16BYTE_ALIGNED( dst[0] ) );
4097 // src is aligned
4098 assert( IS_16BYTE_ALIGNED( src[0] ) );
4099 // round count up to next 4 if needbe
4100 int count2 = ( count + 3 ) & ~3;
4101
4102 //vectorize!
4103 for ( ; i+7 < count2; i += 8 ) {
4104 v0 = vec_ld( 0, &src[i] );
4105 v1 = vec_ld( 16, &src[i] );
4106 v2 = vec_ld( 0, &dst[i] );
4107 v3 = vec_ld( 16, &dst[i] );
4108 v4 = vec_sub( v2, v0 );
4109 v5 = vec_sub( v3, v1 );
4110 ALIGNED_STORE2( &dst[i], v4, v5 );
4111 }
4112
4113 for ( ; i < count2; i += 4 ) {
4114 v0 = vec_ld( 0, &src[i] );
4115 v1 = vec_ld( 0, &dst[i] );
4116 v2 = vec_sub( v1, v0 );
4117 vec_st( v2, 0, &dst[i] );
4118 }
4119 }
4120
4121 /*
4122 ============
4123 idSIMD_AltiVec::MulAssign16
4124
4125 Assumptions:
4126 Assumes that dst starts at aligned address and count is multiple of 4
4127 ============
4128 */
MulAssign16(float * dst,const float constant,const int count)4129 void VPCALL idSIMD_AltiVec::MulAssign16( float *dst, const float constant, const int count ) {
4130 //#define OPER(X) dst[(X)] *= constant
4131
4132 // dst is aligned
4133 assert( IS_16BYTE_ALIGNED( dst[0] ) );
4134 // round count up to next 4 if needbe
4135 int count2 = ( count + 3 ) & ~3;
4136
4137 register vector float v0, v1, v2, v3;
4138 register vector float constVec;
4139 int i = 0;
4140 register vector float zeroVector = (vector float)(0.0);
4141
4142 //splat constant into a vector
4143 constVec = loadSplatUnalignedScalar( &constant );
4144
4145 //vectorize!
4146 for ( ; i+7 < count2; i += 8 ) {
4147 v0 = vec_ld( 0, &dst[i] );
4148 v1 = vec_ld( 16, &dst[i] );
4149 v2 = vec_madd( v0, constVec, zeroVector );
4150 v3 = vec_madd( v1, constVec, zeroVector );
4151 ALIGNED_STORE2( &dst[i], v2, v3 );
4152 }
4153
4154 for ( ; i < count2; i += 4 ) {
4155 v0 = vec_ld( 0, &dst[i] );
4156 v1 = vec_madd( v0, constVec, zeroVector );
4157 vec_st( v1, 0, &dst[i] );
4158 }
4159 }
4160
4161 #endif /* ENABLE_16ROUTINES */
4162
4163 #ifdef ENABLE_LOWER_TRIANGULAR
4164
4165 /*
4166 ============
4167 idSIMD_AltiVec::MatX_LowerTriangularSolve
4168
4169 solves x in L * x = b for the first n rows of L
4170 if skip > 0 the first skip elements of x are assumed to be valid already
4171 L has to be a lower triangular matrix with (implicit) ones on the diagonal
4172 x == b is allowed
4173 ============
4174 */
4175
MatX_LowerTriangularSolve(const idMatX & L,float * x,const float * b,const int n,int skip)4176 void VPCALL idSIMD_AltiVec::MatX_LowerTriangularSolve( const idMatX &L, float *x, const float *b, const int n, int skip ) {
4177
4178 int i, j;
4179 const float *lptr;
4180 const float *lptr2;
4181 const float *lptr3;
4182 const float *lptr4;
4183 float sum;
4184 float sum2;
4185 float sum3;
4186 float sum4;
4187 float tempSum;
4188 float tempSum2;
4189 float tempSum3;
4190 float tempSum4;
4191 vector float vecSum1 = (vector float)(0.0);
4192 vector float vecSum2 = (vector float)(0.0);
4193 vector float v0, v1, v2, v3, v4, v5, v6, v7, v8, v9;
4194 vector float zeroVector = (vector float)(0.0);
4195 vector float vecSum3, vecSum4, vecSum5, vecSum6, vecSum7, vecSum8;
4196
4197 vector unsigned char vecPermX = vec_add( vec_lvsl( -1, &x[0] ), (vector unsigned char)(1) );
4198
4199 // unrolled this loop a bit
4200 for ( i = skip; i+3 < n; i+=4 ) {
4201 sum = b[i];
4202 sum2 = b[i+1];
4203 sum3 = b[i+2];
4204 sum4 = b[i+3];
4205
4206 vecSum1 = zeroVector;
4207 vecSum2 = zeroVector;
4208 vecSum3 = vecSum4 = vecSum5 = vecSum6 = vecSum7 = vecSum8 = zeroVector;
4209 lptr = L[i];
4210 lptr2 = L[i+1];
4211 lptr3 = L[i+2];
4212 lptr4 = L[i+3];
4213
4214 vector unsigned char vecPermLptr1 = vec_add( vec_lvsl( -1, lptr ), (vector unsigned char)(1) );
4215 vector unsigned char vecPermLptr2 = vec_add( vec_lvsl( -1, lptr2 ), (vector unsigned char)(1) );
4216 vector unsigned char vecPermLptr3 = vec_add( vec_lvsl( -1, lptr3 ), (vector unsigned char)(1) );
4217 vector unsigned char vecPermLptr4 = vec_add( vec_lvsl( -1, lptr4 ), (vector unsigned char)(1) );
4218
4219 for ( j = 0 ; j+7 < i; j+=8 ) {
4220
4221 v0 = vec_ld( 0, &x[j] );
4222 v1 = vec_ld( 15, &x[j] );
4223 vector float vecExtraX = vec_ld( 31, &x[j] );
4224 v0 = vec_perm( v0, v1, vecPermX );
4225 v1 = vec_perm( v1, vecExtraX, vecPermX );
4226
4227 v2 = vec_ld( 0, lptr + j );
4228 v3 = vec_ld( 15, lptr + j );
4229 vector float vecExtra1 = vec_ld( 31, lptr + j );
4230 v2 = vec_perm( v2, v3, vecPermLptr1 );
4231 v3 = vec_perm( v3, vecExtra1, vecPermLptr1 );
4232
4233 v4 = vec_ld( 0, lptr2 + j );
4234 v5 = vec_ld( 15, lptr2 + j );
4235 vector float vecExtra2 = vec_ld( 31, lptr2 + j );
4236 v4 = vec_perm( v4, v5, vecPermLptr2 );
4237 v5 = vec_perm( v5, vecExtra2, vecPermLptr2 );
4238
4239 v6 = vec_ld( 0, lptr3 + j );
4240 v7 = vec_ld( 15, lptr3 + j );
4241 vector float vecExtra3 = vec_ld( 31, lptr3 + j );
4242 v6 = vec_perm( v6, v7, vecPermLptr3 );
4243 v7 = vec_perm( v7, vecExtra3, vecPermLptr3 );
4244
4245 v8 = vec_ld( 0, lptr4 + j );
4246 v9 = vec_ld( 15, lptr4 + j );
4247 vector float vecExtra4 = vec_ld( 31, lptr4 + j );
4248 v8 = vec_perm( v8, v9, vecPermLptr4 );
4249 v9 = vec_perm( v9, vecExtra4, vecPermLptr4 );
4250
4251 vecSum1 = vec_madd( v2, v0, vecSum1 );
4252 vecSum2 = vec_madd( v3, v1, vecSum2 );
4253
4254 vecSum3 = vec_madd( v4, v0, vecSum3 );
4255 vecSum4 = vec_madd( v5, v1, vecSum4 );
4256
4257 vecSum5 = vec_madd( v6, v0, vecSum5 );
4258 vecSum6 = vec_madd( v7, v1, vecSum6 );
4259
4260 vecSum7 = vec_madd( v8, v0, vecSum7 );
4261 vecSum8 = vec_madd( v9, v1, vecSum8 );
4262 }
4263
4264 // if we ran the unrolled code, we need to sum accross the vectors
4265 // to find out how much to subtract from sum
4266 if ( j > 0 ) {
4267 vecSum1 = vec_add( vecSum1, vecSum2 );
4268 vecSum3 = vec_add( vecSum3, vecSum4 );
4269 vecSum5 = vec_add( vecSum5, vecSum6 );
4270 vecSum7 = vec_add( vecSum7, vecSum8 );
4271 //sum accross the vectors
4272 vecSum1 = vec_add( vecSum1, vec_sld( vecSum1, vecSum1, 8 ) );
4273 vecSum1 = vec_add( vecSum1, vec_sld( vecSum1, vecSum1, 4 ) );
4274
4275 vecSum3 = vec_add( vecSum3, vec_sld( vecSum3, vecSum3, 8 ) );
4276 vecSum3 = vec_add( vecSum3, vec_sld( vecSum3, vecSum3, 4 ) );
4277
4278 vecSum5 = vec_add( vecSum5, vec_sld( vecSum5, vecSum5, 8 ) );
4279 vecSum5 = vec_add( vecSum5, vec_sld( vecSum5, vecSum5, 4 ) );
4280
4281 vecSum7 = vec_add( vecSum7, vec_sld( vecSum7, vecSum7, 8 ) );
4282 vecSum7 = vec_add( vecSum7, vec_sld( vecSum7, vecSum7, 4 ) );
4283
4284 //move the result to the FPU
4285 vec_ste( vec_splat( vecSum1, 0 ), 0, &tempSum );
4286 vec_ste( vec_splat( vecSum3, 0 ), 0, &tempSum2 );
4287 vec_ste( vec_splat( vecSum5, 0 ), 0, &tempSum3 );
4288 vec_ste( vec_splat( vecSum7, 0 ), 0, &tempSum4 );
4289
4290 sum -= tempSum;
4291 sum2 -= tempSum2;
4292 sum3 -= tempSum3;
4293 sum4 -= tempSum4;
4294 }
4295
4296 //cleanup
4297 for ( ; j < i; j++ ) {
4298 sum -= lptr[j] * x[j];
4299 sum2 -= lptr2[j] * x[j];
4300 sum3 -= lptr3[j] * x[j];
4301 sum4 -= lptr4[j] * x[j];
4302 }
4303
4304 // store the 4 results at a time
4305 sum2 -= ( lptr2[i] * sum );
4306 sum3 = sum3 - ( lptr3[i+1] * sum2 ) - ( lptr3[i] * sum );
4307 sum4 = sum4 - ( lptr4[i+2] * sum3 ) - ( lptr4[i+1] * sum2 ) - ( lptr4[i] * sum );
4308
4309 x[i] = sum;
4310 x[i+1] = sum2;
4311 x[i+2] = sum3;
4312 x[i+3] = sum4;
4313 }
4314
4315 // cleanup
4316 for ( ; i < n; i++ ) {
4317 sum = b[i];
4318 vecSum1 = zeroVector;
4319 vecSum2 = zeroVector;
4320 lptr = L[i];
4321 vector unsigned char vecPermLptr = vec_add( vec_lvsl( -1, lptr ), (vector unsigned char)(1) );
4322
4323 for ( j = 0 ; j+7 < i; j+=8 ) {
4324
4325 v0 = vec_ld( 0, &x[j] );
4326 v2 = vec_ld( 15, &x[j] );
4327 vector float vecExtraX = vec_ld( 31, &x[j] );
4328 v0 = vec_perm( v0, v2, vecPermX );
4329 v2 = vec_perm( v2, vecExtraX, vecPermX );
4330
4331 v1 = vec_ld( 0, lptr + j );
4332 v3 = vec_ld( 15, lptr + j );
4333 vector float vecExtra = vec_ld( 31, lptr + j );
4334 v1 = vec_perm( v1, v3, vecPermLptr );
4335 v3 = vec_perm( v3, vecExtra, vecPermLptr );
4336
4337 vecSum1 = vec_madd( v1, v0, vecSum1 );
4338 vecSum2 = vec_madd( v3, v2, vecSum2 );
4339 }
4340
4341 // if we ran the unrolled code, we need to sum accross the vectors
4342 // to find out how much to subtract from sum
4343 if ( j > 0 ) {
4344 //sum accross the vectors
4345 vecSum1 = vec_add( vecSum1, vecSum2 );
4346 vecSum1 = vec_add( vecSum1, vec_sld( vecSum1, vecSum1, 8 ) );
4347 vecSum1 = vec_add( vecSum1, vec_sld( vecSum1, vecSum1, 4 ) );
4348
4349 //move the result to the FPU
4350 vec_ste( vec_splat( vecSum1, 0 ), 0, &tempSum );
4351 sum -= tempSum;
4352 }
4353
4354 //cleanup
4355 for ( ; j < i; j++ ) {
4356 sum -= lptr[j] * x[j];
4357 }
4358 x[i] = sum;
4359 }
4360 }
4361
4362 /*
4363 ============
4364 idSIMD_AltiVec::MatX_LowerTriangularSolveTranspose
4365
4366 solves x in L.Transpose() * x = b for the first n rows of L
4367 L has to be a lower triangular matrix with (implicit) ones on the diagonal
4368 x == b is allowed
4369 ============
4370 */
MatX_LowerTriangularSolveTranspose(const idMatX & L,float * x,const float * b,const int n)4371 void VPCALL idSIMD_AltiVec::MatX_LowerTriangularSolveTranspose( const idMatX &L, float *x, const float *b, const int n ) {
4372
4373 int nc;
4374 const float *lptr;
4375
4376 lptr = L.ToFloatPtr();
4377 nc = L.GetNumColumns();
4378
4379 float x0, x1, x2, x3, x4, x5, x6;
4380 // unrolled cases for n < 8
4381 if ( n < 8 ) {
4382 switch( n ) {
4383 // using local variables to avoid aliasing issues
4384 case 0:
4385 return;
4386 case 1:
4387 x[0] = b[0];
4388 return;
4389 case 2:
4390 x1 = b[1];
4391 x0 = b[0] - lptr[1*nc+0] * x1;
4392
4393 x[1] = x1;
4394 x[0] = x0;
4395 return;
4396 case 3:
4397 x2 = b[2];
4398 x1 = b[1] - lptr[2*nc+1] * x2;
4399 x0 = b[0] - lptr[2*nc+0] * x2 - lptr[1*nc+0] * x1;
4400
4401 x[2] = x2;
4402 x[1] = x1;
4403 x[0] = x0;
4404 return;
4405 case 4:
4406 x3 = b[3];
4407 x2 = b[2] - lptr[3*nc+2] * x3;
4408 x1 = b[1] - lptr[3*nc+1] * x3 - lptr[2*nc+1] * x2;
4409 x0 = b[0] - lptr[3*nc+0] * x3 - lptr[2*nc+0] * x2 - lptr[1*nc+0] * x1;
4410
4411 x[3] = x3;
4412 x[2] = x2;
4413 x[1] = x1;
4414 x[0] = x0;
4415
4416 return;
4417 case 5:
4418 x4 = b[4];
4419 x3 = b[3] - lptr[4*nc+3] * x4;
4420 x2 = b[2] - lptr[4*nc+2] * x4 - lptr[3*nc+2] * x3;
4421 x1 = b[1] - lptr[4*nc+1] * x4 - lptr[3*nc+1] * x3 - lptr[2*nc+1] * x2;
4422 x0 = b[0] - lptr[4*nc+0] * x4 - lptr[3*nc+0] * x3 - lptr[2*nc+0] * x2 - lptr[1*nc+0] * x1;
4423
4424 x[4] = x4;
4425 x[3] = x3;
4426 x[2] = x2;
4427 x[1] = x1;
4428 x[0] = x0;
4429 return;
4430 case 6:
4431 x5 = b[5];
4432 x4 = b[4] - lptr[5*nc+4] * x5;
4433 x3 = b[3] - lptr[5*nc+3] * x5 - lptr[4*nc+3] * x4;
4434 x2 = b[2] - lptr[5*nc+2] * x5 - lptr[4*nc+2] * x4 - lptr[3*nc+2] * x3;
4435 x1 = b[1] - lptr[5*nc+1] * x5 - lptr[4*nc+1] * x4 - lptr[3*nc+1] * x3 - lptr[2*nc+1] * x2;
4436 x0 = b[0] - lptr[5*nc+0] * x5 - lptr[4*nc+0] * x4 - lptr[3*nc+0] * x3 - lptr[2*nc+0] * x2 - lptr[1*nc+0] * x1;
4437
4438 x[5] = x5;
4439 x[4] = x4;
4440 x[3] = x3;
4441 x[2] = x2;
4442 x[1] = x1;
4443 x[0] = x0;
4444
4445 return;
4446 case 7:
4447 x6 = b[6];
4448 x5 = b[5] - lptr[6*nc+5] * x6;
4449 x4 = b[4] - lptr[6*nc+4] * x6 - lptr[5*nc+4] * x5;
4450 x3 = b[3] - lptr[6*nc+3] * x6 - lptr[5*nc+3] * x5 - lptr[4*nc+3] * x4;
4451 x2 = b[2] - lptr[6*nc+2] * x6 - lptr[5*nc+2] * x5 - lptr[4*nc+2] * x4 - lptr[3*nc+2] * x3;
4452 x1 = b[1] - lptr[6*nc+1] * x6 - lptr[5*nc+1] * x5 - lptr[4*nc+1] * x4 - lptr[3*nc+1] * x3 - lptr[2*nc+1] * x2;
4453 x0 = b[0] - lptr[6*nc+0] * x6 - lptr[5*nc+0] * x5 - lptr[4*nc+0] * x4 - lptr[3*nc+0] * x3 - lptr[2*nc+0] * x2 - lptr[1*nc+0] * x1;
4454
4455 x[6] = x6;
4456 x[5] = x5;
4457 x[4] = x4;
4458 x[3] = x3;
4459 x[2] = x2;
4460 x[1] = x1;
4461 x[0] = x0;
4462 return;
4463 }
4464 return;
4465 }
4466
4467 int i, j;
4468 register float s0, s1, s2, s3;
4469 float *xptr;
4470
4471 lptr = L.ToFloatPtr() + n * nc + n - 4;
4472 xptr = x + n;
4473
4474 // process 4 rows at a time
4475 for ( i = n; i >= 4; i -= 4 ) {
4476 s0 = b[i-4];
4477 s1 = b[i-3];
4478 s2 = b[i-2];
4479 s3 = b[i-1];
4480 // process 4x4 blocks
4481 for ( j = 0; j < n-i; j += 4 ) {
4482 s0 -= lptr[(j+0)*nc+0] * xptr[j+0];
4483 s1 -= lptr[(j+0)*nc+1] * xptr[j+0];
4484 s2 -= lptr[(j+0)*nc+2] * xptr[j+0];
4485 s3 -= lptr[(j+0)*nc+3] * xptr[j+0];
4486 s0 -= lptr[(j+1)*nc+0] * xptr[j+1];
4487 s1 -= lptr[(j+1)*nc+1] * xptr[j+1];
4488 s2 -= lptr[(j+1)*nc+2] * xptr[j+1];
4489 s3 -= lptr[(j+1)*nc+3] * xptr[j+1];
4490 s0 -= lptr[(j+2)*nc+0] * xptr[j+2];
4491 s1 -= lptr[(j+2)*nc+1] * xptr[j+2];
4492 s2 -= lptr[(j+2)*nc+2] * xptr[j+2];
4493 s3 -= lptr[(j+2)*nc+3] * xptr[j+2];
4494 s0 -= lptr[(j+3)*nc+0] * xptr[j+3];
4495 s1 -= lptr[(j+3)*nc+1] * xptr[j+3];
4496 s2 -= lptr[(j+3)*nc+2] * xptr[j+3];
4497 s3 -= lptr[(j+3)*nc+3] * xptr[j+3];
4498 }
4499 // process left over of the 4 rows
4500 s0 -= lptr[0-1*nc] * s3;
4501 s1 -= lptr[1-1*nc] * s3;
4502 s2 -= lptr[2-1*nc] * s3;
4503 s0 -= lptr[0-2*nc] * s2;
4504 s1 -= lptr[1-2*nc] * s2;
4505 s0 -= lptr[0-3*nc] * s1;
4506 // store result
4507 xptr[-4] = s0;
4508 xptr[-3] = s1;
4509 xptr[-2] = s2;
4510 xptr[-1] = s3;
4511 // update pointers for next four rows
4512 lptr -= 4 + 4 * nc;
4513 xptr -= 4;
4514 }
4515 // process left over rows
4516 for ( i--; i >= 0; i-- ) {
4517 s0 = b[i];
4518 lptr = L[0] + i;
4519 for ( j = i + 1; j < n; j++ ) {
4520 s0 -= lptr[j*nc] * x[j];
4521 }
4522 x[i] = s0;
4523 }
4524 }
4525
4526 /*
4527 ============
4528 idSIMD_AltiVec::MatX_LDLTFactor
4529 ============
4530 */
MatX_LDLTFactor(idMatX & mat,idVecX & invDiag,const int n)4531 bool VPCALL idSIMD_AltiVec::MatX_LDLTFactor( idMatX &mat, idVecX &invDiag, const int n ) {
4532 int i, j, k, nc;
4533 float *v, *diag, *mptr;
4534 float s0, s1, s2, s3, sum, d;
4535 float s0_2, s1_2, s2_2, s3_2, sum_2;
4536 float *mptr2;
4537
4538 v = (float *) _alloca16( n * sizeof( float ) );
4539 diag = (float *) _alloca16( n * sizeof( float ) );
4540
4541 nc = mat.GetNumColumns();
4542
4543 if ( n <= 0 ) {
4544 return true;
4545 }
4546
4547 mptr = mat[0];
4548
4549 sum = mptr[0];
4550
4551 if ( sum == 0.0f ) {
4552 return false;
4553 }
4554
4555 diag[0] = sum;
4556 invDiag[0] = d = 1.0f / sum;
4557
4558 if ( n <= 1 ) {
4559 return true;
4560 }
4561
4562 mptr = mat[0];
4563 for ( j = 1; j < n; j++ ) {
4564 mptr[j*nc+0] = ( mptr[j*nc+0] ) * d;
4565 }
4566
4567 mptr = mat[1];
4568
4569 v[0] = diag[0] * mptr[0]; s0 = v[0] * mptr[0];
4570 sum = mptr[1] - s0;
4571
4572 if ( sum == 0.0f ) {
4573 return false;
4574 }
4575
4576 mat[1][1] = sum;
4577 diag[1] = sum;
4578 invDiag[1] = d = 1.0f / sum;
4579
4580 if ( n <= 2 ) {
4581 return true;
4582 }
4583
4584 mptr = mat[0];
4585 for ( j = 2; j < n; j++ ) {
4586 mptr[j*nc+1] = ( mptr[j*nc+1] - v[0] * mptr[j*nc+0] ) * d;
4587 }
4588
4589 mptr = mat[2];
4590
4591 v[0] = diag[0] * mptr[0]; s0 = v[0] * mptr[0];
4592 v[1] = diag[1] * mptr[1]; s1 = v[1] * mptr[1];
4593 sum = mptr[2] - s0 - s1;
4594
4595 if ( sum == 0.0f ) {
4596 return false;
4597 }
4598
4599 mat[2][2] = sum;
4600 diag[2] = sum;
4601 invDiag[2] = d = 1.0f / sum;
4602
4603 if ( n <= 3 ) {
4604 return true;
4605 }
4606
4607 mptr = mat[0];
4608 for ( j = 3; j < n; j++ ) {
4609 mptr[j*nc+2] = ( mptr[j*nc+2] - v[0] * mptr[j*nc+0] - v[1] * mptr[j*nc+1] ) * d;
4610 }
4611
4612 mptr = mat[3];
4613
4614 v[0] = diag[0] * mptr[0]; s0 = v[0] * mptr[0];
4615 v[1] = diag[1] * mptr[1]; s1 = v[1] * mptr[1];
4616 v[2] = diag[2] * mptr[2]; s2 = v[2] * mptr[2];
4617 sum = mptr[3] - s0 - s1 - s2;
4618
4619 if ( sum == 0.0f ) {
4620 return false;
4621 }
4622
4623 mat[3][3] = sum;
4624 diag[3] = sum;
4625 invDiag[3] = d = 1.0f / sum;
4626
4627 if ( n <= 4 ) {
4628 return true;
4629 }
4630
4631 mptr = mat[0];
4632 for ( j = 4; j < n; j++ ) {
4633 mptr[j*nc+3] = ( mptr[j*nc+3] - v[0] * mptr[j*nc+0] - v[1] * mptr[j*nc+1] - v[2] * mptr[j*nc+2] ) * d;
4634 }
4635
4636 for ( i = 4; i < n; i++ ) {
4637
4638 mptr = mat[i];
4639
4640 v[0] = diag[0] * mptr[0]; s0 = v[0] * mptr[0];
4641 v[1] = diag[1] * mptr[1]; s1 = v[1] * mptr[1];
4642 v[2] = diag[2] * mptr[2]; s2 = v[2] * mptr[2];
4643 v[3] = diag[3] * mptr[3]; s3 = v[3] * mptr[3];
4644 for ( k = 4; k < i-3; k += 4 ) {
4645 v[k+0] = diag[k+0] * mptr[k+0]; s0 += v[k+0] * mptr[k+0];
4646 v[k+1] = diag[k+1] * mptr[k+1]; s1 += v[k+1] * mptr[k+1];
4647 v[k+2] = diag[k+2] * mptr[k+2]; s2 += v[k+2] * mptr[k+2];
4648 v[k+3] = diag[k+3] * mptr[k+3]; s3 += v[k+3] * mptr[k+3];
4649 }
4650 switch( i - k ) {
4651 case 3: v[k+2] = diag[k+2] * mptr[k+2]; s0 += v[k+2] * mptr[k+2];
4652 case 2: v[k+1] = diag[k+1] * mptr[k+1]; s1 += v[k+1] * mptr[k+1];
4653 case 1: v[k+0] = diag[k+0] * mptr[k+0]; s2 += v[k+0] * mptr[k+0];
4654 }
4655 sum = s3;
4656 sum += s2;
4657 sum += s1;
4658 sum += s0;
4659 sum = mptr[i] - sum;
4660
4661 if ( sum == 0.0f ) {
4662 return false;
4663 }
4664
4665 mat[i][i] = sum;
4666 diag[i] = sum;
4667 invDiag[i] = d = 1.0f / sum;
4668
4669 if ( i + 1 >= n ) {
4670 return true;
4671 }
4672
4673 // unrolling madness!
4674 mptr = mat[i+1];
4675 mptr2 = mat[i+1] + nc;
4676
4677 for ( j = i+1; j+1 < n; j+=2 ) {
4678 s0 = mptr[0] * v[0];
4679 s1 = mptr[1] * v[1];
4680 s2 = mptr[2] * v[2];
4681 s3 = mptr[3] * v[3];
4682
4683 s0_2 = mptr2[0] * v[0];
4684 s1_2 = mptr2[1] * v[1];
4685 s2_2 = mptr2[2] * v[2];
4686 s3_2 = mptr2[3] * v[3];
4687
4688 for ( k = 4; k < i-7; k += 8 ) {
4689 s0 += mptr[k+0] * v[k+0];
4690 s1 += mptr[k+1] * v[k+1];
4691 s2 += mptr[k+2] * v[k+2];
4692 s3 += mptr[k+3] * v[k+3];
4693 s0 += mptr[k+4] * v[k+4];
4694 s1 += mptr[k+5] * v[k+5];
4695 s2 += mptr[k+6] * v[k+6];
4696 s3 += mptr[k+7] * v[k+7];
4697
4698 s0_2 += mptr2[k+0] * v[k+0];
4699 s1_2 += mptr2[k+1] * v[k+1];
4700 s2_2 += mptr2[k+2] * v[k+2];
4701 s3_2 += mptr2[k+3] * v[k+3];
4702 s0_2 += mptr2[k+4] * v[k+4];
4703 s1_2 += mptr2[k+5] * v[k+5];
4704 s2_2 += mptr2[k+6] * v[k+6];
4705 s3_2 += mptr2[k+7] * v[k+7];
4706 }
4707
4708 switch( i - k ) {
4709 case 7: s0 += mptr[k+6] * v[k+6]; s0_2 += mptr2[k+6] * v[k+6];
4710 case 6: s1 += mptr[k+5] * v[k+5]; s1_2 += mptr2[k+5] * v[k+5];
4711 case 5: s2 += mptr[k+4] * v[k+4]; s2_2 += mptr2[k+4] * v[k+4];
4712 case 4: s3 += mptr[k+3] * v[k+3]; s3_2 += mptr2[k+3] * v[k+3];
4713 case 3: s0 += mptr[k+2] * v[k+2]; s0_2 += mptr2[k+2] * v[k+2];
4714 case 2: s1 += mptr[k+1] * v[k+1]; s1_2 += mptr2[k+1] * v[k+1];
4715 case 1: s2 += mptr[k+0] * v[k+0]; s2_2 += mptr2[k+0] * v[k+0];
4716 }
4717 // disassociate these adds
4718 s3 += s2;
4719 s1 += s0;
4720 sum = s1 + s3;
4721
4722 s3_2 += s2_2;
4723 s1_2 += s0_2;
4724 sum_2 = s1_2 + s3_2;
4725
4726 mptr[i] = ( mptr[i] - sum ) * d;
4727 mptr2[i] = ( mptr2[i] - sum_2 ) * d;
4728
4729 mptr += nc*2;
4730 mptr2 += nc*2;
4731 }
4732
4733 // cleanup
4734 for ( ; j < n; j++ ) {
4735 s0 = mptr[0] * v[0];
4736 s1 = mptr[1] * v[1];
4737 s2 = mptr[2] * v[2];
4738 s3 = mptr[3] * v[3];
4739 for ( k = 4; k < i-7; k += 8 ) {
4740 s0 += mptr[k+0] * v[k+0];
4741 s1 += mptr[k+1] * v[k+1];
4742 s2 += mptr[k+2] * v[k+2];
4743 s3 += mptr[k+3] * v[k+3];
4744 s0 += mptr[k+4] * v[k+4];
4745 s1 += mptr[k+5] * v[k+5];
4746 s2 += mptr[k+6] * v[k+6];
4747 s3 += mptr[k+7] * v[k+7];
4748 }
4749 switch( i - k ) {
4750 case 7: s0 += mptr[k+6] * v[k+6];
4751 case 6: s1 += mptr[k+5] * v[k+5];
4752 case 5: s2 += mptr[k+4] * v[k+4];
4753 case 4: s3 += mptr[k+3] * v[k+3];
4754 case 3: s0 += mptr[k+2] * v[k+2];
4755 case 2: s1 += mptr[k+1] * v[k+1];
4756 case 1: s2 += mptr[k+0] * v[k+0];
4757 }
4758 // disassociate these adds
4759 s3 += s2;
4760 s1 += s0;
4761 sum = s1 + s3;
4762 mptr[i] = ( mptr[i] - sum ) * d;
4763 mptr += nc;
4764 }
4765 }
4766 return true;
4767 }
4768 #endif /* ENABLE_LOWER_TRIANGULAR */
4769
4770
4771 #ifdef LIVE_VICARIOUSLY
4772 /*
4773 ============
4774 idSIMD_AltiVec::BlendJoints
4775 ============
4776 */
BlendJoints(idJointQuat * joints,const idJointQuat * blendJoints,const float lerp,const int * index,const int numJoints)4777 void VPCALL idSIMD_AltiVec::BlendJoints( idJointQuat *joints, const idJointQuat *blendJoints, const float lerp, const int *index, const int numJoints ) {
4778 int i;
4779
4780 // since lerp is a constant, we can special case the two cases if they're true
4781 if ( lerp <= 0.0f ) {
4782 // this sets joints back to joints. No sense in doing no work, so just return
4783 return;
4784 }
4785
4786 if ( lerp >= 1.0f ) {
4787 // this copies each q from blendJoints to joints and copies each t from blendJoints to joints
4788 memcpy( joints[0].q.ToFloatPtr(), blendJoints[0].q.ToFloatPtr(), sizeof(idJointQuat) * numJoints );
4789 return;
4790 }
4791
4792 vector float vecLerp = loadSplatUnalignedScalar( &lerp );
4793 vector float zeroVector = (vector float)(0);
4794
4795 for ( i = 0; i+3 < numJoints; i+=4 ) {
4796 int j = index[i];
4797 int j2 = index[i+1];
4798 int j3 = index[i+2];
4799 int j4 = index[i+3];
4800
4801 // slerp
4802 const float *jointPtr = joints[j].q.ToFloatPtr();
4803 const float *blendPtr = blendJoints[j].q.ToFloatPtr();
4804 const float *jointPtr2 = joints[j2].q.ToFloatPtr();
4805 const float *blendPtr2 = blendJoints[j2].q.ToFloatPtr();
4806 const float *jointPtr3 = joints[j3].q.ToFloatPtr();
4807 const float *blendPtr3 = blendJoints[j3].q.ToFloatPtr();
4808 const float *jointPtr4 = joints[j4].q.ToFloatPtr();
4809 const float *blendPtr4 = blendJoints[j4].q.ToFloatPtr();
4810
4811 vector unsigned char permVec = vec_add( vec_lvsl( -1, jointPtr ), (vector unsigned char)(1) );
4812 vector unsigned char permVec2 = vec_add( vec_lvsl( -1, jointPtr2 ), (vector unsigned char)(1) );
4813 vector unsigned char permVec3 = vec_add( vec_lvsl( -1, jointPtr3 ), (vector unsigned char)(1) );
4814 vector unsigned char permVec4 = vec_add( vec_lvsl( -1, jointPtr4 ), (vector unsigned char)(1) );
4815
4816 vector unsigned char permVec5 = vec_add( vec_lvsl( -1, blendPtr ), (vector unsigned char)(1) );
4817 vector unsigned char permVec6 = vec_add( vec_lvsl( -1, blendPtr2 ), (vector unsigned char)(1) );
4818 vector unsigned char permVec7 = vec_add( vec_lvsl( -1, blendPtr3 ), (vector unsigned char)(1) );
4819 vector unsigned char permVec8 = vec_add( vec_lvsl( -1, blendPtr4 ), (vector unsigned char)(1) );
4820
4821 vector float v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11;
4822 vector float v12, v13, v14, v15, v16;
4823 vector float vecFromX, vecFromY, vecFromZ, vecFromW;
4824 vector float vecToX, vecToY, vecToZ, vecToW;
4825
4826 // load up the the idJointQuats from joints
4827 v0 = vec_ld( 0, jointPtr );
4828 v1 = vec_ld( 15, jointPtr );
4829 v2 = vec_perm( v0, v1, permVec );
4830
4831 v3 = vec_ld( 0, jointPtr2 );
4832 v4 = vec_ld( 15, jointPtr2 );
4833 v5 = vec_perm( v3, v4, permVec2 );
4834
4835 v6 = vec_ld( 0, jointPtr3 );
4836 v7 = vec_ld( 15, jointPtr3 );
4837 v8 = vec_perm( v6, v7, permVec3 );
4838
4839 v9 = vec_ld( 0, jointPtr4 );
4840 v10 = vec_ld( 15, jointPtr4 );
4841 v11 = vec_perm( v9, v10, permVec4 );
4842
4843 // planarizing, so put each x y z w into its own vector
4844 v0 = vec_mergeh( v2, v8 );
4845 v1 = vec_mergeh( v5, v11 );
4846 v3 = vec_mergel( v2, v8 );
4847 v4 = vec_mergel( v5, v11 );
4848
4849 vecFromX = vec_mergeh( v0, v1 );
4850 vecFromY = vec_mergel( v0, v1 );
4851 vecFromZ = vec_mergeh( v3, v4 );
4852 vecFromW = vec_mergel( v3, v4 );
4853
4854 // load up idJointQuats from blendJoints
4855 v5 = vec_ld( 0, blendPtr );
4856 v6 = vec_ld( 15, blendPtr );
4857 v7 = vec_perm( v5, v6, permVec5 );
4858
4859 v8 = vec_ld( 0, blendPtr2 );
4860 v9 = vec_ld( 15, blendPtr2 );
4861 v10 = vec_perm( v8, v9, permVec6 );
4862
4863 v11 = vec_ld( 0, blendPtr3 );
4864 v12 = vec_ld( 15, blendPtr3 );
4865 v13 = vec_perm( v11, v12, permVec7 );
4866
4867 v14 = vec_ld( 0, blendPtr4 );
4868 v15 = vec_ld( 15, blendPtr4 );
4869 v16 = vec_perm( v14, v15, permVec8 );
4870
4871 // put these into their own vectors too
4872 v5 = vec_mergeh( v7, v13 );
4873 v6 = vec_mergeh( v10, v16 );
4874 v8 = vec_mergel( v7, v13 );
4875 v9 = vec_mergel( v10, v16 );
4876
4877 vecToX = vec_mergeh( v5, v6 );
4878 vecToY = vec_mergel( v5, v6 );
4879 vecToZ = vec_mergeh( v8, v9 );
4880 vecToW = vec_mergel( v8, v9 );
4881
4882 // calculate cosom
4883 vector float vecCosom = vec_madd( vecFromX, vecToX, (vector float)(0) );
4884 vecCosom = vec_madd( vecFromY, vecToY, vecCosom );
4885 vecCosom = vec_madd( vecFromZ, vecToZ, vecCosom );
4886 vecCosom = vec_madd( vecFromW, vecToW, vecCosom );
4887
4888 // if cosom is < 0, negate it and set temp to negated elements in to. otherwise, set temp to
4889 // to
4890 vector bool int vecCmp, vecCmp2;
4891 vecCmp = vec_cmplt( vecCosom, zeroVector );
4892
4893 // negate if needed
4894 vecToX = vec_sel( vecToX, vec_madd( vecToX, (vector float)(-1), zeroVector ), vecCmp );
4895 vecToY = vec_sel( vecToY, vec_madd( vecToY, (vector float)(-1), zeroVector ), vecCmp );
4896 vecToZ = vec_sel( vecToZ, vec_madd( vecToZ, (vector float)(-1), zeroVector ), vecCmp );
4897 vecToW = vec_sel( vecToW, vec_madd( vecToW, (vector float)(-1), zeroVector ), vecCmp );
4898 vecCosom = vec_sel( vecCosom, vec_madd( vecCosom, (vector float)(-1), zeroVector ), vecCmp );
4899
4900 // check if we need to calculate scale
4901 vecCmp2 = vec_cmpgt( vec_sub( (vector float)(1), vecCosom ), (vector float)(1e-6f) );
4902 vector float vecScale0 = vec_sub( (vector float)(1), vecLerp );
4903 vector float vecScale1 = vec_splat( vecLerp, 0 );
4904
4905 vector float vecWork1 = vec_sub( (vector float)(1), vec_madd( vecCosom, vecCosom, zeroVector ) );
4906 vector float vecWork2 = ReciprocalSquareRoot( vecWork1 );
4907 vector float vecWork3 = VectorATan16( vec_madd( vecWork1, vecWork2, zeroVector ), vecCosom );
4908
4909 vecWork1 = vec_madd( VectorSin16( vec_madd( vecScale0, vecWork3, zeroVector ) ), vecWork2, zeroVector );
4910 vecWork2 = vec_madd( VectorSin16( vec_madd( vecLerp, vecWork3, zeroVector ) ), vecWork2, zeroVector );
4911
4912 // see which ones we have to insert into our scale0 and scale1 vectors
4913 vecScale0 = vec_sel( vecScale0, vecWork1, vecCmp2 );
4914 vecScale1 = vec_sel( vecScale1, vecWork2, vecCmp2 );
4915
4916 // multiply each element by the scale
4917 vecFromX = vec_madd( vecFromX, vecScale0, zeroVector );
4918 vecFromY = vec_madd( vecFromY, vecScale0, zeroVector );
4919 vecFromZ = vec_madd( vecFromZ, vecScale0, zeroVector );
4920 vecFromW = vec_madd( vecFromW, vecScale0, zeroVector );
4921
4922 // multiply temp by scale and add to result
4923 vecFromX = vec_madd( vecToX, vecScale1, vecFromX );
4924 vecFromY = vec_madd( vecToY, vecScale1, vecFromY );
4925 vecFromZ = vec_madd( vecToZ, vecScale1, vecFromZ );
4926 vecFromW = vec_madd( vecToW, vecScale1, vecFromW );
4927
4928 // do a transform again to get the results back to vectors we can store out
4929 v5 = vec_mergeh( vecFromX, vecFromZ );
4930 v6 = vec_mergeh( vecFromY, vecFromW );
4931 v8 = vec_mergel( vecFromX, vecFromZ );
4932 v9 = vec_mergel( vecFromY, vecFromW );
4933
4934 vecToX = vec_mergeh( v5, v6 );
4935 vecToY = vec_mergel( v5, v6 );
4936 vecToZ = vec_mergeh( v8, v9 );
4937 vecToW = vec_mergel( v8, v9 );
4938
4939 vector unsigned char storePerm1 = vec_lvsr( 0, jointPtr );
4940 vector unsigned char storePerm2 = vec_lvsr( 0, jointPtr2 );
4941 vector unsigned char storePerm3 = vec_lvsr( 0, jointPtr3 );
4942 vector unsigned char storePerm4 = vec_lvsr( 0, jointPtr4 );
4943
4944 // right rotate the input data
4945 vecToX = vec_perm( vecToX, vecToX, storePerm1 );
4946 vecToY = vec_perm( vecToY, vecToY, storePerm2 );
4947 vecToZ = vec_perm( vecToZ, vecToZ, storePerm3 );
4948 vecToW = vec_perm( vecToW, vecToW, storePerm4 );
4949
4950 vec_ste( vecToX, 0, (float*) jointPtr );
4951 vec_ste( vecToX, 4, (float*) jointPtr );
4952 vec_ste( vecToX, 8, (float*) jointPtr );
4953 vec_ste( vecToX, 12, (float*) jointPtr );
4954
4955 vec_ste( vecToY, 0, (float*) jointPtr2 );
4956 vec_ste( vecToY, 4, (float*) jointPtr2 );
4957 vec_ste( vecToY, 8, (float*) jointPtr2 );
4958 vec_ste( vecToY, 12, (float*) jointPtr2 );
4959
4960 vec_ste( vecToZ, 0, (float*) jointPtr3 );
4961 vec_ste( vecToZ, 4, (float*) jointPtr3 );
4962 vec_ste( vecToZ, 8, (float*) jointPtr3 );
4963 vec_ste( vecToZ, 12, (float*) jointPtr3 );
4964
4965 vec_ste( vecToW, 0, (float*) jointPtr4 );
4966 vec_ste( vecToW, 4, (float*) jointPtr4 );
4967 vec_ste( vecToW, 8, (float*) jointPtr4 );
4968 vec_ste( vecToW, 12, (float*) jointPtr4 );
4969
4970 // lerp is v1 + l * ( v2 - v1 );
4971 // the idVec3 T is going to be 12 bytes after the Q, so we can do this without calling ToFloatPtr() again. since its
4972 float *jointVecPtr = (float*)( jointPtr + 4 );
4973 float *jointVecPtr2 = (float*)( jointPtr2 + 4 );
4974 float *jointVecPtr3 = (float*)( jointPtr3 + 4 );
4975 float *jointVecPtr4 = (float*)( jointPtr4 + 4 );
4976
4977 v0 = vec_ld( 0, jointVecPtr );
4978 v1 = vec_ld( 11, jointVecPtr );
4979 vector float vecLd1 = vec_perm( v0, v1, vec_add( vec_lvsl( -1, jointVecPtr ), (vector unsigned char)(1) ) );
4980
4981 v2 = vec_ld( 0, jointVecPtr2 );
4982 v3 = vec_ld( 11, jointVecPtr2 );
4983 vector float vecLd2 = vec_perm( v2, v3, vec_add( vec_lvsl( -1, jointVecPtr2 ), (vector unsigned char)(1) ) );
4984
4985 v4 = vec_ld( 0, jointVecPtr3 );
4986 v5 = vec_ld( 11, jointVecPtr3 );
4987 vector float vecLd3 = vec_perm( v4, v5, vec_add( vec_lvsl( -1, jointVecPtr3 ), (vector unsigned char)(1) ) );
4988
4989 v6 = vec_ld( 0, jointVecPtr4 );
4990 v7 = vec_ld( 11, jointVecPtr4 );
4991 vector float vecLd4 = vec_perm( v6, v7, vec_add( vec_lvsl( -1, jointVecPtr4 ), (vector unsigned char)(1) ) );
4992
4993 vector float vecVecX, vecVecY, vecVecZ;
4994 vecVecX = vecVecY = vecVecZ = zeroVector;
4995
4996 // planarize
4997 v0 = vec_mergeh( vecLd1, vecLd3 );
4998 v1 = vec_mergeh( vecLd2, vecLd4 );
4999 v3 = vec_mergel( vecLd1, vecLd3 );
5000 v4 = vec_mergel( vecLd2, vecLd4 );
5001
5002 vecVecX = vec_mergeh( v0, v1 );
5003 vecVecY = vec_mergel( v0, v1 );
5004 vecVecZ = vec_mergeh( v3, v4 );
5005
5006 // load blend joint idvec3's
5007 float *blendVecPtr = (float*)( blendPtr + 4 );
5008 float *blendVecPtr2 =(float*)( blendPtr2 + 4 );
5009 float *blendVecPtr3 = (float*)( blendPtr3 + 4 );
5010 float *blendVecPtr4 = (float*)( blendPtr4 + 4 );
5011
5012 v0 = vec_ld( 0, blendVecPtr );
5013 v1 = vec_ld( 11, blendVecPtr );
5014 vector float vecLd5 = vec_perm( v0, v1, vec_add( vec_lvsl( -1, blendVecPtr ), (vector unsigned char)(1) ) );
5015
5016 v2 = vec_ld( 0, blendVecPtr2 );
5017 v3 = vec_ld( 11, blendVecPtr2 );
5018 vector float vecLd6 = vec_perm( v2, v3, vec_add( vec_lvsl( -1, blendVecPtr2 ), (vector unsigned char)(1) ) );
5019
5020 v4 = vec_ld( 0, blendVecPtr3 );
5021 v5 = vec_ld( 11, blendVecPtr3 );
5022 vector float vecLd7 = vec_perm( v4, v5, vec_add( vec_lvsl( -1, blendVecPtr3 ), (vector unsigned char)(1) ) );
5023
5024 v6 = vec_ld( 0, blendVecPtr4 );
5025 v7 = vec_ld( 11, blendVecPtr4 );
5026 vector float vecLd8 = vec_perm( v6, v7, vec_add( vec_lvsl( -1, blendVecPtr4 ), (vector unsigned char)(1) ) );
5027
5028 vector float vecBlendX, vecBlendY, vecBlendZ;
5029 vecBlendX = vecBlendY = vecBlendZ = zeroVector;
5030
5031 // planarize
5032 v0 = vec_mergeh( vecLd5, vecLd7 );
5033 v1 = vec_mergeh( vecLd6, vecLd8 );
5034 v3 = vec_mergel( vecLd5, vecLd7 );
5035 v4 = vec_mergel( vecLd6, vecLd8 );
5036
5037 vecBlendX = vec_mergeh( v0, v1 );
5038 vecBlendY = vec_mergel( v0, v1 );
5039 vecBlendZ = vec_mergeh( v3, v4 );
5040
5041 // do subtraction
5042 vecWork1 = vec_sub( vecBlendX, vecVecX );
5043 vecWork2 = vec_sub( vecBlendY, vecVecY );
5044 vecWork3 = vec_sub( vecBlendZ, vecVecZ );
5045
5046 // multiply by lerp and add to v1
5047 vecVecX = vec_madd( vecWork1, vecLerp, vecVecX );
5048 vecVecY = vec_madd( vecWork2, vecLerp, vecVecY );
5049 vecVecZ = vec_madd( vecWork3, vecLerp, vecVecZ );
5050
5051 // put it back in original form
5052 v0 = vec_mergeh( vecVecX, vecVecZ );
5053 v1 = vec_mergeh( vecVecY, zeroVector );
5054 v3 = vec_mergel( vecVecX, vecVecZ );
5055 v4 = vec_mergel( vecVecY, zeroVector );
5056
5057 // generate vectors to store
5058 vecWork1 = vec_mergeh( v0, v1 );
5059 vecWork2 = vec_mergel( v0, v1 );
5060 vecWork3 = vec_mergeh( v3, v4 );
5061 vector float vecWork4 = vec_mergel( v3, v4 );
5062
5063 // store the T values
5064 storePerm1 = vec_lvsr( 0, jointVecPtr );
5065 storePerm2 = vec_lvsr( 0, jointVecPtr2 );
5066 storePerm3 = vec_lvsr( 0, jointVecPtr3 );
5067 storePerm4 = vec_lvsr( 0, jointVecPtr4 );
5068
5069 // right rotate the input data
5070 vecWork1 = vec_perm( vecWork1, vecWork1, storePerm1 );
5071 vecWork2 = vec_perm( vecWork2, vecWork2, storePerm2 );
5072 vecWork3 = vec_perm( vecWork3, vecWork3, storePerm3 );
5073 vecWork4 = vec_perm( vecWork4, vecWork4, storePerm4 );
5074
5075 vec_ste( vecWork1, 0, (float*) jointVecPtr );
5076 vec_ste( vecWork1, 4, (float*) jointVecPtr );
5077 vec_ste( vecWork1, 8, (float*) jointVecPtr );
5078
5079 vec_ste( vecWork2, 0, (float*) jointVecPtr2 );
5080 vec_ste( vecWork2, 4, (float*) jointVecPtr2 );
5081 vec_ste( vecWork2, 8, (float*) jointVecPtr2 );
5082
5083 vec_ste( vecWork3, 0, (float*) jointVecPtr3 );
5084 vec_ste( vecWork3, 4, (float*) jointVecPtr3 );
5085 vec_ste( vecWork3, 8, (float*) jointVecPtr3 );
5086
5087 vec_ste( vecWork4, 0, (float*) jointVecPtr4 );
5088 vec_ste( vecWork4, 4, (float*) jointVecPtr4 );
5089 vec_ste( vecWork4, 8, (float*) jointVecPtr4 );
5090 }
5091
5092 // cleanup
5093 for ( ; i < numJoints; i++ ) {
5094 int j = index[i];
5095 joints[j].q.Slerp( joints[j].q, blendJoints[j].q, lerp );
5096 joints[j].t.Lerp( joints[j].t, blendJoints[j].t, lerp );
5097 }
5098 }
5099
5100 /*
5101 ============
5102 idSIMD_AltiVec::ConvertJointQuatsToJointMats
5103 ============
5104 */
5105
5106 // SSE doesn't vectorize this, and I don't think we should either. Its mainly just copying data, there's very little math involved and
5107 // it's not easily parallelizable
ConvertJointQuatsToJointMats(idJointMat * jointMats,const idJointQuat * jointQuats,const int numJoints)5108 void VPCALL idSIMD_AltiVec::ConvertJointQuatsToJointMats( idJointMat *jointMats, const idJointQuat *jointQuats, const int numJoints ) {
5109
5110 for ( int i = 0; i < numJoints; i++ ) {
5111
5112 const float *q = jointQuats[i].q.ToFloatPtr();
5113 float *m = jointMats[i].ToFloatPtr();
5114
5115 m[0*4+3] = q[4];
5116 m[1*4+3] = q[5];
5117 m[2*4+3] = q[6];
5118
5119 float x2 = q[0] + q[0];
5120 float y2 = q[1] + q[1];
5121 float z2 = q[2] + q[2];
5122
5123 {
5124 float xx = q[0] * x2;
5125 float yy = q[1] * y2;
5126 float zz = q[2] * z2;
5127
5128 m[0*4+0] = 1.0f - yy - zz;
5129 m[1*4+1] = 1.0f - xx - zz;
5130 m[2*4+2] = 1.0f - xx - yy;
5131 }
5132
5133 {
5134 float yz = q[1] * z2;
5135 float wx = q[3] * x2;
5136
5137 m[2*4+1] = yz - wx;
5138 m[1*4+2] = yz + wx;
5139 }
5140
5141 {
5142 float xy = q[0] * y2;
5143 float wz = q[3] * z2;
5144
5145 m[1*4+0] = xy - wz;
5146 m[0*4+1] = xy + wz;
5147 }
5148
5149 {
5150 float xz = q[0] * z2;
5151 float wy = q[3] * y2;
5152
5153 m[0*4+2] = xz - wy;
5154 m[2*4+0] = xz + wy;
5155 }
5156 }
5157 }
5158
5159 /*
5160 ============
5161 idSIMD_AltiVec::ConvertJointMatsToJointQuats
5162 ============
5163 */
ConvertJointMatsToJointQuats(idJointQuat * jointQuats,const idJointMat * jointMats,const int numJoints)5164 void VPCALL idSIMD_AltiVec::ConvertJointMatsToJointQuats( idJointQuat *jointQuats, const idJointMat *jointMats, const int numJoints ) {
5165
5166 int index;
5167
5168 // Since we use very little of the data we have to pull in for the altivec version, we end up with
5169 // a lot of wasted math. Rather than try to force it to use altivec, I wrote an optimized version
5170 // of InvSqrt for the G5, and made it use that instead. With only this change, we get a little
5171 // bigger than 50% speedup, which is not too shabby. Should really replace idMath::InvSqrt with
5172 // my function so everyone can benefit on G5.
5173
5174 for ( index = 0; index < numJoints; index++ ) {
5175
5176 idJointQuat jq;
5177 float trace;
5178 float s;
5179 float t;
5180 int i;
5181 int j;
5182 int k;
5183
5184 static int next[3] = { 1, 2, 0 };
5185
5186 float *mat = (float*)( jointMats[index].ToFloatPtr() );
5187 trace = mat[0 * 4 + 0] + mat[1 * 4 + 1] + mat[2 * 4 + 2];
5188
5189 if ( trace > 0.0f ) {
5190
5191 t = trace + 1.0f;
5192 //s = idMath::InvSqrt( t ) * 0.5f;
5193 s = FastScalarInvSqrt( t ) * 0.5f;
5194
5195 jq.q[3] = s * t;
5196 jq.q[0] = ( mat[1 * 4 + 2] - mat[2 * 4 + 1] ) * s;
5197 jq.q[1] = ( mat[2 * 4 + 0] - mat[0 * 4 + 2] ) * s;
5198 jq.q[2] = ( mat[0 * 4 + 1] - mat[1 * 4 + 0] ) * s;
5199
5200 } else {
5201
5202 i = 0;
5203 if ( mat[1 * 4 + 1] > mat[0 * 4 + 0] ) {
5204 i = 1;
5205 }
5206 if ( mat[2 * 4 + 2] > mat[i * 4 + i] ) {
5207 i = 2;
5208 }
5209 j = next[i];
5210 k = next[j];
5211
5212 t = ( mat[i * 4 + i] - ( mat[j * 4 + j] + mat[k * 4 + k] ) ) + 1.0f;
5213 //s = idMath::InvSqrt( t ) * 0.5f;
5214 s = FastScalarInvSqrt( t ) * 0.5f;
5215
5216 jq.q[i] = s * t;
5217 jq.q[3] = ( mat[j * 4 + k] - mat[k * 4 + j] ) * s;
5218 jq.q[j] = ( mat[i * 4 + j] + mat[j * 4 + i] ) * s;
5219 jq.q[k] = ( mat[i * 4 + k] + mat[k * 4 + i] ) * s;
5220 }
5221
5222 jq.t[0] = mat[0 * 4 + 3];
5223 jq.t[1] = mat[1 * 4 + 3];
5224 jq.t[2] = mat[2 * 4 + 3];
5225 jointQuats[index] = jq;
5226 }
5227 }
5228
5229 /*
5230 ============
5231 idSIMD_AltiVec::TransformJoints
5232 ============
5233 */
TransformJoints(idJointMat * jointMats,const int * parents,const int firstJoint,const int lastJoint)5234 void VPCALL idSIMD_AltiVec::TransformJoints( idJointMat *jointMats, const int *parents, const int firstJoint, const int lastJoint ) {
5235 int i;
5236 #if 0
5237 for( i = firstJoint; i <= lastJoint; i++ ) {
5238 assert( parents[i] < i );
5239 jointMats[i] *= jointMats[parents[i]];
5240 }
5241 #else
5242
5243 // I don't think you can unroll this since the next iteration of the loop might depending on the previous iteration, depending
5244 // on what the parents array looks like. This is true in the test code.
5245 for ( i = firstJoint; i <= lastJoint; i++ ) {
5246 assert( parents[i] < i );
5247 float *jointPtr = jointMats[i].ToFloatPtr();
5248 float *parentPtr = jointMats[parents[i]].ToFloatPtr();
5249
5250 vector unsigned char permVec = vec_add( vec_lvsl( -1, jointPtr ), (vector unsigned char)(1) );
5251 vector unsigned char permVec2 = vec_add( vec_lvsl( -1, parentPtr ), (vector unsigned char)(1) );
5252 vector float v0, v1, v2, v3, v4, v5, v6, v7;
5253
5254 // we need to load up 12 float elements that make up the Mat
5255 v0 = vec_ld( 0, jointPtr );
5256 v1 = vec_ld( 15, jointPtr );
5257 v2 = vec_ld( 31, jointPtr );
5258 v3 = vec_ld( 47, jointPtr );
5259
5260 // load parents
5261 v4 = vec_ld( 0, parentPtr );
5262 v5 = vec_ld( 15, parentPtr );
5263 v6 = vec_ld( 31, parentPtr );
5264 v7 = vec_ld( 47, parentPtr );
5265
5266 // permute into vectors
5267 vector float vecJointMat1 = vec_perm( v0, v1, permVec );
5268 vector float vecJointMat2 = vec_perm( v1, v2, permVec );
5269 vector float vecJointMat3 = vec_perm( v2, v3, permVec );
5270
5271 vector float vecParentMat1 = vec_perm( v4, v5, permVec2 );
5272 vector float vecParentMat2 = vec_perm( v5, v6, permVec2 );
5273 vector float vecParentMat3 = vec_perm( v6, v7, permVec2 );
5274
5275 vector float zero = (vector float)(0);
5276 vector float C1, C2, C3;
5277
5278 // matrix multiply
5279 C1 = vec_madd( vecJointMat1, vec_splat( vecParentMat1, 0 ), zero ); // m(0 to 3) * a(0)
5280 C2 = vec_madd( vecJointMat1, vec_splat( vecParentMat2, 0 ), zero ); // m(4 to 7) * a(4)
5281 C3 = vec_madd( vecJointMat1, vec_splat( vecParentMat3, 0 ), zero ); // m(8 to 11) * a(8)
5282
5283 C1 = vec_madd( vecJointMat2, vec_splat( vecParentMat1, 1 ), C1 ); // add in m(4 to 7) * a(1)
5284 C2 = vec_madd( vecJointMat2, vec_splat( vecParentMat2, 1 ), C2 ); // add in m(4 to 7) * a(5)
5285 C3 = vec_madd( vecJointMat2, vec_splat( vecParentMat3, 1 ), C3 ); // add in m(4 to 7) * a(9)
5286
5287 C1 = vec_madd( vecJointMat3, vec_splat( vecParentMat1, 2 ), C1 );
5288 C2 = vec_madd( vecJointMat3, vec_splat( vecParentMat2, 2 ), C2 );
5289 C3 = vec_madd( vecJointMat3, vec_splat( vecParentMat3, 2 ), C3 );
5290
5291 // do the addition at the end
5292 vector unsigned char permZeroAndLast = (vector unsigned char)(0,1,2,3,4,5,6,7,8,9,10,11,28,29,30,31);
5293 C1 = vec_add( C1, vec_perm( zero, vecParentMat1, permZeroAndLast ) );
5294 C2 = vec_add( C2, vec_perm( zero, vecParentMat2, permZeroAndLast ) );
5295 C3 = vec_add( C3, vec_perm( zero, vecParentMat3, permZeroAndLast ) );
5296
5297 // store results
5298 UNALIGNED_STORE3( (float*) jointPtr, C1, C2, C3 );
5299 }
5300 #endif
5301 }
5302
5303 /*
5304 ============
5305 idSIMD_AltiVec::UntransformJoints
5306 ============
5307 */
UntransformJoints(idJointMat * jointMats,const int * parents,const int firstJoint,const int lastJoint)5308 void VPCALL idSIMD_AltiVec::UntransformJoints( idJointMat *jointMats, const int *parents, const int firstJoint, const int lastJoint ) {
5309 int i;
5310 #if 0
5311 for( i = lastJoint; i >= firstJoint; i-- ) {
5312 assert( parents[i] < i );
5313 jointMats[i] /= jointMats[parents[i]];
5314 }
5315 #else
5316 // I don't think you can unroll this since the next iteration of the loop might depending on the previous iteration, depending
5317 // on what the parents array looks like. This is true in the test code.
5318 for ( i = lastJoint; i >= firstJoint; i-- ) {
5319 assert( parents[i] < i );
5320 float *jointPtr = jointMats[i].ToFloatPtr();
5321 float *parentPtr = jointMats[parents[i]].ToFloatPtr();
5322
5323 vector unsigned char permVec = vec_add( vec_lvsl( -1, jointPtr ), (vector unsigned char)(1) );
5324 vector unsigned char permVec2 = vec_add( vec_lvsl( -1, parentPtr ), (vector unsigned char)(1) );
5325 vector float v0, v1, v2, v3, v4, v5, v6, v7;
5326
5327 // we need to load up 12 float elements that make up the Mat
5328 v0 = vec_ld( 0, jointPtr );
5329 v1 = vec_ld( 15, jointPtr );
5330 v2 = vec_ld( 31, jointPtr );
5331 v3 = vec_ld( 47, jointPtr );
5332
5333 // load parents
5334 v4 = vec_ld( 0, parentPtr );
5335 v5 = vec_ld( 15, parentPtr );
5336 v6 = vec_ld( 31, parentPtr );
5337 v7 = vec_ld( 47, parentPtr );
5338
5339 // permute into vectors
5340 vector float vecJointMat1 = vec_perm( v0, v1, permVec );
5341 vector float vecJointMat2 = vec_perm( v1, v2, permVec );
5342 vector float vecJointMat3 = vec_perm( v2, v3, permVec );
5343
5344 vector float vecParentMat1 = vec_perm( v4, v5, permVec2 );
5345 vector float vecParentMat2 = vec_perm( v5, v6, permVec2 );
5346 vector float vecParentMat3 = vec_perm( v6, v7, permVec2 );
5347
5348 vector float zero = (vector float)(0);
5349 vector float C1, C2, C3;
5350
5351 // do subtraction at the beginning
5352 vector unsigned char permZeroAndLast = (vector unsigned char)(0,1,2,3,4,5,6,7,8,9,10,11,28,29,30,31);
5353 vecJointMat1 = vec_sub( vecJointMat1, vec_perm( zero, vecParentMat1, permZeroAndLast ) );
5354 vecJointMat2 = vec_sub( vecJointMat2, vec_perm( zero, vecParentMat2, permZeroAndLast ) );
5355 vecJointMat3 = vec_sub( vecJointMat3, vec_perm( zero, vecParentMat3, permZeroAndLast ) );
5356
5357 // matrix multiply
5358 C1 = vec_madd( vecJointMat1, vec_splat( vecParentMat1, 0 ), zero );
5359 C2 = vec_madd( vecJointMat1, vec_splat( vecParentMat1, 1 ), zero );
5360 C3 = vec_madd( vecJointMat1, vec_splat( vecParentMat1, 2 ), zero );
5361
5362 C1 = vec_madd( vecJointMat2, vec_splat( vecParentMat2, 0 ), C1 );
5363 C2 = vec_madd( vecJointMat2, vec_splat( vecParentMat2, 1 ), C2 );
5364 C3 = vec_madd( vecJointMat2, vec_splat( vecParentMat2, 2 ), C3 );
5365
5366 C1 = vec_madd( vecJointMat3, vec_splat( vecParentMat3, 0 ), C1 );
5367 C2 = vec_madd( vecJointMat3, vec_splat( vecParentMat3, 1 ), C2 );
5368 C3 = vec_madd( vecJointMat3, vec_splat( vecParentMat3, 2 ), C3 );
5369
5370 // store results back
5371 vector unsigned char storePerm = vec_lvsr( 0, jointPtr );
5372
5373 // right rotate the input data
5374 C1 = vec_perm( C1, C1, storePerm );
5375 C2 = vec_perm( C2, C2, storePerm );
5376 C3 = vec_perm( C3, C3, storePerm );
5377
5378 vec_ste( C1, 0, (float*) jointPtr );
5379 vec_ste( C1, 4, (float*) jointPtr );
5380 vec_ste( C1, 8, (float*) jointPtr );
5381 vec_ste( C1, 12, (float*) jointPtr );
5382
5383 vec_ste( C2, 16, (float*) jointPtr );
5384 vec_ste( C2, 20, (float*) jointPtr );
5385 vec_ste( C2, 24, (float*) jointPtr );
5386 vec_ste( C2, 28, (float*) jointPtr );
5387
5388 vec_ste( C3, 32, (float*) jointPtr );
5389 vec_ste( C3, 36, (float*) jointPtr );
5390 vec_ste( C3, 40, (float*) jointPtr );
5391 vec_ste( C3, 44, (float*) jointPtr );
5392 }
5393
5394 #endif
5395 }
5396
5397 /*
5398 ============
5399 idSIMD_AltiVec::TransformVerts
5400 ============
5401 */
5402
5403 // Here we don't have much for the vector unit to do, and the gain we get from doing the math
5404 // in parallel is eaten by doing unaligned stores.
TransformVerts(idDrawVert * verts,const int numVerts,const idJointMat * joints,const idVec4 * weights,const int * index,int numWeights)5405 void VPCALL idSIMD_AltiVec::TransformVerts( idDrawVert *verts, const int numVerts, const idJointMat *joints, const idVec4 *weights, const int *index, int numWeights ) {
5406 int i, j;
5407 const byte *jointsPtr = (byte *)joints;
5408
5409 for( j = i = 0; i < numVerts; i++ ) {
5410 idVec3 v;
5411
5412 float *matPtrOrig = ( *(idJointMat *)( jointsPtr + index[j*2] ) ).ToFloatPtr();
5413 float *weightPtr = (float*) weights[j].ToFloatPtr();
5414
5415 v[0] = matPtrOrig[0] * weightPtr[0];
5416 v[0] += matPtrOrig[1] * weightPtr[1];
5417 v[0] += matPtrOrig[2] * weightPtr[2];
5418 v[0] += matPtrOrig[3] * weightPtr[3];
5419
5420 v[1] = matPtrOrig[4] * weightPtr[0];
5421 v[1] += matPtrOrig[5] * weightPtr[1];
5422 v[1] += matPtrOrig[6] * weightPtr[2];
5423 v[1] += matPtrOrig[7] * weightPtr[3];
5424
5425 v[2] = matPtrOrig[8] * weightPtr[0];
5426 v[2] += matPtrOrig[9] * weightPtr[1];
5427 v[2] += matPtrOrig[10] * weightPtr[2];
5428 v[2] += matPtrOrig[11] * weightPtr[3];
5429
5430 while( index[j*2+1] == 0 ) {
5431 j++;
5432 float *matPtr = ( *(idJointMat *)( jointsPtr + index[j*2] ) ).ToFloatPtr();
5433 weightPtr = (float*) weights[j].ToFloatPtr();
5434
5435 v[0] += matPtr[0] * weightPtr[0];
5436 v[0] += matPtr[1] * weightPtr[1];
5437 v[0] += matPtr[2] * weightPtr[2];
5438 v[0] += matPtr[3] * weightPtr[3];
5439
5440 v[1] += matPtr[4] * weightPtr[0];
5441 v[1] += matPtr[5] * weightPtr[1];
5442 v[1] += matPtr[6] * weightPtr[2];
5443 v[1] += matPtr[7] * weightPtr[3];
5444
5445 v[2] += matPtr[8] * weightPtr[0];
5446 v[2] += matPtr[9] * weightPtr[1];
5447 v[2] += matPtr[10] * weightPtr[2];
5448 v[2] += matPtr[11] * weightPtr[3];
5449 }
5450 j++;
5451
5452 verts[i].xyz = v;
5453 }
5454 }
5455 #endif /* LIVE_VICARIOUSLY */
5456
5457 #ifdef ENABLE_CULL
5458
5459 #ifndef DRAWVERT_PADDED
5460 /*
5461 ============
5462 idSIMD_AltiVec::TracePointCull
5463 ============
5464 */
TracePointCull(byte * cullBits,byte & totalOr,const float radius,const idPlane * planes,const idDrawVert * verts,const int numVerts)5465 void VPCALL idSIMD_AltiVec::TracePointCull( byte *cullBits, byte &totalOr, const float radius, const idPlane *planes, const idDrawVert *verts, const int numVerts ) {
5466
5467 // idDrawVert size
5468 assert( sizeof(idDrawVert) == DRAWVERT_OFFSET * sizeof(float) );
5469
5470 byte tOr;
5471 tOr = 0;
5472
5473 // pointers
5474 const float *planePtr = planes[0].ToFloatPtr();
5475
5476 vector unsigned int vecShift1 = (vector unsigned int)(0,1,2,3);
5477 vector unsigned int vecShift2 = (vector unsigned int)(4,5,6,7);
5478 vector unsigned int vecFlipBits = (vector unsigned int)(0x0F);
5479 vector float vecPlane0, vecPlane1, vecPlane2, vecPlane3;
5480 vector bool int vecCmp1, vecCmp2, vecCmp3, vecCmp4, vecCmp5, vecCmp6, vecCmp7, vecCmp8;
5481 vector unsigned char vecPerm;
5482 vector float v0, v1, v2, v3, v4, v5, v6, v7;
5483 vector float zeroVector = (vector float)(0);
5484 vector float vecRadius;
5485 vector float vecXYZ1, vecXYZ2, vecXYZ3, vecXYZ4;
5486 vector float vec1Sum1, vec1Sum2, vec1Sum3, vec1Sum4;
5487 vector unsigned char vecPermLast = (vector unsigned char)(0,1,2,3,4,5,6,7,8,9,10,11,16,17,18,19);
5488 vector unsigned char vecPermHalves = (vector unsigned char)(0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23);
5489 vector float vecDPlusRadius1, vecDPlusRadius2, vecDPlusRadius3, vecDPlusRadius4;
5490 vector float vecDMinusRadius1, vecDMinusRadius2, vecDMinusRadius3, vecDMinusRadius4;
5491 vector bool int oneIntVector = (vector bool int)(1);
5492 vector unsigned int vecBitShifted1, vecBitShifted2, vecBitShifted3, vecBitShifted4, vecBitShifted5, vecBitShifted6, vecBitShifted7, vecBitShifted8;
5493 vector unsigned int vecTotals;
5494 vector unsigned int tempIntSum;
5495 vector unsigned char vertPerm1, vertPerm2, vertPerm3, vertPerm4;
5496
5497 vecPerm = vec_add( vec_lvsl( -1, planePtr ), (vector unsigned char)(1) );
5498
5499 // populate planes
5500 v0 = vec_ld( 0, planePtr );
5501 v1 = vec_ld( 15, planePtr );
5502 vecPlane0 = vec_perm( v0, v1, vecPerm );
5503
5504 v2 = vec_ld( 0, planePtr + 4 );
5505 v3 = vec_ld( 15, planePtr + 4 );
5506 vecPlane1 = vec_perm( v2, v3, vecPerm );
5507
5508 v0 = vec_ld( 0, planePtr + 8 );
5509 v1 = vec_ld( 15, planePtr + 8 );
5510 vecPlane2 = vec_perm( v0, v1, vecPerm );
5511
5512 v2 = vec_ld( 0, planePtr + 12 );
5513 v3 = vec_ld( 15, planePtr + 12 );
5514 vecPlane3 = vec_perm( v2, v3, vecPerm );
5515
5516 // transpose
5517 v0 = vec_mergeh( vecPlane0, vecPlane2 );
5518 v1 = vec_mergeh( vecPlane1, vecPlane3 );
5519 v2 = vec_mergel( vecPlane0, vecPlane2 );
5520 v3 = vec_mergel( vecPlane1, vecPlane3 );
5521
5522 vecPlane0 = vec_mergeh( v0, v1 );
5523 vecPlane1 = vec_mergel( v0, v1 );
5524 vecPlane2 = vec_mergeh( v2, v3 );
5525 vecPlane3 = vec_mergel( v2, v3 );
5526
5527 // load constants
5528 vecRadius = loadSplatUnalignedScalar( &radius );
5529
5530 unsigned int cullBitVal[4];
5531 vector unsigned char cullBitPerm = vec_lvsr( 0, &cullBitVal[0] );
5532 int i = 0;
5533
5534 // every fourth one will have the same alignment. Make sure we've got enough here
5535 if ( i+3 < numVerts ) {
5536 vertPerm1 = vec_add( vec_lvsl( -1, (float*) verts[0].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
5537 vertPerm2 = vec_add( vec_lvsl( -1, (float*) verts[1].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
5538 vertPerm3 = vec_add( vec_lvsl( -1, (float*) verts[2].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
5539 vertPerm4 = vec_add( vec_lvsl( -1, (float*) verts[3].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
5540 }
5541
5542
5543 for ( ; i+3 < numVerts; i+=4 ) {
5544 const float *vertPtr = verts[i].xyz.ToFloatPtr();
5545 const float *vertPtr2 = verts[i+1].xyz.ToFloatPtr();
5546 const float *vertPtr3 = verts[i+2].xyz.ToFloatPtr();
5547 const float *vertPtr4 = verts[i+3].xyz.ToFloatPtr();
5548
5549 v0 = vec_ld( 0, vertPtr );
5550 v1 = vec_ld( 15, vertPtr );
5551 v2 = vec_ld( 0, vertPtr2 );
5552 v3 = vec_ld( 15, vertPtr2 );
5553 v4 = vec_ld( 0, vertPtr3 );
5554 v5 = vec_ld( 15, vertPtr3 );
5555 v6 = vec_ld( 0, vertPtr4 );
5556 v7 = vec_ld( 15, vertPtr4 );
5557
5558 vecXYZ1 = vec_perm( v0, v1, vertPerm1 );
5559 vecXYZ2 = vec_perm( v2, v3, vertPerm2 );
5560 vecXYZ3 = vec_perm( v4, v5, vertPerm3 );
5561 vecXYZ4 = vec_perm( v6, v7, vertPerm4 );
5562
5563 vec1Sum1 = vec_madd( vec_splat( vecXYZ1, 0 ), vecPlane0, zeroVector );
5564 vec1Sum1 = vec_madd( vec_splat( vecXYZ1, 1 ), vecPlane1, vec1Sum1 );
5565 vec1Sum1 = vec_madd( vec_splat( vecXYZ1, 2 ), vecPlane2, vec1Sum1 );
5566 vec1Sum1 = vec_add( vec1Sum1, vecPlane3 );
5567
5568 vec1Sum2 = vec_madd( vec_splat( vecXYZ2, 0 ), vecPlane0, zeroVector );
5569 vec1Sum2 = vec_madd( vec_splat( vecXYZ2, 1 ), vecPlane1, vec1Sum2 );
5570 vec1Sum2 = vec_madd( vec_splat( vecXYZ2, 2 ), vecPlane2, vec1Sum2 );
5571 vec1Sum2 = vec_add( vec1Sum2, vecPlane3 );
5572
5573 vec1Sum3 = vec_madd( vec_splat( vecXYZ3, 0 ), vecPlane0, zeroVector );
5574 vec1Sum3 = vec_madd( vec_splat( vecXYZ3, 1 ), vecPlane1, vec1Sum3 );
5575 vec1Sum3 = vec_madd( vec_splat( vecXYZ3, 2 ), vecPlane2, vec1Sum3 );
5576 vec1Sum3 = vec_add( vec1Sum3, vecPlane3 );
5577
5578 vec1Sum4 = vec_madd( vec_splat( vecXYZ4, 0 ), vecPlane0, zeroVector );
5579 vec1Sum4 = vec_madd( vec_splat( vecXYZ4, 1 ), vecPlane1, vec1Sum4 );
5580 vec1Sum4 = vec_madd( vec_splat( vecXYZ4, 2 ), vecPlane2, vec1Sum4 );
5581 vec1Sum4 = vec_add( vec1Sum4, vecPlane3 );
5582
5583 // vec1Sum1 now holds d0, d1, d2, d3. calculate the
5584 // difference with +radius and -radius
5585 vecDPlusRadius1 = vec_add( vec1Sum1, vecRadius );
5586 vecDMinusRadius1 = vec_sub( vec1Sum1, vecRadius );
5587 vecDPlusRadius2 = vec_add( vec1Sum2, vecRadius );
5588 vecDMinusRadius2 = vec_sub( vec1Sum2, vecRadius );
5589 vecDPlusRadius3 = vec_add( vec1Sum3, vecRadius );
5590 vecDMinusRadius3 = vec_sub( vec1Sum3, vecRadius );
5591 vecDPlusRadius4 = vec_add( vec1Sum4, vecRadius );
5592 vecDMinusRadius4 = vec_sub( vec1Sum4, vecRadius );
5593
5594 // do compare
5595 vecCmp1 = vec_cmplt( vecDPlusRadius1, zeroVector );
5596 vecCmp2 = vec_cmplt( vecDMinusRadius1, zeroVector );
5597 vecCmp3 = vec_cmplt( vecDPlusRadius2, zeroVector );
5598 vecCmp4 = vec_cmplt( vecDMinusRadius2, zeroVector );
5599 vecCmp5 = vec_cmplt( vecDPlusRadius3, zeroVector );
5600 vecCmp6 = vec_cmplt( vecDMinusRadius3, zeroVector );
5601 vecCmp7 = vec_cmplt( vecDPlusRadius4, zeroVector );
5602 vecCmp8 = vec_cmplt( vecDMinusRadius4, zeroVector );
5603
5604 //and it with 1 so we multiply by 1 not 1111's
5605 vecCmp1 = vec_and( vecCmp1, oneIntVector );
5606 vecCmp2 = vec_and( vecCmp2, oneIntVector );
5607 vecCmp3 = vec_and( vecCmp3, oneIntVector );
5608 vecCmp4 = vec_and( vecCmp4, oneIntVector );
5609 vecCmp5 = vec_and( vecCmp5, oneIntVector );
5610 vecCmp6 = vec_and( vecCmp6, oneIntVector );
5611 vecCmp7 = vec_and( vecCmp7, oneIntVector );
5612 vecCmp8 = vec_and( vecCmp8, oneIntVector );
5613
5614 vecBitShifted1 = vec_sl( (vector unsigned int)vecCmp1, vecShift1 );
5615 vecBitShifted2 = vec_sl( (vector unsigned int)vecCmp2, vecShift2 );
5616 vecBitShifted3 = vec_sl( (vector unsigned int)vecCmp3, vecShift1 );
5617 vecBitShifted4 = vec_sl( (vector unsigned int)vecCmp4, vecShift2 );
5618 vecBitShifted5 = vec_sl( (vector unsigned int)vecCmp5, vecShift1 );
5619 vecBitShifted6 = vec_sl( (vector unsigned int)vecCmp6, vecShift2 );
5620 vecBitShifted7 = vec_sl( (vector unsigned int)vecCmp7, vecShift1 );
5621 vecBitShifted8 = vec_sl( (vector unsigned int)vecCmp8, vecShift2 );
5622
5623 // OR (add) them all together
5624 vecBitShifted1 = vec_add( vecBitShifted1, vecBitShifted2 );
5625 vecBitShifted3 = vec_add( vecBitShifted3, vecBitShifted4 );
5626 vecBitShifted5 = vec_add( vecBitShifted5, vecBitShifted6 );
5627 vecBitShifted7 = vec_add( vecBitShifted7, vecBitShifted8 );
5628
5629 vecTotals = vec_add( vecBitShifted1, vec_sld( vecBitShifted1, vecBitShifted1, 8 ) );
5630 vecTotals = vec_add( vecTotals, vec_sld( vecTotals, vecTotals, 4 ) );
5631 tempIntSum = vec_add( vecBitShifted3, vec_sld( vecBitShifted3, vecBitShifted3, 8 ) );
5632 tempIntSum = vec_add( tempIntSum, vec_sld( tempIntSum, tempIntSum, 4 ) );
5633 vecTotals = vec_mergeh( vecTotals, tempIntSum );
5634 tempIntSum = vec_add( vecBitShifted5, vec_sld( vecBitShifted5, vecBitShifted5, 8 ) );
5635 tempIntSum = vec_add( tempIntSum, vec_sld( tempIntSum, tempIntSum, 4 ) );
5636 vecTotals = vec_perm( vecTotals, tempIntSum, vecPermHalves );
5637 tempIntSum = vec_add( vecBitShifted7, vec_sld( vecBitShifted7, vecBitShifted7, 8 ) );
5638 tempIntSum = vec_add( tempIntSum, vec_sld( tempIntSum, tempIntSum, 4 ) );
5639 vecTotals = vec_perm( vecTotals, tempIntSum, vecPermLast );
5640
5641 // store out results
5642 vector unsigned int tempSt = vec_xor( vecTotals, vecFlipBits );
5643 tempSt = vec_perm( tempSt, tempSt, cullBitPerm );
5644 vec_ste( tempSt, 0, &cullBitVal[0] );
5645 vec_ste( tempSt, 4, &cullBitVal[0] );
5646 vec_ste( tempSt, 8, &cullBitVal[0] );
5647 vec_ste( tempSt, 12, &cullBitVal[0] );
5648
5649 tOr |= cullBitVal[0];
5650 tOr |= cullBitVal[1];
5651 tOr |= cullBitVal[2];
5652 tOr |= cullBitVal[3];
5653
5654 cullBits[i] = cullBitVal[0];
5655 cullBits[i+1] = cullBitVal[1];
5656 cullBits[i+2] = cullBitVal[2];
5657 cullBits[i+3] = cullBitVal[3];
5658 }
5659
5660 // cleanup
5661 for ( ; i < numVerts; i++ ) {
5662 byte bits;
5663 float d0, d1, d2, d3, t;
5664 const idVec3 &v = verts[i].xyz;
5665
5666 d0 = planes[0].Distance( v );
5667 d1 = planes[1].Distance( v );
5668 d2 = planes[2].Distance( v );
5669 d3 = planes[3].Distance( v );
5670
5671 t = d0 + radius;
5672 bits = FLOATSIGNBITSET( t ) << 0;
5673 t = d1 + radius;
5674 bits |= FLOATSIGNBITSET( t ) << 1;
5675 t = d2 + radius;
5676 bits |= FLOATSIGNBITSET( t ) << 2;
5677 t = d3 + radius;
5678 bits |= FLOATSIGNBITSET( t ) << 3;
5679
5680 t = d0 - radius;
5681 bits |= FLOATSIGNBITSET( t ) << 4;
5682 t = d1 - radius;
5683 bits |= FLOATSIGNBITSET( t ) << 5;
5684 t = d2 - radius;
5685 bits |= FLOATSIGNBITSET( t ) << 6;
5686 t = d3 - radius;
5687 bits |= FLOATSIGNBITSET( t ) << 7;
5688
5689 bits ^= 0x0F; // flip lower four bits
5690
5691 tOr |= bits;
5692 cullBits[i] = bits;
5693 }
5694
5695 totalOr = tOr;
5696 }
5697 #else
5698
5699 /*
5700 ============
5701 idSIMD_AltiVec::TracePointCull
5702 ============
5703 */
TracePointCull(byte * cullBits,byte & totalOr,const float radius,const idPlane * planes,const idDrawVert * verts,const int numVerts)5704 void VPCALL idSIMD_AltiVec::TracePointCull( byte *cullBits, byte &totalOr, const float radius, const idPlane *planes, const idDrawVert *verts, const int numVerts ) {
5705
5706 // idDrawVert size
5707 assert( sizeof(idDrawVert) == DRAWVERT_OFFSET * sizeof(float) );
5708
5709 byte tOr;
5710 tOr = 0;
5711
5712 // pointers
5713 const float *planePtr = planes[0].ToFloatPtr();
5714
5715 vector unsigned int vecShift1 = (vector unsigned int)(0,1,2,3);
5716 vector unsigned int vecShift2 = (vector unsigned int)(4,5,6,7);
5717 vector unsigned int vecFlipBits = (vector unsigned int)(0x0F);
5718 vector float vecPlane0, vecPlane1, vecPlane2, vecPlane3;
5719 vector bool int vecCmp1, vecCmp2, vecCmp3, vecCmp4, vecCmp5, vecCmp6, vecCmp7, vecCmp8;
5720 vector unsigned char vecPerm;
5721 vector float v0, v1, v2, v3, v4, v5, v6, v7;
5722 vector float zeroVector = (vector float)(0);
5723 vector float vecRadius;
5724 vector float vecXYZ1, vecXYZ2, vecXYZ3, vecXYZ4;
5725 vector float vec1Sum1, vec1Sum2, vec1Sum3, vec1Sum4;
5726 vector unsigned char vecPermLast = (vector unsigned char)(0,1,2,3,4,5,6,7,8,9,10,11,16,17,18,19);
5727 vector unsigned char vecPermHalves = (vector unsigned char)(0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23);
5728 vector float vecDPlusRadius1, vecDPlusRadius2, vecDPlusRadius3, vecDPlusRadius4;
5729 vector float vecDMinusRadius1, vecDMinusRadius2, vecDMinusRadius3, vecDMinusRadius4;
5730 vector bool int oneIntVector = (vector bool int)(1);
5731 vector unsigned int vecBitShifted1, vecBitShifted2, vecBitShifted3, vecBitShifted4, vecBitShifted5, vecBitShifted6, vecBitShifted7, vecBitShifted8;
5732 vector unsigned int vecTotals;
5733 vector unsigned int tempIntSum;
5734 vector unsigned char vertPerm1, vertPerm2, vertPerm3, vertPerm4;
5735
5736 vecPerm = vec_add( vec_lvsl( -1, planePtr ), (vector unsigned char)(1) );
5737
5738 // populate planes
5739 v0 = vec_ld( 0, planePtr );
5740 v1 = vec_ld( 15, planePtr );
5741 vecPlane0 = vec_perm( v0, v1, vecPerm );
5742
5743 v2 = vec_ld( 0, planePtr + 4 );
5744 v3 = vec_ld( 15, planePtr + 4 );
5745 vecPlane1 = vec_perm( v2, v3, vecPerm );
5746
5747 v0 = vec_ld( 0, planePtr + 8 );
5748 v1 = vec_ld( 15, planePtr + 8 );
5749 vecPlane2 = vec_perm( v0, v1, vecPerm );
5750
5751 v2 = vec_ld( 0, planePtr + 12 );
5752 v3 = vec_ld( 15, planePtr + 12 );
5753 vecPlane3 = vec_perm( v2, v3, vecPerm );
5754
5755 // transpose
5756 v0 = vec_mergeh( vecPlane0, vecPlane2 );
5757 v1 = vec_mergeh( vecPlane1, vecPlane3 );
5758 v2 = vec_mergel( vecPlane0, vecPlane2 );
5759 v3 = vec_mergel( vecPlane1, vecPlane3 );
5760
5761 vecPlane0 = vec_mergeh( v0, v1 );
5762 vecPlane1 = vec_mergel( v0, v1 );
5763 vecPlane2 = vec_mergeh( v2, v3 );
5764 vecPlane3 = vec_mergel( v2, v3 );
5765
5766 // load constants
5767 vecRadius = loadSplatUnalignedScalar( &radius );
5768
5769 unsigned int cullBitVal[4];
5770 vector unsigned char cullBitPerm = vec_lvsr( 0, &cullBitVal[0] );
5771 int i = 0;
5772
5773
5774 for ( ; i+3 < numVerts; i+=4 ) {
5775 const float *vertPtr = verts[i].xyz.ToFloatPtr();
5776 const float *vertPtr2 = verts[i+1].xyz.ToFloatPtr();
5777 const float *vertPtr3 = verts[i+2].xyz.ToFloatPtr();
5778 const float *vertPtr4 = verts[i+3].xyz.ToFloatPtr();
5779
5780 vecXYZ1 = vec_ld( 0, vertPtr );
5781 vecXYZ2 = vec_ld( 0, vertPtr2 );
5782 vecXYZ3 = vec_ld( 0, vertPtr3 );
5783 vecXYZ4 = vec_ld( 0, vertPtr4 );
5784
5785 vec1Sum1 = vec_madd( vec_splat( vecXYZ1, 0 ), vecPlane0, zeroVector );
5786 vec1Sum1 = vec_madd( vec_splat( vecXYZ1, 1 ), vecPlane1, vec1Sum1 );
5787 vec1Sum1 = vec_madd( vec_splat( vecXYZ1, 2 ), vecPlane2, vec1Sum1 );
5788 vec1Sum1 = vec_add( vec1Sum1, vecPlane3 );
5789
5790 vec1Sum2 = vec_madd( vec_splat( vecXYZ2, 0 ), vecPlane0, zeroVector );
5791 vec1Sum2 = vec_madd( vec_splat( vecXYZ2, 1 ), vecPlane1, vec1Sum2 );
5792 vec1Sum2 = vec_madd( vec_splat( vecXYZ2, 2 ), vecPlane2, vec1Sum2 );
5793 vec1Sum2 = vec_add( vec1Sum2, vecPlane3 );
5794
5795 vec1Sum3 = vec_madd( vec_splat( vecXYZ3, 0 ), vecPlane0, zeroVector );
5796 vec1Sum3 = vec_madd( vec_splat( vecXYZ3, 1 ), vecPlane1, vec1Sum3 );
5797 vec1Sum3 = vec_madd( vec_splat( vecXYZ3, 2 ), vecPlane2, vec1Sum3 );
5798 vec1Sum3 = vec_add( vec1Sum3, vecPlane3 );
5799
5800 vec1Sum4 = vec_madd( vec_splat( vecXYZ4, 0 ), vecPlane0, zeroVector );
5801 vec1Sum4 = vec_madd( vec_splat( vecXYZ4, 1 ), vecPlane1, vec1Sum4 );
5802 vec1Sum4 = vec_madd( vec_splat( vecXYZ4, 2 ), vecPlane2, vec1Sum4 );
5803 vec1Sum4 = vec_add( vec1Sum4, vecPlane3 );
5804
5805 // vec1Sum1 now holds d0, d1, d2, d3. calculate the
5806 // difference with +radius and -radius
5807 vecDPlusRadius1 = vec_add( vec1Sum1, vecRadius );
5808 vecDMinusRadius1 = vec_sub( vec1Sum1, vecRadius );
5809 vecDPlusRadius2 = vec_add( vec1Sum2, vecRadius );
5810 vecDMinusRadius2 = vec_sub( vec1Sum2, vecRadius );
5811 vecDPlusRadius3 = vec_add( vec1Sum3, vecRadius );
5812 vecDMinusRadius3 = vec_sub( vec1Sum3, vecRadius );
5813 vecDPlusRadius4 = vec_add( vec1Sum4, vecRadius );
5814 vecDMinusRadius4 = vec_sub( vec1Sum4, vecRadius );
5815
5816 // do compare
5817 vecCmp1 = vec_cmplt( vecDPlusRadius1, zeroVector );
5818 vecCmp2 = vec_cmplt( vecDMinusRadius1, zeroVector );
5819 vecCmp3 = vec_cmplt( vecDPlusRadius2, zeroVector );
5820 vecCmp4 = vec_cmplt( vecDMinusRadius2, zeroVector );
5821 vecCmp5 = vec_cmplt( vecDPlusRadius3, zeroVector );
5822 vecCmp6 = vec_cmplt( vecDMinusRadius3, zeroVector );
5823 vecCmp7 = vec_cmplt( vecDPlusRadius4, zeroVector );
5824 vecCmp8 = vec_cmplt( vecDMinusRadius4, zeroVector );
5825
5826 //and it with 1 so we multiply by 1 not 1111's
5827 vecCmp1 = vec_and( vecCmp1, oneIntVector );
5828 vecCmp2 = vec_and( vecCmp2, oneIntVector );
5829 vecCmp3 = vec_and( vecCmp3, oneIntVector );
5830 vecCmp4 = vec_and( vecCmp4, oneIntVector );
5831 vecCmp5 = vec_and( vecCmp5, oneIntVector );
5832 vecCmp6 = vec_and( vecCmp6, oneIntVector );
5833 vecCmp7 = vec_and( vecCmp7, oneIntVector );
5834 vecCmp8 = vec_and( vecCmp8, oneIntVector );
5835
5836 vecBitShifted1 = vec_sl( (vector unsigned int)vecCmp1, vecShift1 );
5837 vecBitShifted2 = vec_sl( (vector unsigned int)vecCmp2, vecShift2 );
5838 vecBitShifted3 = vec_sl( (vector unsigned int)vecCmp3, vecShift1 );
5839 vecBitShifted4 = vec_sl( (vector unsigned int)vecCmp4, vecShift2 );
5840 vecBitShifted5 = vec_sl( (vector unsigned int)vecCmp5, vecShift1 );
5841 vecBitShifted6 = vec_sl( (vector unsigned int)vecCmp6, vecShift2 );
5842 vecBitShifted7 = vec_sl( (vector unsigned int)vecCmp7, vecShift1 );
5843 vecBitShifted8 = vec_sl( (vector unsigned int)vecCmp8, vecShift2 );
5844
5845 // OR (add) them all together
5846 vecBitShifted1 = vec_add( vecBitShifted1, vecBitShifted2 );
5847 vecBitShifted3 = vec_add( vecBitShifted3, vecBitShifted4 );
5848 vecBitShifted5 = vec_add( vecBitShifted5, vecBitShifted6 );
5849 vecBitShifted7 = vec_add( vecBitShifted7, vecBitShifted8 );
5850
5851 vecTotals = vec_add( vecBitShifted1, vec_sld( vecBitShifted1, vecBitShifted1, 8 ) );
5852 vecTotals = vec_add( vecTotals, vec_sld( vecTotals, vecTotals, 4 ) );
5853 tempIntSum = vec_add( vecBitShifted3, vec_sld( vecBitShifted3, vecBitShifted3, 8 ) );
5854 tempIntSum = vec_add( tempIntSum, vec_sld( tempIntSum, tempIntSum, 4 ) );
5855 vecTotals = vec_mergeh( vecTotals, tempIntSum );
5856 tempIntSum = vec_add( vecBitShifted5, vec_sld( vecBitShifted5, vecBitShifted5, 8 ) );
5857 tempIntSum = vec_add( tempIntSum, vec_sld( tempIntSum, tempIntSum, 4 ) );
5858 vecTotals = vec_perm( vecTotals, tempIntSum, vecPermHalves );
5859 tempIntSum = vec_add( vecBitShifted7, vec_sld( vecBitShifted7, vecBitShifted7, 8 ) );
5860 tempIntSum = vec_add( tempIntSum, vec_sld( tempIntSum, tempIntSum, 4 ) );
5861 vecTotals = vec_perm( vecTotals, tempIntSum, vecPermLast );
5862
5863 // store out results
5864 vector unsigned int tempSt = vec_xor( vecTotals, vecFlipBits );
5865 tempSt = vec_perm( tempSt, tempSt, cullBitPerm );
5866 vec_ste( tempSt, 0, &cullBitVal[0] );
5867 vec_ste( tempSt, 4, &cullBitVal[0] );
5868 vec_ste( tempSt, 8, &cullBitVal[0] );
5869 vec_ste( tempSt, 12, &cullBitVal[0] );
5870
5871 tOr |= cullBitVal[0];
5872 tOr |= cullBitVal[1];
5873 tOr |= cullBitVal[2];
5874 tOr |= cullBitVal[3];
5875
5876 cullBits[i] = cullBitVal[0];
5877 cullBits[i+1] = cullBitVal[1];
5878 cullBits[i+2] = cullBitVal[2];
5879 cullBits[i+3] = cullBitVal[3];
5880 }
5881
5882 // cleanup
5883 for ( ; i < numVerts; i++ ) {
5884 byte bits;
5885 float d0, d1, d2, d3, t;
5886 const idVec3 &v = verts[i].xyz;
5887
5888 d0 = planes[0].Distance( v );
5889 d1 = planes[1].Distance( v );
5890 d2 = planes[2].Distance( v );
5891 d3 = planes[3].Distance( v );
5892
5893 t = d0 + radius;
5894 bits = FLOATSIGNBITSET( t ) << 0;
5895 t = d1 + radius;
5896 bits |= FLOATSIGNBITSET( t ) << 1;
5897 t = d2 + radius;
5898 bits |= FLOATSIGNBITSET( t ) << 2;
5899 t = d3 + radius;
5900 bits |= FLOATSIGNBITSET( t ) << 3;
5901
5902 t = d0 - radius;
5903 bits |= FLOATSIGNBITSET( t ) << 4;
5904 t = d1 - radius;
5905 bits |= FLOATSIGNBITSET( t ) << 5;
5906 t = d2 - radius;
5907 bits |= FLOATSIGNBITSET( t ) << 6;
5908 t = d3 - radius;
5909 bits |= FLOATSIGNBITSET( t ) << 7;
5910
5911 bits ^= 0x0F; // flip lower four bits
5912
5913 tOr |= bits;
5914 cullBits[i] = bits;
5915 }
5916
5917 totalOr = tOr;
5918 }
5919
5920 #endif /* DRAWVERT_PADDED */
5921
5922 #ifndef DRAWVERT_PADDED
5923 /*
5924 ============
5925 idSIMD_AltiVec::DecalPointCull
5926 ============
5927 */
DecalPointCull(byte * cullBits,const idPlane * planes,const idDrawVert * verts,const int numVerts)5928 void VPCALL idSIMD_AltiVec::DecalPointCull( byte *cullBits, const idPlane *planes, const idDrawVert *verts, const int numVerts ) {
5929
5930 // idDrawVert size
5931 assert( sizeof(idDrawVert) == DRAWVERT_OFFSET * sizeof(float) );
5932
5933 int i;
5934 const float *planePtr = planes[0].ToFloatPtr();
5935
5936 vector float vecPlane0, vecPlane1, vecPlane2, vecPlane3, vecPlane4, vecPlane5, vecPlane6, vecPlane7;
5937 vector float zeroVector = (vector float)(0.0);
5938 vector unsigned char vecPerm;
5939 vector float v0, v1, v2, v3, v4, v5, v6, v7;
5940
5941 vecPerm = vec_add( vec_lvsl( -1, planePtr ), (vector unsigned char)(1) );
5942
5943 // populate planes
5944 v0 = vec_ld( 0, planePtr );
5945 v1 = vec_ld( 15, planePtr );
5946 vecPlane0 = vec_perm( v0, v1, vecPerm );
5947
5948 v2 = vec_ld( 0, planePtr + 4 );
5949 v3 = vec_ld( 15, planePtr + 4 );
5950 vecPlane1 = vec_perm( v2, v3, vecPerm );
5951
5952 v0 = vec_ld( 0, planePtr + 8 );
5953 v1 = vec_ld( 15, planePtr + 8 );
5954 vecPlane2 = vec_perm( v0, v1, vecPerm );
5955
5956 v2 = vec_ld( 0, planePtr + 12 );
5957 v3 = vec_ld( 15, planePtr + 12 );
5958 vecPlane3 = vec_perm( v2, v3, vecPerm );
5959
5960 v0 = vec_ld( 0, planePtr + 16 );
5961 v1 = vec_ld( 15, planePtr + 16 );
5962 vecPlane4 = vec_perm( v0, v1, vecPerm );
5963
5964 v2 = vec_ld( 0, planePtr + 20 );
5965 v3 = vec_ld( 15, planePtr + 20 );
5966 vecPlane5 = vec_perm( v2, v3, vecPerm );
5967
5968 // transpose
5969 v0 = vec_mergeh( vecPlane0, vecPlane2 );
5970 v1 = vec_mergeh( vecPlane1, vecPlane3 );
5971 v2 = vec_mergel( vecPlane0, vecPlane2 );
5972 v3 = vec_mergel( vecPlane1, vecPlane3 );
5973
5974 vecPlane0 = vec_mergeh( v0, v1 );
5975 vecPlane1 = vec_mergel( v0, v1 );
5976 vecPlane2 = vec_mergeh( v2, v3 );
5977 vecPlane3 = vec_mergel( v2, v3 );
5978
5979 v0 = vec_mergeh( vecPlane4, zeroVector );
5980 v1 = vec_mergeh( vecPlane5, zeroVector );
5981 v2 = vec_mergel( vecPlane4, zeroVector );
5982 v3 = vec_mergel( vecPlane5, zeroVector );
5983
5984 vecPlane4 = vec_mergeh( v0, v1 );
5985 vecPlane5 = vec_mergel( v0, v1 );
5986 vecPlane6 = vec_mergeh( v2, v3 );
5987 vecPlane7 = vec_mergel( v2, v3 );
5988
5989
5990 vector float vecXYZ1, vecXYZ2, vecXYZ3, vecXYZ4;
5991 vector bool int oneIntVector = (vector bool int)(1);
5992 vector float vec1Sum1, vec1Sum2, vec2Sum1, vec2Sum2, vec3Sum1, vec3Sum2, vec4Sum1, vec4Sum2;
5993 vector unsigned int vecShift1 = (vector unsigned int)(0, 1, 2, 3 );
5994 vector unsigned int vecShift2 = (vector unsigned int)(4, 5, 0, 0 );
5995
5996 vector bool int vecCmp1, vecCmp2, vecCmp3, vecCmp4, vecCmp5, vecCmp6, vecCmp7, vecCmp8;
5997 vector unsigned int vecBitShifted1, vecBitShifted2, vecBitShifted3, vecBitShifted4;
5998 vector unsigned int vecBitShifted5, vecBitShifted6, vecBitShifted7, vecBitShifted8;
5999 vector unsigned int vecFlipBits = (vector unsigned int)( 0x3F, 0x3F, 0x3F, 0x3F );
6000 vector unsigned int vecR1, vecR2, vecR3, vecR4;
6001 vector unsigned char permHalves = (vector unsigned char)(0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23);
6002 vector unsigned char vertPerm1, vertPerm2, vertPerm3, vertPerm4;
6003 unsigned int vBits[4];
6004 vector unsigned char vBitPerm = vec_lvsr( 0, &vBits[4] );
6005
6006 i = 0;
6007 // every fourth one will have the same alignment. Make sure we've got enough here
6008 if ( i+3 < numVerts ) {
6009 vertPerm1 = vec_add( vec_lvsl( -1, (float*) verts[0].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
6010 vertPerm2 = vec_add( vec_lvsl( -1, (float*) verts[1].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
6011 vertPerm3 = vec_add( vec_lvsl( -1, (float*) verts[2].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
6012 vertPerm4 = vec_add( vec_lvsl( -1, (float*) verts[3].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
6013 }
6014
6015
6016 for ( ; i+3 < numVerts; i+=4 ) {
6017 const float *vertPtr = verts[i].xyz.ToFloatPtr();
6018 const float *vertPtr2 = verts[i+1].xyz.ToFloatPtr();
6019 const float *vertPtr3 = verts[i+2].xyz.ToFloatPtr();
6020 const float *vertPtr4 = verts[i+3].xyz.ToFloatPtr();
6021
6022 v0 = vec_ld( 0, vertPtr );
6023 v1 = vec_ld( 15, vertPtr );
6024 v2 = vec_ld( 0, vertPtr2 );
6025 v3 = vec_ld( 15, vertPtr2 );
6026 v4 = vec_ld( 0, vertPtr3 );
6027 v5 = vec_ld( 15, vertPtr3 );
6028 v6 = vec_ld( 0, vertPtr4 );
6029 v7 = vec_ld( 15, vertPtr4 );
6030
6031 vecXYZ1 = vec_perm( v0, v1, vertPerm1 );
6032 vecXYZ2 = vec_perm( v2, v3, vertPerm2 );
6033 vecXYZ3 = vec_perm( v4, v5, vertPerm3 );
6034 vecXYZ4 = vec_perm( v6, v7, vertPerm4 );
6035
6036 vec1Sum1 = vec_madd( vec_splat( vecXYZ1, 0 ), vecPlane0, zeroVector );
6037 vec1Sum1 = vec_madd( vec_splat( vecXYZ1, 1 ), vecPlane1, vec1Sum1 );
6038 vec1Sum1 = vec_madd( vec_splat( vecXYZ1, 2 ), vecPlane2, vec1Sum1 );
6039 vec1Sum1 = vec_add( vec1Sum1, vecPlane3 );
6040
6041 vec1Sum2 = vec_madd( vec_splat( vecXYZ1, 0 ), vecPlane4, zeroVector );
6042 vec1Sum2 = vec_madd( vec_splat( vecXYZ1, 1 ), vecPlane5, vec1Sum2 );
6043 vec1Sum2 = vec_madd( vec_splat( vecXYZ1, 2 ), vecPlane6, vec1Sum2 );
6044 vec1Sum2 = vec_add( vec1Sum2, vecPlane7 );
6045
6046 vec2Sum1 = vec_madd( vec_splat( vecXYZ2, 0 ), vecPlane0, zeroVector );
6047 vec2Sum1 = vec_madd( vec_splat( vecXYZ2, 1 ), vecPlane1, vec2Sum1 );
6048 vec2Sum1 = vec_madd( vec_splat( vecXYZ2, 2 ), vecPlane2, vec2Sum1 );
6049 vec2Sum1 = vec_add( vec2Sum1, vecPlane3 );
6050
6051 vec2Sum2 = vec_madd( vec_splat( vecXYZ2, 0 ), vecPlane4, zeroVector );
6052 vec2Sum2 = vec_madd( vec_splat( vecXYZ2, 1 ), vecPlane5, vec2Sum2 );
6053 vec2Sum2 = vec_madd( vec_splat( vecXYZ2, 2 ), vecPlane6, vec2Sum2 );
6054 vec2Sum2 = vec_add( vec2Sum2, vecPlane7 );
6055
6056 vec3Sum1 = vec_madd( vec_splat( vecXYZ3, 0 ), vecPlane0, zeroVector );
6057 vec3Sum1 = vec_madd( vec_splat( vecXYZ3, 1 ), vecPlane1, vec3Sum1 );
6058 vec3Sum1 = vec_madd( vec_splat( vecXYZ3, 2 ), vecPlane2, vec3Sum1 );
6059 vec3Sum1 = vec_add( vec3Sum1, vecPlane3 );
6060
6061 vec3Sum2 = vec_madd( vec_splat( vecXYZ3, 0 ), vecPlane4, zeroVector );
6062 vec3Sum2 = vec_madd( vec_splat( vecXYZ3, 1 ), vecPlane5, vec3Sum2 );
6063 vec3Sum2 = vec_madd( vec_splat( vecXYZ3, 2 ), vecPlane6, vec3Sum2 );
6064 vec3Sum2 = vec_add( vec3Sum2, vecPlane7 );
6065
6066 vec4Sum1 = vec_madd( vec_splat( vecXYZ4, 0 ), vecPlane0, zeroVector );
6067 vec4Sum1 = vec_madd( vec_splat( vecXYZ4, 1 ), vecPlane1, vec4Sum1 );
6068 vec4Sum1 = vec_madd( vec_splat( vecXYZ4, 2 ), vecPlane2, vec4Sum1 );
6069 vec4Sum1 = vec_add( vec4Sum1, vecPlane3 );
6070
6071 vec4Sum2 = vec_madd( vec_splat( vecXYZ4, 0 ), vecPlane4, zeroVector );
6072 vec4Sum2 = vec_madd( vec_splat( vecXYZ4, 1 ), vecPlane5, vec4Sum2 );
6073 vec4Sum2 = vec_madd( vec_splat( vecXYZ4, 2 ), vecPlane6, vec4Sum2 );
6074 vec4Sum2 = vec_add( vec4Sum2, vecPlane7 );
6075
6076 vecCmp1 = vec_cmplt( vec1Sum1, zeroVector );
6077 vecCmp2 = vec_cmplt( vec1Sum2, zeroVector );
6078 vecCmp3 = vec_cmplt( vec2Sum1, zeroVector );
6079 vecCmp4 = vec_cmplt( vec2Sum2, zeroVector );
6080 vecCmp5 = vec_cmplt( vec3Sum1, zeroVector );
6081 vecCmp6 = vec_cmplt( vec3Sum2, zeroVector );
6082 vecCmp7 = vec_cmplt( vec4Sum1, zeroVector );
6083 vecCmp8 = vec_cmplt( vec4Sum2, zeroVector );
6084
6085 //and it with 1 so we multiply by 1 not 1111's
6086 vecCmp1 = vec_and( vecCmp1, oneIntVector );
6087 vecCmp2 = vec_and( vecCmp2, oneIntVector );
6088 vecCmp3 = vec_and( vecCmp3, oneIntVector );
6089 vecCmp4 = vec_and( vecCmp4, oneIntVector );
6090 vecCmp5 = vec_and( vecCmp5, oneIntVector );
6091 vecCmp6 = vec_and( vecCmp6, oneIntVector );
6092 vecCmp7 = vec_and( vecCmp7, oneIntVector );
6093 vecCmp8 = vec_and( vecCmp8, oneIntVector );
6094
6095 vecBitShifted1 = vec_sl( (vector unsigned int)vecCmp1, vecShift1 );
6096 vecBitShifted2 = vec_sl( (vector unsigned int)vecCmp2, vecShift2 );
6097 vecBitShifted3 = vec_sl( (vector unsigned int)vecCmp3, vecShift1 );
6098 vecBitShifted4 = vec_sl( (vector unsigned int)vecCmp4, vecShift2 );
6099 vecBitShifted5 = vec_sl( (vector unsigned int)vecCmp5, vecShift1 );
6100 vecBitShifted6 = vec_sl( (vector unsigned int)vecCmp6, vecShift2 );
6101 vecBitShifted7 = vec_sl( (vector unsigned int)vecCmp7, vecShift1 );
6102 vecBitShifted8 = vec_sl( (vector unsigned int)vecCmp8, vecShift2 );
6103
6104 //OR them all together (this is the same as adding them, since they're all only 1 bit set)
6105 vecR1 = (vector unsigned int)(0); //zeroIntVector;
6106 vecR1 = vec_add( vecBitShifted1, vec_sld( vecBitShifted1, vecBitShifted1, 8 ) );
6107 vecR1 = vec_add( vecR1, vec_sld( vecR1, vecR1, 4 ) );
6108 vecR1 = vec_add(vecR1, vecBitShifted2 );
6109 vecR1 = vec_or( vecR1, vec_sld( vecBitShifted2, vecBitShifted2, 4 ) );
6110
6111 vecR2 = (vector unsigned int)(0); //zeroIntVector;
6112 vecR2 = vec_add( vecBitShifted3, vec_sld( vecBitShifted3, vecBitShifted3, 8 ) );
6113 vecR2 = vec_add( vecR2, vec_sld( vecR2, vecR2, 4 ) );
6114 vecR2 = vec_add(vecR2, vecBitShifted4 );
6115 vecR2 = vec_or( vecR2, vec_sld( vecBitShifted4, vecBitShifted4, 4 ) );
6116
6117 vecR3 = (vector unsigned int)(0); //zeroIntVector;
6118 vecR3 = vec_add( vecBitShifted5, vec_sld( vecBitShifted5, vecBitShifted5, 8 ) );
6119 vecR3 = vec_add( vecR3, vec_sld( vecR3, vecR3, 4 ) );
6120 vecR3 = vec_add(vecR3, vecBitShifted6 );
6121 vecR3 = vec_or( vecR3, vec_sld( vecBitShifted6, vecBitShifted6, 4 ) );
6122
6123 vecR4 = (vector unsigned int)(0); //zeroIntVector;
6124 vecR4 = vec_add( vecBitShifted7, vec_sld( vecBitShifted7, vecBitShifted7, 8 ) );
6125 vecR4 = vec_add( vecR4, vec_sld( vecR4, vecR4, 4 ) );
6126 vecR4 = vec_add(vecR4, vecBitShifted8 );
6127 vecR4 = vec_or( vecR4, vec_sld( vecBitShifted8, vecBitShifted8, 4 ) );
6128
6129 // take the first element from each vector and put them into vecR1
6130 vecR1 = vec_mergeh( vecR1, vecR2 );
6131 vecR3 = vec_mergeh( vecR3, vecR4 );
6132 vecR1 = vec_perm( vecR1, vecR3, permHalves );
6133
6134 // XOR with 0x3F to flip lower 6 bits
6135 vecR1 = vec_xor( vecR1, vecFlipBits );
6136
6137 // store out results. don't have 16 at a time so let's just
6138 // do this and avoid alignment concerns
6139 vecR1 = vec_perm( vecR1, vecR1, vBitPerm );
6140 vec_ste( vecR1, 0, &vBits[0] );
6141 vec_ste( vecR1, 4, &vBits[0] );
6142 vec_ste( vecR1, 8, &vBits[0] );
6143 vec_ste( vecR1, 12, &vBits[0] );
6144
6145 cullBits[i] = vBits[0];
6146 cullBits[i+1] = vBits[1];
6147 cullBits[i+2] = vBits[2];
6148 cullBits[i+3] = vBits[3];
6149 }
6150
6151 for ( ; i < numVerts; i++ ) {
6152 byte bits;
6153 float d0, d1, d2, d3, d4, d5;
6154 const idVec3 &v = verts[i].xyz;
6155
6156 d0 = planes[0].Distance( v );
6157 d1 = planes[1].Distance( v );
6158 d2 = planes[2].Distance( v );
6159 d3 = planes[3].Distance( v );
6160 d4 = planes[4].Distance( v );
6161 d5 = planes[5].Distance( v );
6162
6163 // they check if the sign bit is set by casting as long and shifting right 31 places.
6164 bits = FLOATSIGNBITSET( d0 ) << 0;
6165 bits |= FLOATSIGNBITSET( d1 ) << 1;
6166 bits |= FLOATSIGNBITSET( d2 ) << 2;
6167 bits |= FLOATSIGNBITSET( d3 ) << 3;
6168 bits |= FLOATSIGNBITSET( d4 ) << 4;
6169 bits |= FLOATSIGNBITSET( d5 ) << 5;
6170
6171 cullBits[i] = bits ^ 0x3F; // flip lower 6 bits
6172 }
6173 }
6174
6175 #else
6176
6177 /*
6178 ============
6179 idSIMD_AltiVec::DecalPointCull
6180 ============
6181 */
DecalPointCull(byte * cullBits,const idPlane * planes,const idDrawVert * verts,const int numVerts)6182 void VPCALL idSIMD_AltiVec::DecalPointCull( byte *cullBits, const idPlane *planes, const idDrawVert *verts, const int numVerts ) {
6183
6184 // idDrawVert size
6185 assert( sizeof(idDrawVert) == DRAWVERT_OFFSET * sizeof(float) );
6186
6187 int i;
6188 const float *planePtr = planes[0].ToFloatPtr();
6189
6190 vector float vecPlane0, vecPlane1, vecPlane2, vecPlane3, vecPlane4, vecPlane5, vecPlane6, vecPlane7;
6191 vector float zeroVector = (vector float)(0.0);
6192 vector unsigned char vecPerm;
6193 vector float v0, v1, v2, v3, v4, v5, v6, v7;
6194
6195 vecPerm = vec_add( vec_lvsl( -1, planePtr ), (vector unsigned char)(1) );
6196
6197 // populate planes
6198 v0 = vec_ld( 0, planePtr );
6199 v1 = vec_ld( 15, planePtr );
6200 vecPlane0 = vec_perm( v0, v1, vecPerm );
6201
6202 v2 = vec_ld( 0, planePtr + 4 );
6203 v3 = vec_ld( 15, planePtr + 4 );
6204 vecPlane1 = vec_perm( v2, v3, vecPerm );
6205
6206 v0 = vec_ld( 0, planePtr + 8 );
6207 v1 = vec_ld( 15, planePtr + 8 );
6208 vecPlane2 = vec_perm( v0, v1, vecPerm );
6209
6210 v2 = vec_ld( 0, planePtr + 12 );
6211 v3 = vec_ld( 15, planePtr + 12 );
6212 vecPlane3 = vec_perm( v2, v3, vecPerm );
6213
6214 v0 = vec_ld( 0, planePtr + 16 );
6215 v1 = vec_ld( 15, planePtr + 16 );
6216 vecPlane4 = vec_perm( v0, v1, vecPerm );
6217
6218 v2 = vec_ld( 0, planePtr + 20 );
6219 v3 = vec_ld( 15, planePtr + 20 );
6220 vecPlane5 = vec_perm( v2, v3, vecPerm );
6221
6222 // transpose
6223 v0 = vec_mergeh( vecPlane0, vecPlane2 );
6224 v1 = vec_mergeh( vecPlane1, vecPlane3 );
6225 v2 = vec_mergel( vecPlane0, vecPlane2 );
6226 v3 = vec_mergel( vecPlane1, vecPlane3 );
6227
6228 vecPlane0 = vec_mergeh( v0, v1 );
6229 vecPlane1 = vec_mergel( v0, v1 );
6230 vecPlane2 = vec_mergeh( v2, v3 );
6231 vecPlane3 = vec_mergel( v2, v3 );
6232
6233 v0 = vec_mergeh( vecPlane4, zeroVector );
6234 v1 = vec_mergeh( vecPlane5, zeroVector );
6235 v2 = vec_mergel( vecPlane4, zeroVector );
6236 v3 = vec_mergel( vecPlane5, zeroVector );
6237
6238 vecPlane4 = vec_mergeh( v0, v1 );
6239 vecPlane5 = vec_mergel( v0, v1 );
6240 vecPlane6 = vec_mergeh( v2, v3 );
6241 vecPlane7 = vec_mergel( v2, v3 );
6242
6243
6244 vector float vecXYZ1, vecXYZ2, vecXYZ3, vecXYZ4;
6245 vector bool int oneIntVector = (vector bool int)(1);
6246 vector float vec1Sum1, vec1Sum2, vec2Sum1, vec2Sum2, vec3Sum1, vec3Sum2, vec4Sum1, vec4Sum2;
6247 vector unsigned int vecShift1 = (vector unsigned int)(0, 1, 2, 3 );
6248 vector unsigned int vecShift2 = (vector unsigned int)(4, 5, 0, 0 );
6249
6250 vector bool int vecCmp1, vecCmp2, vecCmp3, vecCmp4, vecCmp5, vecCmp6, vecCmp7, vecCmp8;
6251 vector unsigned int vecBitShifted1, vecBitShifted2, vecBitShifted3, vecBitShifted4;
6252 vector unsigned int vecBitShifted5, vecBitShifted6, vecBitShifted7, vecBitShifted8;
6253 vector unsigned int vecFlipBits = (vector unsigned int)( 0x3F, 0x3F, 0x3F, 0x3F );
6254 vector unsigned int vecR1, vecR2, vecR3, vecR4;
6255 vector unsigned char permHalves = (vector unsigned char)(0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23);
6256 vector unsigned char vertPerm1, vertPerm2, vertPerm3, vertPerm4;
6257 unsigned int vBits[4];
6258 vector unsigned char vBitPerm = vec_lvsr( 0, &vBits[4] );
6259
6260 i = 0;
6261
6262 for ( ; i+3 < numVerts; i+=4 ) {
6263 const float *vertPtr = verts[i].xyz.ToFloatPtr();
6264 const float *vertPtr2 = verts[i+1].xyz.ToFloatPtr();
6265 const float *vertPtr3 = verts[i+2].xyz.ToFloatPtr();
6266 const float *vertPtr4 = verts[i+3].xyz.ToFloatPtr();
6267
6268 v0 = vec_ld( 0, vertPtr );
6269 v2 = vec_ld( 0, vertPtr2 );
6270 v4 = vec_ld( 0, vertPtr3 );
6271 v6 = vec_ld( 0, vertPtr4 );
6272
6273 vec1Sum1 = vec_madd( vec_splat( vecXYZ1, 0 ), vecPlane0, zeroVector );
6274 vec1Sum1 = vec_madd( vec_splat( vecXYZ1, 1 ), vecPlane1, vec1Sum1 );
6275 vec1Sum1 = vec_madd( vec_splat( vecXYZ1, 2 ), vecPlane2, vec1Sum1 );
6276 vec1Sum1 = vec_add( vec1Sum1, vecPlane3 );
6277
6278 vec1Sum2 = vec_madd( vec_splat( vecXYZ1, 0 ), vecPlane4, zeroVector );
6279 vec1Sum2 = vec_madd( vec_splat( vecXYZ1, 1 ), vecPlane5, vec1Sum2 );
6280 vec1Sum2 = vec_madd( vec_splat( vecXYZ1, 2 ), vecPlane6, vec1Sum2 );
6281 vec1Sum2 = vec_add( vec1Sum2, vecPlane7 );
6282
6283 vec2Sum1 = vec_madd( vec_splat( vecXYZ2, 0 ), vecPlane0, zeroVector );
6284 vec2Sum1 = vec_madd( vec_splat( vecXYZ2, 1 ), vecPlane1, vec2Sum1 );
6285 vec2Sum1 = vec_madd( vec_splat( vecXYZ2, 2 ), vecPlane2, vec2Sum1 );
6286 vec2Sum1 = vec_add( vec2Sum1, vecPlane3 );
6287
6288 vec2Sum2 = vec_madd( vec_splat( vecXYZ2, 0 ), vecPlane4, zeroVector );
6289 vec2Sum2 = vec_madd( vec_splat( vecXYZ2, 1 ), vecPlane5, vec2Sum2 );
6290 vec2Sum2 = vec_madd( vec_splat( vecXYZ2, 2 ), vecPlane6, vec2Sum2 );
6291 vec2Sum2 = vec_add( vec2Sum2, vecPlane7 );
6292
6293 vec3Sum1 = vec_madd( vec_splat( vecXYZ3, 0 ), vecPlane0, zeroVector );
6294 vec3Sum1 = vec_madd( vec_splat( vecXYZ3, 1 ), vecPlane1, vec3Sum1 );
6295 vec3Sum1 = vec_madd( vec_splat( vecXYZ3, 2 ), vecPlane2, vec3Sum1 );
6296 vec3Sum1 = vec_add( vec3Sum1, vecPlane3 );
6297
6298 vec3Sum2 = vec_madd( vec_splat( vecXYZ3, 0 ), vecPlane4, zeroVector );
6299 vec3Sum2 = vec_madd( vec_splat( vecXYZ3, 1 ), vecPlane5, vec3Sum2 );
6300 vec3Sum2 = vec_madd( vec_splat( vecXYZ3, 2 ), vecPlane6, vec3Sum2 );
6301 vec3Sum2 = vec_add( vec3Sum2, vecPlane7 );
6302
6303 vec4Sum1 = vec_madd( vec_splat( vecXYZ4, 0 ), vecPlane0, zeroVector );
6304 vec4Sum1 = vec_madd( vec_splat( vecXYZ4, 1 ), vecPlane1, vec4Sum1 );
6305 vec4Sum1 = vec_madd( vec_splat( vecXYZ4, 2 ), vecPlane2, vec4Sum1 );
6306 vec4Sum1 = vec_add( vec4Sum1, vecPlane3 );
6307
6308 vec4Sum2 = vec_madd( vec_splat( vecXYZ4, 0 ), vecPlane4, zeroVector );
6309 vec4Sum2 = vec_madd( vec_splat( vecXYZ4, 1 ), vecPlane5, vec4Sum2 );
6310 vec4Sum2 = vec_madd( vec_splat( vecXYZ4, 2 ), vecPlane6, vec4Sum2 );
6311 vec4Sum2 = vec_add( vec4Sum2, vecPlane7 );
6312
6313 vecCmp1 = vec_cmplt( vec1Sum1, zeroVector );
6314 vecCmp2 = vec_cmplt( vec1Sum2, zeroVector );
6315 vecCmp3 = vec_cmplt( vec2Sum1, zeroVector );
6316 vecCmp4 = vec_cmplt( vec2Sum2, zeroVector );
6317 vecCmp5 = vec_cmplt( vec3Sum1, zeroVector );
6318 vecCmp6 = vec_cmplt( vec3Sum2, zeroVector );
6319 vecCmp7 = vec_cmplt( vec4Sum1, zeroVector );
6320 vecCmp8 = vec_cmplt( vec4Sum2, zeroVector );
6321
6322 //and it with 1 so we multiply by 1 not 1111's
6323 vecCmp1 = vec_and( vecCmp1, oneIntVector );
6324 vecCmp2 = vec_and( vecCmp2, oneIntVector );
6325 vecCmp3 = vec_and( vecCmp3, oneIntVector );
6326 vecCmp4 = vec_and( vecCmp4, oneIntVector );
6327 vecCmp5 = vec_and( vecCmp5, oneIntVector );
6328 vecCmp6 = vec_and( vecCmp6, oneIntVector );
6329 vecCmp7 = vec_and( vecCmp7, oneIntVector );
6330 vecCmp8 = vec_and( vecCmp8, oneIntVector );
6331
6332 vecBitShifted1 = vec_sl( (vector unsigned int)vecCmp1, vecShift1 );
6333 vecBitShifted2 = vec_sl( (vector unsigned int)vecCmp2, vecShift2 );
6334 vecBitShifted3 = vec_sl( (vector unsigned int)vecCmp3, vecShift1 );
6335 vecBitShifted4 = vec_sl( (vector unsigned int)vecCmp4, vecShift2 );
6336 vecBitShifted5 = vec_sl( (vector unsigned int)vecCmp5, vecShift1 );
6337 vecBitShifted6 = vec_sl( (vector unsigned int)vecCmp6, vecShift2 );
6338 vecBitShifted7 = vec_sl( (vector unsigned int)vecCmp7, vecShift1 );
6339 vecBitShifted8 = vec_sl( (vector unsigned int)vecCmp8, vecShift2 );
6340
6341 //OR them all together (this is the same as adding them, since they're all only 1 bit set)
6342 vecR1 = (vector unsigned int)(0); //zeroIntVector;
6343 vecR1 = vec_add( vecBitShifted1, vec_sld( vecBitShifted1, vecBitShifted1, 8 ) );
6344 vecR1 = vec_add( vecR1, vec_sld( vecR1, vecR1, 4 ) );
6345 vecR1 = vec_add(vecR1, vecBitShifted2 );
6346 vecR1 = vec_or( vecR1, vec_sld( vecBitShifted2, vecBitShifted2, 4 ) );
6347
6348 vecR2 = (vector unsigned int)(0); //zeroIntVector;
6349 vecR2 = vec_add( vecBitShifted3, vec_sld( vecBitShifted3, vecBitShifted3, 8 ) );
6350 vecR2 = vec_add( vecR2, vec_sld( vecR2, vecR2, 4 ) );
6351 vecR2 = vec_add(vecR2, vecBitShifted4 );
6352 vecR2 = vec_or( vecR2, vec_sld( vecBitShifted4, vecBitShifted4, 4 ) );
6353
6354 vecR3 = (vector unsigned int)(0); //zeroIntVector;
6355 vecR3 = vec_add( vecBitShifted5, vec_sld( vecBitShifted5, vecBitShifted5, 8 ) );
6356 vecR3 = vec_add( vecR3, vec_sld( vecR3, vecR3, 4 ) );
6357 vecR3 = vec_add(vecR3, vecBitShifted6 );
6358 vecR3 = vec_or( vecR3, vec_sld( vecBitShifted6, vecBitShifted6, 4 ) );
6359
6360 vecR4 = (vector unsigned int)(0); //zeroIntVector;
6361 vecR4 = vec_add( vecBitShifted7, vec_sld( vecBitShifted7, vecBitShifted7, 8 ) );
6362 vecR4 = vec_add( vecR4, vec_sld( vecR4, vecR4, 4 ) );
6363 vecR4 = vec_add(vecR4, vecBitShifted8 );
6364 vecR4 = vec_or( vecR4, vec_sld( vecBitShifted8, vecBitShifted8, 4 ) );
6365
6366 // take the first element from each vector and put them into vecR1
6367 vecR1 = vec_mergeh( vecR1, vecR2 );
6368 vecR3 = vec_mergeh( vecR3, vecR4 );
6369 vecR1 = vec_perm( vecR1, vecR3, permHalves );
6370
6371 // XOR with 0x3F to flip lower 6 bits
6372 vecR1 = vec_xor( vecR1, vecFlipBits );
6373
6374 // store out results. don't have 16 at a time so let's just
6375 // do this and avoid alignment concerns
6376 vecR1 = vec_perm( vecR1, vecR1, vBitPerm );
6377 vec_ste( vecR1, 0, &vBits[0] );
6378 vec_ste( vecR1, 4, &vBits[0] );
6379 vec_ste( vecR1, 8, &vBits[0] );
6380 vec_ste( vecR1, 12, &vBits[0] );
6381
6382 cullBits[i] = vBits[0];
6383 cullBits[i+1] = vBits[1];
6384 cullBits[i+2] = vBits[2];
6385 cullBits[i+3] = vBits[3];
6386 }
6387
6388 for ( ; i < numVerts; i++ ) {
6389 byte bits;
6390 float d0, d1, d2, d3, d4, d5;
6391 const idVec3 &v = verts[i].xyz;
6392
6393 d0 = planes[0].Distance( v );
6394 d1 = planes[1].Distance( v );
6395 d2 = planes[2].Distance( v );
6396 d3 = planes[3].Distance( v );
6397 d4 = planes[4].Distance( v );
6398 d5 = planes[5].Distance( v );
6399
6400 // they check if the sign bit is set by casting as long and shifting right 31 places.
6401 bits = FLOATSIGNBITSET( d0 ) << 0;
6402 bits |= FLOATSIGNBITSET( d1 ) << 1;
6403 bits |= FLOATSIGNBITSET( d2 ) << 2;
6404 bits |= FLOATSIGNBITSET( d3 ) << 3;
6405 bits |= FLOATSIGNBITSET( d4 ) << 4;
6406 bits |= FLOATSIGNBITSET( d5 ) << 5;
6407
6408 cullBits[i] = bits ^ 0x3F; // flip lower 6 bits
6409 }
6410 }
6411
6412
6413 #endif /*DRAWVERT_PADDED */
6414
6415 #ifndef DRAWVERT_PADDED
6416 /*
6417 ============
6418 idSIMD_AltiVec::OverlayPointCull
6419 ============
6420 */
OverlayPointCull(byte * cullBits,idVec2 * texCoords,const idPlane * planes,const idDrawVert * verts,const int numVerts)6421 void VPCALL idSIMD_AltiVec::OverlayPointCull( byte *cullBits, idVec2 *texCoords, const idPlane *planes, const idDrawVert *verts, const int numVerts ) {
6422
6423 // idDrawVert size
6424 assert( sizeof(idDrawVert) == DRAWVERT_OFFSET * sizeof(float) );
6425
6426 int i;
6427
6428 float p0x, p0y, p0z, p0d;
6429 float p1x, p1y, p1z, p1d;
6430
6431 const float *planePtr = planes[0].ToFloatPtr();
6432 const float *vertPtr = verts[0].xyz.ToFloatPtr();
6433
6434 vector float vecPlane0, vecPlane1, vecPlane2, vecPlane3;
6435 vector float v0, v1, v2, v3, v4, v5, v6, v7;
6436 vector unsigned char vecPerm;
6437 vector float zeroVector = (vector float)(0);
6438
6439 p0x = *(planePtr + 0);
6440 p0y = *(planePtr + 1);
6441 p0z = *(planePtr + 2);
6442 p0d = *(planePtr + 3);
6443 p1x = *(planePtr + 4);
6444 p1y = *(planePtr + 5);
6445 p1z = *(planePtr + 6);
6446 p1d = *(planePtr + 7);
6447
6448 // populate the planes
6449 vecPerm = vec_add( vec_lvsl( -1, planePtr ), (vector unsigned char)(1) );
6450 v0 = vec_ld( 0, planePtr );
6451 v1 = vec_ld( 15, planePtr );
6452 vecPlane0 = vec_perm( v0, v1, vecPerm );
6453
6454 v2 = vec_ld( 31, planePtr );
6455 vecPlane1 = vec_perm( v1, v2, vecPerm );
6456
6457 // transpose
6458 v0 = vec_mergeh( vecPlane0, vecPlane0 );
6459 v1 = vec_mergeh( vecPlane1, vecPlane1 );
6460 v2 = vec_mergel( vecPlane0, vecPlane0 );
6461 v3 = vec_mergel( vecPlane1, vecPlane1);
6462
6463 vecPlane0 = vec_mergeh( v0, v1 );
6464 vecPlane1 = vec_mergel( v0, v1 );
6465 vecPlane2 = vec_mergeh( v2, v3 );
6466 vecPlane3 = vec_mergel( v2, v3 );
6467
6468 vector float vecXYZ1, vecXYZ2, vecXYZ3, vecXYZ4;
6469 vector float oneVector = (vector float)(1);
6470
6471 vector float vecSum1, vecSum2, vecSum1Inv,vecSum2Inv;
6472
6473 vector bool int vecCmp1, vecCmp2, vecCmp1Inv, vecCmp2Inv;
6474 vector float negTwoVector = (vector float)(-2);
6475 vector unsigned int vecBitShifted1, vecBitShifted2, vecBitShifted1Inv, vecBitShifted2Inv;
6476 vector unsigned int vecShift = (vector unsigned int)( 0, 1, 0, 1 );
6477 vector unsigned int vecShiftInv = (vector unsigned int)( 2, 3, 2, 3 );
6478 vector unsigned char vecPermFirstThird = (vector unsigned char)(0,1,2,3,8,9,10,11,16,17,18,19,24,25,26,27);
6479 vector bool int oneIntVector = (vector bool int)(1);
6480 vector unsigned char vertPerm1, vertPerm2, vertPerm3, vertPerm4;
6481 unsigned int cullBitVal[4];
6482 vector unsigned char cullBitPerm = vec_lvsr( 0, &cullBitVal[0] );
6483
6484 i = 0;
6485 // every fourth one will have the same alignment. Make sure we've got enough here
6486 if ( i+3 < numVerts ) {
6487 vertPerm1 = vec_add( vec_lvsl( -1, (float*) verts[0].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
6488 vertPerm2 = vec_add( vec_lvsl( -1, (float*) verts[1].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
6489 vertPerm3 = vec_add( vec_lvsl( -1, (float*) verts[2].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
6490 vertPerm4 = vec_add( vec_lvsl( -1, (float*) verts[3].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
6491 }
6492
6493
6494 for ( ; i+3 < numVerts; i+=4 ) {
6495 const float *vertPtr = verts[i].xyz.ToFloatPtr();
6496 const float *vertPtr2 = verts[i+1].xyz.ToFloatPtr();
6497 const float *vertPtr3 = verts[i+2].xyz.ToFloatPtr();
6498 const float *vertPtr4 = verts[i+3].xyz.ToFloatPtr();
6499
6500 v0 = vec_ld( 0, vertPtr );
6501 v1 = vec_ld( 15, vertPtr );
6502 v2 = vec_ld( 0, vertPtr2 );
6503 v3 = vec_ld( 15, vertPtr2 );
6504 v4 = vec_ld( 0, vertPtr3 );
6505 v5 = vec_ld( 15, vertPtr3 );
6506 v6 = vec_ld( 0, vertPtr4 );
6507 v7 = vec_ld( 15, vertPtr4 );
6508
6509 vecXYZ1 = vec_perm( v0, v1, vertPerm1 );
6510 vecXYZ2 = vec_perm( v2, v3, vertPerm2 );
6511 vecXYZ3 = vec_perm( v4, v5, vertPerm3 );
6512 vecXYZ4 = vec_perm( v6, v7, vertPerm4 );
6513
6514 // like a splat, but only doing halves
6515 vecSum1 = vec_madd( vec_perm( vecXYZ1, vecXYZ2, (vector unsigned char)(0,1,2,3,0,1,2,3,16,17,18,19,16,17,18,19) ), vecPlane0, zeroVector );
6516 vecSum1 = vec_madd( vec_perm( vecXYZ1, vecXYZ2, (vector unsigned char)(4,5,6,7,4,5,6,7,20,21,22,23,20,21,22,23) ) , vecPlane1, vecSum1 );
6517 vecSum1 = vec_madd( vec_perm( vecXYZ1, vecXYZ2, (vector unsigned char)(8,9,10,11,8,9,10,11,24,25,26,27,24,25,26,27) ), vecPlane2, vecSum1 );
6518 vecSum1 = vec_add( vecSum1, vecPlane3 );
6519
6520 vecSum2 = vec_madd( vec_perm( vecXYZ3, vecXYZ4, (vector unsigned char)(0,1,2,3,0,1,2,3,16,17,18,19,16,17,18,19) ), vecPlane0, zeroVector );
6521 vecSum2 = vec_madd( vec_perm( vecXYZ3, vecXYZ4, (vector unsigned char)(4,5,6,7,4,5,6,7,20,21,22,23,20,21,22,23) ) , vecPlane1, vecSum2 );
6522 vecSum2 = vec_madd( vec_perm( vecXYZ3, vecXYZ4, (vector unsigned char)(8,9,10,11,8,9,10,11,24,25,26,27,24,25,26,27) ), vecPlane2, vecSum2 );
6523 vecSum2 = vec_add( vecSum2, vecPlane3 );
6524
6525 // store out results
6526 UNALIGNED_STORE2( &texCoords[i][0], vecSum1, vecSum2 );
6527
6528 // bit manipulation
6529 vecCmp1 = vec_cmplt( vecSum1, zeroVector );
6530 vecCmp2 = vec_cmplt( vecSum2, zeroVector );
6531
6532 //and it with 1 so we multiply by 1 not 1111's
6533 vecCmp1 = vec_and( vecCmp1, oneIntVector );
6534 vecCmp2 = vec_and( vecCmp2, oneIntVector );
6535
6536 // store out and write to cullBits
6537 // finally, a use for algebra! 1-x = x + 1 - 2x
6538 vecSum1Inv = vec_madd( vecSum1, negTwoVector, vecSum1 );
6539 vecSum2Inv = vec_madd( vecSum2, negTwoVector, vecSum2 );
6540 vecSum1Inv = vec_add( vecSum1Inv, oneVector );
6541 vecSum2Inv = vec_add( vecSum2Inv, oneVector );
6542
6543 // do the same comparisons for the inverted d0/d1
6544 vecCmp1Inv = vec_cmplt( vecSum1Inv, zeroVector );
6545 vecCmp2Inv = vec_cmplt( vecSum2Inv, zeroVector );
6546
6547 //and it with 1 so we multiply by 1 not 1111's
6548 vecCmp1Inv = vec_and( vecCmp1Inv, oneIntVector );
6549 vecCmp2Inv = vec_and( vecCmp2Inv, oneIntVector );
6550
6551 // shift them as needed
6552 vecBitShifted1 = vec_sl( (vector unsigned int)vecCmp1, vecShift );
6553 vecBitShifted2 = vec_sl( (vector unsigned int)vecCmp2, vecShift );
6554 vecBitShifted1Inv = vec_sl( (vector unsigned int)vecCmp1Inv, vecShiftInv );
6555 vecBitShifted2Inv = vec_sl( (vector unsigned int)vecCmp2Inv, vecShiftInv );
6556
6557 // OR them all together. since only 1 bit is set for each value, thats
6558 // the same as adding them. add up d0 + d1 + d0Inv + d1Inv
6559 vector unsigned int vecResult;
6560 vector unsigned int vecResult2;
6561 vector unsigned int vecResult3;
6562 vecResult = vec_add( vecBitShifted1, vec_sld( vecBitShifted1, vecBitShifted1, 4 ) );
6563
6564 vecResult2 = vec_add( vecBitShifted2, vec_sld( vecBitShifted2, vecBitShifted2, 4 ) );
6565
6566 // vecResult now holds the values without the inverses yet, so add those
6567 vecResult = vec_perm( vecResult, vecResult2, vecPermFirstThird );
6568 vecResult2 = vec_add( vecBitShifted1Inv, vec_sld( vecBitShifted1Inv, vecBitShifted1Inv, 4 ) );
6569 vecResult3 = vec_add( vecBitShifted2Inv, vec_sld( vecBitShifted2Inv, vecBitShifted2Inv, 4 ) );
6570 vecResult2 = vec_perm( vecResult2, vecResult3, vecPermFirstThird );
6571
6572 vecResult = vec_add( vecResult, vecResult2 );
6573
6574 //store out results
6575 vecResult = vec_perm( vecResult, vecResult, cullBitPerm );
6576 vec_ste( vecResult, 0, &cullBitVal[0] );
6577 vec_ste( vecResult, 4, &cullBitVal[0] );
6578 vec_ste( vecResult, 8, &cullBitVal[0] );
6579 vec_ste( vecResult, 12, &cullBitVal[0] );
6580
6581 cullBits[i] = cullBitVal[0];
6582 cullBits[i+1] = cullBitVal[1];
6583 cullBits[i+2] = cullBitVal[2];
6584 cullBits[i+3] = cullBitVal[3];
6585 }
6586
6587 // cleanup
6588 for ( ; i < numVerts; i++ ) {
6589 byte bits;
6590 float d0, d1;
6591 float vx, vy, vz;
6592
6593 vx = *( vertPtr + (i*DRAWVERT_OFFSET) + 0 );
6594 vy = *( vertPtr + (i*DRAWVERT_OFFSET) + 1 );
6595 vz = *( vertPtr + (i*DRAWVERT_OFFSET) + 2 );
6596
6597 d0 = p0x * vx + p0y * vy + p0z * vz + p0d;
6598 d1 = p1x * vx + p1y * vy + p1z * vz + p1d;
6599 texCoords[i][0] = d0;
6600 texCoords[i][1] = d1;
6601
6602 bits = ( d0 >= 0 ) ? 0 : 1;
6603 d0 = 1.0f - d0;
6604 bits |= ( d1 >= 0 ) ? 0 : 1*2;
6605 d1 = 1.0f - d1;
6606
6607 bits |= ( d0 >= 0 ) ? 0: 1*4;
6608 bits |= ( d1 >= 0 ) ? 0: 1*8;
6609
6610 cullBits[i] = bits;
6611 }
6612 }
6613 #else
6614
6615 /*
6616 ============
6617 idSIMD_AltiVec::OverlayPointCull
6618 ============
6619 */
OverlayPointCull(byte * cullBits,idVec2 * texCoords,const idPlane * planes,const idDrawVert * verts,const int numVerts)6620 void VPCALL idSIMD_AltiVec::OverlayPointCull( byte *cullBits, idVec2 *texCoords, const idPlane *planes, const idDrawVert *verts, const int numVerts ) {
6621
6622 // idDrawVert size
6623 assert( sizeof(idDrawVert) == DRAWVERT_OFFSET * sizeof(float) );
6624
6625 int i;
6626
6627 float p0x, p0y, p0z, p0d;
6628 float p1x, p1y, p1z, p1d;
6629
6630 const float *planePtr = planes[0].ToFloatPtr();
6631 const float *vertPtr = verts[0].xyz.ToFloatPtr();
6632
6633 vector float vecPlane0, vecPlane1, vecPlane2, vecPlane3;
6634 vector float v0, v1, v2, v3, v4, v5, v6, v7;
6635 vector unsigned char vecPerm;
6636 vector float zeroVector = (vector float)(0);
6637
6638 p0x = *(planePtr + 0);
6639 p0y = *(planePtr + 1);
6640 p0z = *(planePtr + 2);
6641 p0d = *(planePtr + 3);
6642 p1x = *(planePtr + 4);
6643 p1y = *(planePtr + 5);
6644 p1z = *(planePtr + 6);
6645 p1d = *(planePtr + 7);
6646
6647 // populate the planes
6648 vecPerm = vec_add( vec_lvsl( -1, planePtr ), (vector unsigned char)(1) );
6649 v0 = vec_ld( 0, planePtr );
6650 v1 = vec_ld( 15, planePtr );
6651 vecPlane0 = vec_perm( v0, v1, vecPerm );
6652
6653 v2 = vec_ld( 31, planePtr );
6654 vecPlane1 = vec_perm( v1, v2, vecPerm );
6655
6656 // transpose
6657 v0 = vec_mergeh( vecPlane0, vecPlane0 );
6658 v1 = vec_mergeh( vecPlane1, vecPlane1 );
6659 v2 = vec_mergel( vecPlane0, vecPlane0 );
6660 v3 = vec_mergel( vecPlane1, vecPlane1);
6661
6662 vecPlane0 = vec_mergeh( v0, v1 );
6663 vecPlane1 = vec_mergel( v0, v1 );
6664 vecPlane2 = vec_mergeh( v2, v3 );
6665 vecPlane3 = vec_mergel( v2, v3 );
6666
6667 vector float vecXYZ1, vecXYZ2, vecXYZ3, vecXYZ4;
6668 vector float oneVector = (vector float)(1);
6669
6670 vector float vecSum1, vecSum2, vecSum1Inv,vecSum2Inv;
6671
6672 vector bool int vecCmp1, vecCmp2, vecCmp1Inv, vecCmp2Inv;
6673 vector float negTwoVector = (vector float)(-2);
6674 vector unsigned int vecBitShifted1, vecBitShifted2, vecBitShifted1Inv, vecBitShifted2Inv;
6675 vector unsigned int vecShift = (vector unsigned int)( 0, 1, 0, 1 );
6676 vector unsigned int vecShiftInv = (vector unsigned int)( 2, 3, 2, 3 );
6677 vector unsigned char vecPermFirstThird = (vector unsigned char)(0,1,2,3,8,9,10,11,16,17,18,19,24,25,26,27);
6678 vector bool int oneIntVector = (vector bool int)(1);
6679 vector unsigned char vertPerm1, vertPerm2, vertPerm3, vertPerm4;
6680 unsigned int cullBitVal[4];
6681 vector unsigned char cullBitPerm = vec_lvsr( 0, &cullBitVal[0] );
6682
6683 i = 0;
6684
6685 for ( ; i+3 < numVerts; i+=4 ) {
6686 const float *vertPtr = verts[i].xyz.ToFloatPtr();
6687 const float *vertPtr2 = verts[i+1].xyz.ToFloatPtr();
6688 const float *vertPtr3 = verts[i+2].xyz.ToFloatPtr();
6689 const float *vertPtr4 = verts[i+3].xyz.ToFloatPtr();
6690
6691 vecXYZ1 = vec_ld( 0, vertPtr );
6692 vecXYZ2 = vec_ld( 0, vertPtr2 );
6693 vecXYZ3 = vec_ld( 0, vertPtr3 );
6694 vecXYZ4 = vec_ld( 0, vertPtr4 );
6695
6696 // like a splat, but only doing halves
6697 vecSum1 = vec_madd( vec_perm( vecXYZ1, vecXYZ2, (vector unsigned char)(0,1,2,3,0,1,2,3,16,17,18,19,16,17,18,19) ), vecPlane0, zeroVector );
6698 vecSum1 = vec_madd( vec_perm( vecXYZ1, vecXYZ2, (vector unsigned char)(4,5,6,7,4,5,6,7,20,21,22,23,20,21,22,23) ) , vecPlane1, vecSum1 );
6699 vecSum1 = vec_madd( vec_perm( vecXYZ1, vecXYZ2, (vector unsigned char)(8,9,10,11,8,9,10,11,24,25,26,27,24,25,26,27) ), vecPlane2, vecSum1 );
6700 vecSum1 = vec_add( vecSum1, vecPlane3 );
6701
6702 vecSum2 = vec_madd( vec_perm( vecXYZ3, vecXYZ4, (vector unsigned char)(0,1,2,3,0,1,2,3,16,17,18,19,16,17,18,19) ), vecPlane0, zeroVector );
6703 vecSum2 = vec_madd( vec_perm( vecXYZ3, vecXYZ4, (vector unsigned char)(4,5,6,7,4,5,6,7,20,21,22,23,20,21,22,23) ) , vecPlane1, vecSum2 );
6704 vecSum2 = vec_madd( vec_perm( vecXYZ3, vecXYZ4, (vector unsigned char)(8,9,10,11,8,9,10,11,24,25,26,27,24,25,26,27) ), vecPlane2, vecSum2 );
6705 vecSum2 = vec_add( vecSum2, vecPlane3 );
6706
6707 // store out results
6708 UNALIGNED_STORE2( &texCoords[i][0], vecSum1, vecSum2 );
6709
6710 // bit manipulation
6711 vecCmp1 = vec_cmplt( vecSum1, zeroVector );
6712 vecCmp2 = vec_cmplt( vecSum2, zeroVector );
6713
6714 //and it with 1 so we multiply by 1 not 1111's
6715 vecCmp1 = vec_and( vecCmp1, oneIntVector );
6716 vecCmp2 = vec_and( vecCmp2, oneIntVector );
6717
6718 // store out and write to cullBits
6719 // finally, a use for algebra! 1-x = x + 1 - 2x
6720 vecSum1Inv = vec_madd( vecSum1, negTwoVector, vecSum1 );
6721 vecSum2Inv = vec_madd( vecSum2, negTwoVector, vecSum2 );
6722 vecSum1Inv = vec_add( vecSum1Inv, oneVector );
6723 vecSum2Inv = vec_add( vecSum2Inv, oneVector );
6724
6725 // do the same comparisons for the inverted d0/d1
6726 vecCmp1Inv = vec_cmplt( vecSum1Inv, zeroVector );
6727 vecCmp2Inv = vec_cmplt( vecSum2Inv, zeroVector );
6728
6729 //and it with 1 so we multiply by 1 not 1111's
6730 vecCmp1Inv = vec_and( vecCmp1Inv, oneIntVector );
6731 vecCmp2Inv = vec_and( vecCmp2Inv, oneIntVector );
6732
6733 // shift them as needed
6734 vecBitShifted1 = vec_sl( (vector unsigned int)vecCmp1, vecShift );
6735 vecBitShifted2 = vec_sl( (vector unsigned int)vecCmp2, vecShift );
6736 vecBitShifted1Inv = vec_sl( (vector unsigned int)vecCmp1Inv, vecShiftInv );
6737 vecBitShifted2Inv = vec_sl( (vector unsigned int)vecCmp2Inv, vecShiftInv );
6738
6739 // OR them all together. since only 1 bit is set for each value, thats
6740 // the same as adding them. add up d0 + d1 + d0Inv + d1Inv
6741 vector unsigned int vecResult;
6742 vector unsigned int vecResult2;
6743 vector unsigned int vecResult3;
6744 vecResult = vec_add( vecBitShifted1, vec_sld( vecBitShifted1, vecBitShifted1, 4 ) );
6745
6746 vecResult2 = vec_add( vecBitShifted2, vec_sld( vecBitShifted2, vecBitShifted2, 4 ) );
6747
6748 // vecResult now holds the values without the inverses yet, so add those
6749 vecResult = vec_perm( vecResult, vecResult2, vecPermFirstThird );
6750 vecResult2 = vec_add( vecBitShifted1Inv, vec_sld( vecBitShifted1Inv, vecBitShifted1Inv, 4 ) );
6751 vecResult3 = vec_add( vecBitShifted2Inv, vec_sld( vecBitShifted2Inv, vecBitShifted2Inv, 4 ) );
6752 vecResult2 = vec_perm( vecResult2, vecResult3, vecPermFirstThird );
6753
6754 vecResult = vec_add( vecResult, vecResult2 );
6755
6756 //store out results
6757 vecResult = vec_perm( vecResult, vecResult, cullBitPerm );
6758 vec_ste( vecResult, 0, &cullBitVal[0] );
6759 vec_ste( vecResult, 4, &cullBitVal[0] );
6760 vec_ste( vecResult, 8, &cullBitVal[0] );
6761 vec_ste( vecResult, 12, &cullBitVal[0] );
6762
6763 cullBits[i] = cullBitVal[0];
6764 cullBits[i+1] = cullBitVal[1];
6765 cullBits[i+2] = cullBitVal[2];
6766 cullBits[i+3] = cullBitVal[3];
6767 }
6768
6769 // cleanup
6770 for ( ; i < numVerts; i++ ) {
6771 byte bits;
6772 float d0, d1;
6773 float vx, vy, vz;
6774
6775 vx = *( vertPtr + (i*DRAWVERT_OFFSET) + 0 );
6776 vy = *( vertPtr + (i*DRAWVERT_OFFSET) + 1 );
6777 vz = *( vertPtr + (i*DRAWVERT_OFFSET) + 2 );
6778
6779 d0 = p0x * vx + p0y * vy + p0z * vz + p0d;
6780 d1 = p1x * vx + p1y * vy + p1z * vz + p1d;
6781 texCoords[i][0] = d0;
6782 texCoords[i][1] = d1;
6783
6784 bits = ( d0 >= 0 ) ? 0 : 1;
6785 d0 = 1.0f - d0;
6786 bits |= ( d1 >= 0 ) ? 0 : 1*2;
6787 d1 = 1.0f - d1;
6788
6789 bits |= ( d0 >= 0 ) ? 0: 1*4;
6790 bits |= ( d1 >= 0 ) ? 0: 1*8;
6791
6792 cullBits[i] = bits;
6793 }
6794 }
6795
6796
6797 #endif /* DRAWVERT_PADDED */
6798
6799 #endif /* ENABLE_CULL */
6800
6801 #ifdef ENABLE_DERIVE
6802 /*
6803 ============
6804 idSIMD_AltiVec::DeriveTriPlanes
6805
6806 Derives a plane equation for each triangle.
6807 ============
6808 */
DeriveTriPlanes(idPlane * planes,const idDrawVert * verts,const int numVerts,const int * indexes,const int numIndexes)6809 void VPCALL idSIMD_AltiVec::DeriveTriPlanes( idPlane *planes, const idDrawVert *verts, const int numVerts, const int *indexes, const int numIndexes ) {
6810
6811 // idDrawVert size
6812 assert( sizeof(idDrawVert) == DRAWVERT_OFFSET * sizeof(float) );
6813 // idPlane size
6814 assert( sizeof(idPlane) == PLANE_OFFSET * sizeof(float) );
6815 int i;
6816
6817 vector float vecD0, vecD1, vecD2, vecD3, vecD4, vecD5, vecD6, vecD7;
6818 vector float vecVertA, vecVertB, vecVertC;
6819 vector float vecVertA2, vecVertB2, vecVertC2;
6820 vector float vecVertA3, vecVertB3, vecVertC3;
6821 vector float vecVertA4, vecVertB4, vecVertC4;
6822
6823 vector float vecN, vecN2, vecN3, vecN4;
6824 vector float vecWork1, vecWork2, vecWork3, vecWork4, vecWork5, vecWork6, vecWork7, vecWork8;
6825 vector unsigned char vecPerm1 = (vector unsigned char)(4,5,6,7,8,9,10,11,0,1,2,3,12,13,14,15);
6826 vector unsigned char vecPerm2 = (vector unsigned char)(8,9,10,11,0,1,2,3,4,5,6,7,12,13,14,15);
6827 vector float vecF;
6828 vector float vecF1, vecF2, vecF3, vecF4;
6829 vector float zeroVector = (vector float)(0);
6830 vector float vecNegOne = (vector float)(-1);
6831 vector float vecSecondHalf, vecFirstHalf, vecSecondHalf2, vecFirstHalf2, vecSecondHalf3, vecFirstHalf3, vecFirstHalf4, vecSecondHalf4;
6832
6833 vector unsigned char vecPermA, vecPermA2, vecPermA3, vecPermA4;
6834 vector unsigned char vecPermB, vecPermB2, vecPermB3, vecPermB4;
6835 vector unsigned char vecPermC, vecPermC2, vecPermC3, vecPermC4;
6836
6837 vector unsigned char oneVector = (vector unsigned char)(1);
6838 vector float vecLd1, vecLd2, vecLd3, vecLd4, vecLd5, vecLd6;
6839 vector unsigned char vecPermZeroLast = (vector unsigned char)(0,1,2,3,4,5,6,7,8,9,10,11,16,17,18,19);
6840
6841 const float *xyzPtr = verts[0].xyz.ToFloatPtr();
6842 float *planePtr = planes[0].ToFloatPtr();
6843
6844 int j;
6845 for ( j = 0, i = 0; i+11 < numIndexes; i += 12, j += 4 ) {
6846
6847 #ifndef DRAWVERT_PADDED
6848 // calculate permute vectors to load as needed. these are all
6849 // triangle indexes and are usaully pretty close together but
6850 // not guaranteed to be in any particular order
6851 vecPermA = vec_add( vec_lvsl( -1, xyzPtr + ( indexes[i+0] * DRAWVERT_OFFSET ) ), oneVector );
6852 vecPermB = vec_add( vec_lvsl( -1, xyzPtr + ( indexes[i+1] * DRAWVERT_OFFSET ) ), oneVector );
6853 vecPermC = vec_add( vec_lvsl( -1, xyzPtr + ( indexes[i+2] * DRAWVERT_OFFSET ) ), oneVector );
6854 vecPermA2 = vec_add( vec_lvsl( -1, xyzPtr + ( indexes[i+3] * DRAWVERT_OFFSET ) ), oneVector );
6855 vecPermB2 = vec_add( vec_lvsl( -1, xyzPtr + ( indexes[i+4] * DRAWVERT_OFFSET ) ), oneVector );
6856 vecPermC2 = vec_add( vec_lvsl( -1, xyzPtr + ( indexes[i+5] * DRAWVERT_OFFSET ) ), oneVector );
6857 vecPermA3 = vec_add( vec_lvsl( -1, xyzPtr + ( indexes[i+6] * DRAWVERT_OFFSET ) ), oneVector );
6858 vecPermB3 = vec_add( vec_lvsl( -1, xyzPtr + ( indexes[i+7] * DRAWVERT_OFFSET ) ), oneVector );
6859 vecPermC3 = vec_add( vec_lvsl( -1, xyzPtr + ( indexes[i+8] * DRAWVERT_OFFSET ) ), oneVector );
6860 vecPermA4 = vec_add( vec_lvsl( -1, xyzPtr + ( indexes[i+9] * DRAWVERT_OFFSET ) ), oneVector );
6861 vecPermB4 = vec_add( vec_lvsl( -1, xyzPtr + ( indexes[i+10] * DRAWVERT_OFFSET ) ), oneVector );
6862 vecPermC4 = vec_add( vec_lvsl( -1, xyzPtr + ( indexes[i+11] * DRAWVERT_OFFSET ) ), oneVector );
6863 #endif
6864
6865 #ifndef DRAWVERT_PADDED
6866 // load first A B C
6867 vecLd1 = vec_ld( 0, xyzPtr + ( indexes[i+0] * DRAWVERT_OFFSET ) );
6868 vecLd2 = vec_ld( 15, xyzPtr + ( indexes[i+0] * DRAWVERT_OFFSET ) );
6869 vecLd3 = vec_ld( 0, xyzPtr + ( indexes[i+1] * DRAWVERT_OFFSET ) );
6870 vecLd4 = vec_ld( 15, xyzPtr + ( indexes[i+1] * DRAWVERT_OFFSET ) );
6871 vecLd5 = vec_ld( 0, xyzPtr + ( indexes[i+2] * DRAWVERT_OFFSET ) );
6872 vecLd6 = vec_ld( 15, xyzPtr + ( indexes[i+2] * DRAWVERT_OFFSET ) );
6873
6874 vecVertA = vec_perm( vecLd1, vecLd2, vecPermA );
6875 vecVertB = vec_perm( vecLd3, vecLd4, vecPermB );
6876 vecVertC = vec_perm( vecLd5, vecLd6, vecPermC );
6877
6878 // set the last element to 0
6879 vecVertA = vec_perm( vecVertA, zeroVector, vecPermZeroLast );
6880 vecVertB = vec_perm( vecVertB, zeroVector, vecPermZeroLast );
6881 vecVertC = vec_perm( vecVertC, zeroVector, vecPermZeroLast );
6882
6883 // load second A B C
6884 vecLd1 = vec_ld( 0, xyzPtr + ( indexes[i+3] * DRAWVERT_OFFSET ) );
6885 vecLd2 = vec_ld( 15, xyzPtr + ( indexes[i+3] * DRAWVERT_OFFSET ) );
6886 vecLd3 = vec_ld( 0, xyzPtr + ( indexes[i+4] * DRAWVERT_OFFSET ) );
6887 vecLd4 = vec_ld( 15, xyzPtr + ( indexes[i+4] * DRAWVERT_OFFSET ) );
6888 vecLd5 = vec_ld( 0, xyzPtr + ( indexes[i+5] * DRAWVERT_OFFSET ) );
6889 vecLd6 = vec_ld( 15, xyzPtr + ( indexes[i+5] * DRAWVERT_OFFSET ) );
6890
6891 vecVertA2 = vec_perm( vecLd1, vecLd2, vecPermA2 );
6892 vecVertB2 = vec_perm( vecLd3, vecLd4, vecPermB2 );
6893 vecVertC2 = vec_perm( vecLd5, vecLd6, vecPermC2 );
6894
6895 // set the last element to 0
6896 vecVertA2 = vec_perm( vecVertA2, zeroVector, vecPermZeroLast );
6897 vecVertB2 = vec_perm( vecVertB2, zeroVector, vecPermZeroLast );
6898 vecVertC2 = vec_perm( vecVertC2, zeroVector, vecPermZeroLast );
6899
6900 // load third A B C
6901 vecLd1 = vec_ld( 0, xyzPtr + ( indexes[i+6] * DRAWVERT_OFFSET ) );
6902 vecLd2 = vec_ld( 15, xyzPtr + ( indexes[i+6] * DRAWVERT_OFFSET ) );
6903 vecLd3 = vec_ld( 0, xyzPtr + ( indexes[i+7] * DRAWVERT_OFFSET ) );
6904 vecLd4 = vec_ld( 15, xyzPtr + ( indexes[i+7] * DRAWVERT_OFFSET ) );
6905 vecLd5 = vec_ld( 0, xyzPtr + ( indexes[i+8] * DRAWVERT_OFFSET ) );
6906 vecLd6 = vec_ld( 15, xyzPtr + ( indexes[i+8] * DRAWVERT_OFFSET ) );
6907
6908 vecVertA3 = vec_perm( vecLd1, vecLd2, vecPermA3 );
6909 vecVertB3 = vec_perm( vecLd3, vecLd4, vecPermB3 );
6910 vecVertC3 = vec_perm( vecLd5, vecLd6, vecPermC3 );
6911
6912 // set the last element to 0
6913 vecVertA2 = vec_perm( vecVertA2, zeroVector, vecPermZeroLast );
6914 vecVertB2 = vec_perm( vecVertB2, zeroVector, vecPermZeroLast );
6915 vecVertC2 = vec_perm( vecVertC2, zeroVector, vecPermZeroLast );
6916
6917 // load the fourth A B C
6918 vecLd1 = vec_ld( 0, xyzPtr + ( indexes[i+9] * DRAWVERT_OFFSET ) );
6919 vecLd2 = vec_ld( 15, xyzPtr + ( indexes[i+9] * DRAWVERT_OFFSET ) );
6920 vecLd3 = vec_ld( 0, xyzPtr + ( indexes[i+10] * DRAWVERT_OFFSET ) );
6921 vecLd4 = vec_ld( 15, xyzPtr + ( indexes[i+10] * DRAWVERT_OFFSET ) );
6922 vecLd5 = vec_ld( 0, xyzPtr + ( indexes[i+11] * DRAWVERT_OFFSET ) );
6923 vecLd6 = vec_ld( 15, xyzPtr + ( indexes[i+11] * DRAWVERT_OFFSET ) );
6924
6925 vecVertA4 = vec_perm( vecLd1, vecLd2, vecPermA4 );
6926 vecVertB4 = vec_perm( vecLd3, vecLd4, vecPermB4 );
6927 vecVertC4 = vec_perm( vecLd5, vecLd6, vecPermC4 );
6928
6929 // set the last element to 0
6930 vecVertA4 = vec_perm( vecVertA4, zeroVector, vecPermZeroLast );
6931 vecVertB4 = vec_perm( vecVertB4, zeroVector, vecPermZeroLast );
6932 vecVertC4 = vec_perm( vecVertC4, zeroVector, vecPermZeroLast );
6933 #else
6934 // load first A B C
6935 vecVertA = vec_ld( 0, xyzPtr + ( indexes[i+0] * DRAWVERT_OFFSET ) );
6936 vecVertB = vec_ld( 0, xyzPtr + ( indexes[i+1] * DRAWVERT_OFFSET ) );
6937 vecVertC = vec_ld( 0, xyzPtr + ( indexes[i+2] * DRAWVERT_OFFSET ) );
6938
6939 // set the last element to 0
6940 vecVertA = vec_perm( vecVertA, zeroVector, vecPermZeroLast );
6941 vecVertB = vec_perm( vecVertB, zeroVector, vecPermZeroLast );
6942 vecVertC = vec_perm( vecVertC, zeroVector, vecPermZeroLast );
6943
6944 // load second A B C
6945 vecVertA2 = vec_ld( 0, xyzPtr + ( indexes[i+3] * DRAWVERT_OFFSET ) );
6946 vecVertB2 = vec_ld( 0, xyzPtr + ( indexes[i+4] * DRAWVERT_OFFSET ) );
6947 vecVertC2 = vec_ld( 0, xyzPtr + ( indexes[i+5] * DRAWVERT_OFFSET ) );
6948
6949 // set the last element to 0
6950 vecVertA2 = vec_perm( vecVertA2, zeroVector, vecPermZeroLast );
6951 vecVertB2 = vec_perm( vecVertB2, zeroVector, vecPermZeroLast );
6952 vecVertC2 = vec_perm( vecVertC2, zeroVector, vecPermZeroLast );
6953
6954 // load third A B C
6955 vecVertA3 = vec_ld( 0, xyzPtr + ( indexes[i+6] * DRAWVERT_OFFSET ) );
6956 vecVertB3 = vec_ld( 0, xyzPtr + ( indexes[i+7] * DRAWVERT_OFFSET ) );
6957 vecVertC3 = vec_ld( 0, xyzPtr + ( indexes[i+8] * DRAWVERT_OFFSET ) );
6958
6959 // set the last element to 0
6960 vecVertA3 = vec_perm( vecVertA3, zeroVector, vecPermZeroLast );
6961 vecVertB3 = vec_perm( vecVertB3, zeroVector, vecPermZeroLast );
6962 vecVertC3 = vec_perm( vecVertC3, zeroVector, vecPermZeroLast );
6963
6964 // load the fourth A B C
6965 vecVertA4 = vec_ld( 0, xyzPtr + ( indexes[i+9] * DRAWVERT_OFFSET ) );
6966 vecVertB4 = vec_ld( 0, xyzPtr + ( indexes[i+10] * DRAWVERT_OFFSET ) );
6967 vecVertC4 = vec_ld( 0, xyzPtr + ( indexes[i+11] * DRAWVERT_OFFSET ) );
6968
6969 // set the last element to 0
6970 vecVertA4 = vec_perm( vecVertA4, zeroVector, vecPermZeroLast );
6971 vecVertB4 = vec_perm( vecVertB4, zeroVector, vecPermZeroLast );
6972 vecVertC4 = vec_perm( vecVertC4, zeroVector, vecPermZeroLast );
6973 #endif
6974 // calculate d0 and d1 for each
6975 vecD0 = vec_sub( vecVertB, vecVertA );
6976 vecD1 = vec_sub( vecVertC, vecVertA );
6977
6978 vecD2 = vec_sub( vecVertB2, vecVertA2 );
6979 vecD3 = vec_sub( vecVertC2, vecVertA2 );
6980
6981 vecD4 = vec_sub( vecVertB3, vecVertA3 );
6982 vecD5 = vec_sub( vecVertC3, vecVertA3 );
6983
6984 vecD6 = vec_sub( vecVertB4, vecVertA4 );
6985 vecD7 = vec_sub( vecVertC4, vecVertA4 );
6986
6987 vecWork1 = vec_perm( vecD0, vecD0, vecPerm1 );
6988 vecWork2 = vec_perm( vecD1, vecD1, vecPerm2 );
6989 vecWork3 = vec_perm( vecD2, vecD2, vecPerm1 );
6990 vecWork4 = vec_perm( vecD3, vecD3, vecPerm2 );
6991 vecWork5 = vec_perm( vecD4, vecD4, vecPerm1 );
6992 vecWork6 = vec_perm( vecD5, vecD5, vecPerm2 );
6993 vecWork7 = vec_perm( vecD6, vecD6, vecPerm1 );
6994 vecWork8 = vec_perm( vecD7, vecD7, vecPerm2 );
6995
6996 vecSecondHalf = vec_madd( vecWork1, vecWork2, zeroVector );
6997 vecSecondHalf2 = vec_madd( vecWork3, vecWork4, zeroVector );
6998 vecSecondHalf3 = vec_madd( vecWork5, vecWork6, zeroVector );
6999 vecSecondHalf4 = vec_madd( vecWork7, vecWork8, zeroVector );
7000
7001 vecWork1 = vec_perm( vecD1, vecD1, vecPerm1 );
7002 vecWork2 = vec_perm( vecD0, vecD0, vecPerm2 );
7003 vecWork3 = vec_perm( vecD3, vecD3, vecPerm1 );
7004 vecWork4 = vec_perm( vecD2, vecD2, vecPerm2 );
7005 vecWork5 = vec_perm( vecD5, vecD5, vecPerm1 );
7006 vecWork6 = vec_perm( vecD4, vecD4, vecPerm2 );
7007 vecWork7 = vec_perm( vecD7, vecD7, vecPerm1 );
7008 vecWork8 = vec_perm( vecD6, vecD6, vecPerm2 );
7009
7010 vecFirstHalf = vec_madd( vecWork1, vecWork2, zeroVector );
7011 vecFirstHalf2 = vec_madd( vecWork3, vecWork4, zeroVector );
7012 vecFirstHalf3 = vec_madd( vecWork5, vecWork6, zeroVector );
7013 vecFirstHalf4 = vec_madd( vecWork7, vecWork8, zeroVector );
7014
7015 vecN = vec_madd( vecSecondHalf, vecNegOne, vecFirstHalf );
7016 vecN2 = vec_madd( vecSecondHalf2, vecNegOne, vecFirstHalf2 );
7017 vecN3 = vec_madd( vecSecondHalf3, vecNegOne, vecFirstHalf3 );
7018 vecN4 = vec_madd( vecSecondHalf4, vecNegOne, vecFirstHalf4 );
7019
7020 // transpose vecNs
7021 vector float v0, v1, v2, v3;
7022 v0 = vec_mergeh( vecN, vecN3 );
7023 v1 = vec_mergeh( vecN2, vecN4 );
7024 v2 = vec_mergel( vecN, vecN3 );
7025 v3 = vec_mergel( vecN2, vecN4 );
7026
7027 vecN = vec_mergeh( v0, v1 );
7028 vecN2 = vec_mergel( v0, v1 );
7029 vecN3 = vec_mergeh( v2, v3 );
7030 vecN4 = vec_mergel( v2, v3 );
7031
7032 vecF = vec_madd( vecN, vecN, zeroVector );
7033 vecF = vec_madd( vecN2, vecN2, vecF );
7034 vecF = vec_madd( vecN3, vecN3, vecF );
7035
7036 vecF = ReciprocalSquareRoot( vecF );
7037
7038 vecF1 = vec_madd( vecF, vecN, zeroVector );
7039 vecF2 = vec_madd( vecF, vecN2, zeroVector );
7040 vecF3 = vec_madd( vecF, vecN3, zeroVector );
7041 vecF4 = vec_madd( vecF, vecN4, zeroVector );
7042
7043 vector float v8, v9, v10, v11;
7044 v8 = vecF1;
7045 v9 = vecF2;
7046 v10 = vecF3;
7047 v11 = vecF4;
7048
7049 // transpose vecVerts
7050 v0 = vec_mergeh( vecVertA, vecVertA3 );
7051 v1 = vec_mergeh( vecVertA2, vecVertA4 );
7052 v2 = vec_mergel( vecVertA, vecVertA3 );
7053 v3 = vec_mergel( vecVertA2, vecVertA4 );
7054
7055 vecVertA = vec_mergeh( v0, v1 );
7056 vecVertA2 = vec_mergel( v0, v1 );
7057 vecVertA3 = vec_mergeh( v2, v3 );
7058 vecVertA4 = vec_mergel( v2, v3 );
7059
7060 vector float vecTotals;
7061 vecTotals = vec_madd( vecVertA, v8, zeroVector );
7062 vecTotals = vec_madd( vecVertA2, v9, vecTotals );
7063 vecTotals = vec_madd( vecVertA3, v10, vecTotals );
7064 vecTotals = vec_madd( vecVertA4, v11, vecTotals );
7065 vecF = vec_madd( vecTotals, vecNegOne, zeroVector );
7066
7067 // transpose vecFs
7068 v0 = vec_mergeh( vecF1, vecF3 );
7069 v1 = vec_mergeh( vecF2, vecF );
7070 v2 = vec_mergel( vecF1, vecF3 );
7071 v3 = vec_mergel( vecF2, vecF );
7072
7073 vecF1 = vec_mergeh( v0, v1 );
7074 vecF2 = vec_mergel( v0, v1 );
7075 vecF3 = vec_mergeh( v2, v3 );
7076 vecF4 = vec_mergel( v2, v3 );
7077
7078 // store results
7079 UNALIGNED_STORE4( planePtr + ( j * PLANE_OFFSET ), vecF1, vecF2, vecF3, vecF4 );
7080 }
7081
7082 // cleanup
7083 for ( ; i < numIndexes; i += 3, j++ ) {
7084 const idDrawVert *a, *b, *c;
7085 float d0[3], d1[3], f;
7086 idVec3 n;
7087
7088 a = verts + indexes[i + 0];
7089 b = verts + indexes[i + 1];
7090 c = verts + indexes[i + 2];
7091
7092 d0[0] = b->xyz[0] - a->xyz[0];
7093 d0[1] = b->xyz[1] - a->xyz[1];
7094 d0[2] = b->xyz[2] - a->xyz[2];
7095
7096 d1[0] = c->xyz[0] - a->xyz[0];
7097 d1[1] = c->xyz[1] - a->xyz[1];
7098 d1[2] = c->xyz[2] - a->xyz[2];
7099
7100 n[0] = d1[1] * d0[2] - d1[2] * d0[1];
7101 n[1] = d1[2] * d0[0] - d1[0] * d0[2];
7102 n[2] = d1[0] * d0[1] - d1[1] * d0[0];
7103
7104 f = FastScalarInvSqrt( n.x * n.x + n.y * n.y + n.z * n.z );
7105 //idMath::RSqrt( n.x * n.x + n.y * n.y + n.z * n.z );
7106
7107 n.x *= f;
7108 n.y *= f;
7109 n.z *= f;
7110
7111 planes[j].SetNormal( n );
7112 planes[j].FitThroughPoint( a->xyz );
7113 }
7114 }
7115
7116 /*
7117 ============
7118 idSIMD_AltiVec::DeriveTangents
7119
7120 Derives the normal and orthogonal tangent vectors for the triangle vertices.
7121 For each vertex the normal and tangent vectors are derived from all triangles
7122 using the vertex which results in smooth tangents across the mesh.
7123 In the process the triangle planes are calculated as well.
7124
7125 ============
7126 */
DeriveTangents(idPlane * planes,idDrawVert * verts,const int numVerts,const int * indexes,const int numIndexes)7127 void VPCALL idSIMD_AltiVec::DeriveTangents( idPlane *planes, idDrawVert *verts, const int numVerts, const int *indexes, const int numIndexes ) {
7128 int i;
7129
7130 bool *used = (bool *)_alloca16( numVerts * sizeof( used[0] ) );
7131 memset( used, 0, numVerts * sizeof( used[0] ) );
7132
7133 idPlane *planesPtr = planes;
7134 for ( i = 0; i < numIndexes; i += 3 ) {
7135 idDrawVert *a, *b, *c;
7136 // unsigned long signBit;
7137 float d0[5], d1[5], area;
7138 idVec3 n, t0, t1;
7139 float f1, f2, f3;
7140
7141 int v0 = indexes[i + 0];
7142 int v1 = indexes[i + 1];
7143 int v2 = indexes[i + 2];
7144
7145 a = verts + v0;
7146 b = verts + v1;
7147 c = verts + v2;
7148
7149 d0[0] = b->xyz[0] - a->xyz[0];
7150 d0[1] = b->xyz[1] - a->xyz[1];
7151 d0[2] = b->xyz[2] - a->xyz[2];
7152 d0[3] = b->st[0] - a->st[0];
7153 d0[4] = b->st[1] - a->st[1];
7154
7155 d1[0] = c->xyz[0] - a->xyz[0];
7156 d1[1] = c->xyz[1] - a->xyz[1];
7157 d1[2] = c->xyz[2] - a->xyz[2];
7158 d1[3] = c->st[0] - a->st[0];
7159 d1[4] = c->st[1] - a->st[1];
7160
7161 // normal
7162 n[0] = d1[1] * d0[2] - d1[2] * d0[1];
7163 n[1] = d1[2] * d0[0] - d1[0] * d0[2];
7164 n[2] = d1[0] * d0[1] - d1[1] * d0[0];
7165
7166 f1 = n.x * n.x + n.y * n.y + n.z * n.z;
7167
7168 // area sign bit
7169 area = d0[3] * d1[4] - d0[4] * d1[3];
7170
7171 // first tangent
7172 t0[0] = d0[0] * d1[4] - d0[4] * d1[0];
7173 t0[1] = d0[1] * d1[4] - d0[4] * d1[1];
7174 t0[2] = d0[2] * d1[4] - d0[4] * d1[2];
7175
7176 f2 = t0.x * t0.x + t0.y * t0.y + t0.z * t0.z;
7177
7178 // second tangent
7179 t1[0] = d0[3] * d1[0] - d0[0] * d1[3];
7180 t1[1] = d0[3] * d1[1] - d0[1] * d1[3];
7181 t1[2] = d0[3] * d1[2] - d0[2] * d1[3];
7182
7183 f3 = t1.x * t1.x + t1.y * t1.y + t1.z * t1.z;
7184
7185 // Behold! The power of the pipeline
7186 FastScalarInvSqrt_x3( &f1, &f2, &f3 );
7187 #ifdef PPC_INTRINSICS
7188 f2 = __fsel( area, f2, -f2 );
7189 f3 = __fsel( area, f3, -f3 );
7190 #else
7191 f2 = ( area < 0.0f ) ? -f2 : f2;
7192 f3 = ( area < 0.0f ) ? -f3 : f3;
7193 #endif
7194 t0.x *= f2;
7195 t0.y *= f2;
7196 t0.z *= f2;
7197
7198 n.x *= f1;
7199 n.y *= f1;
7200 n.z *= f1;
7201
7202 planesPtr->SetNormal( n );
7203 planesPtr->FitThroughPoint( a->xyz );
7204 planesPtr++;
7205
7206 t1.x *= f3;
7207 t1.y *= f3;
7208 t1.z *= f3;
7209
7210 if ( used[v0] ) {
7211 a->normal += n;
7212 a->tangents[0] += t0;
7213 a->tangents[1] += t1;
7214 } else {
7215 a->normal = n;
7216 a->tangents[0] = t0;
7217 a->tangents[1] = t1;
7218 used[v0] = true;
7219 }
7220
7221 if ( used[v1] ) {
7222 b->normal += n;
7223 b->tangents[0] += t0;
7224 b->tangents[1] += t1;
7225 } else {
7226 b->normal = n;
7227 b->tangents[0] = t0;
7228 b->tangents[1] = t1;
7229 used[v1] = true;
7230 }
7231
7232 if ( used[v2] ) {
7233 c->normal += n;
7234 c->tangents[0] += t0;
7235 c->tangents[1] += t1;
7236 } else {
7237 c->normal = n;
7238 c->tangents[0] = t0;
7239 c->tangents[1] = t1;
7240 used[v2] = true;
7241 }
7242 }
7243 }
7244
7245
7246 #ifdef DERIVE_UNSMOOTH_DRAWVERT_ALIGNED
7247
7248 /*
7249 ============
7250 idSIMD_AltiVec::DeriveUnsmoothedTangents
7251
7252 Derives the normal and orthogonal tangent vectors for the triangle vertices.
7253 For each vertex the normal and tangent vectors are derived from a single dominant triangle.
7254 ============
7255 */
7256 #define DERIVE_UNSMOOTHED_BITANGENT
DeriveUnsmoothedTangents(idDrawVert * verts,const dominantTri_s * dominantTris,const int numVerts)7257 void VPCALL idSIMD_AltiVec::DeriveUnsmoothedTangents( idDrawVert *verts, const dominantTri_s *dominantTris, const int numVerts ) {
7258
7259 int i;
7260 // idDrawVert size
7261 assert( sizeof(idDrawVert) == DRAWVERT_OFFSET * sizeof(float) );
7262 // drawverts aligned
7263 assert( IS_16BYTE_ALIGNED( verts[0] ) );
7264
7265 vector float vecVertA, vecVertB, vecVertC;
7266 vector float vecVertA2, vecVertB2, vecVertC2;
7267 vector float vecVertA3, vecVertB3, vecVertC3;
7268 vector float vecVertA4, vecVertB4, vecVertC4;
7269
7270 vector float v0, v1, v2, v3, v4, v5, v6, v7, v8;
7271 vector float vecS0, vecS1, vecS2;
7272 vector float vecS0_2, vecS1_2, vecS2_2;
7273 vector float vecS0_3, vecS1_3, vecS2_3;
7274 vector float vecS0_4, vecS1_4, vecS2_4;
7275
7276 vector float vecD1, vecD2, vecD3, vecD4, vecD5, vecD6;
7277 vector float vecD7, vecD8, vecD9, vecD10, vecD11, vecD12;
7278 vector float vecT1, vecT1_2, vecT1_3, vecT1_4, vecT2, vecT2_2, vecT2_3, vecT2_4;
7279 vector float vecWork1, vecWork2, vecWork3, vecWork4, vecWork5, vecWork6, vecWork7, vecWork8;
7280 vector float vecN, vecN2, vecN3, vecN4;
7281
7282 vector unsigned char vecPermN0 = (vector unsigned char)(8,9,10,11,0,1,2,3,4,5,6,7,12,13,14,15);
7283 vector unsigned char vecPermN1 = (vector unsigned char)(4,5,6,7,8,9,10,11,0,1,2,3,12,13,14,15);
7284 vector unsigned char vecPermT0 = (vector unsigned char)(0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3);
7285 vector unsigned char vecPermT1 = (vector unsigned char)(8,9,10,11,8,9,10,11,8,9,10,11,8,9,10,11);
7286 vector float zeroVector = (vector float)(0);
7287
7288 vector float vecNegOne = (vector float)(-1.0);
7289
7290 vector float vecStore1, vecStore2, vecStore3;
7291 vector unsigned char vecPermFirstThreeLast = (vector unsigned char)(0,1,2,3,4,5,6,7,8,9,10,11,16,17,18,19);
7292 vector unsigned char vecPermStoreSecond = (vector unsigned char)(4,5,6,7,8,9,10,11,16,17,18,19,20,21,22,23);
7293 vector unsigned char vecPermLeadAndThree = (vector unsigned char)(0,1,2,3,16,17,18,19,20,21,22,23,24,25,26,27);
7294 vector unsigned char vecPermStore2 = (vector unsigned char)(4,5,6,7,8,9,10,11,24,25,26,27,28,29,30,31);
7295 vector unsigned char vecPermStore3 = (vector unsigned char)(4,5,6,7,8,9,10,11,16,17,18,19,20,21,22,23);
7296 vector unsigned char vecPermStore4 = (vector unsigned char)(8,9,10,11,16,17,18,19,20,21,22,23,24,25,26,27);
7297 vector unsigned char vecPermHalves = (vector unsigned char)(0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23);
7298
7299 vector float vecLd1, vecLd2, vecLd3;
7300 vector unsigned char vecPerm0, vecPerm1, vecPerm2, vecPerm3, vecPerm4;
7301
7302 float *normalPtr = verts[0].normal.ToFloatPtr();
7303 float *xyzPtr = verts[0].xyz.ToFloatPtr();
7304
7305 vector float vecFirstHalf, vecSecondHalf;
7306 vector float vecFirstHalf2, vecSecondHalf2;
7307 vector float vecFirstHalf3, vecSecondHalf3;
7308 vector float vecFirstHalf4, vecSecondHalf4;
7309
7310 for ( i = 0; i+3 < numVerts; i+=4 ) {
7311 int bOffset1, bOffset2, bOffset3, bOffset4;
7312 int cOffset1, cOffset2, cOffset3, cOffset4;
7313
7314 bOffset1 = dominantTris[i].v2;
7315 cOffset1 = dominantTris[i].v3;
7316 bOffset2 = dominantTris[i+1].v2;
7317 cOffset2 = dominantTris[i+1].v3;
7318 bOffset3 = dominantTris[i+2].v2;
7319 cOffset3 = dominantTris[i+2].v3;
7320 bOffset4 = dominantTris[i+3].v2;
7321 cOffset4 = dominantTris[i+3].v3;
7322
7323 vecPerm0 = vec_lvsl( 0, xyzPtr + ( i * DRAWVERT_OFFSET ) );
7324 v0 = vec_ld( 0, xyzPtr + (i * DRAWVERT_OFFSET ) );
7325 v1 = vec_ld( 16, xyzPtr + (i * DRAWVERT_OFFSET ) );
7326 vecVertA = vec_perm( v0, v1, vecPerm0 );
7327
7328 vecPerm1 = vec_lvsl( 0, xyzPtr + (bOffset1 * DRAWVERT_OFFSET ) );
7329 v2 = vec_ld( 0, xyzPtr + ( bOffset1 * DRAWVERT_OFFSET ) );
7330 v3 = vec_ld( 16, xyzPtr + ( bOffset1 * DRAWVERT_OFFSET ) );
7331 vecVertB = vec_perm( v2, v3, vecPerm1 );
7332
7333 vecPerm2 = vec_lvsl( 0, xyzPtr + ( cOffset1 * DRAWVERT_OFFSET ) );
7334 v4 = vec_ld( 0, xyzPtr + ( cOffset1 * DRAWVERT_OFFSET ) );
7335 v5 = vec_ld( 16, xyzPtr + ( cOffset1 * DRAWVERT_OFFSET ) );
7336 vecVertC = vec_perm( v4, v5, vecPerm2 );
7337
7338 // put remainder into v2
7339 v1 = vec_perm( v1, v1, vecPerm0 );
7340 v3 = vec_perm( v3, v3, vecPerm1 );
7341 v5 = vec_perm( v5, v5, vecPerm2 );
7342
7343 v1 = vec_mergeh( v1, v5 );
7344 v2 = vec_mergeh( v3, zeroVector );
7345 v2 = vec_mergeh( v1, v2 );
7346 v2 = vec_perm( v2, v2, (vector unsigned char)(4,5,6,7,0,1,2,3,8,9,10,11,0,1,2,3) );
7347
7348 // load second one
7349 vecPerm0 = vec_lvsl( 0, xyzPtr + ((i+1) * DRAWVERT_OFFSET ) );
7350 v0 = vec_ld( 0, xyzPtr + ((i+1) * DRAWVERT_OFFSET ) );
7351 v1 = vec_ld( 16, xyzPtr + ((i+1) * DRAWVERT_OFFSET ) );
7352 vecVertA2 = vec_perm( v0, v1, vecPerm0 );
7353
7354 vecPerm3 = vec_lvsl( 0, xyzPtr + (bOffset2 * DRAWVERT_OFFSET ) );
7355 v3 = vec_ld( 0, xyzPtr + ( bOffset2 * DRAWVERT_OFFSET ) );
7356 v4 = vec_ld( 16, xyzPtr + ( bOffset2 * DRAWVERT_OFFSET ) );
7357 vecVertB2 = vec_perm( v3, v4, vecPerm3 );
7358
7359 vecPerm4 = vec_lvsl( 0, xyzPtr + ( cOffset2 * DRAWVERT_OFFSET ) );
7360 v5 = vec_ld( 0, xyzPtr + ( cOffset2 * DRAWVERT_OFFSET ) );
7361 v6 = vec_ld( 16, xyzPtr + ( cOffset2 * DRAWVERT_OFFSET ) );
7362 vecVertC2 = vec_perm( v5, v6, vecPerm4 );
7363
7364 // put remainder into v3
7365 v1 = vec_perm( v1, v1, vecPerm0 );
7366 v4 = vec_perm( v4, v4, vecPerm3 );
7367 v5 = vec_perm( v6, v6, vecPerm4 );
7368
7369 v1 = vec_mergeh( v1, v5 );
7370 v3 = vec_mergeh( v4, zeroVector );
7371 v3 = vec_mergeh( v1, v3 );
7372 v3 = vec_perm( v3, v3, (vector unsigned char)(4,5,6,7,0,1,2,3,8,9,10,11,0,1,2,3) );
7373
7374 // load third one
7375 vecPerm0 = vec_lvsl( 0, xyzPtr + ((i+2) * DRAWVERT_OFFSET ) );
7376 v0 = vec_ld( 0, xyzPtr + ((i+2) * DRAWVERT_OFFSET ) );
7377 v1 = vec_ld( 16, xyzPtr + ((i+2) * DRAWVERT_OFFSET ) );
7378 vecVertA3 = vec_perm( v0, v1, vecPerm0 );
7379
7380 vecPerm1 = vec_lvsl( 0, xyzPtr + (bOffset3 * DRAWVERT_OFFSET ) );
7381 v4 = vec_ld( 0, xyzPtr + ( bOffset3 * DRAWVERT_OFFSET ) );
7382 v5 = vec_ld( 16, xyzPtr + ( bOffset3 * DRAWVERT_OFFSET ) );
7383 vecVertB3 = vec_perm( v4, v5, vecPerm1 );
7384
7385 vecPerm2 = vec_lvsl( 0, xyzPtr + ( cOffset3 * DRAWVERT_OFFSET ) );
7386 v6 = vec_ld( 0, xyzPtr + ( cOffset3 * DRAWVERT_OFFSET ) );
7387 v7 = vec_ld( 16, xyzPtr + ( cOffset3 * DRAWVERT_OFFSET ) );
7388 vecVertC3 = vec_perm( v6, v7, vecPerm2 );
7389
7390 // put remainder into v4
7391 v1 = vec_perm( v1, v1, vecPerm0 );
7392 v5 = vec_perm( v5, v5, vecPerm1 );
7393 v7 = vec_perm( v7, v7, vecPerm2 );
7394
7395 v1 = vec_mergeh( v1, v7 );
7396 v4 = vec_mergeh( v5, zeroVector );
7397 v4 = vec_mergeh( v1, v4 );
7398 v4 = vec_perm( v4, v4, (vector unsigned char)(4,5,6,7,0,1,2,3,8,9,10,11,0,1,2,3) );
7399
7400 // load fourth one
7401 vecPerm0 = vec_lvsl( 0, xyzPtr + ((i+3) * DRAWVERT_OFFSET ) );
7402 v0 = vec_ld( 0, xyzPtr + ((i+3) * DRAWVERT_OFFSET ) );
7403 v1 = vec_ld( 16, xyzPtr + ((i+3) * DRAWVERT_OFFSET ) );
7404 vecVertA4 = vec_perm( v0, v1, vecPerm0 );
7405
7406 vecPerm3 = vec_lvsl( 0, xyzPtr + (bOffset4 * DRAWVERT_OFFSET ) );
7407 v5 = vec_ld( 0, xyzPtr + ( bOffset4 * DRAWVERT_OFFSET ) );
7408 v6 = vec_ld( 16, xyzPtr + ( bOffset4 * DRAWVERT_OFFSET ) );
7409 vecVertB4 = vec_perm( v5, v6, vecPerm3 );
7410
7411 vecPerm4 = vec_lvsl( 0, xyzPtr + ( cOffset4 * DRAWVERT_OFFSET ) );
7412 v7 = vec_ld( 0, xyzPtr + ( cOffset4 * DRAWVERT_OFFSET ) );
7413 v8 = vec_ld( 16, xyzPtr + ( cOffset4 * DRAWVERT_OFFSET ) );
7414 vecVertC4 = vec_perm( v7, v8, vecPerm4 );
7415
7416 // put remainder into v5
7417 v1 = vec_perm( v1, v1, vecPerm0 );
7418 v6 = vec_perm( v6, v6, vecPerm3 );
7419 v8 = vec_perm( v8, v8, vecPerm4 );
7420
7421 v1 = vec_mergeh( v1, v8 );
7422 v5 = vec_mergeh( v6, zeroVector );
7423 v5 = vec_mergeh( v1, v5 );
7424 v5 = vec_perm( v5, v5, (vector unsigned char)(4,5,6,7,0,1,2,3,8,9,10,11,0,1,2,3) );
7425
7426 // remainder vectors look like b->st[1], a->st[1], c->st[1], a->st[1]
7427
7428 //vecD1 now holds d0, d1, d2, d3
7429 vecD1 = vec_sub( vecVertB, vecVertA );
7430 vecD4 = vec_sub( vecVertB2, vecVertA2 );
7431 vecD7 = vec_sub( vecVertB3, vecVertA3 );
7432 vecD10 = vec_sub( vecVertB4, vecVertA4 );
7433
7434 // vecD2 how holds d5, d6, d7, d8
7435 vecD2 = vec_sub( vecVertC, vecVertA );
7436 vecD5 = vec_sub( vecVertC2, vecVertA2 );
7437 vecD8 = vec_sub( vecVertC3, vecVertA3 );
7438 vecD11 = vec_sub( vecVertC4, vecVertA4 );
7439
7440 // vecD3 now holds d4, crap, d9, crap
7441 vecD3 = vec_sub( v2, vec_sld( v2, v2, 4 ) );
7442 vecD6 = vec_sub( v3, vec_sld( v3, v3, 4 ) );
7443 vecD9 = vec_sub( v4, vec_sld( v4, v4, 4 ) );
7444 vecD12 = vec_sub( v5, vec_sld( v5, v5, 4 ) );
7445
7446 // get permute vectors for loading from dt
7447 vecPerm1 = vec_add( vec_lvsl( -1, (int*) &dominantTris[i].normalizationScale[0] ), (vector unsigned char)(1) );
7448 vecPerm2 = vec_add( vec_lvsl( -1, (int*) &dominantTris[i+1].normalizationScale[0] ), (vector unsigned char)(1) );
7449 vecPerm3 = vec_add( vec_lvsl( -1, (int*) &dominantTris[i+2].normalizationScale[0] ), (vector unsigned char)(1) );
7450 vecPerm4 = vec_add( vec_lvsl( -1, (int*) &dominantTris[i+3].normalizationScale[0] ), (vector unsigned char)(1) );
7451
7452 // load S values from dominantTris
7453 v0 = vec_ld( 0, &dominantTris[i].normalizationScale[0] );
7454 v1 = vec_ld( 11, &dominantTris[i].normalizationScale[0] );
7455 v2 = vec_ld( 0, &dominantTris[i+1].normalizationScale[0] );
7456 v3 = vec_ld( 11, &dominantTris[i+1].normalizationScale[0] );
7457 v4 = vec_ld( 0, &dominantTris[i+2].normalizationScale[0] );
7458 v5 = vec_ld( 11, &dominantTris[i+2].normalizationScale[0] );
7459 v6 = vec_ld( 0, &dominantTris[i+3].normalizationScale[0] );
7460 v7 = vec_ld( 11, &dominantTris[i+3].normalizationScale[0] );
7461
7462 v0 = vec_perm( v0, v1, vecPerm1 );
7463 v2 = vec_perm( v2, v3, vecPerm2 );
7464 v4 = vec_perm( v4, v5, vecPerm3 );
7465 v6 = vec_perm( v6, v7, vecPerm4 );
7466
7467 vecS0 = vec_splat( v0, 0 );
7468 vecS1 = vec_splat( v0, 1 );
7469 vecS2 = vec_splat( v0, 2 );
7470
7471 vecS0_2 = vec_splat( v2, 0);
7472 vecS1_2 = vec_splat( v2, 1 );
7473 vecS2_2 = vec_splat( v2, 2 );
7474
7475 vecS0_3 = vec_splat( v4, 0 );
7476 vecS1_3 = vec_splat( v4, 1 );
7477 vecS2_3 = vec_splat( v4, 2 );
7478
7479 vecS0_4 = vec_splat( v6, 0 );
7480 vecS1_4 = vec_splat( v6, 1 );
7481 vecS2_4 = vec_splat( v6, 2 );
7482
7483 // do calculation
7484 vecWork1 = vec_perm( vecD2, vecD2, vecPermN1 );
7485 vecWork2 = vec_perm( vecD1, vecD1, vecPermN0 );
7486 vecWork3 = vec_perm( vecD5, vecD5, vecPermN1 );
7487 vecWork4 = vec_perm( vecD4, vecD4, vecPermN0 );
7488 vecWork5 = vec_perm( vecD8, vecD8, vecPermN1 );
7489 vecWork6 = vec_perm( vecD7, vecD7, vecPermN0 );
7490 vecWork7 = vec_perm( vecD11, vecD11, vecPermN1 );
7491 vecWork8 = vec_perm( vecD10, vecD10, vecPermN0 );
7492
7493 vecFirstHalf = vec_madd( vecWork1, vecWork2, zeroVector );
7494 vecFirstHalf2 = vec_madd( vecWork3, vecWork4, zeroVector );
7495 vecFirstHalf3 = vec_madd( vecWork5, vecWork6, zeroVector );
7496 vecFirstHalf4 = vec_madd( vecWork7, vecWork8, zeroVector );
7497
7498 vecWork1 = vec_perm( vecD2, vecD2, vecPermN0 );
7499 vecWork2 = vec_perm( vecD1, vecD1, vecPermN1 );
7500 vecWork3 = vec_perm( vecD5, vecD5, vecPermN0 );
7501 vecWork4 = vec_perm( vecD4, vecD4, vecPermN1 );
7502 vecWork5 = vec_perm( vecD8, vecD8, vecPermN0 );
7503 vecWork6 = vec_perm( vecD7, vecD7, vecPermN1 );
7504 vecWork7 = vec_perm( vecD11, vecD11, vecPermN0 );
7505 vecWork8 = vec_perm( vecD10, vecD10, vecPermN1 );
7506
7507 vecSecondHalf = vec_nmsub( vecWork1, vecWork2, vecFirstHalf );
7508 vecSecondHalf2 = vec_nmsub( vecWork3, vecWork4, vecFirstHalf2 );
7509 vecSecondHalf3 = vec_nmsub( vecWork5, vecWork6, vecFirstHalf3 );
7510 vecSecondHalf4 = vec_nmsub( vecWork7, vecWork8, vecFirstHalf4 );
7511
7512
7513 // calculate N values
7514 vecN = vec_madd( vecS2, vecSecondHalf, zeroVector );
7515 vecN2 = vec_madd( vecS2_2, vecSecondHalf2, zeroVector );
7516 vecN3 = vec_madd( vecS2_3, vecSecondHalf3, zeroVector );
7517 vecN4 = vec_madd( vecS2_4, vecSecondHalf4, zeroVector );
7518
7519 // calculate both halves of the calculation for t
7520 vecWork1 = vecD1;
7521 vecWork2 = vec_perm( vecD3, vecD3, vecPermT1 );
7522 vecWork3 = vecD4;
7523 vecWork4 = vec_perm( vecD6, vecD6, vecPermT1 );
7524 vecWork5 = vecD7;
7525 vecWork6 = vec_perm( vecD9, vecD9, vecPermT1 );
7526 vecWork7 = vecD10;
7527 vecWork8 = vec_perm( vecD12, vecD12, vecPermT1 );
7528
7529 vecFirstHalf = vec_madd( vecWork1, vecWork2, zeroVector );
7530 vecFirstHalf2 = vec_madd( vecWork3, vecWork4, zeroVector );
7531 vecFirstHalf3 = vec_madd( vecWork5, vecWork6, zeroVector );
7532 vecFirstHalf4 = vec_madd( vecWork7, vecWork8, zeroVector );
7533
7534 vecWork1 = vecD2;
7535 vecWork2 = vec_perm( vecD3, vecD3, vecPermT0 );
7536 vecWork3 = vecD5;
7537 vecWork4 = vec_perm( vecD6, vecD6, vecPermT0 );
7538 vecWork5 = vecD8;
7539 vecWork6 = vec_perm( vecD9, vecD9, vecPermT0 );
7540 vecWork7 = vecD11;
7541 vecWork8 = vec_perm( vecD12, vecD12, vecPermT0 );
7542
7543 vecSecondHalf = vec_nmsub( vecWork1, vecWork2, vecFirstHalf );
7544 vecSecondHalf2 = vec_nmsub( vecWork3, vecWork4, vecFirstHalf2 );
7545 vecSecondHalf3 = vec_nmsub( vecWork5, vecWork6, vecFirstHalf3 );
7546 vecSecondHalf4 = vec_nmsub( vecWork7, vecWork8, vecFirstHalf4 );
7547
7548 // calculate T values
7549 vecT1 = vec_madd( vecS0, vecSecondHalf, zeroVector );
7550 vecT1_2 = vec_madd( vecS0_2, vecSecondHalf2, zeroVector );
7551 vecT1_3 = vec_madd( vecS0_3, vecSecondHalf3, zeroVector );
7552 vecT1_4 = vec_madd( vecS0_4, vecSecondHalf4, zeroVector );
7553
7554 #ifndef DERIVE_UNSMOOTHED_BITANGENT
7555 vecWork1 = vecD1;
7556 vecWork2 = vec_perm( vecD2, vecD2, vecPermT2 );
7557 vecWork3 = vecD4;
7558 vecWork4 = vec_perm( vecD5, vecD5, vecPermT2 );
7559 vecWork5 = vecD7;
7560 vecWork6 = vec_perm( vecD8, vecD8, vecPermT2 );
7561 vecWork7 = vecD10;
7562 vecWork8 = vec_perm( vecD11, vecD11, vecPermT2 );
7563
7564 vecSecondHalf = vec_madd( vecWork1, vecWork2, zeroVector );
7565 vecSecondHalf2 = vec_madd( vecWork3, vecWork4, zeroVector );
7566 vecSecondHalf3 = vec_madd( vecWork5, vecWork6, zeroVector );
7567 vecSecondHalf4 = vec_madd( vecWork7, vecWork8, zeroVector );
7568
7569 vecWork1 = vec_perm( vecD1, vecD1, vecPermT2 );
7570 vecWork2 = vecD2;
7571 vecWork3 = vec_perm( vecD4, vecD4, vecPermT2 );
7572 vecWork4 = vecD5;
7573 vecWork5 = vec_perm( vecD7, vecD7, vecPermT2 );
7574 vecWork6 = vecD8;
7575 vecWork7 = vec_perm( vecD10, vecD10, vecPermT2 );
7576 vecWork8 = vecD11;
7577
7578 vecFirstHalf = vec_madd( vecWork1, vecWork2, zeroVector );
7579 vecFirstHalf2 = vec_madd( vecWork3, vecWork4, zeroVector );
7580 vecFirstHalf3 = vec_madd( vecWork5, vecWork6, zeroVector );
7581 vecFirstHalf4 = vec_madd( vecWork7, vecWork8, zeroVector );
7582
7583 #else
7584 vecWork1 = vec_perm( vecN, vecN, vecPermN1 );
7585 vecWork2 = vec_perm( vecT1, vecT1, vecPermN0 );
7586 vecWork3 = vec_perm( vecN2, vecN2, vecPermN1 );
7587 vecWork4 = vec_perm( vecT1_2, vecT1_2, vecPermN0 );
7588 vecWork5 = vec_perm( vecN3, vecN3, vecPermN1 );
7589 vecWork6 = vec_perm( vecT1_3, vecT1_3, vecPermN0 );
7590 vecWork7 = vec_perm( vecN4, vecN4, vecPermN1 );
7591 vecWork8 = vec_perm( vecT1_4, vecT1_4, vecPermN0 );
7592
7593 vecSecondHalf = vec_madd( vecWork1, vecWork2, zeroVector );
7594 vecSecondHalf2 = vec_madd( vecWork3, vecWork4, zeroVector );
7595 vecSecondHalf3 = vec_madd( vecWork5, vecWork6, zeroVector );
7596 vecSecondHalf4 = vec_madd( vecWork7, vecWork8, zeroVector );
7597
7598 vecWork1 = vec_perm( vecN, vecN, vecPermN0 );
7599 vecWork2 = vec_perm( vecT1, vecT1, vecPermN1 );
7600 vecWork3 = vec_perm( vecN2, vecN2, vecPermN0 );
7601 vecWork4 = vec_perm( vecT1_2, vecT1_2, vecPermN1 );
7602 vecWork5 = vec_perm( vecN3, vecN3, vecPermN0 );
7603 vecWork6 = vec_perm( vecT1_3, vecT1_3, vecPermN1 );
7604 vecWork7 = vec_perm( vecN4, vecN4, vecPermN0 );
7605 vecWork8 = vec_perm( vecT1_4, vecT1_4, vecPermN1 );
7606
7607 vecFirstHalf = vec_madd( vecWork1, vecWork2, zeroVector );
7608 vecFirstHalf2 = vec_madd( vecWork3, vecWork4, zeroVector );
7609 vecFirstHalf3 = vec_madd( vecWork5, vecWork6, zeroVector );
7610 vecFirstHalf4 = vec_madd( vecWork7, vecWork8, zeroVector );
7611 #endif
7612 // finish the calculation
7613 vecSecondHalf = vec_madd( vecSecondHalf, vecNegOne, vecFirstHalf );
7614 vecSecondHalf2 = vec_madd( vecSecondHalf2, vecNegOne, vecFirstHalf2 );
7615 vecSecondHalf3 = vec_madd( vecSecondHalf3, vecNegOne, vecFirstHalf3 );
7616 vecSecondHalf4 = vec_madd( vecSecondHalf4, vecNegOne, vecFirstHalf4 );
7617
7618 vecT2 = vec_madd( vecS1, vecSecondHalf, zeroVector );
7619 vecT2_2 = vec_madd( vecS1_2, vecSecondHalf2, zeroVector );
7620 vecT2_3 = vec_madd( vecS1_3, vecSecondHalf3, zeroVector );
7621 vecT2_4 = vec_madd( vecS1_4, vecSecondHalf4, zeroVector );
7622
7623 // Store results
7624
7625 // read values that we need to preserve
7626 vecLd1 = vec_ld( 0, normalPtr + ( i * DRAWVERT_OFFSET ) );
7627 vecLd2 = vec_ld( 32, normalPtr + ( i * DRAWVERT_OFFSET ) );
7628
7629 //generate vectors to store
7630 vecStore1 = vec_perm( vecLd1, vecN, vecPermLeadAndThree );
7631 vecStore2 = vec_perm( vecT1, vecT2, vecPermFirstThreeLast );
7632 vecStore3 = vec_perm( vecT2, vecLd2, vecPermStore2 );
7633
7634 // store out results
7635 ALIGNED_STORE3( normalPtr + ( i * DRAWVERT_OFFSET ), vecStore1, vecStore2, vecStore3 );
7636
7637 // read values that we need to preserve
7638 vecLd3 = vec_ld( 32, normalPtr + ( (i+1) * DRAWVERT_OFFSET ));
7639
7640 // generate vectors to store
7641 vecStore1 = vec_perm( vecN2, vecT1_2, vecPermFirstThreeLast );
7642 vecStore2 = vec_perm( vecT1_2, vecT2_2, vecPermStoreSecond );
7643 vecStore3 = vec_perm( vecT2_2, vecLd3, (vector unsigned char)(8,9,10,11,20,21,22,23,24,25,26,27,28,29,30,31) );
7644
7645 // instead of doing permute, shift it where it needs to be and use vec_ste
7646 // store out vectors
7647 ALIGNED_STORE3( normalPtr + ((i+1) * DRAWVERT_OFFSET), vecStore1, vecStore2, vecStore3 );
7648
7649 // read values that we need to preserve
7650 vecLd1 = vec_ld( 0, normalPtr + ( (i+2) * DRAWVERT_OFFSET ) );
7651
7652 // generate vectors to store
7653 vecStore1 = vec_perm( vecLd1, vecN3, vecPermFirstThreeLast );
7654 vecStore2 = vec_perm( vecN3, vecT1_3, vecPermStore3 );
7655 vecStore3 = vec_perm( vecT1_3, vecT2_3, vecPermStore4 );
7656
7657 // store out vectors
7658 ALIGNED_STORE3( normalPtr + ((i+2) * DRAWVERT_OFFSET), vecStore1, vecStore2, vecStore3 );
7659
7660 // read values that we need to preserve
7661 vecLd2 = vec_ld( 0, normalPtr + ((i+3) * DRAWVERT_OFFSET ) );
7662 vecLd3 = vec_ld( 32, normalPtr + ((i+3) * DRAWVERT_OFFSET ) );
7663
7664 // generate vectors to store
7665 vecStore1 = vec_perm( vecLd2, vecN4, vecPermHalves );
7666 vecStore2 = vec_perm( vecN4, vecT1_4, vecPermStore4 );
7667 vecStore3 = vec_perm( vecT2_4, vecLd3, vecPermFirstThreeLast );
7668
7669 // store out vectors
7670 ALIGNED_STORE3( normalPtr + ((i+3) * DRAWVERT_OFFSET ), vecStore1, vecStore2, vecStore3 );
7671 }
7672
7673 // cleanup
7674 for ( ; i < numVerts; i++ ) {
7675 idDrawVert *a, *b, *c;
7676 float d0, d1, d2, d3, d4;
7677 float d5, d6, d7, d8, d9;
7678 float s0, s1, s2;
7679 float n0, n1, n2;
7680 float t0, t1, t2;
7681 float t3, t4, t5;
7682
7683 const dominantTri_s &dt = dominantTris[i];
7684
7685 a = verts + i;
7686 b = verts + dt.v2;
7687 c = verts + dt.v3;
7688
7689 d0 = b->xyz[0] - a->xyz[0];
7690 d1 = b->xyz[1] - a->xyz[1];
7691 d2 = b->xyz[2] - a->xyz[2];
7692 d3 = b->st[0] - a->st[0];
7693
7694 d4 = b->st[1] - a->st[1];
7695
7696 d5 = c->xyz[0] - a->xyz[0];
7697 d6 = c->xyz[1] - a->xyz[1];
7698 d7 = c->xyz[2] - a->xyz[2];
7699 d8 = c->st[0] - a->st[0];
7700
7701 d9 = c->st[1] - a->st[1];
7702
7703 s0 = dt.normalizationScale[0];
7704 s1 = dt.normalizationScale[1];
7705 s2 = dt.normalizationScale[2];
7706
7707 n0 = s2 * ( d6 * d2 - d7 * d1 );
7708 n1 = s2 * ( d7 * d0 - d5 * d2 );
7709 n2 = s2 * ( d5 * d1 - d6 * d0 );
7710
7711 t0 = s0 * ( d0 * d9 - d4 * d5 );
7712 t1 = s0 * ( d1 * d9 - d4 * d6 );
7713 t2 = s0 * ( d2 * d9 - d4 * d7 );
7714
7715 #ifndef DERIVE_UNSMOOTHED_BITANGENT
7716 t3 = s1 * ( d3 * d5 - d0 * d8 );
7717 t4 = s1 * ( d3 * d6 - d1 * d8 );
7718 t5 = s1 * ( d3 * d7 - d2 * d8 );
7719 #else
7720 t3 = s1 * ( n2 * t1 - n1 * t2 );
7721 t4 = s1 * ( n0 * t2 - n2 * t0 );
7722 t5 = s1 * ( n1 * t0 - n0 * t1 );
7723 #endif
7724
7725 a->normal[0] = n0;
7726 a->normal[1] = n1;
7727 a->normal[2] = n2;
7728
7729 a->tangents[0][0] = t0;
7730 a->tangents[0][1] = t1;
7731 a->tangents[0][2] = t2;
7732
7733 a->tangents[1][0] = t3;
7734 a->tangents[1][1] = t4;
7735 a->tangents[1][2] = t5;
7736 }
7737 }
7738
7739 #else
7740 /*
7741 ============
7742 idSIMD_AltiVec::DeriveUnsmoothedTangents
7743
7744 Derives the normal and orthogonal tangent vectors for the triangle vertices.
7745 For each vertex the normal and tangent vectors are derived from a single dominant triangle.
7746 ============
7747 */
7748 #define DERIVE_UNSMOOTHED_BITANGENT
7749
DeriveUnsmoothedTangents(idDrawVert * verts,const dominantTri_s * dominantTris,const int numVerts)7750 void VPCALL idSIMD_AltiVec::DeriveUnsmoothedTangents( idDrawVert *verts, const dominantTri_s *dominantTris, const int numVerts ) {
7751 int i;
7752
7753 for ( i = 0; i < numVerts; i++ ) {
7754 idDrawVert *a, *b, *c;
7755 float d0, d1, d2, d3, d4;
7756 float d5, d6, d7, d8, d9;
7757 float s0, s1, s2;
7758 float n0, n1, n2;
7759 float t0, t1, t2;
7760 float t3, t4, t5;
7761
7762 const dominantTri_s &dt = dominantTris[i];
7763
7764 a = verts + i;
7765 b = verts + dt.v2;
7766 c = verts + dt.v3;
7767
7768 d0 = b->xyz[0] - a->xyz[0];
7769 d1 = b->xyz[1] - a->xyz[1];
7770 d2 = b->xyz[2] - a->xyz[2];
7771 d3 = b->st[0] - a->st[0];
7772
7773 d4 = b->st[1] - a->st[1];
7774
7775 d5 = c->xyz[0] - a->xyz[0];
7776 d6 = c->xyz[1] - a->xyz[1];
7777 d7 = c->xyz[2] - a->xyz[2];
7778 d8 = c->st[0] - a->st[0];
7779
7780 d9 = c->st[1] - a->st[1];
7781
7782 s0 = dt.normalizationScale[0];
7783 s1 = dt.normalizationScale[1];
7784 s2 = dt.normalizationScale[2];
7785
7786 n0 = s2 * ( d6 * d2 - d7 * d1 );
7787 n1 = s2 * ( d7 * d0 - d5 * d2 );
7788 n2 = s2 * ( d5 * d1 - d6 * d0 );
7789
7790 t0 = s0 * ( d0 * d9 - d4 * d5 );
7791 t1 = s0 * ( d1 * d9 - d4 * d6 );
7792 t2 = s0 * ( d2 * d9 - d4 * d7 );
7793
7794 #ifndef DERIVE_UNSMOOTHED_BITANGENT
7795 t3 = s1 * ( d3 * d5 - d0 * d8 );
7796 t4 = s1 * ( d3 * d6 - d1 * d8 );
7797 t5 = s1 * ( d3 * d7 - d2 * d8 );
7798 #else
7799 t3 = s1 * ( n2 * t1 - n1 * t2 );
7800 t4 = s1 * ( n0 * t2 - n2 * t0 );
7801 t5 = s1 * ( n1 * t0 - n0 * t1 );
7802 #endif
7803
7804 a->normal[0] = n0;
7805 a->normal[1] = n1;
7806 a->normal[2] = n2;
7807
7808 a->tangents[0][0] = t0;
7809 a->tangents[0][1] = t1;
7810 a->tangents[0][2] = t2;
7811
7812 a->tangents[1][0] = t3;
7813 a->tangents[1][1] = t4;
7814 a->tangents[1][2] = t5;
7815 }
7816
7817 }
7818 #endif /* DERIVE_UNSMOOTH_DRAWVERT_ALIGNED */
7819
7820 /*
7821 ============
7822 idSIMD_AltiVec::NormalizeTangents
7823
7824 Normalizes each vertex normal and projects and normalizes the
7825 tangent vectors onto the plane orthogonal to the vertex normal.
7826 ============
7827 */
NormalizeTangents(idDrawVert * verts,const int numVerts)7828 void VPCALL idSIMD_AltiVec::NormalizeTangents( idDrawVert *verts, const int numVerts ) {
7829
7830 // idDrawVert size
7831 assert( sizeof(idDrawVert) == DRAWVERT_OFFSET * sizeof(float) );
7832
7833 float *addr = verts[0].normal.ToFloatPtr();
7834 float *tAddr = verts[0].tangents[0].ToFloatPtr();
7835
7836 // v0 through v3 maintain originally loaded values so we don't take
7837 // as much hit for unaligned stores
7838 vector float v0, v1, v2, v3;
7839 // v5 through v8 are the "working" values of the vectors
7840 vector float v5, v6, v7, v8;
7841 // working values
7842 vector float vec1T0, vec1T1, vec2T0, vec2T1, vec3T0, vec3T1, vec4T0, vec4T1;
7843 vector float vecSum, vecTSum1, vecTSum2, tempSum, tempSum2, tempSum3;
7844 vector float vecF, vecF2;
7845 vector float vecTemp, vecTemp2, vecTemp3, vecTemp4;
7846
7847 register vector float zeroVector = (vector float)(0.0);
7848
7849 vector unsigned char vecPermHalves = (vector unsigned char)(0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23);
7850 vector unsigned char vecPermLast = (vector unsigned char)(0,1,2,3,4,5,6,7,8,9,10,11,16,17,18,19);
7851 vector unsigned char vecPermSplatFirstWithZero = (vector unsigned char)(0,1,2,3,0,1,2,3,0,1,2,3,16,17,18,19);
7852 vector unsigned char vecPerm0, vecPerm1, vecPerm2, vecPerm3;
7853 vector unsigned char storePerm0, storePerm1, storePerm2, storePerm3;
7854
7855 vector float vecTan11, vecTan12, vecTan13, vecTan21, vecTan22, vecTan23;
7856 vector float vecTan31, vecTan32, vecTan33, vecTan41, vecTan42, vecTan43;
7857
7858 vector unsigned char vec1T0Perm, vec1T1Perm, vec2T0Perm, vec2T1Perm, vec3T0Perm, vec3T1Perm, vec4T0Perm, vec4T1Perm;
7859 vector unsigned char storeT11, storeT12, storeT21, storeT22, storeT31, storeT32;
7860 vector unsigned char storeT41, storeT42;
7861
7862 int i = 0;
7863
7864 if ( i+3 < numVerts ) {
7865 // for loading normal from idDrawVert
7866 vecPerm0 = vec_add( vec_lvsl( -1, addr ), (vector unsigned char)(1) );
7867 vecPerm1 = vec_add( vec_lvsl( -1, addr + ( 1 * DRAWVERT_OFFSET ) ), (vector unsigned char)(1) );
7868 vecPerm2 = vec_add( vec_lvsl( -1, addr + ( 2 * DRAWVERT_OFFSET ) ), (vector unsigned char)(1) );
7869 vecPerm3 = vec_add( vec_lvsl( -1, addr + ( 3 * DRAWVERT_OFFSET ) ), (vector unsigned char)(1) );
7870
7871 // for loading tangents from idDrawVert
7872 vec1T0Perm = vec_add( vec_lvsl( -1, tAddr + ( 0 * DRAWVERT_OFFSET ) ), (vector unsigned char)(1) );
7873 vec1T1Perm = vec_add( vec_lvsl( -1, tAddr + ( 0 * DRAWVERT_OFFSET ) + 3 ), (vector unsigned char)(1) );
7874 vec2T0Perm = vec_add( vec_lvsl( -1, tAddr + ( 1 * DRAWVERT_OFFSET ) ), (vector unsigned char)(1) );
7875 vec2T1Perm = vec_add( vec_lvsl( -1, tAddr + ( 1 * DRAWVERT_OFFSET ) + 3 ), (vector unsigned char)(1) );
7876 vec3T0Perm = vec_add( vec_lvsl( -1, tAddr + ( 2 * DRAWVERT_OFFSET ) ), (vector unsigned char)(1) );
7877 vec3T1Perm = vec_add( vec_lvsl( -1, tAddr + ( 2 * DRAWVERT_OFFSET ) + 3 ), (vector unsigned char)(1) );
7878 vec4T0Perm = vec_add( vec_lvsl( -1, tAddr + ( 3 * DRAWVERT_OFFSET ) ), (vector unsigned char)(1) );
7879 vec4T1Perm = vec_add( vec_lvsl( -1, tAddr + ( 3 * DRAWVERT_OFFSET ) + 3 ), (vector unsigned char)(1) );
7880
7881 // generate permute vectors to store normals
7882 storePerm0 = vec_lvsr( 0, addr );
7883 storePerm1 = vec_lvsr( 0, addr + ( 1 * DRAWVERT_OFFSET ) );
7884 storePerm2 = vec_lvsr( 0, addr + ( 2 * DRAWVERT_OFFSET ) );
7885 storePerm3 = vec_lvsr( 0, addr + ( 3 * DRAWVERT_OFFSET ) );
7886
7887 // generate permute vectors to store tangents
7888 storeT11 = vec_lvsr( 0, tAddr + ( 0 * DRAWVERT_OFFSET ) );
7889 storeT12 = vec_lvsr( 12, tAddr + ( 0 * DRAWVERT_OFFSET ) );
7890
7891 storeT21 = vec_lvsr( 0, tAddr + ( 1 * DRAWVERT_OFFSET ) );
7892 storeT22 = vec_lvsr( 12, tAddr + ( 1 * DRAWVERT_OFFSET ) );
7893
7894 storeT31 = vec_lvsr( 0, tAddr + ( 2 * DRAWVERT_OFFSET ) );
7895 storeT32 = vec_lvsr( 12, tAddr + ( 2 * DRAWVERT_OFFSET ) );
7896
7897 storeT41 = vec_lvsr( 0, tAddr + ( 3 * DRAWVERT_OFFSET ) );
7898 storeT42 = vec_lvsr( 12, tAddr + ( 3 * DRAWVERT_OFFSET ) );
7899 }
7900
7901 for ( ; i+3 < numVerts; i+=4 ) {
7902
7903 // load normals
7904 vector float vecNormal11 = vec_ld( 0, addr + ( i * DRAWVERT_OFFSET ) );
7905 vector float vecNormal12 = vec_ld( 15, addr + ( i * DRAWVERT_OFFSET ) );
7906 v0 = vec_perm( vecNormal11, vecNormal12, vecPerm0 );
7907
7908 vector float vecNormal21 = vec_ld( 0, addr + ((i+1) * DRAWVERT_OFFSET ) );
7909 vector float vecNormal22 = vec_ld( 15, addr + ((i+1) * DRAWVERT_OFFSET ) );
7910 v1 = vec_perm( vecNormal21, vecNormal22, vecPerm1 );
7911
7912 vector float vecNormal31 = vec_ld( 0, addr + ( (i+2) * DRAWVERT_OFFSET ) );
7913 vector float vecNormal32 = vec_ld( 15, addr + ( (i+2) * DRAWVERT_OFFSET ) );
7914 v2 = vec_perm( vecNormal31, vecNormal32, vecPerm2 );
7915
7916 vector float vecNormal41 = vec_ld( 0, addr + ((i+3) * DRAWVERT_OFFSET ) );
7917 vector float vecNormal42 = vec_ld( 15, addr + ((i+3) * DRAWVERT_OFFSET ) );
7918 v3 = vec_perm( vecNormal41, vecNormal42, vecPerm3 );
7919
7920 // zero out the last element of each useless vector
7921 v0 = vec_perm( v0, zeroVector, vecPermLast );
7922 v1 = vec_perm( v1, zeroVector, vecPermLast );
7923 v2 = vec_perm( v2, zeroVector, vecPermLast );
7924 v3 = vec_perm( v3, zeroVector, vecPermLast );
7925
7926 // got 4 vectors in v0 through v3, sum them each accross
7927 // and put into one vector
7928 vecTemp = vec_madd( v0, v0, zeroVector );
7929
7930 vecSum = vec_add( vecTemp, vec_sld( vecTemp, vecTemp, 8 ) );
7931 vecSum = vec_add( vecSum, vec_sld( vecSum, vecSum, 4 ) );
7932 // element 0 of vecSum now has sum of v0
7933
7934 vecTemp2 = vec_madd( v1, v1, zeroVector );
7935 tempSum = vec_add( vecTemp2, vec_sld( vecTemp2, vecTemp2, 8 ) );
7936 tempSum = vec_add( tempSum, vec_sld( tempSum, tempSum, 4 ) );
7937 // put this into vecSum
7938 vecSum = vec_mergeh( vecSum, tempSum );
7939
7940 vecTemp3 = vec_madd( v2, v2, zeroVector );
7941 tempSum = vec_add( vecTemp3, vec_sld( vecTemp3, vecTemp3, 8 ) );
7942 tempSum = vec_add( tempSum, vec_sld( tempSum, tempSum, 4 ) );
7943 // put this into vecSum
7944 vecSum = vec_perm( vecSum, tempSum, vecPermHalves );
7945
7946 vecTemp4 = vec_madd( v3, v3, zeroVector );
7947 tempSum = vec_add( vecTemp4, vec_sld( vecTemp4, vecTemp4, 8 ) );
7948 tempSum = vec_add( tempSum, vec_sld( tempSum, tempSum, 4 ) );
7949 // put this into vecSum
7950 vecSum = vec_perm( vecSum, tempSum, vecPermLast );
7951
7952 // take reciprocal square roots of these
7953 vecF = ReciprocalSquareRoot( vecSum );
7954
7955 // multiply each vector by f
7956 v5 = vec_madd( v0, vec_splat( vecF, 0 ), zeroVector );
7957 v6 = vec_madd( v1, vec_splat( vecF, 1 ), zeroVector );
7958 v7 = vec_madd( v2, vec_splat( vecF, 2 ), zeroVector );
7959 v8 = vec_madd( v3, vec_splat( vecF, 3 ), zeroVector );
7960
7961 // load tangents as unaligned
7962 vecTan11 = vec_ld( 0, tAddr + ( i * DRAWVERT_OFFSET ) );
7963 vecTan12 = vec_ld( 11, tAddr + ( i * DRAWVERT_OFFSET ) );
7964 vecTan13 = vec_ld( 23, tAddr + ( i * DRAWVERT_OFFSET ) );
7965
7966 vecTan21 = vec_ld( 0, tAddr + ( (i+1) * DRAWVERT_OFFSET ) );
7967 vecTan22 = vec_ld( 11, tAddr + ( (i+1) * DRAWVERT_OFFSET ) );
7968 vecTan23 = vec_ld( 23, tAddr + ( (i+1) * DRAWVERT_OFFSET ) );
7969
7970 vecTan31 = vec_ld( 0, tAddr + ( (i+2) * DRAWVERT_OFFSET ) );
7971 vecTan32 = vec_ld( 11, tAddr + ( (i+2) * DRAWVERT_OFFSET ) );
7972 vecTan33 = vec_ld( 23, tAddr + ( (i+2) * DRAWVERT_OFFSET ) );
7973
7974 vecTan41 = vec_ld( 0, tAddr + ( (i+3) * DRAWVERT_OFFSET ) );
7975 vecTan42 = vec_ld( 11, tAddr + ( (i+3) * DRAWVERT_OFFSET ) );
7976 vecTan43 = vec_ld( 23, tAddr + ( (i+3) * DRAWVERT_OFFSET ) );
7977
7978 vec1T0 = vec_perm( vecTan11, vecTan12, vec1T0Perm );
7979 vec1T1 = vec_perm( vecTan12, vecTan13, vec1T1Perm );
7980 vec2T0 = vec_perm( vecTan21, vecTan22, vec2T0Perm );
7981 vec2T1 = vec_perm( vecTan22, vecTan23, vec2T1Perm );
7982 vec3T0 = vec_perm( vecTan31, vecTan32, vec3T0Perm );
7983 vec3T1 = vec_perm( vecTan32, vecTan33, vec3T1Perm );
7984 vec4T0 = vec_perm( vecTan41, vecTan42, vec4T0Perm );
7985 vec4T1 = vec_perm( vecTan42, vecTan43, vec4T1Perm );
7986
7987 //zero out last element of tangents
7988 vec1T0 = vec_perm( vec1T0, zeroVector, vecPermLast );
7989 vec1T1 = vec_perm( vec1T1, zeroVector, vecPermLast );
7990 vec2T0 = vec_perm( vec2T0, zeroVector, vecPermLast );
7991 vec2T1 = vec_perm( vec2T1, zeroVector, vecPermLast );
7992 vec3T0 = vec_perm( vec3T0, zeroVector, vecPermLast );
7993 vec3T1 = vec_perm( vec3T1, zeroVector, vecPermLast );
7994 vec4T0 = vec_perm( vec4T0, zeroVector, vecPermLast );
7995 vec4T1 = vec_perm( vec4T1, zeroVector, vecPermLast );
7996
7997 // all tangents[0]
7998 tempSum = zeroVector;
7999 tempSum = vec_madd( vec1T0, v5, tempSum );
8000 //sum accross tempSum
8001 vecTSum1 = vec_add( tempSum, vec_sld( tempSum, tempSum, 8 ) );
8002 vecTSum1 = vec_add( vecTSum1, vec_sld( vecTSum1, vecTSum1, 4 ) );
8003 // put tempSum splatted accross vecTSum1
8004 vecTSum1 = vec_perm( vecTSum1, zeroVector, vecPermSplatFirstWithZero );
8005 vecTSum1 = vec_madd( vecTSum1, v5, zeroVector );
8006
8007 //vec1T0 now contains what needs to be rsqrt'd and multiplied by f
8008 vec1T0 = vec_sub( vec1T0, vecTSum1 );
8009
8010 tempSum = zeroVector;
8011 tempSum = vec_madd( vec2T0, v6, tempSum );
8012
8013 //sum accross tempSum
8014 vecTSum1 = vec_add( tempSum, vec_sld( tempSum, tempSum, 8 ) );
8015 vecTSum1 = vec_add( vecTSum1, vec_sld( vecTSum1, vecTSum1, 4 ) );
8016 vecTSum1 = vec_perm( vecTSum1, zeroVector, vecPermSplatFirstWithZero );
8017 vecTSum1 = vec_madd( vecTSum1, v6, zeroVector );
8018 vec2T0 = vec_sub( vec2T0, vecTSum1 );
8019
8020 tempSum = zeroVector;
8021 tempSum = vec_madd( vec3T0, v7, tempSum );
8022
8023 //sum accross tempSum
8024 vecTSum1 = vec_add( tempSum, vec_sld( tempSum, tempSum, 8 ) );
8025 vecTSum1 = vec_add( vecTSum1, vec_sld( vecTSum1, vecTSum1, 4 ) );
8026 vecTSum1 = vec_perm( vecTSum1, zeroVector, vecPermSplatFirstWithZero );
8027 vecTSum1 = vec_madd( vecTSum1, v7, zeroVector );
8028 vec3T0 = vec_sub( vec3T0, vecTSum1 );
8029
8030 tempSum = zeroVector;
8031 tempSum = vec_madd( vec4T0, v8, tempSum );
8032
8033 //sum accross tempSum
8034 vecTSum1 = vec_add( tempSum, vec_sld( tempSum, tempSum, 8 ) );
8035 vecTSum1 = vec_add( vecTSum1, vec_sld( vecTSum1, vecTSum1, 4 ) );
8036 vecTSum1 = vec_perm( vecTSum1, zeroVector, vecPermSplatFirstWithZero );
8037 vecTSum1 = vec_madd( vecTSum1, v8, zeroVector );
8038 vec4T0 = vec_sub( vec4T0, vecTSum1 );
8039
8040 // all tangents[1]
8041 tempSum = zeroVector;
8042 tempSum = vec_madd( vec1T1, v5, tempSum );
8043
8044 //sum accross tempSum
8045 vecTSum1 = vec_add( tempSum, vec_sld( tempSum, tempSum, 8 ) );
8046 vecTSum1 = vec_add( vecTSum1, vec_sld( vecTSum1, vecTSum1, 4 ) );
8047 vecTSum1 = vec_perm( vecTSum1, zeroVector, vecPermSplatFirstWithZero );
8048 vecTSum1 = vec_madd( vecTSum1, v5, zeroVector );
8049
8050 //vec1T0 now contains what needs to be rsqrt'd and multiplied by f
8051 vec1T1 = vec_sub( vec1T1, vecTSum1 );
8052
8053 tempSum = zeroVector;
8054 tempSum = vec_madd( vec2T1, v6, tempSum );
8055
8056 //sum accross tempSum
8057 vecTSum1 = vec_add( tempSum, vec_sld( tempSum, tempSum, 8 ) );
8058 vecTSum1 = vec_add( vecTSum1, vec_sld( vecTSum1, vecTSum1, 4 ) );
8059 vecTSum1 = vec_perm( vecTSum1, zeroVector, vecPermSplatFirstWithZero );
8060 vecTSum1 = vec_madd( vecTSum1, v6, zeroVector );
8061 vec2T1 = vec_sub( vec2T1, vecTSum1 );
8062
8063 tempSum = zeroVector;
8064 tempSum = vec_madd( vec3T1, v7, tempSum );
8065
8066 //sum accross tempSum
8067 vecTSum1 = vec_add( tempSum, vec_sld( tempSum, tempSum, 8 ) );
8068 vecTSum1 = vec_add( vecTSum1, vec_sld( vecTSum1, vecTSum1, 4 ) );
8069 vecTSum1 = vec_perm( vecTSum1, zeroVector, vecPermSplatFirstWithZero );
8070 vecTSum1 = vec_madd( vecTSum1, v7, zeroVector );
8071 vec3T1 = vec_sub( vec3T1, vecTSum1 );
8072
8073 tempSum = zeroVector;
8074 tempSum = vec_madd( vec4T1, v8, tempSum );
8075
8076 //sum accross tempSum
8077 vecTSum1 = vec_add( tempSum, vec_sld( tempSum, tempSum, 8 ) );
8078 vecTSum1 = vec_add( vecTSum1, vec_sld( vecTSum1, vecTSum1, 4 ) );
8079 vecTSum1 = vec_perm( vecTSum1, zeroVector, vecPermSplatFirstWithZero );
8080 vecTSum1 = vec_madd( vecTSum1, v8, zeroVector );
8081 vec4T1 = vec_sub( vec4T1, vecTSum1 );
8082
8083
8084 // sum accross vectors and put into one vector
8085 vecTemp = vec_madd( vec1T0, vec1T0, zeroVector );
8086 vecTSum1 = vec_add( vecTemp, vec_sld( vecTemp, vecTemp, 8 ) );
8087 vecTSum1 = vec_add( vecTSum1, vec_sld( vecTSum1, vecTSum1, 4 ) );
8088
8089 // element 0 of vecSum now has sum of v0
8090 vecTemp = vec_madd( vec2T0, vec2T0, zeroVector );
8091 tempSum2 = vec_add( vecTemp, vec_sld( vecTemp, vecTemp, 8 ) );
8092 tempSum2 = vec_add( tempSum2, vec_sld( tempSum2, tempSum2, 4 ) );
8093 // put this into vecSum
8094 vecTemp = vec_madd( vec3T0, vec3T0, zeroVector );
8095 vecTSum1 = vec_mergeh( vecTSum1, tempSum2 );
8096 tempSum2 = vec_add( vecTemp, vec_sld( vecTemp, vecTemp, 8 ) );
8097 tempSum2 = vec_add( tempSum2, vec_sld( tempSum2, tempSum2, 4 ) );
8098 // put this into vecSum
8099 vecTSum1 = vec_perm( vecTSum1, tempSum2, vecPermHalves );
8100 vecTemp = vec_madd( vec4T0, vec4T0, zeroVector );
8101 tempSum2 = vec_add( vecTemp, vec_sld( vecTemp, vecTemp, 8 ) );
8102 tempSum2 = vec_add( tempSum2, vec_sld( tempSum2, tempSum2, 4 ) );
8103 // put this into vecSum
8104 vecTSum1 = vec_perm( vecTSum1, tempSum2, vecPermLast );
8105
8106 vecTemp = vec_madd( vec1T1, vec1T1, zeroVector );
8107 vecTSum2 = vec_add( vecTemp, vec_sld( vecTemp, vecTemp, 8 ) );
8108 vecTSum2 = vec_add( vecTSum2, vec_sld( vecTSum2, vecTSum2, 4 ) );
8109 // element 0 of vecSum now has sum of v0
8110 vecTemp = vec_madd( vec2T1, vec2T1, zeroVector );
8111 tempSum3 = vec_add( vecTemp, vec_sld( vecTemp, vecTemp, 8 ) );
8112 tempSum3 = vec_add( tempSum3, vec_sld( tempSum3, tempSum3, 4 ) );
8113 // put this into vecSum
8114 vecTSum2 = vec_mergeh( vecTSum2, tempSum3 );
8115 vecTemp = vec_madd( vec3T1, vec3T1, zeroVector );
8116 tempSum3 = vec_add( vecTemp, vec_sld( vecTemp, vecTemp, 8 ) );
8117 tempSum3 = vec_add( tempSum3, vec_sld( tempSum3, tempSum3, 4 ) );
8118 // put this into vecSum
8119 vecTSum2 = vec_perm( vecTSum2, tempSum3, vecPermHalves );
8120 vecTemp = vec_madd( vec4T1, vec4T1, zeroVector );
8121 tempSum3 = vec_add( vecTemp, vec_sld( vecTemp, vecTemp, 8 ) );
8122 tempSum3 = vec_add( tempSum3, vec_sld( tempSum3, tempSum3, 4 ) );
8123 // put this into vecSum
8124 vecTSum2 = vec_perm( vecTSum2, tempSum3, vecPermLast );
8125
8126 // tangents[0]
8127 vecF = ReciprocalSquareRoot( vecTSum1 );
8128 // tangents[1]
8129 vecF2 = ReciprocalSquareRoot( vecTSum2 );
8130
8131 // multiply each tangent vector by f
8132
8133 vec1T0 = vec_madd( vec1T0, vec_splat( vecF, 0 ), zeroVector );
8134 vec2T0 = vec_madd( vec2T0, vec_splat( vecF, 1 ), zeroVector );
8135 vec3T0 = vec_madd( vec3T0, vec_splat( vecF, 2 ), zeroVector );
8136 vec4T0 = vec_madd( vec4T0, vec_splat( vecF, 3 ), zeroVector );
8137
8138 vec1T1 = vec_madd( vec1T1, vec_splat( vecF2, 0 ), zeroVector );
8139 vec2T1 = vec_madd( vec2T1, vec_splat( vecF2, 1 ), zeroVector );
8140 vec3T1 = vec_madd( vec3T1, vec_splat( vecF2, 2 ), zeroVector );
8141 vec4T1 = vec_madd( vec4T1, vec_splat( vecF2, 3 ), zeroVector );
8142
8143 // rotate input data
8144 v5 = vec_perm( v5, v5, storePerm0 );
8145 v6 = vec_perm( v6, v6, storePerm1 );
8146 v7 = vec_perm( v7, v7, storePerm2 );
8147 v8 = vec_perm( v8, v8, storePerm3 );
8148
8149 vec_ste( v5, 0, addr + ( (i+0) * DRAWVERT_OFFSET ) );
8150 vec_ste( v5, 4, addr + ( (i+0) * DRAWVERT_OFFSET ) );
8151 vec_ste( v5, 8, addr + ( (i+0) * DRAWVERT_OFFSET ) );
8152
8153 vec_ste( v6, 0, addr + ( (i+1) * DRAWVERT_OFFSET ) );
8154 vec_ste( v6, 4, addr + ( (i+1) * DRAWVERT_OFFSET ) );
8155 vec_ste( v6, 8, addr + ( (i+1) * DRAWVERT_OFFSET ) );
8156
8157 vec_ste( v7, 0, addr + ( (i+2) * DRAWVERT_OFFSET ) );
8158 vec_ste( v7, 4, addr + ( (i+2) * DRAWVERT_OFFSET ) );
8159 vec_ste( v7, 8, addr + ( (i+2) * DRAWVERT_OFFSET ) );
8160
8161 vec_ste( v8, 0, addr + ( (i+3) * DRAWVERT_OFFSET ) );
8162 vec_ste( v8, 4, addr + ( (i+3) * DRAWVERT_OFFSET ) );
8163 vec_ste( v8, 8, addr + ( (i+3) * DRAWVERT_OFFSET ) );
8164
8165 // store tangents[0] and tangents[1]
8166 vec1T0 = vec_perm( vec1T0, vec1T0, storeT11 );
8167 vec1T1 = vec_perm( vec1T1, vec1T1, storeT12 );
8168
8169 vec_ste( vec1T0, 0, tAddr + ((i+0) * DRAWVERT_OFFSET ) );
8170 vec_ste( vec1T0, 4, tAddr + ((i+0) * DRAWVERT_OFFSET ) );
8171 vec_ste( vec1T0, 8, tAddr + ((i+0) * DRAWVERT_OFFSET ) );
8172 vec_ste( vec1T1, 12, tAddr + ((i+0) * DRAWVERT_OFFSET ) );
8173 vec_ste( vec1T1, 16, tAddr + ((i+0) * DRAWVERT_OFFSET ) );
8174 vec_ste( vec1T1, 20, tAddr + ((i+0) * DRAWVERT_OFFSET ) );
8175
8176 // store second tangents[0] and tangents[1]
8177 vec2T0 = vec_perm( vec2T0, vec2T0, storeT21 );
8178 vec2T1 = vec_perm( vec2T1, vec2T1, storeT22 );
8179
8180 vec_ste( vec2T0, 0, tAddr + ((i+1) * DRAWVERT_OFFSET ) );
8181 vec_ste( vec2T0, 4, tAddr + ((i+1) * DRAWVERT_OFFSET ) );
8182 vec_ste( vec2T0, 8, tAddr + ((i+1) * DRAWVERT_OFFSET ) );
8183 vec_ste( vec2T1, 12, tAddr + ((i+1) * DRAWVERT_OFFSET ) );
8184 vec_ste( vec2T1, 16, tAddr + ((i+1) * DRAWVERT_OFFSET ) );
8185 vec_ste( vec2T1, 20, tAddr + ((i+1) * DRAWVERT_OFFSET ) );
8186
8187 // store third tangents[0] and tangents[1]
8188 vec3T0 = vec_perm( vec3T0, vec3T0, storeT31 );
8189 vec3T1 = vec_perm( vec3T1, vec3T1, storeT32 );
8190
8191 vec_ste( vec3T0, 0, tAddr + ((i+2) * DRAWVERT_OFFSET ) );
8192 vec_ste( vec3T0, 4, tAddr + ((i+2) * DRAWVERT_OFFSET ) );
8193 vec_ste( vec3T0, 8, tAddr + ((i+2) * DRAWVERT_OFFSET ) );
8194 vec_ste( vec3T1, 12, tAddr + ((i+2) * DRAWVERT_OFFSET ) );
8195 vec_ste( vec3T1, 16, tAddr + ((i+2) * DRAWVERT_OFFSET ) );
8196 vec_ste( vec3T1, 20, tAddr + ((i+2) * DRAWVERT_OFFSET ) );
8197
8198 // store fourth tangents[0] and tangents[1]
8199 vec4T0 = vec_perm( vec4T0, vec4T0, storeT41 );
8200 vec4T1 = vec_perm( vec4T1, vec4T1, storeT42 );
8201
8202 vec_ste( vec4T0, 0, tAddr + ((i+3) * DRAWVERT_OFFSET ) );
8203 vec_ste( vec4T0, 4, tAddr + ((i+3) * DRAWVERT_OFFSET ) );
8204 vec_ste( vec4T0, 8, tAddr + ((i+3) * DRAWVERT_OFFSET ) );
8205 vec_ste( vec4T1, 12, tAddr + ((i+3) * DRAWVERT_OFFSET ) );
8206 vec_ste( vec4T1, 16, tAddr + ((i+3) * DRAWVERT_OFFSET ) );
8207 vec_ste( vec4T1, 20, tAddr + ((i+3) * DRAWVERT_OFFSET ) );
8208 }
8209
8210 // cleanup
8211 for ( ; i < numVerts; i++ ) {
8212 idVec3 &v = verts[i].normal;
8213 float f;
8214
8215 //f = idMath::RSqrt( v.x * v.x + v.y * v.y + v.z * v.z );
8216 f = FastScalarInvSqrt( v.x * v.x + v.y * v.y + v.z * v.z );
8217 v.x *= f; v.y *= f; v.z *= f;
8218
8219 for ( int j = 0; j < 2; j++ ) {
8220 idVec3 &t = verts[i].tangents[j];
8221
8222 t -= ( t * v ) * v;
8223 // f = idMath::RSqrt( t.x * t.x + t.y * t.y + t.z * t.z );
8224 f = FastScalarInvSqrt( t.x * t.x + t.y * t.y + t.z * t.z );
8225 t.x *= f; t.y *= f; t.z *= f;
8226 }
8227 }
8228 }
8229 #endif /* ENABLE_DERIVE */
8230
8231 #ifdef ENABLE_CREATE
8232
8233 /*
8234 ============
8235 idSIMD_AltiVec::CreateTextureSpaceLightVectors
8236
8237 Calculates light vectors in texture space for the given triangle vertices.
8238 For each vertex the direction towards the light origin is projected onto texture space.
8239 The light vectors are only calculated for the vertices referenced by the indexes.
8240 ============
8241 */
8242
CreateTextureSpaceLightVectors(idVec3 * lightVectors,const idVec3 & lightOrigin,const idDrawVert * verts,const int numVerts,const int * indexes,const int numIndexes)8243 void VPCALL idSIMD_AltiVec::CreateTextureSpaceLightVectors( idVec3 *lightVectors, const idVec3 &lightOrigin, const idDrawVert *verts, const int numVerts, const int *indexes, const int numIndexes ) {
8244
8245 bool *used = (bool *)_alloca16( numVerts * sizeof( used[0] ) );
8246 memset( used, 0, numVerts * sizeof( used[0] ) );
8247
8248 int i;
8249 for ( i = 0; i+7 < numIndexes; i+= 8 ) {
8250 used[indexes[i]] = true;
8251 used[indexes[i+1]] = true;
8252 used[indexes[i+2]] = true;
8253 used[indexes[i+3]] = true;
8254 used[indexes[i+4]] = true;
8255 used[indexes[i+5]] = true;
8256 used[indexes[i+6]] = true;
8257 used[indexes[i+7]] = true;
8258 }
8259
8260 for ( ; i < numIndexes; i++ ) {
8261 used[indexes[i]] = true;
8262 }
8263
8264 for ( i = 0; i+1 < numVerts; i+=2 ) {
8265
8266 const idDrawVert *v = &verts[i];
8267 const idDrawVert *v2 = &verts[i+1];
8268
8269 float x, y, z;
8270 float x2, y2, z2;
8271 idVec3 lightDir, lightDir2;
8272
8273 lightDir[0] = lightOrigin[0] - v->xyz[0];
8274 lightDir[1] = lightOrigin[1] - v->xyz[1];
8275 lightDir[2] = lightOrigin[2] - v->xyz[2];
8276
8277 lightDir2[0] = lightOrigin[0] - v2->xyz[0];
8278 lightDir2[1] = lightOrigin[1] - v2->xyz[1];
8279 lightDir2[2] = lightOrigin[2] - v2->xyz[2];
8280
8281 x = lightDir[0] * v->tangents[0][0] + lightDir[1] * v->tangents[0][1] + lightDir[2] * v->tangents[0][2];
8282 y = lightDir[0] * v->tangents[1][0] + lightDir[1] * v->tangents[1][1] + lightDir[2] * v->tangents[1][2];
8283 z = lightDir[0] * v->normal[0] + lightDir[1] * v->normal[1] + lightDir[2] * v->normal[2];
8284
8285 x2 = lightDir2[0] * v2->tangents[0][0] + lightDir2[1] * v2->tangents[0][1] + lightDir2[2] * v2->tangents[0][2];
8286 y2 = lightDir2[0] * v2->tangents[1][0] + lightDir2[1] * v2->tangents[1][1] + lightDir2[2] * v2->tangents[1][2];
8287 z2 = lightDir2[0] * v2->normal[0] + lightDir2[1] * v2->normal[1] + lightDir2[2] * v2->normal[2];
8288
8289 if ( used[i] ) {
8290 lightVectors[i][0] = x;
8291 lightVectors[i][1] = y;
8292 lightVectors[i][2] = z;
8293 }
8294
8295 if ( used[i+1] ) {
8296 lightVectors[i+1][0] = x2;
8297 lightVectors[i+1][1] = y2;
8298 lightVectors[i+1][2] = z2;
8299 }
8300 }
8301
8302 // cleanup
8303 for ( ; i < numVerts; i++ ) {
8304 if ( !used[i] ) {
8305 continue;
8306 }
8307
8308 const idDrawVert *v = &verts[i];
8309 idVec3 lightDir;
8310
8311 lightDir[0] = lightOrigin[0] - v->xyz[0];
8312 lightDir[1] = lightOrigin[1] - v->xyz[1];
8313 lightDir[2] = lightOrigin[2] - v->xyz[2];
8314
8315 lightVectors[i][0] = lightDir[0] * v->tangents[0][0] + lightDir[1] * v->tangents[0][1] + lightDir[2] * v->tangents[0][2];
8316 lightVectors[i][1] = lightDir[0] * v->tangents[1][0] + lightDir[1] * v->tangents[1][1] + lightDir[2] * v->tangents[1][2];
8317 lightVectors[i][2] = lightDir[0] * v->normal[0] + lightDir[1] * v->normal[1] + lightDir[2] * v->normal[2];
8318 }
8319 }
8320
8321 #if 1
8322 /*
8323 ============
8324 idSIMD_AltiVec::CreateSpecularTextureCoords
8325
8326 Calculates specular texture coordinates for the given triangle vertices.
8327 For each vertex the normalized direction towards the light origin is added to the
8328 normalized direction towards the view origin and the result is projected onto texture space.
8329 The texture coordinates are only calculated for the vertices referenced by the indexes.
8330 ============
8331 */
CreateSpecularTextureCoords(idVec4 * texCoords,const idVec3 & lightOrigin,const idVec3 & viewOrigin,const idDrawVert * verts,const int numVerts,const int * indexes,const int numIndexes)8332 void VPCALL idSIMD_AltiVec::CreateSpecularTextureCoords( idVec4 *texCoords, const idVec3 &lightOrigin, const idVec3 &viewOrigin, const idDrawVert *verts, const int numVerts, const int *indexes, const int numIndexes ) {
8333
8334 bool *used = (bool *)_alloca16( numVerts * sizeof( used[0] ) );
8335 memset( used, 0, numVerts * sizeof( used[0] ) );
8336
8337 int i;
8338 for ( i = 0; i+7 < numIndexes; i+= 8 ) {
8339 used[indexes[i]] = true;
8340 used[indexes[i+1]] = true;
8341 used[indexes[i+2]] = true;
8342 used[indexes[i+3]] = true;
8343 used[indexes[i+4]] = true;
8344 used[indexes[i+5]] = true;
8345 used[indexes[i+6]] = true;
8346 used[indexes[i+7]] = true;
8347 }
8348
8349 for ( ; i < numIndexes; i++ ) {
8350 used[indexes[i]] = true;
8351 }
8352
8353 // load lightOrigin and viewOrigin into vectors
8354 const float *lightOriginPtr = lightOrigin.ToFloatPtr();
8355 const float *viewOriginPtr = viewOrigin.ToFloatPtr();
8356 vector unsigned char permVec = vec_lvsl( 0, lightOriginPtr );
8357 vector unsigned char permVec2 = vec_lvsl( 0, viewOriginPtr );
8358 vector float v0 = vec_ld( 0, lightOriginPtr );
8359 vector float v1 = vec_ld( 15, lightOriginPtr );
8360 vector float v2 = vec_ld( 0, viewOriginPtr );
8361 vector float v3 = vec_ld( 15, viewOriginPtr );
8362 vector float vecLightOrigin = vec_perm( v0, v1, permVec );
8363 vector float vecViewOrigin = vec_perm( v2, v3, permVec2 );
8364 const vector float zeroVector = (vector float)(0);
8365 int index;
8366
8367 for ( index = 0; index+1 < numVerts; index+=2 ) {
8368 const float *vertPtr = verts[index].xyz.ToFloatPtr();
8369 const float *vertPtr2 = verts[index+1].xyz.ToFloatPtr();
8370
8371 permVec = vec_add( vec_lvsl( -1, vertPtr ), (vector unsigned char)(1) );
8372 permVec2 = vec_add( vec_lvsl( -1, vertPtr2 ), (vector unsigned char)(1) );
8373
8374 v0 = vec_ld( 0, vertPtr );
8375 v1 = vec_ld( 15, vertPtr );
8376 vector float v2 = vec_ld( 31, vertPtr );
8377 vector float v3 = vec_ld( 47, vertPtr );
8378 vector float v4 = vec_ld( 63, vertPtr );
8379
8380 vector float v5 = vec_ld( 0, vertPtr2 );
8381 vector float v6 = vec_ld( 15, vertPtr2 );
8382 vector float v7 = vec_ld( 31, vertPtr2 );
8383 vector float v8 = vec_ld( 47, vertPtr2 );
8384 vector float v9 = vec_ld( 63, vertPtr2 );
8385
8386 // figure out what values go where
8387 vector float vecXYZ = vec_perm( v0, v1, permVec );
8388 vector float vecNormal = vec_perm( v1, v2, permVec );
8389 vecNormal = vec_sld( vecNormal, vecNormal, 4 );
8390 const vector float vecTangent0 = vec_perm( v2, v3, permVec );
8391 permVec = vec_add( permVec, (vector unsigned char)(-4) ); //shift permute right 3 elements
8392 const vector float vecTangent1 = vec_perm( v3, v4, permVec );
8393
8394 vector float vecXYZ2 = vec_perm( v5, v6, permVec2 );
8395 vector float vecNormal2 = vec_perm( v6, v7, permVec2 );
8396 vecNormal2 = vec_sld( vecNormal2, vecNormal2, 4 );
8397 const vector float vecTangent02 = vec_perm( v7, v8, permVec2 );
8398 permVec2 = vec_add( permVec2, (vector unsigned char)(-4) );
8399 const vector float vecTangent12 = vec_perm( v8, v9, permVec2 );
8400
8401 // calculate lightDir
8402 vector float vecLightDir = vec_sub( vecLightOrigin, vecXYZ );
8403 vector float vecViewDir = vec_sub( vecViewOrigin, vecXYZ );
8404
8405 vector float vecLightDir2 = vec_sub( vecLightOrigin, vecXYZ2 );
8406 vector float vecViewDir2 = vec_sub( vecViewOrigin, vecXYZ2 );
8407
8408 // calculate distance
8409 vector float vecTempLight = vec_madd( vecLightDir, vecLightDir, zeroVector );
8410 vector float vecTempView = vec_madd( vecViewDir, vecViewDir, zeroVector );
8411
8412 vector float vecTempLight2 = vec_madd( vecLightDir2, vecLightDir2, zeroVector );
8413 vector float vecTempView2 = vec_madd( vecViewDir2, vecViewDir2, zeroVector );
8414
8415 // sum accross first 3 elements of vector
8416 vector float tempSum = vec_add( vecTempLight, vec_sld( vecTempLight, vecTempLight, 4 ) );
8417 vecTempLight = vec_add( tempSum, vec_sld( tempSum, tempSum, 8 ) );
8418 vector float tempSum2 = vec_add( vecTempView, vec_sld( vecTempView, vecTempView, 4 ) );
8419 vecTempView = vec_add( tempSum2, vec_sld( tempSum2, tempSum2, 8 ) );
8420
8421 vector float tempSum4 = vec_add( vecTempLight2, vec_sld( vecTempLight2, vecTempLight2, 4 ) );
8422 vecTempLight2 = vec_add( tempSum4, vec_sld( tempSum4, tempSum4, 8 ) );
8423 vector float tempSum5 = vec_add( vecTempView2, vec_sld( vecTempView2, vecTempView2, 4 ) );
8424 vecTempView2 = vec_add( tempSum5, vec_sld( tempSum5, tempSum5, 8 ) );
8425
8426 // splat sum accross the whole vector
8427 vecTempLight = vec_splat( vecTempLight, 0 );
8428 vecTempView = vec_splat( vecTempView, 0 );
8429
8430 vecTempLight2 = vec_splat( vecTempLight2, 0 );
8431 vecTempView2 = vec_splat( vecTempView2, 0 );
8432
8433 vecTempLight = ReciprocalSquareRoot( vecTempLight );
8434 vecTempView = ReciprocalSquareRoot( vecTempView );
8435
8436 vecTempLight2 = ReciprocalSquareRoot( vecTempLight2 );
8437 vecTempView2 = ReciprocalSquareRoot( vecTempView2 );
8438
8439 // modify light and view vectors based on ilength
8440 vecViewDir = vec_madd( vecViewDir, vecTempView, zeroVector );
8441 vecLightDir = vec_madd( vecLightDir, vecTempLight, vecViewDir );
8442
8443 vecViewDir2 = vec_madd( vecViewDir2, vecTempView2, zeroVector );
8444 vecLightDir2 = vec_madd( vecLightDir2, vecTempLight2, vecViewDir2 );
8445
8446 // calculate what to store in each texture coord
8447 vector float vecTC0 = vec_madd( vecLightDir, vecTangent0, zeroVector );
8448 vector float vecTC1 = vec_madd( vecLightDir, vecTangent1, zeroVector );
8449 vector float vecTC2 = vec_madd( vecLightDir, vecNormal, zeroVector );
8450
8451 vector float vecTC3 = vec_madd( vecLightDir2, vecTangent02, zeroVector );
8452 vector float vecTC4 = vec_madd( vecLightDir2, vecTangent12, zeroVector );
8453 vector float vecTC5 = vec_madd( vecLightDir2, vecNormal2, zeroVector );
8454
8455 // sum accross first 3 elements of vector
8456 vector float tempSum3;
8457 tempSum = vec_add( vecTC0, vec_sld( vecTC0, vecTC0, 4 ) );
8458 vecTC0 = vec_add( tempSum, vec_sld( vecTC0, vecTC0, 8 ) );
8459 tempSum2 = vec_add( vecTC1, vec_sld( vecTC1, vecTC1, 4 ) );
8460 vecTC1 = vec_add( tempSum2, vec_sld( vecTC1, vecTC1, 8 ) );
8461 tempSum3 = vec_add( vecTC2, vec_sld( vecTC2, vecTC2, 4 ) );
8462 vecTC2 = vec_add( tempSum3, vec_sld( vecTC2, vecTC2, 8 ) );
8463
8464 tempSum4 = vec_add( vecTC3, vec_sld( vecTC3, vecTC3, 4 ) );
8465 vecTC3 = vec_add( tempSum4, vec_sld( vecTC3, vecTC3, 8 ) );
8466 tempSum5 = vec_add( vecTC4, vec_sld( vecTC4, vecTC4, 4 ) );
8467 vecTC4 = vec_add( tempSum5, vec_sld( vecTC4, vecTC4, 8 ) );
8468 vector float tempSum6 = vec_add( vecTC5, vec_sld( vecTC5, vecTC5, 4 ) );
8469 vecTC5 = vec_add( tempSum6, vec_sld( vecTC5, vecTC5, 8 ) );
8470
8471 vecTC0 = vec_splat( vecTC0, 0 );
8472 vecTC1 = vec_splat( vecTC1, 0 );
8473 vecTC2 = vec_splat( vecTC2, 0 );
8474
8475 vecTC3 = vec_splat( vecTC3, 0 );
8476 vecTC4 = vec_splat( vecTC4, 0 );
8477 vecTC5 = vec_splat( vecTC5, 0 );
8478
8479 if ( used[index] ) {
8480 // store out results
8481 vec_ste( vecTC0, 0, &texCoords[index][0] );
8482 vec_ste( vecTC1, 0, &texCoords[index][1] );
8483 vec_ste( vecTC2, 0, &texCoords[index][2] );
8484 vec_ste( (vector float)(1.0), 0, &texCoords[index][3] );
8485 }
8486
8487 if ( used[index+1] ) {
8488 vec_ste( vecTC3, 0, &texCoords[index+1][0] );
8489 vec_ste( vecTC4, 0, &texCoords[index+1][1] );
8490 vec_ste( vecTC5, 0, &texCoords[index+1][2] );
8491 vec_ste( (vector float)(1.0), 0, &texCoords[index+1][3] );
8492 }
8493 }
8494
8495 // cleanup
8496 for ( ; index < numVerts; index++ ) {
8497 if ( !used[index] ) {
8498 continue;
8499 }
8500
8501 const float *vertPtr = verts[index].xyz.ToFloatPtr();
8502
8503 permVec = vec_add( vec_lvsl( -1, vertPtr ), (vector unsigned char)(1) );
8504
8505 v0 = vec_ld( 0, vertPtr );
8506 v1 = vec_ld( 15, vertPtr );
8507 vector float v2 = vec_ld( 31, vertPtr );
8508 vector float v3 = vec_ld( 47, vertPtr );
8509 vector float v4 = vec_ld( 63, vertPtr );
8510
8511 // figure out what values go where
8512 vector float vecXYZ = vec_perm( v0, v1, permVec );
8513 vector float vecNormal = vec_perm( v1, v2, permVec );
8514 vecNormal = vec_sld( vecNormal, vecNormal, 4 );
8515 const vector float vecTangent0 = vec_perm( v2, v3, permVec );
8516 permVec = vec_add( permVec, (vector unsigned char)(-4) ); //shift permute right 3 elements
8517 const vector float vecTangent1 = vec_perm( v3, v4, permVec );
8518
8519 // calculate lightDir
8520 vector float vecLightDir = vec_sub( vecLightOrigin, vecXYZ );
8521 vector float vecViewDir = vec_sub( vecViewOrigin, vecXYZ );
8522
8523 // calculate distance
8524 vector float vecTempLight = vec_madd( vecLightDir, vecLightDir, zeroVector );
8525 vector float vecTempView = vec_madd( vecViewDir, vecViewDir, zeroVector );
8526
8527 // sum accross first 3 elements of vector
8528 vector float tempSum = vec_add( vecTempLight, vec_sld( vecTempLight, vecTempLight, 4 ) );
8529 vecTempLight = vec_add( tempSum, vec_sld( tempSum, tempSum, 8 ) );
8530 vector float tempSum2 = vec_add( vecTempView, vec_sld( vecTempView, vecTempView, 4 ) );
8531 vecTempView = vec_add( tempSum2, vec_sld( tempSum2, tempSum2, 8 ) );
8532
8533 // splat sum accross the whole vector
8534 vecTempLight = vec_splat( vecTempLight, 0 );
8535 vecTempView = vec_splat( vecTempView, 0 );
8536
8537 vecTempLight = ReciprocalSquareRoot( vecTempLight );
8538 vecTempView = ReciprocalSquareRoot( vecTempView );
8539
8540 // modify light and view vectors based on ilength
8541 vecViewDir = vec_madd( vecViewDir, vecTempView, zeroVector );
8542 vecLightDir = vec_madd( vecLightDir, vecTempLight, vecViewDir );
8543
8544 // calculate what to store in each texture coord
8545 vector float vecTC0 = vec_madd( vecLightDir, vecTangent0, zeroVector );
8546 vector float vecTC1 = vec_madd( vecLightDir, vecTangent1, zeroVector );
8547 vector float vecTC2 = vec_madd( vecLightDir, vecNormal, zeroVector );
8548
8549 // sum accross first 3 elements of vector
8550 vector float tempSum3;
8551 tempSum = vec_add( vecTC0, vec_sld( vecTC0, vecTC0, 4 ) );
8552 vecTC0 = vec_add( tempSum, vec_sld( vecTC0, vecTC0, 8 ) );
8553 tempSum2 = vec_add( vecTC1, vec_sld( vecTC1, vecTC1, 4 ) );
8554 vecTC1 = vec_add( tempSum2, vec_sld( vecTC1, vecTC1, 8 ) );
8555 tempSum3 = vec_add( vecTC2, vec_sld( vecTC2, vecTC2, 4 ) );
8556 vecTC2 = vec_add( tempSum3, vec_sld( vecTC2, vecTC2, 8 ) );
8557
8558 vecTC0 = vec_splat( vecTC0, 0 );
8559 vecTC1 = vec_splat( vecTC1, 0 );
8560 vecTC2 = vec_splat( vecTC2, 0 );
8561
8562 // store out results
8563 vec_ste( vecTC0, 0, &texCoords[index][0] );
8564 vec_ste( vecTC1, 0, &texCoords[index][1] );
8565 vec_ste( vecTC2, 0, &texCoords[index][2] );
8566 vec_ste( (vector float)(1.0), 0, &texCoords[index][3] );
8567
8568 }
8569 }
8570 #endif /* 0 for disable spec coord */
8571
8572 #if 1
8573
8574 #ifdef VERTEXCACHE_ALIGNED
8575 /*
8576 ============
8577 idSIMD_AltiVec::CreateShadowCache
8578 ============
8579 */
CreateShadowCache(idVec4 * vertexCache,int * vertRemap,const idVec3 & lightOrigin,const idDrawVert * verts,const int numVerts)8580 int VPCALL idSIMD_AltiVec::CreateShadowCache( idVec4 *vertexCache, int *vertRemap, const idVec3 &lightOrigin, const idDrawVert *verts, const int numVerts ) {
8581 int outVerts = 0;
8582 int i = 0;
8583
8584 assert( IS_16BYTE_ALIGNED( vertexCache[0] ) );
8585
8586 register vector float v0, v1, v2, v3, v4, v5, v6, v7;
8587 register vector unsigned char vecPerm, vecPerm2, vecPerm3, vecPerm4, vecPerm5;
8588 register vector float zeroVector = (vector float)(0.0);
8589 register vector float oneVector = (vector float)(1);
8590 register vector unsigned char vecPermZeroLast = (vector unsigned char)(0,1,2,3,4,5,6,7,8,9,10,11,16,17,18,19);
8591
8592 const float *lPtr = lightOrigin.ToFloatPtr();
8593 const float *vPtr;
8594 const float *vPtr2;
8595 const float *vPtr3;
8596 const float *vPtr4;
8597
8598 // put values into a vector
8599 vecPerm = vec_add( vec_lvsl( -1, lPtr ), (vector unsigned char)(1) );
8600 v0 = vec_ld( 0, lPtr );
8601 v1 = vec_ld( 15, lPtr );
8602 v0 = vec_perm( v0, v1, vecPerm );
8603 v0 = vec_perm( v0, zeroVector, vecPermZeroLast );
8604
8605 //v0 now contains lightOrigin[0], lightOrigin[1], lightOrigin[2], 0
8606 for ( ; i+3 < numVerts; i+= 4 ) {
8607 if ( ! vertRemap[i] ) {
8608 vPtr = verts[i].xyz.ToFloatPtr();
8609
8610 #ifndef DRAWVERT_PADDED
8611 vecPerm2 = vec_add( vec_lvsl( -1, vPtr ), (vector unsigned char)(1) );
8612 v2 = vec_ld( 0, vPtr );
8613 v3 = vec_ld( 15, vPtr );
8614 v7 = vec_perm( v2, v3, vecPerm2 );
8615 #else
8616 v7 = vec_ld( 0, vPtr );
8617 #endif
8618 v2 = vec_perm( v7, zeroVector, vecPermZeroLast );
8619 v3 = vec_perm( v7, oneVector, vecPermZeroLast );
8620 v1 = vec_sub( v2, v0 );
8621
8622 vec_st( v3, 0, &vertexCache[outVerts][0] );
8623 vec_st( v1, 0, &vertexCache[outVerts+1][0] );
8624
8625 vertRemap[i] = outVerts;
8626 outVerts += 2;
8627 }
8628
8629 if ( ! vertRemap[i+1] ) {
8630 vPtr2 = verts[i+1].xyz.ToFloatPtr();
8631
8632 #ifndef DRAWVERT_PADDED
8633 vecPerm3 = vec_add( vec_lvsl( -1, vPtr2 ), (vector unsigned char)(1) );
8634 v4 = vec_ld( 0, vPtr2 );
8635 v5 = vec_ld( 15, vPtr2 );
8636 v6 = vec_perm( v4, v5, vecPerm3 );
8637 #else
8638 v6 = vec_ld( 0, vPtr2 );
8639 #endif
8640 v4 = vec_perm( v6, zeroVector, vecPermZeroLast );
8641 v5 = vec_perm( v6, oneVector, vecPermZeroLast );
8642 v6 = vec_sub( v4, v0 );
8643
8644 vec_st( v5, 0, &vertexCache[outVerts][0] );
8645 vec_st( v6, 0, &vertexCache[outVerts+1][0] );
8646
8647 vertRemap[i+1] = outVerts;
8648 outVerts += 2;
8649 }
8650
8651 if ( ! vertRemap[i+2] ) {
8652 vPtr3 = verts[i+2].xyz.ToFloatPtr();
8653
8654 #ifndef DRAWVERT_PADDED
8655 vecPerm4 = vec_add( vec_lvsl( -1, vPtr3 ), (vector unsigned char)(1) );
8656 v1 = vec_ld( 0, vPtr3 );
8657 v2 = vec_ld( 15, vPtr3 );
8658 v3 = vec_perm( v1, v2, vecPerm4 );
8659 #else
8660 v3 = vec_ld( 0, vPtr3 );
8661 #endif
8662 v1 = vec_perm( v3, zeroVector, vecPermZeroLast );
8663 v2 = vec_perm( v3, oneVector, vecPermZeroLast );
8664 v3 = vec_sub( v1, v0 );
8665
8666 vec_st( v2, 0, &vertexCache[outVerts][0] );
8667 vec_st( v3, 0, &vertexCache[outVerts+1][0] );
8668
8669 vertRemap[i+2] = outVerts;
8670 outVerts += 2;
8671 }
8672
8673 if ( ! vertRemap[i+3] ) {
8674 vPtr4 = verts[i+3].xyz.ToFloatPtr();
8675 #ifndef DRAWVERT_PADDED
8676 vecPerm5 = vec_add( vec_lvsl( -1, vPtr4 ), (vector unsigned char)(1) );
8677 v4 = vec_ld( 0, vPtr4 );
8678 v5 = vec_ld( 16, vPtr4 );
8679 v6 = vec_perm( v4, v5, vecPerm5 );
8680 #else
8681 v6 = vec_ld( 0, vPtr4 );
8682 #endif
8683 v4 = vec_perm( v6, zeroVector, vecPermZeroLast );
8684 v5 = vec_perm( v6, oneVector, vecPermZeroLast );
8685 v6 = vec_sub( v4, v0 );
8686
8687 vec_st( v5, 0, &vertexCache[outVerts][0] );
8688 vec_st( v6, 0, &vertexCache[outVerts+1][0] );
8689
8690 vertRemap[i+3] = outVerts;
8691 outVerts += 2;
8692 }
8693 }
8694
8695 // cleanup
8696 for (; i < numVerts; i++ ) {
8697 if ( vertRemap[i] ) {
8698 continue;
8699 }
8700 const float *v = verts[i].xyz.ToFloatPtr();
8701 vertexCache[outVerts+0][0] = v[0];
8702 vertexCache[outVerts+0][1] = v[1];
8703 vertexCache[outVerts+0][2] = v[2];
8704 vertexCache[outVerts+0][3] = 1.0f;
8705
8706 // R_SetupProjection() builds the projection matrix with a slight crunch
8707 // for depth, which keeps this w=0 division from rasterizing right at the
8708 // wrap around point and causing depth fighting with the rear caps
8709 vertexCache[outVerts+1][0] = v[0] - lightOrigin[0];
8710 vertexCache[outVerts+1][1] = v[1] - lightOrigin[1];
8711 vertexCache[outVerts+1][2] = v[2] - lightOrigin[2];
8712 vertexCache[outVerts+1][3] = 0.0f;
8713 vertRemap[i] = outVerts;
8714 outVerts += 2;
8715 }
8716 return outVerts;
8717 }
8718
8719 #else
8720
8721 /*
8722 ============
8723 idSIMD_AltiVec::CreateShadowCache
8724 ============
8725 */
CreateShadowCache(idVec4 * vertexCache,int * vertRemap,const idVec3 & lightOrigin,const idDrawVert * verts,const int numVerts)8726 int VPCALL idSIMD_AltiVec::CreateShadowCache( idVec4 *vertexCache, int *vertRemap, const idVec3 &lightOrigin, const idDrawVert *verts, const int numVerts ) {
8727 int outVerts = 0;
8728 int i = 0;
8729
8730 register vector float v0, v1, v2, v3, v4, v5, v6, v7;
8731 register vector unsigned char vecPerm, vecPerm2, vecPerm3, vecPerm4, vecPerm5;
8732 register vector float zeroVector = (vector float)(0.0);
8733 register vector float oneVector = (vector float)(1);
8734 register vector unsigned char vecPermZeroLast = (vector unsigned char)(0,1,2,3,4,5,6,7,8,9,10,11,16,17,18,19);
8735
8736 const float *lPtr = lightOrigin.ToFloatPtr();
8737 const float *vPtr;
8738 const float *vPtr2;
8739 const float *vPtr3;
8740 const float *vPtr4;
8741
8742 // put values into a vector
8743 vecPerm = vec_add( vec_lvsl( -1, lPtr ), (vector unsigned char)(1) );
8744 v0 = vec_ld( 0, lPtr );
8745 v1 = vec_ld( 15, lPtr );
8746 v0 = vec_perm( v0, v1, vecPerm );
8747 v0 = vec_perm( v0, zeroVector, vecPermZeroLast );
8748
8749 //v0 now contains lightOrigin[0], lightOrigin[1], lightOrigin[2], 0
8750 for ( ; i+3 < numVerts; i+= 4 ) {
8751 if ( ! vertRemap[i] ) {
8752 vPtr = verts[i].xyz.ToFloatPtr();
8753 #ifndef DRAWVERT_PADDED
8754 vecPerm2 = vec_add( vec_lvsl( -1, vPtr ), (vector unsigned char)(1) );
8755 v2 = vec_ld( 0, vPtr );
8756 v3 = vec_ld( 15, vPtr );
8757 v7 = vec_perm( v2, v3, vecPerm2 );
8758 #else
8759 v7 = vec_ld( 0, vPtr );
8760 #endif
8761 v2 = vec_perm( v7, zeroVector, vecPermZeroLast );
8762 v3 = vec_perm( v7, oneVector, vecPermZeroLast );
8763 v1 = vec_sub( v2, v0 );
8764
8765 // store results
8766 UNALIGNED_STORE2( &vertexCache[outVerts][0], v3, v1 );
8767
8768 vertRemap[i] = outVerts;
8769 outVerts += 2;
8770 }
8771
8772 if ( ! vertRemap[i+1] ) {
8773 vPtr2 = verts[i+1].xyz.ToFloatPtr();
8774 #ifndef DRAWVERT_PADDED
8775 vecPerm3 = vec_add( vec_lvsl( -1, vPtr2 ), (vector unsigned char)(1) );
8776 v4 = vec_ld( 0, vPtr2 );
8777 v5 = vec_ld( 15, vPtr2 );
8778 v6 = vec_perm( v4, v5, vecPerm3 );
8779 #else
8780 v6 = vec_ld( 0, vPtr2 );
8781 #endif
8782 v4 = vec_perm( v6, zeroVector, vecPermZeroLast );
8783 v5 = vec_perm( v6, oneVector, vecPermZeroLast );
8784 v6 = vec_sub( v4, v0 );
8785
8786 // store results
8787 UNALIGNED_STORE2( &vertexCache[outVerts][0], v5, v6 );
8788
8789 vertRemap[i+1] = outVerts;
8790 outVerts += 2;
8791 }
8792
8793 if ( ! vertRemap[i+2] ) {
8794 vPtr3 = verts[i+2].xyz.ToFloatPtr();
8795 #ifndef DRAWVERT_PADDED
8796 vecPerm4 = vec_add( vec_lvsl( -1, vPtr3 ), (vector unsigned char)(1) );
8797 v1 = vec_ld( 0, vPtr3 );
8798 v2 = vec_ld( 15, vPtr3 );
8799 v3 = vec_perm( v1, v2, vecPerm4 );
8800 #else
8801 v3 = vec_ld( 0, vPtr3 );
8802 #endif
8803 v1 = vec_perm( v3, zeroVector, vecPermZeroLast );
8804 v2 = vec_perm( v3, oneVector, vecPermZeroLast );
8805 v3 = vec_sub( v1, v0 );
8806
8807 // store results
8808 UNALIGNED_STORE2( &vertexCache[outVerts][0], v2, v3 );
8809
8810 vertRemap[i+2] = outVerts;
8811 outVerts += 2;
8812 }
8813 if ( ! vertRemap[i+3] ) {
8814 vPtr4 = verts[i+3].xyz.ToFloatPtr();
8815 #ifndef DRAWVERT_PADDED
8816 vecPerm5 = vec_add( vec_lvsl( -1, vPtr4 ), (vector unsigned char)(1) );
8817 v4 = vec_ld( 0, vPtr4 );
8818 v5 = vec_ld( 16, vPtr4 );
8819 v6 = vec_perm( v4, v5, vecPerm5 );
8820 #else
8821 v6 = vec_ld( 0, vPtr4 );
8822 #endif
8823
8824 v4 = vec_perm( v6, zeroVector, vecPermZeroLast );
8825 v5 = vec_perm( v6, oneVector, vecPermZeroLast );
8826 v6 = vec_sub( v4, v0 );
8827
8828 // store results
8829 UNALIGNED_STORE2( &vertexCache[outVerts][0], v5, v6 );
8830
8831
8832 vertRemap[i+3] = outVerts;
8833 outVerts += 2;
8834 }
8835 }
8836
8837 // cleanup
8838 for (; i < numVerts; i++ ) {
8839 if ( vertRemap[i] ) {
8840 continue;
8841 }
8842 const float *v = verts[i].xyz.ToFloatPtr();
8843 vertexCache[outVerts+0][0] = v[0];
8844 vertexCache[outVerts+0][1] = v[1];
8845 vertexCache[outVerts+0][2] = v[2];
8846 vertexCache[outVerts+0][3] = 1.0f;
8847
8848 // R_SetupProjection() builds the projection matrix with a slight crunch
8849 // for depth, which keeps this w=0 division from rasterizing right at the
8850 // wrap around point and causing depth fighting with the rear caps
8851 vertexCache[outVerts+1][0] = v[0] - lightOrigin[0];
8852 vertexCache[outVerts+1][1] = v[1] - lightOrigin[1];
8853 vertexCache[outVerts+1][2] = v[2] - lightOrigin[2];
8854 vertexCache[outVerts+1][3] = 0.0f;
8855 vertRemap[i] = outVerts;
8856 outVerts += 2;
8857 }
8858 return outVerts;
8859 }
8860 #endif /* VERTEXCACHE_ALIGNED */
8861
8862 #endif /* 0 to disable shadow cache */
8863
8864 #if 1
8865
8866 #ifdef VERTEXCACHE_ALIGNED
8867 /*
8868 ============
8869 idSIMD_AltiVec::CreateVertexProgramShadowCache
8870 ============
8871 */
CreateVertexProgramShadowCache(idVec4 * vertexCache,const idDrawVert * verts,const int numVerts)8872 int VPCALL idSIMD_AltiVec::CreateVertexProgramShadowCache( idVec4 *vertexCache, const idDrawVert *verts, const int numVerts ) {
8873
8874 // vertexCache aligned
8875 assert( IS_16BYTE_ALIGNED( vertexCache[0] ) );
8876 // idDrawVert size
8877 assert( sizeof(idDrawVert) == DRAWVERT_OFFSET * sizeof(float) );
8878 // idVec4 size
8879 assert( sizeof(idVec4) == IDVEC4_OFFSET * sizeof(float) );
8880
8881 register vector float v0, v1, v2, v3, v4, v5, v6, v7;
8882 register vector float zeroVector = (vector float)(0.0);
8883 register vector float oneVector = (vector float)(1);
8884 register vector unsigned char vecPermThreeOne = (vector unsigned char)(0,1,2,3,4,5,6,7,8,9,10,11,16,17,18,19);
8885 vector unsigned char vertPerm1, vertPerm2, vertPerm3, vertPerm4;
8886 int i = 0;
8887
8888 #ifndef DRAWVERT_PADDED
8889 // every fourth one will have the same alignment. Make sure we've got enough here
8890 if ( i+3 < numVerts ) {
8891 vertPerm1 = vec_add( vec_lvsl( -1, (float*) verts[0].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
8892 vertPerm2 = vec_add( vec_lvsl( -1, (float*) verts[1].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
8893 vertPerm3 = vec_add( vec_lvsl( -1, (float*) verts[2].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
8894 vertPerm4 = vec_add( vec_lvsl( -1, (float*) verts[3].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
8895 }
8896 #endif
8897
8898 for ( ; i+3 < numVerts; i+=4 ) {
8899 const float *vertPtr = verts[i].xyz.ToFloatPtr();
8900 const float *vertPtr2 = verts[i+1].xyz.ToFloatPtr();
8901 const float *vertPtr3 = verts[i+2].xyz.ToFloatPtr();
8902 const float *vertPtr4 = verts[i+3].xyz.ToFloatPtr();
8903
8904 #ifndef DRAWVERT_PADDED
8905 v0 = vec_ld( 0, vertPtr );
8906 v1 = vec_ld( 15, vertPtr );
8907 v2 = vec_ld( 0, vertPtr2 );
8908 v3 = vec_ld( 15, vertPtr2 );
8909 v4 = vec_ld( 0, vertPtr3 );
8910 v5 = vec_ld( 15, vertPtr3 );
8911 v6 = vec_ld( 0, vertPtr4 );
8912 v7 = vec_ld( 15, vertPtr4 );
8913
8914 v0 = vec_perm( v0, v1, vertPerm1 );
8915 v1 = vec_perm( v2, v3, vertPerm2 );
8916 v2 = vec_perm( v4, v5, vertPerm3 );
8917 v3 = vec_perm( v6, v7, vertPerm4 );
8918 #else
8919 v0 = vec_ld( 0, vertPtr );
8920 v1 = vec_ld( 0, vertPtr2 );
8921 v2 = vec_ld( 0, vertPtr3 );
8922 v3 = vec_ld( 0, vertPtr4 );
8923 #endif
8924
8925 v0 = vec_perm( v0, oneVector, vecPermThreeOne );
8926 v4 = vec_perm( v0, zeroVector, vecPermThreeOne );
8927
8928 v1 = vec_perm( v1, oneVector, vecPermThreeOne );
8929 v5 = vec_perm( v1, zeroVector, vecPermThreeOne );
8930
8931 v2 = vec_perm( v2, oneVector, vecPermThreeOne );
8932 v6 = vec_perm( v2, zeroVector, vecPermThreeOne );
8933
8934 v3 = vec_perm( v3, oneVector, vecPermThreeOne );
8935 v7 = vec_perm( v3, zeroVector, vecPermThreeOne );
8936
8937 // store results
8938 ALIGNED_STORE4( &vertexCache[i*2][0], v0, v4, v1, v5 );
8939 ALIGNED_STORE4( &vertexCache[(i+2)*2][0], v2, v6, v3, v7 );
8940
8941 }
8942
8943 // cleanup
8944 for ( ; i < numVerts; i++ ) {
8945 const float *v = verts[i].xyz.ToFloatPtr();
8946 vertexCache[i*2+0][0] = v[0];
8947 vertexCache[i*2+1][0] = v[0];
8948 vertexCache[i*2+0][1] = v[1];
8949 vertexCache[i*2+1][1] = v[1];
8950 vertexCache[i*2+0][2] = v[2];
8951 vertexCache[i*2+1][2] = v[2];
8952 vertexCache[i*2+0][3] = 1.0f;
8953 vertexCache[i*2+1][3] = 0.0f;
8954 }
8955 return numVerts * 2;
8956 }
8957
8958 #else
8959 /*
8960 ============
8961 idSIMD_AltiVec::CreateVertexProgramShadowCache
8962 ============
8963 */
CreateVertexProgramShadowCache(idVec4 * vertexCache,const idDrawVert * verts,const int numVerts)8964 int VPCALL idSIMD_AltiVec::CreateVertexProgramShadowCache( idVec4 *vertexCache, const idDrawVert *verts, const int numVerts ) {
8965
8966 // idDrawVert size
8967 assert( sizeof(idDrawVert) == DRAWVERT_OFFSET * sizeof(float) );
8968 // idVec4 size
8969 assert( sizeof(idVec4) == IDVEC4_OFFSET * sizeof(float) );
8970
8971 register vector float v0, v1, v2, v3, v4, v5, v6, v7;
8972 register vector float zeroVector = (vector float)(0.0);
8973 register vector float oneVector = (vector float)(1);
8974 register vector unsigned char vecPermThreeOne = (vector unsigned char)(0,1,2,3,4,5,6,7,8,9,10,11,16,17,18,19);
8975 vector unsigned char vertPerm1, vertPerm2, vertPerm3, vertPerm4;
8976 int i = 0;
8977
8978 #ifndef DRAWVERT_PADDED
8979 // every fourth one will have the same alignment. Make sure we've got enough here
8980 if ( i+3 < numVerts ) {
8981 vertPerm1 = vec_add( vec_lvsl( -1, (float*) verts[0].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
8982 vertPerm2 = vec_add( vec_lvsl( -1, (float*) verts[1].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
8983 vertPerm3 = vec_add( vec_lvsl( -1, (float*) verts[2].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
8984 vertPerm4 = vec_add( vec_lvsl( -1, (float*) verts[3].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
8985 }
8986 #endif
8987
8988 for ( ; i+3 < numVerts; i+=4 ) {
8989 const float *vertPtr = verts[i].xyz.ToFloatPtr();
8990 const float *vertPtr2 = verts[i+1].xyz.ToFloatPtr();
8991 const float *vertPtr3 = verts[i+2].xyz.ToFloatPtr();
8992 const float *vertPtr4 = verts[i+3].xyz.ToFloatPtr();
8993
8994 #ifndef DRAWVERT_PADDED
8995 v0 = vec_ld( 0, vertPtr );
8996 v1 = vec_ld( 15, vertPtr );
8997 v2 = vec_ld( 0, vertPtr2 );
8998 v3 = vec_ld( 15, vertPtr2 );
8999 v4 = vec_ld( 0, vertPtr3 );
9000 v5 = vec_ld( 15, vertPtr3 );
9001 v6 = vec_ld( 0, vertPtr4 );
9002 v7 = vec_ld( 15, vertPtr4 );
9003
9004 v0 = vec_perm( v0, v1, vertPerm1 );
9005 v1 = vec_perm( v2, v3, vertPerm2 );
9006 v2 = vec_perm( v4, v5, vertPerm3 );
9007 v3 = vec_perm( v6, v7, vertPerm4 );
9008 #else
9009 v0 = vec_ld( 0, vertPtr );
9010 v1 = vec_ld( 0, vertPtr2 );
9011 v2 = vec_ld( 0, vertPtr3 );
9012 v3 = vec_ld( 0, vertPtr4 );
9013 #endif
9014
9015 v0 = vec_perm( v0, oneVector, vecPermThreeOne );
9016 v4 = vec_perm( v0, zeroVector, vecPermThreeOne );
9017
9018 v1 = vec_perm( v1, oneVector, vecPermThreeOne );
9019 v5 = vec_perm( v1, zeroVector, vecPermThreeOne );
9020
9021 v2 = vec_perm( v2, oneVector, vecPermThreeOne );
9022 v6 = vec_perm( v2, zeroVector, vecPermThreeOne );
9023
9024 v3 = vec_perm( v3, oneVector, vecPermThreeOne );
9025 v7 = vec_perm( v3, zeroVector, vecPermThreeOne );
9026
9027 // store results as unaligned
9028 vector unsigned char storePerm = vec_sub( vec_lvsr( 15, &vertexCache[i*2][0] ), (vector unsigned char)(1) );
9029 vector unsigned int mask = vec_perm( (vector unsigned int)(0), (vector unsigned int)(-1), storePerm );
9030 vector float vc1 = vec_ld( 0, &vertexCache[i*2][0] );
9031 vector float vc2 = vec_ld( 127, &vertexCache[i*2][0] );
9032
9033 // right rotate input data
9034 v0 = vec_perm( v0, v0, storePerm );
9035 v4 = vec_perm( v4, v4, storePerm );
9036 v1 = vec_perm( v1, v1, storePerm );
9037 v5 = vec_perm( v5, v5, storePerm );
9038 v2 = vec_perm( v2, v2, storePerm );
9039 v6 = vec_perm( v6, v6, storePerm );
9040 v3 = vec_perm( v3, v3, storePerm );
9041 v7 = vec_perm( v7, v7, storePerm );
9042
9043 vec_st( vec_sel( vc1, v0, mask ), 0 , &vertexCache[i*2][0] );
9044 vec_st( vec_sel( v0, v4, mask ), 15 , &vertexCache[i*2][0] );
9045 vec_st( vec_sel( v4, v1, mask ), 31 , &vertexCache[i*2][0] );
9046 vec_st( vec_sel( v1, v5, mask ), 47 , &vertexCache[i*2][0] );
9047 vec_st( vec_sel( v5, v2, mask ), 63 , &vertexCache[i*2][0] );
9048 vec_st( vec_sel( v2, v6, mask ), 79 , &vertexCache[i*2][0] );
9049 vec_st( vec_sel( v6, v3, mask ), 95 , &vertexCache[i*2][0] );
9050 vec_st( vec_sel( v3, v7, mask ), 111 , &vertexCache[i*2][0] );
9051 vec_st( vec_sel( v7, vc2, mask ), 127 , &vertexCache[i*2][0] );
9052 }
9053
9054 // cleanup
9055 for ( ; i < numVerts; i++ ) {
9056 const float *v = verts[i].xyz.ToFloatPtr();
9057 vertexCache[i*2+0][0] = v[0];
9058 vertexCache[i*2+1][0] = v[0];
9059 vertexCache[i*2+0][1] = v[1];
9060 vertexCache[i*2+1][1] = v[1];
9061 vertexCache[i*2+0][2] = v[2];
9062 vertexCache[i*2+1][2] = v[2];
9063 vertexCache[i*2+0][3] = 1.0f;
9064 vertexCache[i*2+1][3] = 0.0f;
9065 }
9066 return numVerts * 2;
9067 }
9068
9069 #endif /* VERTEXCACHE_ALIGNED */
9070
9071 #endif /* 0 to kill VP shader cache */
9072
9073 #endif /* ENABLE_CREATE */
9074
9075 #ifdef ENABLE_SOUND_ROUTINES
9076
9077 #ifdef SOUND_DEST_ALIGNED
9078 /*
9079 ============
9080 idSIMD_AltiVec::UpSamplePCMTo44kHz
9081
9082 Duplicate samples for 44kHz output.
9083
9084 Assumptions:
9085 Assumes that dest starts at aligned address
9086 ============
9087 */
UpSamplePCMTo44kHz(float * dest,const short * src,const int numSamples,const int kHz,const int numChannels)9088 void idSIMD_AltiVec::UpSamplePCMTo44kHz( float *dest, const short *src, const int numSamples, const int kHz, const int numChannels ) {
9089
9090 // dest is aligned
9091 assert( IS_16BYTE_ALIGNED( dest[0] ) );
9092
9093 vector signed short vs0, vs1;
9094 register vector signed int vi0, vi1;
9095 register vector float v0, v1, v2, v3, v4, v5, v6, v7, v8, v9;
9096 // permute vectors
9097 register vector unsigned char vecFirstHalf = (vector unsigned char)(0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7);
9098 register vector unsigned char vecSecondHalf = (vector unsigned char)(8,9,10,11,12,13,14,15,8,9,10,11,12,13,14,15);
9099
9100 register vector unsigned char vecBottom = (vector unsigned char)(0,1,2,3,0,1,2,3,4,5,6,7,4,5,6,7);
9101 register vector unsigned char vecTop = (vector unsigned char)(8,9,10,11,8,9,10,11,12,13,14,15,12,13,14,15);
9102
9103 // If this can be assumed true, we can eliminate another conditional that checks to see if we can
9104 // load up a vector before the loop
9105 assert( numSamples >= 12 );
9106
9107 if ( kHz == 11025 ) {
9108 if ( numChannels == 1 ) {
9109 // 8 at a time
9110 int i = 0;
9111
9112 vector signed short vsOld = vec_ld( 0, &src[i] );
9113 vector unsigned char permVec = vec_add( vec_lvsl( -1, &src[i] ), (vector unsigned char)(1) );
9114
9115 for ( ; i+7 < numSamples; i+= 8 ) {
9116 // load src
9117 vs1 = vec_ld( 15, &src[i] );
9118 vs0 = vec_perm( vsOld, vs1, permVec );
9119 vsOld = vs1;
9120
9121 // unpack shorts to ints
9122 vi0 = vec_unpackh( vs0 );
9123 vi1 = vec_unpackl( vs0 );
9124 // convert ints to floats
9125 v0 = vec_ctf( vi0, 0 );
9126 v1 = vec_ctf( vi1, 0 );
9127 // permute into vectors in the order to store
9128
9129 v2 = vec_splat( v0, 0 );
9130 v3 = vec_splat( v0, 1 );
9131 v4 = vec_splat( v0, 2 );
9132 v5 = vec_splat( v0, 3 );
9133 v6 = vec_splat( v1, 0 );
9134 v7 = vec_splat( v1, 1 );
9135 v8 = vec_splat( v1, 2 );
9136 v9 = vec_splat( v1, 3 );
9137
9138 // store results
9139 ALIGNED_STORE8( &dest[i*4], v2, v3, v4, v5, v6, v7, v8, v9 );
9140 }
9141 // cleanup
9142 for (; i < numSamples; i++ ) {
9143 dest[i*4+0] = dest[i*4+1] = dest[i*4+2] = dest[i*4+3] = (float) src[i+0];
9144 }
9145 } else {
9146 int i = 0;
9147
9148 vector unsigned char permVec = vec_add( vec_lvsl( -1, &src[0] ), (vector unsigned char)(1) );
9149 vector signed short vsOld = vec_ld( 0, &src[0] );
9150
9151 for ( ; i+7 < numSamples; i += 8 ) {
9152 // load src
9153 vs1 = vec_ld( 15, &src[i] );
9154 vs0 = vec_perm( vsOld, vs1, permVec );
9155 vsOld = vs1;
9156
9157 // unpack shorts to ints
9158 vi0 = vec_unpackh( vs0 );
9159 vi1 = vec_unpackl( vs0 );
9160 // convert ints to floats
9161 v0 = vec_ctf( vi0, 0 );
9162 v1 = vec_ctf( vi1, 0 );
9163 // put into vectors in order to store
9164 v2 = vec_perm( v0, v0, vecFirstHalf );
9165 v3 = v2;
9166 v4 = vec_perm( v0, v0, vecSecondHalf );
9167 v5 = v4;
9168 v6 = vec_perm( v1, v1, vecFirstHalf );
9169 v7 = v6;
9170 v8 = vec_perm (v1, v1, vecSecondHalf );
9171 v9 = v8;
9172
9173 // store results
9174 ALIGNED_STORE8( &dest[i*4], v2, v3, v4, v5, v6, v7, v8, v9 );
9175 }
9176
9177 for ( ; i < numSamples; i += 2 ) {
9178 dest[i*4+0] = dest[i*4+2] = dest[i*4+4] = dest[i*4+6] = (float) src[i+0];
9179 dest[i*4+1] = dest[i*4+3] = dest[i*4+5] = dest[i*4+7] = (float) src[i+1];
9180 }
9181 }
9182 } else if ( kHz == 22050 ) {
9183 if ( numChannels == 1 ) {
9184 int i;
9185 vector unsigned char permVec = vec_add( vec_lvsl( -1, &src[0] ), (vector unsigned char)(1) );
9186 vector signed short vsOld = vec_ld( 0, &src[0] );
9187
9188 for ( i = 0; i+7 < numSamples; i += 8 ) {
9189 // load src
9190 vs1 = vec_ld( 0, &src[i] );
9191 vs0 = vec_perm( vsOld, vs1, permVec );
9192 vsOld = vs1;
9193
9194 // unpack shorts to ints
9195 vi0 = vec_unpackh( vs0 );
9196 vi1 = vec_unpackl( vs0 );
9197 // convert ints to floats
9198 v0 = vec_ctf( vi0, 0 );
9199 v1 = vec_ctf( vi1, 0 );
9200 // put into vectors in order to store
9201 v2 = vec_perm( v0, v0, vecBottom );
9202 v3 = vec_perm( v0, v0, vecTop );
9203 v4 = vec_perm( v1, v1, vecBottom );
9204 v5 = vec_perm (v1, v1, vecTop );
9205
9206 // store results
9207 ALIGNED_STORE4( &dest[i*2], v2, v3, v4, v5 );
9208 }
9209 // cleanup
9210 for ( ; i < numSamples; i++ ) {
9211 dest[i*2+0] = dest[i*2+1] = (float) src[i+0];
9212 }
9213 } else {
9214 int i;
9215 vector unsigned char permVec = vec_add( vec_lvsl( -1, &src[0] ), (vector unsigned char)(1) );
9216 vector signed short vsOld = vec_ld( 0, &src[0] );
9217
9218 for ( i = 0; i+7 < numSamples; i += 8 ) {
9219 // load src
9220 vs1 = vec_ld( 15, &src[i] );
9221 vs0 = vec_perm( vsOld, vs1, permVec );
9222 vsOld = vs1;
9223
9224 // unpack shorts to ints
9225 vi0 = vec_unpackh( vs0 );
9226 vi1 = vec_unpackl( vs0 );
9227 // convert ints to floats
9228 v0 = vec_ctf( vi0, 0 );
9229 v1 = vec_ctf( vi1, 0 );
9230 // put into vectors in order to store
9231 v2 = vec_perm( v0, v0, vecFirstHalf );
9232 v3 = vec_perm( v0, v0, vecSecondHalf );
9233 v4 = vec_perm( v1, v1, vecFirstHalf );
9234 v5 = vec_perm (v1, v1, vecSecondHalf );
9235
9236 // store results
9237 ALIGNED_STORE4( &dest[i*2], v2, v3, v4, v5 );
9238 }
9239 // cleanup
9240 for ( ; i < numSamples; i += 2 ) {
9241 dest[i*2+0] = dest[i*2+2] = (float) src[i+0];
9242 dest[i*2+1] = dest[i*2+3] = (float) src[i+1];
9243 }
9244 }
9245 } else if ( kHz == 44100 ) {
9246 int i;
9247 vector unsigned char permVec = vec_add( vec_lvsl( -1, &src[0] ), (vector unsigned char)(1) );
9248 vector signed short vsOld = vec_ld( 0, &src[0] );
9249
9250 for ( i = 0; i+7 < numSamples; i += 8 ) {
9251 vs1 = vec_ld( 15, &src[i] );
9252 vs0 = vec_perm( vsOld, vs1, permVec );
9253 vsOld = vs1;
9254
9255 //unpack shorts to ints
9256 vi0 = vec_unpackh( vs0 );
9257 vi1 = vec_unpackl( vs0 );
9258
9259 //convert ints to floats
9260 v0 = vec_ctf( vi0, 0 );
9261 v1 = vec_ctf( vi1, 0 );
9262
9263 //store results
9264 ALIGNED_STORE2( &dest[i], v0, v1 );
9265 }
9266 // cleanup
9267 for ( ; i < numSamples; i++ ) {
9268 dest[i] = (float) src[i];
9269 }
9270 } else {
9271 assert( 0 );
9272 }
9273 }
9274
9275 #else
9276
9277 /*
9278 ============
9279 idSIMD_AltiVec::UpSamplePCMTo44kHz
9280
9281 Duplicate samples for 44kHz output.
9282
9283 Assumptions:
9284 No assumptions
9285 ============
9286 */
UpSamplePCMTo44kHz(float * dest,const short * src,const int numSamples,const int kHz,const int numChannels)9287 void idSIMD_AltiVec::UpSamplePCMTo44kHz( float *dest, const short *src, const int numSamples, const int kHz, const int numChannels ) {
9288
9289 vector signed short vs0, vs1;
9290 register vector signed int vi0, vi1;
9291 register vector float v0, v1, v2, v3, v4, v5, v6, v7, v8, v9;
9292 // permute vectors
9293 register vector unsigned char vecFirstHalf = (vector unsigned char)(0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7);
9294 register vector unsigned char vecSecondHalf = (vector unsigned char)(8,9,10,11,12,13,14,15,8,9,10,11,12,13,14,15);
9295
9296 register vector unsigned char vecBottom = (vector unsigned char)(0,1,2,3,0,1,2,3,4,5,6,7,4,5,6,7);
9297 register vector unsigned char vecTop = (vector unsigned char)(8,9,10,11,8,9,10,11,12,13,14,15,12,13,14,15);
9298
9299 // calculate perm vector and masks for stores
9300 vector unsigned char storePerm = vec_sub( vec_lvsr( 15, &dest[0] ), (vector unsigned char)(1) );
9301 // original values of dest
9302 vector float vecDest = vec_ld( 0, &dest[0] );
9303 vector unsigned int mask = vec_perm( (vector unsigned int)(0), (vector unsigned int)(-1), storePerm );
9304
9305 if ( kHz == 11025 ) {
9306 if ( numChannels == 1 ) {
9307 // 8 at a time
9308 int i = 0;
9309
9310 vector signed short vsOld = vec_ld( 0, &src[i] );
9311 vector unsigned char permVec = vec_add( vec_lvsl( -1, &src[i] ), (vector unsigned char)(1) );
9312
9313 for ( ; i+7 < numSamples; i+= 8 ) {
9314 // load src
9315 vs1 = vec_ld( 15, &src[i] );
9316 vs0 = vec_perm( vsOld, vs1, permVec );
9317 vsOld = vs1;
9318 vector float vecDestEnd = vec_ld( 127, &dest[i*4] );
9319
9320 // unpack shorts to ints
9321 vi0 = vec_unpackh( vs0 );
9322 vi1 = vec_unpackl( vs0 );
9323 // convert ints to floats
9324 v0 = vec_ctf( vi0, 0 );
9325 v1 = vec_ctf( vi1, 0 );
9326 // permute into vectors in the order to store
9327
9328 v2 = vec_splat( v0, 0 );
9329 v3 = vec_splat( v0, 1 );
9330 v4 = vec_splat( v0, 2 );
9331 v5 = vec_splat( v0, 3 );
9332 v6 = vec_splat( v1, 0 );
9333 v7 = vec_splat( v1, 1 );
9334 v8 = vec_splat( v1, 2 );
9335 v9 = vec_splat( v1, 3 );
9336
9337 v2 = vec_perm( v2, v2, storePerm );
9338 v3 = vec_perm( v3, v3, storePerm );
9339 v4 = vec_perm( v4, v4, storePerm );
9340 v5 = vec_perm( v5, v5, storePerm );
9341 v6 = vec_perm( v6, v6, storePerm );
9342 v7 = vec_perm( v7, v7, storePerm );
9343 v8 = vec_perm( v8, v8, storePerm );
9344 v9 = vec_perm( v9, v9, storePerm );
9345
9346 // store results
9347 vec_st( vec_sel( vecDest, v2, mask ), 0, &dest[i*4] );
9348 vec_st( vec_sel( v2, v3, mask ), 15, &dest[i*4] );
9349 vec_st( vec_sel( v3, v4, mask ), 31, &dest[i*4] );
9350 vec_st( vec_sel( v4, v5, mask ), 47, &dest[i*4] );
9351 vec_st( vec_sel( v5, v6, mask ), 63, &dest[i*4] );
9352 vec_st( vec_sel( v6, v7, mask ), 79, &dest[i*4] );
9353 vec_st( vec_sel( v7, v8, mask ), 95, &dest[i*4] );
9354 vec_st( vec_sel( v8, v9, mask ), 111, &dest[i*4] );
9355 vecDest = vec_sel( v9, vecDestEnd, mask );
9356 vec_st( vecDest, 127, &dest[i*4] );
9357 }
9358 // cleanup
9359 for (; i < numSamples; i++ ) {
9360 dest[i*4+0] = dest[i*4+1] = dest[i*4+2] = dest[i*4+3] = (float) src[i+0];
9361 }
9362 } else {
9363 int i = 0;
9364
9365 vector unsigned char permVec = vec_add( vec_lvsl( -1, &src[0] ), (vector unsigned char)(1) );
9366 vector signed short vsOld = vec_ld( 0, &src[0] );
9367
9368 for ( ; i+7 < numSamples; i += 8 ) {
9369 // load src
9370 vs1 = vec_ld( 15, &src[i] );
9371 vs0 = vec_perm( vsOld, vs1, permVec );
9372 vsOld = vs1;
9373 vector float vecDestEnd = vec_ld( 127, &dest[i*4] );
9374
9375 // unpack shorts to ints
9376 vi0 = vec_unpackh( vs0 );
9377 vi1 = vec_unpackl( vs0 );
9378 // convert ints to floats
9379 v0 = vec_ctf( vi0, 0 );
9380 v1 = vec_ctf( vi1, 0 );
9381 // put into vectors in order to store
9382 v2 = vec_perm( v0, v0, vecFirstHalf );
9383 v3 = v2;
9384 v4 = vec_perm( v0, v0, vecSecondHalf );
9385 v5 = v4;
9386 v6 = vec_perm( v1, v1, vecFirstHalf );
9387 v7 = v6;
9388 v8 = vec_perm (v1, v1, vecSecondHalf );
9389 v9 = v8;
9390
9391 v2 = vec_perm( v2, v2, storePerm );
9392 v3 = vec_perm( v3, v3, storePerm );
9393 v4 = vec_perm( v4, v4, storePerm );
9394 v5 = vec_perm( v5, v5, storePerm );
9395 v6 = vec_perm( v6, v6, storePerm );
9396 v7 = vec_perm( v7, v7, storePerm );
9397 v8 = vec_perm( v8, v8, storePerm );
9398 v9 = vec_perm( v9, v9, storePerm );
9399
9400 // store results
9401 vec_st( vec_sel( vecDest, v2, mask ), 0, &dest[i*4] );
9402 vec_st( vec_sel( v2, v3, mask ), 15, &dest[i*4] );
9403 vec_st( vec_sel( v3, v4, mask ), 31, &dest[i*4] );
9404 vec_st( vec_sel( v4, v5, mask ), 47, &dest[i*4] );
9405 vec_st( vec_sel( v5, v6, mask ), 63, &dest[i*4] );
9406 vec_st( vec_sel( v6, v7, mask ), 79, &dest[i*4] );
9407 vec_st( vec_sel( v7, v8, mask ), 95, &dest[i*4] );
9408 vec_st( vec_sel( v8, v9, mask ), 111, &dest[i*4] );
9409 vecDest = vec_sel( v9, vecDestEnd, mask );
9410 vec_st( vecDest, 127, &dest[i*4] );
9411 }
9412
9413 for ( ; i < numSamples; i += 2 ) {
9414 dest[i*4+0] = dest[i*4+2] = dest[i*4+4] = dest[i*4+6] = (float) src[i+0];
9415 dest[i*4+1] = dest[i*4+3] = dest[i*4+5] = dest[i*4+7] = (float) src[i+1];
9416 }
9417 }
9418 } else if ( kHz == 22050 ) {
9419 if ( numChannels == 1 ) {
9420 int i;
9421 vector unsigned char permVec = vec_add( vec_lvsl( -1, &src[0] ), (vector unsigned char)(1) );
9422 vector signed short vsOld = vec_ld( 0, &src[0] );
9423
9424 for ( i = 0; i+7 < numSamples; i += 8 ) {
9425 // load src
9426 vs1 = vec_ld( 0, &src[i] );
9427 vs0 = vec_perm( vsOld, vs1, permVec );
9428 vsOld = vs1;
9429 vector float vecDestEnd = vec_ld( 63, &dest[i*2] );
9430
9431 // unpack shorts to ints
9432 vi0 = vec_unpackh( vs0 );
9433 vi1 = vec_unpackl( vs0 );
9434 // convert ints to floats
9435 v0 = vec_ctf( vi0, 0 );
9436 v1 = vec_ctf( vi1, 0 );
9437 // put into vectors in order to store
9438 v2 = vec_perm( v0, v0, vecBottom );
9439 v3 = vec_perm( v0, v0, vecTop );
9440 v4 = vec_perm( v1, v1, vecBottom );
9441 v5 = vec_perm (v1, v1, vecTop );
9442
9443 v2 = vec_perm( v2, v2, storePerm );
9444 v3 = vec_perm( v3, v3, storePerm );
9445 v4 = vec_perm( v4, v4, storePerm );
9446 v5 = vec_perm( v5, v5, storePerm );
9447
9448 // store results
9449 vec_st( vec_sel( vecDest, v2, mask ), 0, &dest[i*2] );
9450 vec_st( vec_sel( v2, v3, mask ), 15, &dest[i*2] );
9451 vec_st( vec_sel( v3, v4, mask ), 31, &dest[i*2] );
9452 vec_st( vec_sel( v4, v5, mask ), 47, &dest[i*2] );
9453 vecDest = vec_sel( v5, vecDestEnd, mask );
9454 vec_st( vecDest, 63, &dest[i*2] );
9455
9456 }
9457 // cleanup
9458 for ( ; i < numSamples; i++ ) {
9459 dest[i*2+0] = dest[i*2+1] = (float) src[i+0];
9460 }
9461 } else {
9462 int i;
9463 vector unsigned char permVec = vec_add( vec_lvsl( -1, &src[0] ), (vector unsigned char)(1) );
9464 vector signed short vsOld = vec_ld( 0, &src[0] );
9465
9466 for ( i = 0; i+7 < numSamples; i += 8 ) {
9467 // load src
9468 vs1 = vec_ld( 15, &src[i] );
9469 vs0 = vec_perm( vsOld, vs1, permVec );
9470 vsOld = vs1;
9471 vector float vecDestEnd = vec_ld( 63, &dest[i*2] );
9472
9473 // unpack shorts to ints
9474 vi0 = vec_unpackh( vs0 );
9475 vi1 = vec_unpackl( vs0 );
9476 // convert ints to floats
9477 v0 = vec_ctf( vi0, 0 );
9478 v1 = vec_ctf( vi1, 0 );
9479 // put into vectors in order to store
9480 v2 = vec_perm( v0, v0, vecFirstHalf );
9481 v3 = vec_perm( v0, v0, vecSecondHalf );
9482 v4 = vec_perm( v1, v1, vecFirstHalf );
9483 v5 = vec_perm (v1, v1, vecSecondHalf );
9484
9485 v2 = vec_perm( v2, v2, storePerm );
9486 v3 = vec_perm( v3, v3, storePerm );
9487 v4 = vec_perm( v4, v4, storePerm );
9488 v5 = vec_perm( v5, v5, storePerm );
9489
9490 // store results
9491 vec_st( vec_sel( vecDest, v2, mask ), 0, &dest[i*2] );
9492 vec_st( vec_sel( v2, v3, mask ), 15, &dest[i*2] );
9493 vec_st( vec_sel( v3, v4, mask ), 31, &dest[i*2] );
9494 vec_st( vec_sel( v4, v5, mask ), 47, &dest[i*2] );
9495 vecDest = vec_sel( v5, vecDestEnd, mask );
9496 vec_st( vecDest, 63, &dest[i*2] );
9497 }
9498 // cleanup
9499 for ( ; i < numSamples; i += 2 ) {
9500 dest[i*2+0] = dest[i*2+2] = (float) src[i+0];
9501 dest[i*2+1] = dest[i*2+3] = (float) src[i+1];
9502 }
9503 }
9504 } else if ( kHz == 44100 ) {
9505 int i;
9506 vector unsigned char permVec = vec_add( vec_lvsl( -1, &src[0] ), (vector unsigned char)(1) );
9507 vector signed short vsOld = vec_ld( 0, &src[0] );
9508
9509 for ( i = 0; i+7 < numSamples; i += 8 ) {
9510 //vs0 = vec_ld( 0, &src[i] );
9511 vs1 = vec_ld( 15, &src[i] );
9512 vs0 = vec_perm( vsOld, vs1, permVec );
9513 vsOld = vs1;
9514 vector float vecDestEnd = vec_ld( 31, &dest[i] );
9515
9516 //unpack shorts to ints
9517 vi0 = vec_unpackh( vs0 );
9518 vi1 = vec_unpackl( vs0 );
9519
9520 //convert ints to floats
9521 v0 = vec_ctf( vi0, 0 );
9522 v1 = vec_ctf( vi1, 0 );
9523
9524 v0 = vec_perm( v0, v0, storePerm );
9525 v1 = vec_perm( v1, v1, storePerm );
9526
9527 // store results
9528 vec_st( vec_sel( vecDest, v0, mask ), 0, &dest[i] );
9529 vec_st( vec_sel( v0, v1, mask ), 15, &dest[i] );
9530 vecDest = vec_sel( v1, vecDestEnd, mask );
9531 vec_st( vecDest, 31, &dest[i] );
9532 }
9533 // cleanup
9534 for ( ; i < numSamples; i++ ) {
9535 dest[i] = (float) src[i];
9536 }
9537 } else {
9538 assert( 0 );
9539 }
9540 }
9541
9542 #endif
9543
9544 #ifdef SOUND_DEST_ALIGNED
9545 /*
9546 ============
9547 idSIMD_AltiVec::UpSampleOGGTo44kHz
9548
9549 Duplicate samples for 44kHz output.
9550
9551 Assumptions:
9552 Assumes that dest starts at aligned address
9553 ============
9554 */
UpSampleOGGTo44kHz(float * dest,const float * const * ogg,const int numSamples,const int kHz,const int numChannels)9555 void idSIMD_AltiVec::UpSampleOGGTo44kHz( float *dest, const float * const *ogg, const int numSamples, const int kHz, const int numChannels ) {
9556 // dest is aligned
9557 assert( IS_16BYTE_ALIGNED( dest[0] ) );
9558
9559 register vector float oggVec1, oggVec2, oggVec3, oggVec4, oggVec5, oggVec6, oggVec7, oggVec8;
9560 register vector float constVec, zeroVector;
9561 register vector float v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10;
9562 vector unsigned char vecPerm1;
9563 vector unsigned char vecPerm2;
9564
9565 vector unsigned char vecOneTwo = (vector unsigned char)(0,1,2,3,0,1,2,3,4,5,6,7,4,5,6,7);
9566 vector unsigned char vecThreeFour = (vector unsigned char)(8,9,10,11,8,9,10,11,12,13,14,15,12,13,14,15);
9567 vector unsigned char vecFirst = (vector unsigned char)(0,1,2,3,16,17,18,19,0,1,2,3,16,17,18,19);
9568 vector unsigned char vecSecond = (vector unsigned char)(4,5,6,7,20,21,22,23,4,5,6,7,20,21,22,23);
9569 vector unsigned char vecThird = (vector unsigned char)(8,9,10,11,24,25,26,27,8,9,10,11,24,25,26,27);
9570 vector unsigned char vecFourth = (vector unsigned char)(12,13,14,15,28,29,30,31,12,13,14,15,28,29,30,31);
9571
9572 constVec = (vector float)(32768.0f);
9573 zeroVector = (vector float)(0.0);
9574
9575 if ( kHz == 11025 ) {
9576 if ( numChannels == 1 ) {
9577 // calculate perm vector and do first load
9578 vecPerm1 = vec_add( vec_lvsl( -1, (int*) &ogg[0][0] ), (vector unsigned char)(1) );
9579 v10 = vec_ld( 0, &ogg[0][0] );
9580
9581 int i;
9582 for ( i = 0; i+7 < numSamples; i += 8 ) {
9583 // as it happens, ogg[0][i] through ogg[0][i+3] are contiguous in memory
9584 v8 = v10;
9585 v9 = vec_ld( 15, &ogg[0][i] );
9586 v10 = vec_ld( 31, &ogg[0][i] );
9587 v0 = vec_perm( v8, v9, vecPerm1 );
9588 v1 = vec_perm( v9, v10, vecPerm1 );
9589
9590 // now we have the elements in a vector, we want
9591 // to splat them each accross their own vector
9592 oggVec1 = vec_splat( v0, 0 );
9593 oggVec2 = vec_splat( v0, 1 );
9594 oggVec3 = vec_splat( v0, 2 );
9595 oggVec4 = vec_splat( v0, 3 );
9596 oggVec5 = vec_splat( v1, 0 );
9597 oggVec6 = vec_splat( v1, 1 );
9598 oggVec7 = vec_splat( v1, 2 );
9599 oggVec8 = vec_splat( v1, 3 );
9600
9601 v0 = vec_madd( oggVec1, constVec, zeroVector );
9602 v1 = vec_madd( oggVec2, constVec, zeroVector );
9603 v2 = vec_madd( oggVec3, constVec, zeroVector );
9604 v3 = vec_madd( oggVec4, constVec, zeroVector );
9605 v4 = vec_madd( oggVec5, constVec, zeroVector );
9606 v5 = vec_madd( oggVec6, constVec, zeroVector );
9607 v6 = vec_madd( oggVec7, constVec, zeroVector );
9608 v7 = vec_madd( oggVec8, constVec, zeroVector );
9609
9610 //store results
9611 ALIGNED_STORE8( &dest[i*4], v0, v1, v2, v3, v4, v5, v6, v7 );
9612
9613 }
9614
9615 //cleanup
9616 for ( ; i < numSamples; i++ ) {
9617 dest[i*4+0] = dest[i*4+1] = dest[i*4+2] = dest[i*4+3] = ogg[0][i] * 32768.0f;
9618 }
9619
9620 } else {
9621
9622 // calculate perm vec for ogg
9623 vecPerm1 = vec_add( vec_lvsl( -1, (int*) &ogg[0][0] ), (vector unsigned char)(1) );
9624 vecPerm2 = vec_add( vec_lvsl( -1, (int*) &ogg[1][0] ), (vector unsigned char)(1) );
9625 v7 = vec_ld( 0, &ogg[1][0] );
9626 v9 = vec_ld( 0, &ogg[0][0] );
9627 int i;
9628
9629 for ( i = 0; i+3 < numSamples >> 1; i+=4 ) { // +1 += 2
9630 // load and splat from the array ( ogg[0][i] to ogg[0][i+3] )
9631 v8 = v9;
9632 v9 = vec_ld( 15, &ogg[0][i] );
9633 v0 = vec_perm( v8, v9, vecPerm1 );
9634
9635 // now we have the elements in a vector, we want
9636 // to splat them each accross their own vector
9637 oggVec1 = vec_splat( v0, 0 );
9638 oggVec2 = vec_splat( v0, 1 );
9639 oggVec3 = vec_splat( v0, 2 );
9640 oggVec4 = vec_splat( v0, 3 );
9641
9642 // load and splat from the array ( ogg[1][i] to ogg[1][i+3] )
9643 v6 = v7;
9644 v7 = vec_ld( 15, &ogg[1][i] );
9645 v1 = vec_perm( v6, v7, vecPerm2 );
9646
9647 // now we have the elements in a vector, we want
9648 // to splat them each accross their own vector
9649 oggVec5 = vec_splat( v1, 0 );
9650 oggVec6 = vec_splat( v1, 1 );
9651 oggVec7 = vec_splat( v1, 2 );
9652 oggVec8 = vec_splat( v1, 3 );
9653
9654 oggVec1 = vec_madd( oggVec1, constVec, zeroVector ); // ogg[0][i] * 32768
9655 oggVec2 = vec_madd( oggVec2, constVec, zeroVector ); // ogg[0][i+1] * 32768
9656 oggVec3 = vec_madd( oggVec3, constVec, zeroVector ); // ogg[0][i+2] * 32768
9657 oggVec4 = vec_madd( oggVec4, constVec, zeroVector ); // ogg[0][i+3] * 32768
9658 oggVec5 = vec_madd( oggVec5, constVec, zeroVector ); // ogg[1][i] * 32768
9659 oggVec6 = vec_madd( oggVec6, constVec, zeroVector ); // ogg[1][i+1] * 32768
9660 oggVec7 = vec_madd( oggVec7, constVec, zeroVector ); // ogg[1][i+2] * 32768
9661 oggVec8 = vec_madd( oggVec8, constVec, zeroVector ); // ogg[1][i+3] * 32768
9662
9663 //merge generates the interleaved pattern that we want and it
9664 //doesn't require a permute vector, so use that instead
9665 v0 = vec_mergeh( oggVec1, oggVec5 );
9666 v1 = vec_mergel( oggVec1, oggVec5 );
9667 v2 = vec_mergeh( oggVec2, oggVec6 );
9668 v3 = vec_mergel( oggVec2, oggVec6 );
9669
9670 v4 = vec_mergeh( oggVec3, oggVec7 );
9671 v5 = vec_mergel( oggVec3, oggVec7 );
9672 v6 = vec_mergeh( oggVec4, oggVec8 );
9673 v10 = vec_mergel( oggVec4, oggVec8 );
9674
9675 //store results
9676 ALIGNED_STORE8( &dest[i*8], v0, v1, v2, v3, v4, v5, v6, v10 );
9677 }
9678
9679 //cleanup
9680 for ( ; i < numSamples >> 1; i++ ) {
9681 dest[i*8+0] = dest[i*8+2] = dest[i*8+4] = dest[i*8+6] = ogg[0][i] * 32768.0f;
9682 dest[i*8+1] = dest[i*8+3] = dest[i*8+5] = dest[i*8+7] = ogg[1][i] * 32768.0f;
9683 }
9684 }
9685 } else if ( kHz == 22050 ) {
9686 if ( numChannels == 1 ) {
9687
9688 // calculate perm vector and do first load
9689 vecPerm1 = vec_add( vec_lvsl( -1, (int*) &ogg[0][0] ), (vector unsigned char)(1) );
9690 v10 = vec_ld( 0, &ogg[0][0] );
9691
9692 int i;
9693
9694 for ( i = 0; i+7 < numSamples; i += 8 ) {
9695 // load values from ogg
9696 v8 = v10;
9697 v9 = vec_ld( 15, &ogg[0][i] );
9698 v10 = vec_ld( 31, &ogg[0][i] );
9699 v0 = vec_perm( v8, v9, vecPerm1 );
9700 v1 = vec_perm( v9, v10, vecPerm1 );
9701
9702 // multiply
9703 v0 = vec_madd( v0, constVec, zeroVector );
9704 v1 = vec_madd( v1, constVec, zeroVector );
9705
9706 // permute into results vectors to store
9707 v5 = vec_perm( v0, v0, vecOneTwo );
9708 v6 = vec_perm( v0, v0, vecThreeFour);
9709 v7 = vec_perm( v1, v1, vecOneTwo );
9710 v8 = vec_perm( v1, v1, vecThreeFour );
9711
9712 //store results
9713 ALIGNED_STORE4( &dest[i*2], v5, v6, v7, v8 );
9714 }
9715 // cleanup
9716 for ( ; i < numSamples; i++ ) {
9717 dest[i*2+0] = dest[i*2+1] = ogg[0][i] * 32768.0f;
9718 }
9719 } else {
9720
9721 // calculate perm vector and do first load
9722 vecPerm1 = vec_add( vec_lvsl( -1, (int*) &ogg[0][0] ), (vector unsigned char)(1) );
9723 vecPerm2 = vec_add( vec_lvsl( -1, (int*) &ogg[1][0] ), (vector unsigned char)(1) );
9724 v7 = vec_ld( 0, &ogg[1][0] );
9725 v9 = vec_ld( 0, &ogg[0][0] );
9726
9727 int i;
9728 for ( i = 0; i+3 < numSamples >> 1; i += 4 ) {
9729 // load ogg[0][i] to ogg[0][i+4]
9730 v8 = v9;
9731 v9 = vec_ld( 15, &ogg[0][i] );
9732 v0 = vec_perm( v8, v9, vecPerm1 );
9733
9734 // load ogg[1][i] to ogg[1][i+3]
9735 v6 = v7;
9736 v7 = vec_ld( 15, &ogg[1][i] );
9737 v1 = vec_perm( v6, v7, vecPerm2 );
9738
9739 // multiply
9740 v0 = vec_madd( v0, constVec, zeroVector );
9741 v1 = vec_madd( v1, constVec, zeroVector );
9742
9743 // generate result vectors to store
9744 v2 = vec_perm( v0, v1, vecFirst );
9745 v3 = vec_perm( v0, v1, vecSecond );
9746 v4 = vec_perm( v0, v1, vecThird );
9747 v5 = vec_perm( v0, v1, vecFourth );
9748
9749 // store results
9750 ALIGNED_STORE4( &dest[i*4], v2, v3, v4, v5 );
9751 }
9752 // cleanup
9753 for ( ; i < numSamples >> 1; i++ ) {
9754 dest[i*4+0] = dest[i*4+2] = ogg[0][i] * 32768.0f;
9755 dest[i*4+1] = dest[i*4+3] = ogg[1][i] * 32768.0f;
9756 }
9757 }
9758 } else if ( kHz == 44100 ) {
9759 if ( numChannels == 1 ) {
9760 // calculate perm vector and do first load
9761 vecPerm1 = vec_add( vec_lvsl( -1, (int*) &ogg[0][0] ), (vector unsigned char)(1) );
9762
9763 v9 = vec_ld( 0, &ogg[0][0] );
9764 int i;
9765
9766 for ( i = 0; i+7 < numSamples; i += 8 ) {
9767 // load values from ogg
9768 v8 = v9;
9769 v7 = vec_ld( 15, &ogg[0][i] );
9770 v6 = v7;
9771 v9 = vec_ld( 31, &ogg[0][i] );
9772
9773 v0 = vec_perm( v8, v7, vecPerm1 );
9774 v1 = vec_perm( v6, v9, vecPerm1 );
9775
9776 // multiply
9777 v0 = vec_madd( v0, constVec, zeroVector );
9778 v1 = vec_madd( v1, constVec, zeroVector );
9779
9780 ALIGNED_STORE2( &dest[i], v0, v1 );
9781 }
9782
9783 // cleanup
9784 for ( ; i < numSamples; i++ ) {
9785 dest[i*1+0] = ogg[0][i] * 32768.0f;
9786 }
9787 } else {
9788
9789 // calculate perm vector and do first load
9790 vecPerm1 = vec_add( vec_lvsl( -1, (int*) &ogg[0][0] ), (vector unsigned char)(1) );
9791 vecPerm2 = vec_add( vec_lvsl( -1, (int*) &ogg[1][0] ), (vector unsigned char)(1) );
9792 v7 = vec_ld( 0, &ogg[1][0] );
9793 v9 = vec_ld( 0, &ogg[0][0] );
9794 int i;
9795
9796 for ( i = 0; i+3 < numSamples >> 1; i += 4 ) {
9797 v8 = v9;
9798 v9 = vec_ld( 15, &ogg[0][i] );
9799 v0 = vec_perm( v8, v9, vecPerm1 );
9800
9801 // load ogg[1][i] to ogg[1][i+3]
9802 v6 = v7;
9803 v7 = vec_ld( 15, &ogg[1][i] );
9804 v1 = vec_perm( v6, v7, vecPerm2 );
9805
9806 // multiply
9807 v0 = vec_madd( v0, constVec, zeroVector );
9808 v1 = vec_madd( v1, constVec, zeroVector );
9809
9810 // generate result vectors
9811 v2 = vec_mergeh( v0, v1 );
9812 v3 = vec_mergel( v0, v1 );
9813
9814 // store results
9815 ALIGNED_STORE2( &dest[i*2], v2, v3 );
9816 }
9817 // cleanup
9818 for ( ; i < numSamples >> 1; i++ ) {
9819 dest[i*2+0] = ogg[0][i] * 32768.0f;
9820 dest[i*2+1] = ogg[1][i] * 32768.0f;
9821 }
9822 }
9823 } else {
9824 assert( 0 );
9825 }
9826 }
9827
9828 #else
9829
9830 /*
9831 ============
9832 idSIMD_AltiVec::UpSampleOGGTo44kHz
9833
9834 Duplicate samples for 44kHz output.
9835
9836 Assumptions:
9837 No assumptions
9838 ============
9839 */
UpSampleOGGTo44kHz(float * dest,const float * const * ogg,const int numSamples,const int kHz,const int numChannels)9840 void idSIMD_AltiVec::UpSampleOGGTo44kHz( float *dest, const float * const *ogg, const int numSamples, const int kHz, const int numChannels ) {
9841
9842 register vector float oggVec1, oggVec2, oggVec3, oggVec4, oggVec5, oggVec6, oggVec7, oggVec8;
9843 register vector float constVec, zeroVector;
9844 register vector float v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10;
9845 vector unsigned char vecPerm1;
9846 vector unsigned char vecPerm2;
9847
9848 vector unsigned char vecOneTwo = (vector unsigned char)(0,1,2,3,0,1,2,3,4,5,6,7,4,5,6,7);
9849 vector unsigned char vecThreeFour = (vector unsigned char)(8,9,10,11,8,9,10,11,12,13,14,15,12,13,14,15);
9850 vector unsigned char vecFirst = (vector unsigned char)(0,1,2,3,16,17,18,19,0,1,2,3,16,17,18,19);
9851 vector unsigned char vecSecond = (vector unsigned char)(4,5,6,7,20,21,22,23,4,5,6,7,20,21,22,23);
9852 vector unsigned char vecThird = (vector unsigned char)(8,9,10,11,24,25,26,27,8,9,10,11,24,25,26,27);
9853 vector unsigned char vecFourth = (vector unsigned char)(12,13,14,15,28,29,30,31,12,13,14,15,28,29,30,31);
9854
9855 vector unsigned char storePerm;
9856
9857 constVec = (vector float)(32768.0f);
9858 zeroVector = (vector float)(0.0);
9859
9860 // calculate perm vector and masks for stores
9861 storePerm = vec_sub( vec_lvsr( 15, &dest[0] ), (vector unsigned char)(1) );
9862 // original values of dest
9863 vector float vecDest = vec_ld( 0, &dest[0] );
9864 vector unsigned int mask = vec_perm( (vector unsigned int)(0), (vector unsigned int)(-1), storePerm );
9865
9866 if ( kHz == 11025 ) {
9867 if ( numChannels == 1 ) {
9868 // calculate perm vector and do first load
9869 vecPerm1 = vec_add( vec_lvsl( -1, (int*) &ogg[0][0] ), (vector unsigned char)(1) );
9870 v10 = vec_ld( 0, &ogg[0][0] );
9871
9872 int i;
9873 for ( i = 0; i+7 < numSamples; i += 8 ) {
9874 // as it happens, ogg[0][i] through ogg[0][i+3] are contiguous in memory
9875 v8 = v10;
9876 v9 = vec_ld( 15, &ogg[0][i] );
9877 v10 = vec_ld( 31, &ogg[0][i] );
9878 vector float vecDestEnd = vec_ld( 127, &dest[i*4] );
9879 v0 = vec_perm( v8, v9, vecPerm1 );
9880 v1 = vec_perm( v9, v10, vecPerm1 );
9881
9882 // now we have the elements in a vector, we want
9883 // to splat them each accross their own vector
9884 oggVec1 = vec_splat( v0, 0 );
9885 oggVec2 = vec_splat( v0, 1 );
9886 oggVec3 = vec_splat( v0, 2 );
9887 oggVec4 = vec_splat( v0, 3 );
9888 oggVec5 = vec_splat( v1, 0 );
9889 oggVec6 = vec_splat( v1, 1 );
9890 oggVec7 = vec_splat( v1, 2 );
9891 oggVec8 = vec_splat( v1, 3 );
9892
9893 v0 = vec_madd( oggVec1, constVec, zeroVector );
9894 v1 = vec_madd( oggVec2, constVec, zeroVector );
9895 v2 = vec_madd( oggVec3, constVec, zeroVector );
9896 v3 = vec_madd( oggVec4, constVec, zeroVector );
9897 v4 = vec_madd( oggVec5, constVec, zeroVector );
9898 v5 = vec_madd( oggVec6, constVec, zeroVector );
9899 v6 = vec_madd( oggVec7, constVec, zeroVector );
9900 v7 = vec_madd( oggVec8, constVec, zeroVector );
9901
9902 // rotate input data
9903 v0 = vec_perm( v0, v0, storePerm );
9904 v1 = vec_perm( v1, v1, storePerm );
9905 v2 = vec_perm( v2, v2, storePerm );
9906 v3 = vec_perm( v3, v3, storePerm );
9907 v4 = vec_perm( v4, v4, storePerm );
9908 v5 = vec_perm( v5, v5, storePerm );
9909 v6 = vec_perm( v6, v6, storePerm );
9910 v7 = vec_perm( v7, v7, storePerm );
9911
9912 // store results
9913 vec_st( vec_sel( vecDest, v0, mask ), 0, &dest[i*4] );
9914 vec_st( vec_sel( v0, v1, mask ), 15, &dest[i*4] );
9915 vec_st( vec_sel( v1, v2, mask ), 31, &dest[i*4] );
9916 vec_st( vec_sel( v2, v3, mask ), 47, &dest[i*4] );
9917 vec_st( vec_sel( v3, v4, mask ), 63, &dest[i*4] );
9918 vec_st( vec_sel( v4, v5, mask ), 79, &dest[i*4] );
9919 vec_st( vec_sel( v5, v6, mask ), 95, &dest[i*4] );
9920 vec_st( vec_sel( v6, v7, mask ), 111, &dest[i*4] );
9921 vecDest = vec_sel( v7, vecDestEnd, mask );
9922 vec_st( vecDest, 127, &dest[i*4] );
9923 }
9924
9925 //cleanup
9926 for ( ; i < numSamples; i++ ) {
9927 dest[i*4+0] = dest[i*4+1] = dest[i*4+2] = dest[i*4+3] = ogg[0][i] * 32768.0f;
9928 }
9929
9930 } else {
9931
9932 // calculate perm vec for ogg
9933 vecPerm1 = vec_add( vec_lvsl( -1, (int*) &ogg[0][0] ), (vector unsigned char)(1) );
9934 vecPerm2 = vec_add( vec_lvsl( -1, (int*) &ogg[1][0] ), (vector unsigned char)(1) );
9935 v7 = vec_ld( 0, &ogg[1][0] );
9936 v9 = vec_ld( 0, &ogg[0][0] );
9937 int i;
9938
9939 for ( i = 0; i+3 < numSamples >> 1; i+=4 ) { // +1 += 2
9940 // load and splat from the array ( ogg[0][i] to ogg[0][i+3] )
9941 v8 = v9;
9942 v9 = vec_ld( 15, &ogg[0][i] );
9943 vector float vecDestEnd = vec_ld( 127, &dest[i*8] );
9944 v0 = vec_perm( v8, v9, vecPerm1 );
9945
9946 // now we have the elements in a vector, we want
9947 // to splat them each accross their own vector
9948 oggVec1 = vec_splat( v0, 0 );
9949 oggVec2 = vec_splat( v0, 1 );
9950 oggVec3 = vec_splat( v0, 2 );
9951 oggVec4 = vec_splat( v0, 3 );
9952
9953 // load and splat from the array ( ogg[1][i] to ogg[1][i+3] )
9954 v6 = v7;
9955 v7 = vec_ld( 15, &ogg[1][i] );
9956 v1 = vec_perm( v6, v7, vecPerm2 );
9957
9958 // now we have the elements in a vector, we want
9959 // to splat them each accross their own vector
9960 oggVec5 = vec_splat( v1, 0 );
9961 oggVec6 = vec_splat( v1, 1 );
9962 oggVec7 = vec_splat( v1, 2 );
9963 oggVec8 = vec_splat( v1, 3 );
9964
9965 oggVec1 = vec_madd( oggVec1, constVec, zeroVector ); // ogg[0][i] * 32768
9966 oggVec2 = vec_madd( oggVec2, constVec, zeroVector ); // ogg[0][i+1] * 32768
9967 oggVec3 = vec_madd( oggVec3, constVec, zeroVector ); // ogg[0][i+2] * 32768
9968 oggVec4 = vec_madd( oggVec4, constVec, zeroVector ); // ogg[0][i+3] * 32768
9969 oggVec5 = vec_madd( oggVec5, constVec, zeroVector ); // ogg[1][i] * 32768
9970 oggVec6 = vec_madd( oggVec6, constVec, zeroVector ); // ogg[1][i+1] * 32768
9971 oggVec7 = vec_madd( oggVec7, constVec, zeroVector ); // ogg[1][i+2] * 32768
9972 oggVec8 = vec_madd( oggVec8, constVec, zeroVector ); // ogg[1][i+3] * 32768
9973
9974 //merge generates the interleaved pattern that we want and it
9975 //doesn't require a permute vector, so use that instead
9976 v0 = vec_mergeh( oggVec1, oggVec5 );
9977 v1 = vec_mergel( oggVec1, oggVec5 );
9978 v2 = vec_mergeh( oggVec2, oggVec6 );
9979 v3 = vec_mergel( oggVec2, oggVec6 );
9980
9981 v4 = vec_mergeh( oggVec3, oggVec7 );
9982 v5 = vec_mergel( oggVec3, oggVec7 );
9983 v6 = vec_mergeh( oggVec4, oggVec8 );
9984 v10 = vec_mergel( oggVec4, oggVec8 );
9985
9986 // rotate input data
9987 v0 = vec_perm( v0, v0, storePerm );
9988 v1 = vec_perm( v1, v1, storePerm );
9989 v2 = vec_perm( v2, v2, storePerm );
9990 v3 = vec_perm( v3, v3, storePerm );
9991 v4 = vec_perm( v4, v4, storePerm );
9992 v5 = vec_perm( v5, v5, storePerm );
9993 v6 = vec_perm( v6, v6, storePerm );
9994 v10 = vec_perm( v10, v10, storePerm );
9995
9996 // store results
9997 vec_st( vec_sel( vecDest, v0, mask ), 0, &dest[i*8] );
9998 vec_st( vec_sel( v0, v1, mask ), 15, &dest[i*8] );
9999 vec_st( vec_sel( v1, v2, mask ), 31, &dest[i*8] );
10000 vec_st( vec_sel( v2, v3, mask ), 47, &dest[i*8] );
10001 vec_st( vec_sel( v3, v4, mask ), 63, &dest[i*8] );
10002 vec_st( vec_sel( v4, v5, mask ), 79, &dest[i*8] );
10003 vec_st( vec_sel( v5, v6, mask ), 95, &dest[i*8] );
10004 vec_st( vec_sel( v6, v10, mask ), 111, &dest[i*8] );
10005 vecDest = vec_sel( v10, vecDestEnd, mask );
10006 vec_st( vecDest, 127, &dest[i*8] );
10007 }
10008
10009 //cleanup
10010 for ( ; i < numSamples >> 1; i++ ) {
10011 dest[i*8+0] = dest[i*8+2] = dest[i*8+4] = dest[i*8+6] = ogg[0][i] * 32768.0f;
10012 dest[i*8+1] = dest[i*8+3] = dest[i*8+5] = dest[i*8+7] = ogg[1][i] * 32768.0f;
10013 }
10014 }
10015 } else if ( kHz == 22050 ) {
10016 if ( numChannels == 1 ) {
10017
10018 // calculate perm vector and do first load
10019 vecPerm1 = vec_add( vec_lvsl( -1, (int*) &ogg[0][0] ), (vector unsigned char)(1) );
10020 v10 = vec_ld( 0, &ogg[0][0] );
10021
10022 int i;
10023
10024 for ( i = 0; i+7 < numSamples; i += 8 ) {
10025
10026 // load values from ogg
10027 v8 = v10;
10028 v9 = vec_ld( 15, &ogg[0][i] );
10029 v10 = vec_ld( 31, &ogg[0][i] );
10030 vector float vecDestEnd = vec_ld( 63, &dest[i*2] );
10031 v0 = vec_perm( v8, v9, vecPerm1 );
10032 v1 = vec_perm( v9, v10, vecPerm1 );
10033
10034 // multiply
10035 v0 = vec_madd( v0, constVec, zeroVector );
10036 v1 = vec_madd( v1, constVec, zeroVector );
10037
10038 // permute into results vectors to store
10039 v5 = vec_perm( v0, v0, vecOneTwo );
10040 v6 = vec_perm( v0, v0, vecThreeFour);
10041 v7 = vec_perm( v1, v1, vecOneTwo );
10042 v8 = vec_perm( v1, v1, vecThreeFour );
10043
10044 // rotate input data
10045 v5 = vec_perm( v5, v5, storePerm );
10046 v6 = vec_perm( v6, v6, storePerm );
10047 v7 = vec_perm( v7, v7, storePerm );
10048 v8 = vec_perm( v8, v8, storePerm );
10049
10050 // store results
10051 vec_st( vec_sel( vecDest, v5, mask ), 0, &dest[i*2] );
10052 vec_st( vec_sel( v5, v6, mask ), 15, &dest[i*2] );
10053 vec_st( vec_sel( v6, v7, mask ), 31, &dest[i*2] );
10054 vec_st( vec_sel( v7, v8, mask ), 47, &dest[i*2] );
10055 vecDest = vec_sel( v8, vecDestEnd, mask );
10056 vec_st( vecDest, 63, &dest[i*2] );
10057 }
10058
10059 // cleanup
10060 for ( ; i < numSamples; i++ ) {
10061 dest[i*2+0] = dest[i*2+1] = ogg[0][i] * 32768.0f;
10062 }
10063 } else {
10064
10065 // calculate perm vector and do first load
10066 vecPerm1 = vec_add( vec_lvsl( -1, (int*) &ogg[0][0] ), (vector unsigned char)(1) );
10067 vecPerm2 = vec_add( vec_lvsl( -1, (int*) &ogg[1][0] ), (vector unsigned char)(1) );
10068 v7 = vec_ld( 0, &ogg[1][0] );
10069 v9 = vec_ld( 0, &ogg[0][0] );
10070
10071 int i;
10072 for ( i = 0; i+3 < numSamples >> 1; i += 4 ) {
10073 // load ogg[0][i] to ogg[0][i+4]
10074 v8 = v9;
10075 v9 = vec_ld( 15, &ogg[0][i] );
10076 vector float vecDestEnd = vec_ld( 63, &dest[i*4] );
10077 v0 = vec_perm( v8, v9, vecPerm1 );
10078
10079 // load ogg[1][i] to ogg[1][i+3]
10080 v6 = v7;
10081 v7 = vec_ld( 15, &ogg[1][i] );
10082 v1 = vec_perm( v6, v7, vecPerm2 );
10083
10084 // multiply
10085 v0 = vec_madd( v0, constVec, zeroVector );
10086 v1 = vec_madd( v1, constVec, zeroVector );
10087
10088 // generate result vectors to store
10089 v2 = vec_perm( v0, v1, vecFirst );
10090 v3 = vec_perm( v0, v1, vecSecond );
10091 v4 = vec_perm( v0, v1, vecThird );
10092 v5 = vec_perm( v0, v1, vecFourth );
10093
10094 // rotate input data
10095 v2 = vec_perm( v2, v2, storePerm );
10096 v3 = vec_perm( v3, v3, storePerm );
10097 v4 = vec_perm( v4, v4, storePerm );
10098 v5 = vec_perm( v5, v5, storePerm );
10099
10100 // store results
10101 vec_st( vec_sel( vecDest, v2, mask ), 0, &dest[i*4] );
10102 vec_st( vec_sel( v2, v3, mask ), 15, &dest[i*4] );
10103 vec_st( vec_sel( v3, v4, mask ), 31, &dest[i*4] );
10104 vec_st( vec_sel( v4, v5, mask ), 47, &dest[i*4] );
10105 vecDest = vec_sel( v5, vecDestEnd, mask );
10106 vec_st( vecDest, 63, &dest[i*4] );
10107 }
10108
10109 // cleanup
10110 for ( ; i < numSamples >> 1; i++ ) {
10111 dest[i*4+0] = dest[i*4+2] = ogg[0][i] * 32768.0f;
10112 dest[i*4+1] = dest[i*4+3] = ogg[1][i] * 32768.0f;
10113 }
10114 }
10115 } else if ( kHz == 44100 ) {
10116 if ( numChannels == 1 ) {
10117 // calculate perm vector and do first load
10118 vecPerm1 = vec_add( vec_lvsl( -1, (int*) &ogg[0][0] ), (vector unsigned char)(1) );
10119
10120 v9 = vec_ld( 0, &ogg[0][0] );
10121 int i;
10122
10123 for ( i = 0; i+7 < numSamples; i += 8 ) {
10124 // load values from ogg
10125 v8 = v9;
10126 v7 = vec_ld( 15, &ogg[0][i] );
10127 v6 = v7;
10128 v9 = vec_ld( 31, &ogg[0][i] );
10129 vector float vecDestEnd = vec_ld( 31, &dest[i] );
10130
10131 v0 = vec_perm( v8, v7, vecPerm1 );
10132 v1 = vec_perm( v6, v9, vecPerm1 );
10133
10134 // multiply
10135 v0 = vec_madd( v0, constVec, zeroVector );
10136 v1 = vec_madd( v1, constVec, zeroVector );
10137
10138 // rotate data
10139 v0 = vec_perm( v0, v0, storePerm );
10140 v1 = vec_perm( v1, v1, storePerm );
10141
10142 // store results
10143 vec_st( vec_sel( vecDest, v0, mask ), 0, &dest[i] );
10144 vec_st( vec_sel( v0, v1, mask ), 15, &dest[i] );
10145 vecDest = vec_sel( v1, vecDestEnd, mask );
10146 vec_st( vecDest, 31, &dest[i] );
10147 }
10148
10149 // cleanup
10150 for ( ; i < numSamples; i++ ) {
10151 dest[i*1+0] = ogg[0][i] * 32768.0f;
10152 }
10153 } else {
10154
10155 // calculate perm vector and do first load
10156 vecPerm1 = vec_add( vec_lvsl( -1, (int*) &ogg[0][0] ), (vector unsigned char)(1) );
10157 vecPerm2 = vec_add( vec_lvsl( -1, (int*) &ogg[1][0] ), (vector unsigned char)(1) );
10158 v7 = vec_ld( 0, &ogg[1][0] );
10159 v9 = vec_ld( 0, &ogg[0][0] );
10160 int i;
10161
10162 for ( i = 0; i+3 < numSamples >> 1; i += 4 ) {
10163 v8 = v9;
10164 v9 = vec_ld( 15, &ogg[0][i] );
10165 v0 = vec_perm( v8, v9, vecPerm1 );
10166
10167 // load ogg[1][i] to ogg[1][i+3]
10168 v6 = v7;
10169 v7 = vec_ld( 15, &ogg[1][i] );
10170 v1 = vec_perm( v6, v7, vecPerm2 );
10171
10172 // multiply
10173 v0 = vec_madd( v0, constVec, zeroVector );
10174 v1 = vec_madd( v1, constVec, zeroVector );
10175
10176 // generate result vectors
10177 v2 = vec_mergeh( v0, v1 );
10178 v3 = vec_mergel( v0, v1 );
10179
10180 // store results
10181 UNALIGNED_STORE2( &dest[i*2], v2, v3 );
10182 }
10183 // cleanup
10184 for ( ; i < numSamples >> 1; i++ ) {
10185 dest[i*2+0] = ogg[0][i] * 32768.0f;
10186 dest[i*2+1] = ogg[1][i] * 32768.0f;
10187 }
10188 }
10189 } else {
10190 assert( 0 );
10191 }
10192 }
10193 #endif /* SOUND_DEST_ALIGNED */
10194
10195 #ifdef SOUND_DEST_ALIGNED
10196 /*
10197 ============
10198 idSIMD_AltiVec::MixSoundTwoSpeakerMono
10199
10200 Assumptions:
10201 Assumes that mixBuffer starts at aligned address
10202 ============
10203 */
MixSoundTwoSpeakerMono(float * mixBuffer,const float * samples,const int numSamples,const float lastV[2],const float currentV[2])10204 void VPCALL idSIMD_AltiVec::MixSoundTwoSpeakerMono( float *mixBuffer, const float *samples, const int numSamples, const float lastV[2], const float currentV[2] ) {
10205
10206 // mixBuffer is aligned
10207 assert( IS_16BYTE_ALIGNED( mixBuffer[0] ) );
10208
10209 int i;
10210 float inc[2];
10211 float spkr[4];
10212
10213 register vector float vecInc;
10214 register vector float vecSpeaker1, vecSpeaker2, vecSpeaker3, vecSpeaker4;
10215 register vector float vecMixBuffer1, vecMixBuffer2, vecMixBuffer3, vecMixBuffer4;
10216 register vector float vecSamplesLd1, vecSamplesLd2;
10217 register vector float vecSamples1, vecSamples2, vecSamples3, vecSamples4;
10218
10219 register vector unsigned char permVec1 = (vector unsigned char)(0,1,2,3,0,1,2,3,4,5,6,7,4,5,6,7); //0,0,1,1
10220 register vector unsigned char permVec2 = (vector unsigned char)(8,9,10,11,8,9,10,11,12,13,14,15,12,13,14,15); //2,2,3,3
10221 register vector unsigned char permVec3 = (vector unsigned char)(16,17,18,19,16,17,18,19,20,21,22,23,20,21,22,23); //4,4,5,5
10222 register vector unsigned char permVec4 = (vector unsigned char)(24,25,26,27,24,25,26,27,28,29,30,31,28,29,30,31); //6,6,7,7
10223
10224 //constants
10225 vector float fourVec = (vector float)(4.0);
10226 vector float zeroVec = (vector float)(0.0);
10227
10228 inc[0] = ( currentV[0] - lastV[0] ) / MIXBUFFER_SAMPLES;
10229 inc[1] = ( currentV[1] - lastV[1] ) / MIXBUFFER_SAMPLES;
10230
10231 spkr[0] = lastV[0];
10232 spkr[1] = lastV[1];
10233 spkr[2] = lastV[0] + inc[0];
10234 spkr[3] = lastV[1] + inc[1];
10235
10236 assert( numSamples == MIXBUFFER_SAMPLES );
10237
10238 inc[0] *= 2;
10239 inc[1] *= 2;
10240
10241 //load data into registers
10242 vector float v0 = loadSplatUnalignedScalar( &inc[0] );
10243 vector float v1 = loadSplatUnalignedScalar( &inc[1] );
10244 vecInc = vec_mergeh( v0, v1 );
10245
10246 vector float v2 = loadSplatUnalignedScalar( &spkr[0] );
10247 vector float v3 = loadSplatUnalignedScalar( &spkr[1] );
10248 vector float v4 = loadSplatUnalignedScalar( &spkr[2] );
10249 vector float v5 = loadSplatUnalignedScalar( &spkr[3] );
10250
10251 // load spkr array
10252 v0 = vec_mergeh( v2, v4 );
10253 v1 = vec_mergeh( v3, v5 );
10254 vecSpeaker1 = vec_mergeh( v0, v1 );
10255
10256 vecSpeaker2 = vec_add( vecSpeaker1, vecInc );
10257 vecSpeaker3 = vec_add( vecSpeaker2, vecInc );
10258 vecSpeaker4 = vec_add( vecSpeaker3, vecInc );
10259 vecInc = vec_madd( vecInc, fourVec, zeroVec );
10260
10261 vector unsigned char samplesPerm = vec_add( vec_lvsl( -1, &samples[0] ), (vector unsigned char)(1) );
10262 vector float vecSamplesLast = vec_ld( 0, &samples[0] );
10263
10264 //since MIXBUFFER_SAMPLES is a multiple of 8, we don't
10265 //need a cleanup loop
10266 for( i=0 ; i+7 < MIXBUFFER_SAMPLES; i += 8 ) {
10267
10268 //load samples and mix buffers
10269 vecSamplesLd1 = vecSamplesLast; //vec_ld( 0, &samples[i] );
10270 vecSamplesLd2 = vec_ld( 15, &samples[i] );
10271 vecSamplesLast = vec_ld( 31, &samples[i] );
10272
10273 vecSamplesLd1 = vec_perm( vecSamplesLd1, vecSamplesLd2, samplesPerm );
10274 vecSamplesLd2 = vec_perm( vecSamplesLd2, vecSamplesLast, samplesPerm );
10275
10276 vecMixBuffer1 = vec_ld( 0, &mixBuffer[i*2] );
10277 vecMixBuffer2 = vec_ld( 0, &mixBuffer[i*2+4] );
10278 vecMixBuffer3 = vec_ld( 0, &mixBuffer[i*2+8] );
10279 vecMixBuffer4 = vec_ld( 0, &mixBuffer[i*2+12] );
10280
10281 vecSamples1 = vec_perm( vecSamplesLd1, vecSamplesLd2, permVec1 );
10282 vecSamples2 = vec_perm( vecSamplesLd1, vecSamplesLd2, permVec2 );
10283 vecSamples3 = vec_perm( vecSamplesLd1, vecSamplesLd2, permVec3 );
10284 vecSamples4 = vec_perm( vecSamplesLd1, vecSamplesLd2, permVec4 );
10285
10286 vecMixBuffer1 = vec_madd( vecSamples1, vecSpeaker1, vecMixBuffer1 );
10287 vecMixBuffer2 = vec_madd( vecSamples2, vecSpeaker2, vecMixBuffer2 );
10288 vecMixBuffer3 = vec_madd( vecSamples3, vecSpeaker3, vecMixBuffer3 );
10289 vecMixBuffer4 = vec_madd( vecSamples4, vecSpeaker4, vecMixBuffer4 );
10290
10291 // store results
10292 ALIGNED_STORE4( &mixBuffer[i*2], vecMixBuffer1, vecMixBuffer2, vecMixBuffer3, vecMixBuffer4 );
10293
10294 //add for next iteration
10295 vecSpeaker1 = vec_add( vecSpeaker1, vecInc );
10296 vecSpeaker2 = vec_add( vecSpeaker2, vecInc );
10297 vecSpeaker3 = vec_add( vecSpeaker3, vecInc );
10298 vecSpeaker4 = vec_add( vecSpeaker4, vecInc );
10299 }
10300 }
10301
10302 #else
10303
10304 /*
10305 ============
10306 idSIMD_AltiVec::MixSoundTwoSpeakerMono
10307
10308 Assumptions:
10309 No assumptions
10310 ============
10311 */
MixSoundTwoSpeakerMono(float * mixBuffer,const float * samples,const int numSamples,const float lastV[2],const float currentV[2])10312 void VPCALL idSIMD_AltiVec::MixSoundTwoSpeakerMono( float *mixBuffer, const float *samples, const int numSamples, const float lastV[2], const float currentV[2] ) {
10313
10314 int i;
10315 float inc[2];
10316 float spkr[4];
10317
10318 register vector float vecInc;
10319 register vector float vecSpeaker1, vecSpeaker2, vecSpeaker3, vecSpeaker4;
10320 register vector float vecMixBuffer1, vecMixBuffer2, vecMixBuffer3, vecMixBuffer4;
10321 register vector float vecSamplesLd1, vecSamplesLd2;
10322 register vector float vecSamples1, vecSamples2, vecSamples3, vecSamples4;
10323
10324 register vector unsigned char permVec1 = (vector unsigned char)(0,1,2,3,0,1,2,3,4,5,6,7,4,5,6,7); //0,0,1,1
10325 register vector unsigned char permVec2 = (vector unsigned char)(8,9,10,11,8,9,10,11,12,13,14,15,12,13,14,15); //2,2,3,3
10326 register vector unsigned char permVec3 = (vector unsigned char)(16,17,18,19,16,17,18,19,20,21,22,23,20,21,22,23); //4,4,5,5
10327 register vector unsigned char permVec4 = (vector unsigned char)(24,25,26,27,24,25,26,27,28,29,30,31,28,29,30,31); //6,6,7,7
10328
10329 //constants
10330 vector float fourVec = (vector float)(4.0);
10331 vector float zeroVec = (vector float)(0.0);
10332
10333 inc[0] = ( currentV[0] - lastV[0] ) / MIXBUFFER_SAMPLES;
10334 inc[1] = ( currentV[1] - lastV[1] ) / MIXBUFFER_SAMPLES;
10335
10336 spkr[0] = lastV[0];
10337 spkr[1] = lastV[1];
10338 spkr[2] = lastV[0] + inc[0];
10339 spkr[3] = lastV[1] + inc[1];
10340
10341 assert( numSamples == MIXBUFFER_SAMPLES );
10342
10343 inc[0] *= 2;
10344 inc[1] *= 2;
10345
10346 //load data into registers
10347 vector float v0 = loadSplatUnalignedScalar( &inc[0] );
10348 vector float v1 = loadSplatUnalignedScalar( &inc[1] );
10349 vecInc = vec_mergeh( v0, v1 );
10350
10351 vector float v2 = loadSplatUnalignedScalar( &spkr[0] );
10352 vector float v3 = loadSplatUnalignedScalar( &spkr[1] );
10353 vector float v4 = loadSplatUnalignedScalar( &spkr[2] );
10354 vector float v5 = loadSplatUnalignedScalar( &spkr[3] );
10355
10356 // load spkr array
10357 v0 = vec_mergeh( v2, v4 );
10358 v1 = vec_mergeh( v3, v5 );
10359 vecSpeaker1 = vec_mergeh( v0, v1 );
10360
10361 vecSpeaker2 = vec_add( vecSpeaker1, vecInc );
10362 vecSpeaker3 = vec_add( vecSpeaker2, vecInc );
10363 vecSpeaker4 = vec_add( vecSpeaker3, vecInc );
10364 vecInc = vec_madd( vecInc, fourVec, zeroVec );
10365
10366 vector unsigned char samplesPerm = vec_add( vec_lvsl( -1, &samples[0] ), (vector unsigned char)(1) );
10367 vector unsigned char mixBufferPerm = vec_add( vec_lvsl( -1, &mixBuffer[0]), (vector unsigned char)(1) );
10368 vector float vecSamplesLast = vec_ld( 0, &samples[0] );
10369 vector float vecDest = vec_ld( 0, &mixBuffer[0] );
10370
10371 //since MIXBUFFER_SAMPLES is a multiple of 8, we don't
10372 //need a cleanup loop
10373 for( i=0 ; i+7 < MIXBUFFER_SAMPLES; i += 8 ) {
10374
10375 //load samples and mix buffers
10376 vecSamplesLd1 = vecSamplesLast;
10377 vecSamplesLd2 = vec_ld( 15, &samples[i] );
10378 vecSamplesLast = vec_ld( 31, &samples[i] );
10379
10380 vecSamplesLd1 = vec_perm( vecSamplesLd1, vecSamplesLd2, samplesPerm );
10381 vecSamplesLd2 = vec_perm( vecSamplesLd2, vecSamplesLast, samplesPerm );
10382
10383 vecMixBuffer1 = vecDest;
10384 vecMixBuffer2 = vec_ld( 15, &mixBuffer[i*2] );
10385 vecMixBuffer3 = vec_ld( 31, &mixBuffer[i*2] );
10386 vecMixBuffer4 = vec_ld( 47, &mixBuffer[i*2] );
10387 vector float vecDestEnd = vec_ld( 63, &mixBuffer[i*2] );
10388
10389 vecMixBuffer1 = vec_perm( vecMixBuffer1, vecMixBuffer2, mixBufferPerm );
10390 vecMixBuffer2 = vec_perm( vecMixBuffer2, vecMixBuffer3, mixBufferPerm );
10391 vecMixBuffer3 = vec_perm( vecMixBuffer3, vecMixBuffer4, mixBufferPerm );
10392 vecMixBuffer4 = vec_perm( vecMixBuffer4, vecDestEnd, mixBufferPerm );
10393
10394 vecSamples1 = vec_perm( vecSamplesLd1, vecSamplesLd2, permVec1 );
10395 vecSamples2 = vec_perm( vecSamplesLd1, vecSamplesLd2, permVec2 );
10396 vecSamples3 = vec_perm( vecSamplesLd1, vecSamplesLd2, permVec3 );
10397 vecSamples4 = vec_perm( vecSamplesLd1, vecSamplesLd2, permVec4 );
10398
10399 vecMixBuffer1 = vec_madd( vecSamples1, vecSpeaker1, vecMixBuffer1 );
10400 vecMixBuffer2 = vec_madd( vecSamples2, vecSpeaker2, vecMixBuffer2 );
10401 vecMixBuffer3 = vec_madd( vecSamples3, vecSpeaker3, vecMixBuffer3 );
10402 vecMixBuffer4 = vec_madd( vecSamples4, vecSpeaker4, vecMixBuffer4 );
10403
10404 // store results
10405 UNALIGNED_STORE4( &mixBuffer[i*2], vecMixBuffer1, vecMixBuffer2, vecMixBuffer3, vecMixBuffer4 );
10406
10407 //add for next iteration
10408 vecSpeaker1 = vec_add( vecSpeaker1, vecInc );
10409 vecSpeaker2 = vec_add( vecSpeaker2, vecInc );
10410 vecSpeaker3 = vec_add( vecSpeaker3, vecInc );
10411 vecSpeaker4 = vec_add( vecSpeaker4, vecInc );
10412 }
10413 }
10414
10415 #endif /* SOUND_DEST_ALIGNED */
10416
10417 #ifdef SOUND_DEST_ALIGNED
10418 /*
10419 ============
10420 idSIMD_AltiVec::MixSoundTwoSpeakerStereo
10421
10422 Assumptions:
10423 Assumes that mixBuffer starts at aligned address
10424 ============
10425 */
MixSoundTwoSpeakerStereo(float * mixBuffer,const float * samples,const int numSamples,const float lastV[2],const float currentV[2])10426 void VPCALL idSIMD_AltiVec::MixSoundTwoSpeakerStereo( float *mixBuffer, const float *samples, const int numSamples, const float lastV[2], const float currentV[2] ) {
10427 // mixBuffer is aligned
10428 assert( IS_16BYTE_ALIGNED( mixBuffer[0] ) );
10429
10430 int i, k;
10431 float inc[2];
10432 float spkr[4];
10433
10434 // loading buffers
10435 register vector float vecMixBuffer1, vecMixBuffer2, vecMixBuffer3, vecMixBuffer4;
10436 // loading buffers
10437 register vector float vecSamples1, vecSamples2, vecSamples3, vecSamples4;
10438 register vector float vecSpeaker1, vecSpeaker2, vecSpeaker3, vecSpeaker4;
10439 register vector float vecInc;
10440 vector float fourVec = (vector float)(4.0);
10441 vector float zeroVec = (vector float)(0.0);
10442
10443 assert( numSamples == MIXBUFFER_SAMPLES );
10444
10445 inc[0] = ( currentV[0] - lastV[0] ) / MIXBUFFER_SAMPLES;
10446 inc[1] = ( currentV[1] - lastV[1] ) / MIXBUFFER_SAMPLES;
10447
10448 spkr[0] = lastV[0];
10449 spkr[1] = lastV[1];
10450 spkr[2] = lastV[0] + inc[0];
10451 spkr[3] = lastV[1] + inc[1];
10452
10453 for ( k = 0; k < 2; k++ ) {
10454 inc[k] *= 2;
10455 }
10456
10457 // load data in vectors
10458 vector float v0 = loadSplatUnalignedScalar( &inc[0] );
10459 vector float v1 = loadSplatUnalignedScalar( &inc[1] );
10460 vecInc = vec_mergeh( v0, v1 );
10461
10462 vector float v2 = loadSplatUnalignedScalar( &spkr[0] );
10463 vector float v3 = loadSplatUnalignedScalar( &spkr[1] );
10464 vector float v4 = loadSplatUnalignedScalar( &spkr[2] );
10465 vector float v5 = loadSplatUnalignedScalar( &spkr[3] );
10466
10467 // load spkr array
10468 v0 = vec_mergeh( v2, v4 );
10469 v1 = vec_mergeh( v3, v5 );
10470 vecSpeaker1 = vec_mergeh( v0, v1 );
10471
10472 vecSpeaker2 = vec_add( vecSpeaker1, vecInc );
10473 vecSpeaker3 = vec_add( vecSpeaker2, vecInc );
10474 vecSpeaker4 = vec_add( vecSpeaker3, vecInc );
10475 vecInc = vec_madd( vecInc, fourVec, zeroVec );
10476
10477 vector unsigned char samplesPerm = vec_add( vec_lvsl( -1, &samples[0] ), (vector unsigned char)(1) );
10478 vector float vecSamplesLast = vec_ld( 0, &samples[0] );
10479
10480 //since MIXBUFFER_SAMPLES is a multiple of 8, we don't
10481 //need a cleanup loop
10482 for( i = 0 ; i+7 < MIXBUFFER_SAMPLES; i += 8 ) {
10483 // load mix buffers and samples
10484 vecMixBuffer1 = vec_ld( 0, &mixBuffer[i*2] );
10485 vecMixBuffer2 = vec_ld( 0, &mixBuffer[i*2+4] );
10486 vecMixBuffer3 = vec_ld( 0, &mixBuffer[i*2+8] );
10487 vecMixBuffer4 = vec_ld( 0, &mixBuffer[i*2+12] );
10488
10489 vecSamples1 = vecSamplesLast;
10490 vecSamples2 = vec_ld( 15, &samples[i*2] );
10491 vecSamples3 = vec_ld( 31, &samples[i*2] );
10492 vecSamples4 = vec_ld( 47, &samples[i*2] );
10493 vecSamplesLast = vec_ld( 63, &samples[i*2] );
10494
10495 vecSamples1 = vec_perm( vecSamples1, vecSamples2, samplesPerm );
10496 vecSamples2 = vec_perm( vecSamples2, vecSamples3, samplesPerm );
10497 vecSamples3 = vec_perm( vecSamples3, vecSamples4, samplesPerm );
10498 vecSamples4 = vec_perm( vecSamples4, vecSamplesLast, samplesPerm );
10499
10500 vecMixBuffer1 = vec_madd( vecSamples1, vecSpeaker1, vecMixBuffer1 );
10501 vecMixBuffer2 = vec_madd( vecSamples2, vecSpeaker2, vecMixBuffer2 );
10502 vecMixBuffer3 = vec_madd( vecSamples3, vecSpeaker3, vecMixBuffer3 );
10503 vecMixBuffer4 = vec_madd( vecSamples4, vecSpeaker4, vecMixBuffer4 );
10504
10505 vecSpeaker1 = vec_add( vecSpeaker1, vecInc );
10506 vecSpeaker2 = vec_add( vecSpeaker2, vecInc );
10507 vecSpeaker3 = vec_add( vecSpeaker3, vecInc );
10508 vecSpeaker4 = vec_add( vecSpeaker4, vecInc );
10509
10510 //store results
10511 ALIGNED_STORE4( &mixBuffer[i*2], vecMixBuffer1, vecMixBuffer2, vecMixBuffer3, vecMixBuffer4 );
10512 }
10513 }
10514 #else
10515
10516 /*
10517 ============
10518 idSIMD_AltiVec::MixSoundTwoSpeakerStereo
10519
10520 Assumptions:
10521 No assumptions
10522 ============
10523 */
MixSoundTwoSpeakerStereo(float * mixBuffer,const float * samples,const int numSamples,const float lastV[2],const float currentV[2])10524 void VPCALL idSIMD_AltiVec::MixSoundTwoSpeakerStereo( float *mixBuffer, const float *samples, const int numSamples, const float lastV[2], const float currentV[2] ) {
10525
10526 int i, k;
10527 float inc[2];
10528 float spkr[4];
10529 // loading buffers
10530 register vector float vecMixBuffer1, vecMixBuffer2, vecMixBuffer3, vecMixBuffer4;
10531 // loading buffers
10532 register vector float vecSamples1, vecSamples2, vecSamples3, vecSamples4;
10533 register vector float vecSpeaker1, vecSpeaker2, vecSpeaker3, vecSpeaker4;
10534 register vector float vecInc;
10535 vector float fourVec = (vector float)(4.0);
10536 vector float zeroVec = (vector float)(0.0);
10537
10538 assert( numSamples == MIXBUFFER_SAMPLES );
10539
10540 inc[0] = ( currentV[0] - lastV[0] ) / MIXBUFFER_SAMPLES;
10541 inc[1] = ( currentV[1] - lastV[1] ) / MIXBUFFER_SAMPLES;
10542
10543 spkr[0] = lastV[0];
10544 spkr[1] = lastV[1];
10545 spkr[2] = lastV[0] + inc[0];
10546 spkr[3] = lastV[1] + inc[1];
10547
10548 for ( k = 0; k < 2; k++ ) {
10549 inc[k] *= 2;
10550 }
10551
10552 // load data in vectors
10553 vector float v0 = loadSplatUnalignedScalar( &inc[0] );
10554 vector float v1 = loadSplatUnalignedScalar( &inc[1] );
10555 vecInc = vec_mergeh( v0, v1 );
10556
10557 vector float v2 = loadSplatUnalignedScalar( &spkr[0] );
10558 vector float v3 = loadSplatUnalignedScalar( &spkr[1] );
10559 vector float v4 = loadSplatUnalignedScalar( &spkr[2] );
10560 vector float v5 = loadSplatUnalignedScalar( &spkr[3] );
10561
10562 // load spkr array
10563 v0 = vec_mergeh( v2, v4 );
10564 v1 = vec_mergeh( v3, v5 );
10565 vecSpeaker1 = vec_mergeh( v0, v1 );
10566
10567 vecSpeaker2 = vec_add( vecSpeaker1, vecInc );
10568 vecSpeaker3 = vec_add( vecSpeaker2, vecInc );
10569 vecSpeaker4 = vec_add( vecSpeaker3, vecInc );
10570 vecInc = vec_madd( vecInc, fourVec, zeroVec );
10571
10572 vector unsigned char samplesPerm = vec_add( vec_lvsl( -1, &samples[0] ), (vector unsigned char)(1) );
10573 vector unsigned char mixBufferPerm = vec_add( vec_lvsl( -1, &mixBuffer[0] ), (vector unsigned char)(1) );
10574 vector float vecSamplesLast = vec_ld( 0, &samples[0] );
10575 vector float vecDest = vec_ld( 0, &mixBuffer[0] );
10576
10577 //since MIXBUFFER_SAMPLES is a multiple of 8, we don't
10578 //need a cleanup loop
10579 for( i = 0 ; i+7 < MIXBUFFER_SAMPLES; i += 8 ) {
10580 // load mix buffers and samples
10581 vecMixBuffer1 = vecDest;
10582 vecMixBuffer2 = vec_ld( 15, &mixBuffer[i*2] );
10583 vecMixBuffer3 = vec_ld( 31, &mixBuffer[i*2] );
10584 vecMixBuffer4 = vec_ld( 47, &mixBuffer[i*2] );
10585 vector float vecDestEnd = vec_ld( 63, &mixBuffer[i*2] );
10586
10587 vecMixBuffer1 = vec_perm( vecMixBuffer1, vecMixBuffer2, mixBufferPerm );
10588 vecMixBuffer2 = vec_perm( vecMixBuffer2, vecMixBuffer3, mixBufferPerm );
10589 vecMixBuffer3 = vec_perm( vecMixBuffer3, vecMixBuffer4, mixBufferPerm );
10590 vecMixBuffer4 = vec_perm( vecMixBuffer4, vecDestEnd, mixBufferPerm );
10591
10592 vecSamples1 = vecSamplesLast;
10593 vecSamples2 = vec_ld( 15, &samples[i*2] );
10594 vecSamples3 = vec_ld( 31, &samples[i*2] );
10595 vecSamples4 = vec_ld( 47, &samples[i*2] );
10596 vecSamplesLast = vec_ld( 63, &samples[i*2] );
10597
10598 vecSamples1 = vec_perm( vecSamples1, vecSamples2, samplesPerm );
10599 vecSamples2 = vec_perm( vecSamples2, vecSamples3, samplesPerm );
10600 vecSamples3 = vec_perm( vecSamples3, vecSamples4, samplesPerm );
10601 vecSamples4 = vec_perm( vecSamples4, vecSamplesLast, samplesPerm );
10602
10603 vecMixBuffer1 = vec_madd( vecSamples1, vecSpeaker1, vecMixBuffer1 );
10604 vecMixBuffer2 = vec_madd( vecSamples2, vecSpeaker2, vecMixBuffer2 );
10605 vecMixBuffer3 = vec_madd( vecSamples3, vecSpeaker3, vecMixBuffer3 );
10606 vecMixBuffer4 = vec_madd( vecSamples4, vecSpeaker4, vecMixBuffer4 );
10607
10608 vecSpeaker1 = vec_add( vecSpeaker1, vecInc );
10609 vecSpeaker2 = vec_add( vecSpeaker2, vecInc );
10610 vecSpeaker3 = vec_add( vecSpeaker3, vecInc );
10611 vecSpeaker4 = vec_add( vecSpeaker4, vecInc );
10612
10613 // store results
10614 UNALIGNED_STORE4( &mixBuffer[i*2], vecMixBuffer1, vecMixBuffer2, vecMixBuffer3, vecMixBuffer4 );
10615 }
10616 }
10617
10618 #endif /* SOUND_DEST_ALIGNED */
10619
10620 #ifdef SOUND_DEST_ALIGNED
10621 /*
10622 ============
10623 idSIMD_AltiVec::MixSoundSixSpeakerMono
10624
10625 Assumptions:
10626 Assumes that mixBuffer starts at aligned address
10627 ============
10628 */
MixSoundSixSpeakerMono(float * mixBuffer,const float * samples,const int numSamples,const float lastV[6],const float currentV[6])10629 void VPCALL idSIMD_AltiVec::MixSoundSixSpeakerMono( float *mixBuffer, const float *samples, const int numSamples, const float lastV[6], const float currentV[6] ) {
10630
10631 // mixBuffer is aligned
10632 assert( IS_16BYTE_ALIGNED( mixBuffer[0] ) );
10633
10634 float incL[24];
10635 float sL[24];
10636 int i, k;
10637
10638 vector float vecIncl1, vecIncl2, vecIncl3, vecIncl4, vecIncl5, vecIncl6, vecIncl7;
10639 vector float vecSL1, vecSL2, vecSL3, vecSL4, vecSL5, vecSL6, vecSL7;
10640 vector float vecSamplesLd;
10641 vector float vecSamples1, vecSamples2, vecSamples3, vecSamples4, vecSamples5, vecSamples6;
10642 vector float vecMixBuffer1, vecMixBuffer2, vecMixBuffer3, vecMixBuffer4, vecMixBuffer5, vecMixBuffer6;
10643 // permute vectors for sample
10644 vector unsigned char samplePerm2 = (vector unsigned char)( 0,1,2,3,0,1,2,3,4,5,6,7,4,5,6,7);
10645 vector unsigned char samplePerm5 = (vector unsigned char)( 8,9,10,11,8,9,10,11,12,13,14,15,12,13,14,15);
10646
10647 assert( numSamples == MIXBUFFER_SAMPLES );
10648 assert( SPEAKER_RIGHT == 1 );
10649 assert( SPEAKER_BACKRIGHT == 5 );
10650
10651 // incL array, 6 elements repeated
10652 incL[0] = incL[6] = incL[12] = incL[18] = ( currentV[0] - lastV[0] ) / MIXBUFFER_SAMPLES;
10653 incL[1] = incL[7] = incL[13] = incL[19] = ( currentV[1] - lastV[1] ) / MIXBUFFER_SAMPLES;
10654 incL[2] = incL[8] = incL[14] = incL[20] = ( currentV[2] - lastV[2] ) / MIXBUFFER_SAMPLES;
10655 incL[3] = incL[9] = incL[15] = incL[21] = ( currentV[3] - lastV[3] ) / MIXBUFFER_SAMPLES;
10656 incL[4] = incL[10] = incL[16] = incL[22] = ( currentV[4] - lastV[4] ) / MIXBUFFER_SAMPLES;
10657 incL[5] = incL[11] = incL[17] = incL[23] = ( currentV[5] - lastV[5] ) / MIXBUFFER_SAMPLES;
10658
10659 // sL array repeated
10660 for ( k = 0; k < 6; k++ ) {
10661 sL[k] = lastV[k];
10662 }
10663 for ( k = 6; k < 12; k++ ) {
10664 sL[k] = lastV[k-6] + incL[k];
10665 }
10666 for ( k = 12; k < 18; k++ ) {
10667 sL[k] = lastV[k-12] + incL[k] + incL[k];
10668 }
10669 for ( k = 18; k < 24; k++ ) {
10670 sL[k] = lastV[k-18] + incL[k] + incL[k] + incL[k];
10671 }
10672
10673 // multiply by 2 since doing 12 at a time
10674 for ( k = 0; k < 24; k++ ) {
10675 incL[k] *= 4;
10676 }
10677
10678 //load the data
10679 vector unsigned char incPerm = vec_add( vec_lvsl( -1, &incL[0] ), (vector unsigned char)(1) );
10680 vector unsigned char slPerm = vec_add( vec_lvsl( -1, &sL[0] ), (vector unsigned char)(1) );
10681
10682 vecIncl1 = vec_ld( 0, &incL[0] );
10683 vecIncl2 = vec_ld( 15, &incL[0] );
10684 vecIncl3 = vec_ld( 31, &incL[0] );
10685 vecIncl4 = vec_ld( 47, &incL[0] );
10686 vecIncl5 = vec_ld( 63, &incL[0] );
10687 vecIncl6 = vec_ld( 79, &incL[0] );
10688 vecIncl7 = vec_ld( 95, &incL[0] );
10689
10690 vecIncl1 = vec_perm( vecIncl1, vecIncl2, incPerm );
10691 vecIncl2 = vec_perm( vecIncl2, vecIncl3, incPerm );
10692 vecIncl3 = vec_perm( vecIncl3, vecIncl4, incPerm );
10693 vecIncl4 = vec_perm( vecIncl4, vecIncl5, incPerm );
10694 vecIncl5 = vec_perm( vecIncl5, vecIncl6, incPerm );
10695 vecIncl6 = vec_perm( vecIncl6, vecIncl7, incPerm );
10696
10697 vecSL1 = vec_ld( 0, &sL[0] );
10698 vecSL2 = vec_ld( 15, &sL[0] );
10699 vecSL3 = vec_ld( 31, &sL[0] );
10700 vecSL4 = vec_ld( 47, &sL[0] );
10701 vecSL5 = vec_ld( 63, &sL[0] );
10702 vecSL6 = vec_ld( 79, &sL[0] );
10703 vecSL7 = vec_ld( 95, &sL[0] );
10704
10705 vecSL1 = vec_perm( vecSL1, vecSL2, slPerm );
10706 vecSL2 = vec_perm( vecSL2, vecSL3, slPerm );
10707 vecSL3 = vec_perm( vecSL3, vecSL4, slPerm );
10708 vecSL4 = vec_perm( vecSL4, vecSL5, slPerm );
10709 vecSL5 = vec_perm( vecSL5, vecSL6, slPerm );
10710 vecSL6 = vec_perm( vecSL6, vecSL7, slPerm );
10711
10712
10713 vector unsigned char samplesPerm = vec_add( vec_lvsl( -1, &samples[0] ), (vector unsigned char)(1) );
10714 vector float vecSamplesLast = vec_ld( 0, &samples[0] );
10715
10716 //since MIXBUFFER_SAMPLES is a multiple of 4, we don't
10717 //need a cleanup loop
10718 for( i = 0; i <= MIXBUFFER_SAMPLES - 4; i += 4 ) {
10719 //load mix buffer into vectors, assume aligned
10720 vecMixBuffer1 = vec_ld( 0, &mixBuffer[i*6] );
10721 vecMixBuffer2 = vec_ld( 0, &mixBuffer[(i*6)+4] );
10722 vecMixBuffer3 = vec_ld( 0, &mixBuffer[(i*6)+8] );
10723 vecMixBuffer4 = vec_ld( 0, &mixBuffer[(i*6)+12] );
10724 vecMixBuffer5 = vec_ld( 0, &mixBuffer[(i*6)+16] );
10725 vecMixBuffer6 = vec_ld( 0, &mixBuffer[(i*6)+20] );
10726
10727 //load samples into vector
10728 vector float vecSamplesLd2 = vec_ld( 15, &samples[i] );
10729 vecSamplesLd = vec_perm( vecSamplesLast, vecSamplesLd2, samplesPerm );
10730 vecSamplesLast = vecSamplesLd2;
10731
10732 //permute to get them ordered how we want
10733 vecSamples1 = vec_splat( vecSamplesLd, 0 );
10734 vecSamples2 = vec_perm( vecSamplesLd, vecSamplesLd, samplePerm2 );
10735 vecSamples3 = vec_splat( vecSamplesLd, 1 );
10736 vecSamples4 = vec_splat( vecSamplesLd, 2 );
10737 vecSamples5 = vec_perm( vecSamplesLd, vecSamplesLd, samplePerm5 );
10738 vecSamples6 = vec_splat( vecSamplesLd, 3 );
10739
10740 //do calculation
10741 vecMixBuffer1 = vec_madd( vecSamples1, vecSL1, vecMixBuffer1 );
10742 vecMixBuffer2 = vec_madd( vecSamples2, vecSL2, vecMixBuffer2 );
10743 vecMixBuffer3 = vec_madd( vecSamples3, vecSL3, vecMixBuffer3 );
10744 vecMixBuffer4 = vec_madd( vecSamples4, vecSL4, vecMixBuffer4 );
10745 vecMixBuffer5 = vec_madd( vecSamples5, vecSL5, vecMixBuffer5 );
10746 vecMixBuffer6 = vec_madd( vecSamples6, vecSL6, vecMixBuffer6 );
10747
10748 //store out results
10749 ALIGNED_STORE6( &mixBuffer[i*6], vecMixBuffer1, vecMixBuffer2, vecMixBuffer3, vecMixBuffer4, vecMixBuffer5, vecMixBuffer6 );
10750
10751 // add for next iteration
10752 vecSL1 = vec_add( vecSL1, vecIncl1 );
10753 vecSL2 = vec_add( vecSL2, vecIncl2 );
10754 vecSL3 = vec_add( vecSL3, vecIncl3 );
10755 vecSL4 = vec_add( vecSL4, vecIncl4 );
10756 vecSL5 = vec_add( vecSL5, vecIncl5 );
10757 vecSL6 = vec_add( vecSL6, vecIncl6 );
10758 }
10759 }
10760 #else
10761
10762 /*
10763 ============
10764 idSIMD_AltiVec::MixSoundSixSpeakerMono
10765
10766 Assumptions:
10767 No assumptions
10768 ============
10769 */
MixSoundSixSpeakerMono(float * mixBuffer,const float * samples,const int numSamples,const float lastV[6],const float currentV[6])10770 void VPCALL idSIMD_AltiVec::MixSoundSixSpeakerMono( float *mixBuffer, const float *samples, const int numSamples, const float lastV[6], const float currentV[6] ) {
10771
10772 float incL[24];
10773 float sL[24];
10774 int i, k;
10775
10776 vector float vecIncl1, vecIncl2, vecIncl3, vecIncl4, vecIncl5, vecIncl6, vecIncl7;
10777 vector float vecSL1, vecSL2, vecSL3, vecSL4, vecSL5, vecSL6, vecSL7;
10778 vector float vecSamplesLd;
10779 vector float vecSamples1, vecSamples2, vecSamples3, vecSamples4, vecSamples5, vecSamples6;
10780 vector float vecMixBuffer1, vecMixBuffer2, vecMixBuffer3, vecMixBuffer4, vecMixBuffer5, vecMixBuffer6;
10781 // permute vectors for sample
10782 register vector unsigned char samplePerm2 = (vector unsigned char)( 0,1,2,3,0,1,2,3,4,5,6,7,4,5,6,7);
10783 register vector unsigned char samplePerm5 = (vector unsigned char)( 8,9,10,11,8,9,10,11,12,13,14,15,12,13,14,15);
10784
10785 assert( numSamples == MIXBUFFER_SAMPLES );
10786 assert( SPEAKER_RIGHT == 1 );
10787 assert( SPEAKER_BACKRIGHT == 5 );
10788
10789 // incL array, 6 elements repeated
10790 incL[0] = incL[6] = incL[12] = incL[18] = ( currentV[0] - lastV[0] ) / MIXBUFFER_SAMPLES;
10791 incL[1] = incL[7] = incL[13] = incL[19] = ( currentV[1] - lastV[1] ) / MIXBUFFER_SAMPLES;
10792 incL[2] = incL[8] = incL[14] = incL[20] = ( currentV[2] - lastV[2] ) / MIXBUFFER_SAMPLES;
10793 incL[3] = incL[9] = incL[15] = incL[21] = ( currentV[3] - lastV[3] ) / MIXBUFFER_SAMPLES;
10794 incL[4] = incL[10] = incL[16] = incL[22] = ( currentV[4] - lastV[4] ) / MIXBUFFER_SAMPLES;
10795 incL[5] = incL[11] = incL[17] = incL[23] = ( currentV[5] - lastV[5] ) / MIXBUFFER_SAMPLES;
10796
10797 // sL array repeated
10798 for ( k = 0; k < 6; k++ ) {
10799 sL[k] = lastV[k];
10800 }
10801 for ( k = 6; k < 12; k++ ) {
10802 sL[k] = lastV[k-6] + incL[k];
10803 }
10804 for ( k = 12; k < 18; k++ ) {
10805 sL[k] = lastV[k-12] + incL[k] + incL[k];
10806 }
10807 for ( k = 18; k < 24; k++ ) {
10808 sL[k] = lastV[k-18] + incL[k] + incL[k] + incL[k];
10809 }
10810
10811 // multiply by 2 since doing 12 at a time
10812 for ( k = 0; k < 24; k++ ) {
10813 incL[k] *= 4;
10814 }
10815
10816 // load the data
10817 vector unsigned char incPerm = vec_add( vec_lvsl( -1, &incL[0] ), (vector unsigned char)(1) );
10818 vector unsigned char slPerm = vec_add( vec_lvsl( -1, &sL[0] ), (vector unsigned char)(1) );
10819
10820 vecIncl1 = vec_ld( 0, &incL[0] );
10821 vecIncl2 = vec_ld( 15, &incL[0] );
10822 vecIncl3 = vec_ld( 31, &incL[0] );
10823 vecIncl4 = vec_ld( 47, &incL[0] );
10824 vecIncl5 = vec_ld( 63, &incL[0] );
10825 vecIncl6 = vec_ld( 79, &incL[0] );
10826 vecIncl7 = vec_ld( 95, &incL[0] );
10827
10828 vecIncl1 = vec_perm( vecIncl1, vecIncl2, incPerm );
10829 vecIncl2 = vec_perm( vecIncl2, vecIncl3, incPerm );
10830 vecIncl3 = vec_perm( vecIncl3, vecIncl4, incPerm );
10831 vecIncl4 = vec_perm( vecIncl4, vecIncl5, incPerm );
10832 vecIncl5 = vec_perm( vecIncl5, vecIncl6, incPerm );
10833 vecIncl6 = vec_perm( vecIncl6, vecIncl7, incPerm );
10834
10835 vecSL1 = vec_ld( 0, &sL[0] );
10836 vecSL2 = vec_ld( 15, &sL[0] );
10837 vecSL3 = vec_ld( 31, &sL[0] );
10838 vecSL4 = vec_ld( 47, &sL[0] );
10839 vecSL5 = vec_ld( 63, &sL[0] );
10840 vecSL6 = vec_ld( 79, &sL[0] );
10841 vecSL7 = vec_ld( 95, &sL[0] );
10842
10843 vecSL1 = vec_perm( vecSL1, vecSL2, slPerm );
10844 vecSL2 = vec_perm( vecSL2, vecSL3, slPerm );
10845 vecSL3 = vec_perm( vecSL3, vecSL4, slPerm );
10846 vecSL4 = vec_perm( vecSL4, vecSL5, slPerm );
10847 vecSL5 = vec_perm( vecSL5, vecSL6, slPerm );
10848 vecSL6 = vec_perm( vecSL6, vecSL7, slPerm );
10849
10850 vector unsigned char samplesPerm = vec_add( vec_lvsl( -1, &samples[0] ), (vector unsigned char)(1) );
10851 vector unsigned char mixBufferPerm = vec_add( vec_lvsl( -1, &mixBuffer[0] ), (vector unsigned char)(1) );
10852 vector float vecSamplesLast = vec_ld( 0, &samples[0] );
10853 vector float vecDest = vec_ld( 0, &mixBuffer[0] );
10854
10855 //since MIXBUFFER_SAMPLES is a multiple of 4, we don't
10856 //need a cleanup loop
10857 for( i = 0; i <= MIXBUFFER_SAMPLES - 4; i += 4 ) {
10858 //load mix buffer into vectors
10859 vecMixBuffer1 = vecDest;
10860 vecMixBuffer2 = vec_ld( 15, &mixBuffer[i*6] );
10861 vecMixBuffer3 = vec_ld( 31, &mixBuffer[i*6] );
10862 vecMixBuffer4 = vec_ld( 47, &mixBuffer[i*6] );
10863 vecMixBuffer5 = vec_ld( 63, &mixBuffer[i*6] );
10864 vecMixBuffer6 = vec_ld( 79, &mixBuffer[i*6] );
10865 vector float vecDestEnd = vec_ld( 95, &mixBuffer[i*6] );
10866
10867 vecMixBuffer1 = vec_perm( vecMixBuffer1, vecMixBuffer2, mixBufferPerm );
10868 vecMixBuffer2 = vec_perm( vecMixBuffer2, vecMixBuffer3, mixBufferPerm );
10869 vecMixBuffer3 = vec_perm( vecMixBuffer3, vecMixBuffer4, mixBufferPerm );
10870 vecMixBuffer4 = vec_perm( vecMixBuffer4, vecMixBuffer5, mixBufferPerm );
10871 vecMixBuffer5 = vec_perm( vecMixBuffer5, vecMixBuffer6, mixBufferPerm );
10872 vecMixBuffer6 = vec_perm( vecMixBuffer6, vecDestEnd, mixBufferPerm );
10873
10874 //load samples into vector
10875 vector float vecSamplesLd2 = vec_ld( 15, &samples[i] );
10876 vecSamplesLd = vec_perm( vecSamplesLast, vecSamplesLd2, samplesPerm );
10877 vecSamplesLast = vecSamplesLd2;
10878
10879 //permute to get them ordered how we want
10880 vecSamples1 = vec_splat( vecSamplesLd, 0 );
10881 vecSamples2 = vec_perm( vecSamplesLd, vecSamplesLd, samplePerm2 );
10882 vecSamples3 = vec_splat( vecSamplesLd, 1 );
10883 vecSamples4 = vec_splat( vecSamplesLd, 2 );
10884 vecSamples5 = vec_perm( vecSamplesLd, vecSamplesLd, samplePerm5 );
10885 vecSamples6 = vec_splat( vecSamplesLd, 3 );
10886
10887 //do calculation
10888 vecMixBuffer1 = vec_madd( vecSamples1, vecSL1, vecMixBuffer1 );
10889 vecMixBuffer2 = vec_madd( vecSamples2, vecSL2, vecMixBuffer2 );
10890 vecMixBuffer3 = vec_madd( vecSamples3, vecSL3, vecMixBuffer3 );
10891 vecMixBuffer4 = vec_madd( vecSamples4, vecSL4, vecMixBuffer4 );
10892 vecMixBuffer5 = vec_madd( vecSamples5, vecSL5, vecMixBuffer5 );
10893 vecMixBuffer6 = vec_madd( vecSamples6, vecSL6, vecMixBuffer6 );
10894
10895 // store results
10896 UNALIGNED_STORE6( &mixBuffer[i*6], vecMixBuffer1, vecMixBuffer2, vecMixBuffer3, vecMixBuffer4, vecMixBuffer5, vecMixBuffer6 );
10897
10898 // add for next iteration
10899 vecSL1 = vec_add( vecSL1, vecIncl1 );
10900 vecSL2 = vec_add( vecSL2, vecIncl2 );
10901 vecSL3 = vec_add( vecSL3, vecIncl3 );
10902 vecSL4 = vec_add( vecSL4, vecIncl4 );
10903 vecSL5 = vec_add( vecSL5, vecIncl5 );
10904 vecSL6 = vec_add( vecSL6, vecIncl6 );
10905 }
10906 }
10907
10908 #endif /* SOUND_DEST_ALIGNED */
10909
10910 #ifdef SOUND_DEST_ALIGNED
10911 /*
10912 ============
10913 idSIMD_AltiVec::MixSoundSixSpeakerStereo
10914
10915 Assumptions:
10916 Assumes that mixBuffer starts at aligned address
10917 ============
10918 */
10919
MixSoundSixSpeakerStereo(float * mixBuffer,const float * samples,const int numSamples,const float lastV[6],const float currentV[6])10920 void VPCALL idSIMD_AltiVec::MixSoundSixSpeakerStereo( float *mixBuffer, const float *samples, const int numSamples, const float lastV[6], const float currentV[6] ) {
10921
10922 // mixBuffer is aligned
10923 assert( IS_16BYTE_ALIGNED( mixBuffer[0] ) );
10924
10925 float incL[12];
10926 float sL[12];
10927 int i;
10928 vector float vecIncl1, vecIncl2, vecIncl3, vecIncl4;
10929 vector float vecSL1, vecSL2, vecSL3, vecSL4;
10930 vector float vecSamplesLd;
10931 vector float vecSamples1, vecSamples2, vecSamples3;
10932 vector float vecMixBuffer1, vecMixBuffer2, vecMixBuffer3;
10933 // permute vectors for sample
10934 vector unsigned char samplePerm1 = (vector unsigned char)( 0,1,2,3,4,5,6,7,0,1,2,3,0,1,2,3);
10935 vector unsigned char samplePerm3 = (vector unsigned char)( 8,9,10,11,8,9,10,11,8,9,10,11,12,13,14,15);
10936
10937 assert( numSamples == MIXBUFFER_SAMPLES );
10938 assert( SPEAKER_RIGHT == 1 );
10939 assert( SPEAKER_BACKRIGHT == 5 );
10940
10941 // incL array, 6 elements repeated
10942 incL[0] = incL[6] = ( currentV[0] - lastV[0] ) / MIXBUFFER_SAMPLES;
10943 incL[1] = incL[7] = ( currentV[1] - lastV[1] ) / MIXBUFFER_SAMPLES;
10944 incL[2] = incL[8] = ( currentV[2] - lastV[2] ) / MIXBUFFER_SAMPLES;
10945 incL[3] = incL[9] = ( currentV[3] - lastV[3] ) / MIXBUFFER_SAMPLES;
10946 incL[4] = incL[10] = ( currentV[4] - lastV[4] ) / MIXBUFFER_SAMPLES;
10947 incL[5] = incL[11] = ( currentV[5] - lastV[5] ) / MIXBUFFER_SAMPLES;
10948
10949 // sL array repeated
10950 sL[0] = lastV[0];
10951 sL[1] = lastV[1];
10952 sL[2] = lastV[2];
10953 sL[3] = lastV[3];
10954 sL[4] = lastV[4];
10955 sL[5] = lastV[5];
10956 sL[6] = lastV[0] + incL[0];
10957 sL[7] = lastV[1] + incL[1];
10958 sL[8] = lastV[2] + incL[2];
10959 sL[9] = lastV[3] + incL[3];
10960 sL[10] = lastV[4] + incL[4];
10961 sL[11] = lastV[5] + incL[5];
10962
10963 // multiply by 2 since doing 12 at a time
10964 incL[0] *= 2;
10965 incL[1] *= 2;
10966 incL[2] *= 2;
10967 incL[3] *= 2;
10968 incL[4] *= 2;
10969 incL[5] *= 2;
10970 incL[6] *= 2;
10971 incL[7] *= 2;
10972 incL[8] *= 2;
10973 incL[9] *= 2;
10974 incL[10] *= 2;
10975 incL[11] *= 2;
10976
10977 //we aligned this data, so load it up
10978 vector unsigned char incPerm = vec_add( vec_lvsl( -1, &incL[0] ), (vector unsigned char)(1) );
10979 vector unsigned char slPerm = vec_add( vec_lvsl( -1, &sL[0] ), (vector unsigned char)(1) );
10980 vecIncl1 = vec_ld( 0, &incL[0] );
10981 vecIncl2 = vec_ld( 15, &incL[0] );
10982 vecIncl3 = vec_ld( 31, &incL[0] );
10983 vecIncl4 = vec_ld( 47, &incL[0] );
10984
10985 vecIncl1 = vec_perm( vecIncl1, vecIncl2, incPerm );
10986 vecIncl2 = vec_perm( vecIncl2, vecIncl3, incPerm );
10987 vecIncl3 = vec_perm( vecIncl3, vecIncl4, incPerm );
10988
10989 vecSL1 = vec_ld( 0, &sL[0] );
10990 vecSL2 = vec_ld( 15, &sL[0] );
10991 vecSL3 = vec_ld( 31, &sL[0] );
10992 vecSL4 = vec_ld( 47, &sL[0] );
10993
10994 vecSL1 = vec_perm( vecSL1, vecSL2, slPerm );
10995 vecSL2 = vec_perm( vecSL2, vecSL3, slPerm );
10996 vecSL3 = vec_perm( vecSL3, vecSL4, slPerm );
10997
10998 vector unsigned char samplesPerm = vec_add( vec_lvsl( -1, &samples[0] ), (vector unsigned char)(1) );
10999 vector float vecSamplesLast = vec_ld( 0, &samples[0] );
11000
11001 for( i = 0; i <= MIXBUFFER_SAMPLES - 2; i += 2 ) {
11002
11003 //load mix buffer into vectors, assume aligned
11004 vecMixBuffer1 = vec_ld( 0, &mixBuffer[i*6] );
11005 vecMixBuffer2 = vec_ld( 0, &mixBuffer[(i*6)+4] );
11006 vecMixBuffer3 = vec_ld( 0, &mixBuffer[(i*6)+8] );
11007
11008 //load samples into vector
11009 vector float vecSamplesLd2 = vec_ld( 15, &samples[i*2] );
11010 vecSamplesLd = vec_perm( vecSamplesLast, vecSamplesLd2, samplesPerm );
11011 vecSamplesLast = vecSamplesLd2;
11012
11013 //permute to get them ordered how we want. For the 2nd vector,
11014 //the order happens to be the same as the order we loaded them
11015 //in, so there's no need to permute that one
11016 vecSamples1 = vec_perm( vecSamplesLd, vecSamplesLd, samplePerm1 );
11017 vecSamples2 = vecSamplesLd;
11018 vecSamples3 = vec_perm( vecSamplesLd, vecSamplesLd, samplePerm3 );
11019
11020 //do calculation
11021 vecMixBuffer1 = vec_madd( vecSamples1, vecSL1, vecMixBuffer1 );
11022 vecMixBuffer2 = vec_madd( vecSamples2, vecSL2, vecMixBuffer2 );
11023 vecMixBuffer3 = vec_madd( vecSamples3, vecSL3, vecMixBuffer3 );
11024
11025 //store out results
11026 ALIGNED_STORE3( &mixBuffer[i*6], vecMixBuffer1, vecMixBuffer2, vecMixBuffer3 );
11027
11028 // add for next iteration
11029 vecSL1 = vec_add( vecSL1, vecIncl1 );
11030 vecSL2 = vec_add( vecSL2, vecIncl2 );
11031 vecSL3 = vec_add( vecSL3, vecIncl3 );
11032 }
11033 }
11034 #else
11035
11036 /*
11037 ============
11038 idSIMD_AltiVec::MixSoundSixSpeakerStereo
11039
11040 Assumptions:
11041 No assumptions
11042 ============
11043 */
MixSoundSixSpeakerStereo(float * mixBuffer,const float * samples,const int numSamples,const float lastV[6],const float currentV[6])11044 void VPCALL idSIMD_AltiVec::MixSoundSixSpeakerStereo( float *mixBuffer, const float *samples, const int numSamples, const float lastV[6], const float currentV[6] ) {
11045
11046 float incL[12];
11047 float sL[12];
11048
11049 int i;
11050 vector float vecIncl1, vecIncl2, vecIncl3, vecIncl4;
11051 vector float vecSL1, vecSL2, vecSL3, vecSL4;
11052 vector float vecSamplesLd;
11053 vector float vecSamples1, vecSamples2, vecSamples3;
11054 vector float vecMixBuffer1, vecMixBuffer2, vecMixBuffer3;
11055 // permute vectors for sample
11056 vector unsigned char samplePerm1 = (vector unsigned char)( 0,1,2,3,4,5,6,7,0,1,2,3,0,1,2,3);
11057 vector unsigned char samplePerm3 = (vector unsigned char)( 8,9,10,11,8,9,10,11,8,9,10,11,12,13,14,15);
11058
11059 assert( numSamples == MIXBUFFER_SAMPLES );
11060 assert( SPEAKER_RIGHT == 1 );
11061 assert( SPEAKER_BACKRIGHT == 5 );
11062
11063 // incL array, 6 elements repeated
11064 incL[0] = incL[6] = ( currentV[0] - lastV[0] ) / MIXBUFFER_SAMPLES;
11065 incL[1] = incL[7] = ( currentV[1] - lastV[1] ) / MIXBUFFER_SAMPLES;
11066 incL[2] = incL[8] = ( currentV[2] - lastV[2] ) / MIXBUFFER_SAMPLES;
11067 incL[3] = incL[9] = ( currentV[3] - lastV[3] ) / MIXBUFFER_SAMPLES;
11068 incL[4] = incL[10] = ( currentV[4] - lastV[4] ) / MIXBUFFER_SAMPLES;
11069 incL[5] = incL[11] = ( currentV[5] - lastV[5] ) / MIXBUFFER_SAMPLES;
11070
11071 // sL array repeated
11072 sL[0] = lastV[0];
11073 sL[1] = lastV[1];
11074 sL[2] = lastV[2];
11075 sL[3] = lastV[3];
11076 sL[4] = lastV[4];
11077 sL[5] = lastV[5];
11078 sL[6] = lastV[0] + incL[0];
11079 sL[7] = lastV[1] + incL[1];
11080 sL[8] = lastV[2] + incL[2];
11081 sL[9] = lastV[3] + incL[3];
11082 sL[10] = lastV[4] + incL[4];
11083 sL[11] = lastV[5] + incL[5];
11084
11085 // multiply by 2 since doing 12 at a time
11086 incL[0] *= 2;
11087 incL[1] *= 2;
11088 incL[2] *= 2;
11089 incL[3] *= 2;
11090 incL[4] *= 2;
11091 incL[5] *= 2;
11092 incL[6] *= 2;
11093 incL[7] *= 2;
11094 incL[8] *= 2;
11095 incL[9] *= 2;
11096 incL[10] *= 2;
11097 incL[11] *= 2;
11098
11099 // load the data
11100 vector unsigned char incPerm = vec_add( vec_lvsl( -1, &incL[0] ), (vector unsigned char)(1) );
11101 vector unsigned char slPerm = vec_add( vec_lvsl( -1, &sL[0] ), (vector unsigned char)(1) );
11102 vecIncl1 = vec_ld( 0, &incL[0] );
11103 vecIncl2 = vec_ld( 15, &incL[0] );
11104 vecIncl3 = vec_ld( 31, &incL[0] );
11105 vecIncl4 = vec_ld( 47, &incL[0] );
11106
11107 vecIncl1 = vec_perm( vecIncl1, vecIncl2, incPerm );
11108 vecIncl2 = vec_perm( vecIncl2, vecIncl3, incPerm );
11109 vecIncl3 = vec_perm( vecIncl3, vecIncl4, incPerm );
11110
11111 vecSL1 = vec_ld( 0, &sL[0] );
11112 vecSL2 = vec_ld( 15, &sL[0] );
11113 vecSL3 = vec_ld( 31, &sL[0] );
11114 vecSL4 = vec_ld( 47, &sL[0] );
11115
11116 vecSL1 = vec_perm( vecSL1, vecSL2, slPerm );
11117 vecSL2 = vec_perm( vecSL2, vecSL3, slPerm );
11118 vecSL3 = vec_perm( vecSL3, vecSL4, slPerm );
11119
11120 vector unsigned char samplesPerm = vec_add( vec_lvsl( -1, &samples[0] ), (vector unsigned char)(1) );
11121 vector unsigned char mixBufferPerm = vec_add( vec_lvsl( -1, &mixBuffer[0] ), (vector unsigned char)(1) );
11122 vector float vecSamplesLast = vec_ld( 0, &samples[0] );
11123 vector float vecDest = vec_ld( 0, &mixBuffer[0] );
11124
11125 for( i = 0; i <= MIXBUFFER_SAMPLES - 2; i += 2 ) {
11126
11127 //load mix buffer into vectors
11128 vecMixBuffer1 = vecDest;
11129 vecMixBuffer2 = vec_ld( 15, &mixBuffer[i*6] );
11130 vecMixBuffer3 = vec_ld( 31, &mixBuffer[i*6] );
11131 vector float vecDestEnd = vec_ld( 47, &mixBuffer[i*6] );
11132
11133 vecMixBuffer1 = vec_perm( vecMixBuffer1, vecMixBuffer2, mixBufferPerm );
11134 vecMixBuffer2 = vec_perm( vecMixBuffer2, vecMixBuffer3, mixBufferPerm );
11135 vecMixBuffer3 = vec_perm( vecMixBuffer3, vecDestEnd, mixBufferPerm );
11136
11137 //load samples into vector
11138 vector float vecSamplesLd2 = vec_ld( 15, &samples[i*2] );
11139 vecSamplesLd = vec_perm( vecSamplesLast, vecSamplesLd2, samplesPerm );
11140 vecSamplesLast = vecSamplesLd2;
11141
11142 //permute to get them ordered how we want. For the 2nd vector,
11143 //the order happens to be the same as the order we loaded them
11144 //in, so there's no need to permute that one
11145 vecSamples1 = vec_perm( vecSamplesLd, vecSamplesLd, samplePerm1 );
11146 vecSamples2 = vecSamplesLd;
11147 vecSamples3 = vec_perm( vecSamplesLd, vecSamplesLd, samplePerm3 );
11148
11149 //do calculation
11150 vecMixBuffer1 = vec_madd( vecSamples1, vecSL1, vecMixBuffer1 );
11151 vecMixBuffer2 = vec_madd( vecSamples2, vecSL2, vecMixBuffer2 );
11152 vecMixBuffer3 = vec_madd( vecSamples3, vecSL3, vecMixBuffer3 );
11153
11154 // store results
11155 UNALIGNED_STORE3( &mixBuffer[i*6], vecMixBuffer1, vecMixBuffer2, vecMixBuffer3 );
11156
11157 // add for next iteration
11158 vecSL1 = vec_add( vecSL1, vecIncl1 );
11159 vecSL2 = vec_add( vecSL2, vecIncl2 );
11160 vecSL3 = vec_add( vecSL3, vecIncl3 );
11161 }
11162 }
11163
11164 #endif
11165
11166 /*
11167 ============
11168 idSIMD_AltiVec::MixedSoundToSamples
11169 ============
11170 */
MixedSoundToSamples(short * samples,const float * mixBuffer,const int numSamples)11171 void VPCALL idSIMD_AltiVec::MixedSoundToSamples( short *samples, const float *mixBuffer, const int numSamples ) {
11172 //this is basically a clamp for sound mixing
11173 register vector float v0, v1, v2, v3, v4, v5, v6, v7;
11174 register vector signed int vi0, vi1, vi2, vi3;
11175 register vector signed short vs0, vs1;
11176 register vector float minVec, maxVec, constVec;
11177 int i = 0;
11178
11179 //unaligned at start, since samples is not 16-byte aligned
11180 for ( ; NOT_16BYTE_ALIGNED( samples[i] ) && ( i < numSamples ); i++ ) {
11181 samples[i] = mixBuffer[i] <= -32768.0f ? -32768 : mixBuffer[i] >= 32767.0f ? 32767 : (short) mixBuffer[i];
11182 }
11183
11184 constVec = (vector float)(65536.0f);
11185
11186 //splat min/max into a vector
11187 minVec = (vector float)(-32768.0f);
11188 maxVec = (vector float)(32767.0f);
11189
11190 vector float vecOld = vec_ld( 0, &mixBuffer[i] );
11191 vector unsigned char permVec = vec_add( vec_lvsl( -1, &mixBuffer[i] ), (vector unsigned char)(1) );
11192
11193 //vectorize!
11194 for ( ; i+15 < numSamples; i += 16 ) {
11195 //load source
11196 v0 = vecOld;
11197 v1 = vec_ld( 15, &mixBuffer[i] );
11198 v2 = vec_ld( 31, &mixBuffer[i] );
11199 v3 = vec_ld( 31, &mixBuffer[i] );
11200 vecOld = vec_ld( 47, &mixBuffer[i] );
11201
11202 v0 = vec_perm( v0, v1, permVec );
11203 v1 = vec_perm( v1, v2, permVec );
11204 v2 = vec_perm( v2, v3, permVec );
11205 v3 = vec_perm( v3, vecOld, permVec );
11206
11207 //apply minimum
11208 v4 = vec_max( v0, minVec );
11209 v5 = vec_max( v1, minVec );
11210 v6 = vec_max( v2, minVec );
11211 v7 = vec_max( v3, minVec );
11212
11213 //apply maximum
11214 v4 = vec_min( v4, maxVec );
11215 v5 = vec_min( v5, maxVec );
11216 v6 = vec_min( v6, maxVec );
11217 v7 = vec_min( v7, maxVec );
11218
11219 // convert floats to ints
11220 vi0 = vec_cts( v4, 0 );
11221 vi1 = vec_cts( v5, 0 );
11222 vi2 = vec_cts( v6, 0 );
11223 vi3 = vec_cts( v7, 0 );
11224
11225 // pack ints into shorts
11226 vs0 = vec_pack( vi0, vi1 );
11227 vs1 = vec_pack( vi2, vi3 );
11228 ALIGNED_STORE2( &samples[i], vs0, vs1 );
11229 }
11230
11231 //handle cleanup
11232 for ( ; i < numSamples ; i++ ) {
11233 samples[i] = mixBuffer[i] <= -32768.0f ? -32768 : mixBuffer[i] >= 32767.0f ? 32767 : (short) mixBuffer[i];
11234 }
11235 }
11236 #endif /* ENABLE_SOUND_ROUTINES */
11237
11238 #endif /* MACOS_X */
11239