1 /*
2 ===========================================================================
3 
4 Doom 3 GPL Source Code
5 Copyright (C) 1999-2011 id Software LLC, a ZeniMax Media company.
6 
7 This file is part of the Doom 3 GPL Source Code ("Doom 3 Source Code").
8 
9 Doom 3 Source Code is free software: you can redistribute it and/or modify
10 it under the terms of the GNU General Public License as published by
11 the Free Software Foundation, either version 3 of the License, or
12 (at your option) any later version.
13 
14 Doom 3 Source Code is distributed in the hope that it will be useful,
15 but WITHOUT ANY WARRANTY; without even the implied warranty of
16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17 GNU General Public License for more details.
18 
19 You should have received a copy of the GNU General Public License
20 along with Doom 3 Source Code.  If not, see <http://www.gnu.org/licenses/>.
21 
22 In addition, the Doom 3 Source Code is also subject to certain additional terms. You should have received a copy of these additional terms immediately following the terms and conditions of the GNU General Public License which accompanied the Doom 3 Source Code.  If not, please request a copy in writing from id Software at the address below.
23 
24 If you have questions concerning this license or the applicable additional terms, you may contact in writing id Software LLC, c/o ZeniMax Media Inc., Suite 120, Rockville, Maryland 20850 USA.
25 
26 ===========================================================================
27 */
28 
29 #ifndef __MATH_SIMD_ALTIVEC_H__
30 #define __MATH_SIMD_ALTIVEC_H__
31 
32 #include "idlib/math/Simd_Generic.h"
33 
34 /*
35 ===============================================================================
36 
37 	AltiVec implementation of idSIMDProcessor
38 
39 ===============================================================================
40 */
41 
42 // Defines for enabling parts of the library
43 
44 // Turns on/off the simple math routines (add, sub, div, etc)
45 #define ENABLE_SIMPLE_MATH
46 
47 // Turns on/off the dot routines
48 #define ENABLE_DOT
49 
50 // Turns on/off the compare routines
51 #define ENABLE_COMPARES
52 
53 // The MinMax routines introduce a couple of bugs. In the bathroom of the alphalabs2 map, the
54 // wrong surface appears in the mirror at times. It also introduces a noticable delay when map
55 // data is loaded such as going through doors.
56 // Turns on/off MinMax routines
57 //#define ENABLE_MINMAX
58 
59 // Turns on/off Clamp routines
60 #define ENABLE_CLAMP
61 
62 // Turns on/off XXX16 routines
63 #define ENABLE_16ROUTINES
64 
65 // Turns on/off LowerTriangularSolve, LowerTriangularSolveTranspose, and MatX_LDLTFactor
66 #define ENABLE_LOWER_TRIANGULAR
67 
68 // Turns on/off TracePointCull, DecalPointCull, and OverlayPoint
69 // The Enable_Cull routines breaks the g_decals functionality, DecalPointCull is
70 // the likely suspect. Bullet holes do not appear on the walls when this optimization
71 // is enabled.
72 //#define ENABLE_CULL
73 
74 // Turns on/off DeriveTriPlanes, DeriveTangents, DeriveUnsmoothedTangents, NormalizeTangents
75 #define ENABLE_DERIVE
76 
77 // Turns on/off CreateTextureSpaceLightVectors, CreateShadowCache, CreateVertexProgramShadowCache
78 #define ENABLE_CREATE
79 
80 // Turns on/off the sound routines
81 #define ENABLE_SOUND_ROUTINES
82 
83 // Turns on/off the stuff that isn't on elsewhere
84 // Currently: BlendJoints, TransformJoints, UntransformJoints, ConvertJointQuatsToJointMats, and
85 // ConvertJointMatsToJointQuats
86 #define LIVE_VICARIOUSLY
87 
88 // This assumes that the dest (and mixBuffer) array to the sound functions is aligned. If this is not true, we take a large
89 // performance hit from having to do unaligned stores
90 //#define SOUND_DEST_ALIGNED
91 
92 // This assumes that the vertexCache array to CreateShadowCache and CreateVertexProgramShadowCache is aligned. If it's not,
93 // then we take a big performance hit from unaligned stores.
94 //#define VERTEXCACHE_ALIGNED
95 
96 // This turns on support for PPC intrinsics in the SIMD_AltiVec.cpp file. Right now it's only used for frsqrte. GCC
97 // supports these intrinsics but XLC does not.
98 #if defined(__GNUC__) && defined(__ALTIVEC__)
99     #define PPC_INTRINSICS
100 #endif
101 
102 // This assumes that the idDrawVert array that is used in DeriveUnsmoothedTangents is aligned. If its not aligned,
103 // then we don't get any speedup
104 //#define DERIVE_UNSMOOTH_DRAWVERT_ALIGNED
105 
106 // Disable DRAWVERT_PADDED since we disabled the ENABLE_CULL optimizations and the default
107 // implementation does not allow for the extra padding.
108 // This assumes that idDrawVert has been padded by 4 bytes so that xyz always starts at an aligned
109 // address
110 //#define DRAWVERT_PADDED
111 
112 class idSIMD_AltiVec : public idSIMD_Generic {
113 #if defined(MACOS_X) && defined(__GNUC__) && defined(__ALTIVEC__)
114 public:
115 
116 	virtual const char * VPCALL GetName( void ) const;
117 
118 #ifdef ENABLE_SIMPLE_MATH
119 	// Basic math, works for both aligned and unaligned data
120 	virtual void VPCALL Add( float *dst, const float constant, const float *src, const int count );
121 	virtual void VPCALL Add( float *dst, const float *src0, const float *src1, const int count );
122 	virtual void VPCALL Sub( float *dst, const float constant, const float *src, const int count );
123 	virtual void VPCALL Sub( float *dst, const float *src0, const float *src1, const int count );
124 	virtual void VPCALL Mul( float *dst, const float constant, const float *src, const int count);
125 	virtual void VPCALL Mul( float *dst, const float *src0, const float *src1, const int count );
126 	virtual void VPCALL Div( float *dst, const float constant, const float *divisor, const int count );
127 	virtual void VPCALL Div( float *dst, const float *src0, const float *src1, const int count );
128 	virtual void VPCALL MulAdd( float *dst, const float constant, const float *src, const int count );
129 	virtual void VPCALL MulAdd( float *dst, const float *src0, const float *src1, const int count );
130 	virtual void VPCALL MulSub( float *dst, const float constant, const float *src, const int count );
131 	virtual void VPCALL MulSub( float *dst, const float *src0, const float *src1, const int count );
132 #endif
133 
134 #ifdef ENABLE_DOT
135 	// Dot products, expects data structures in contiguous memory
136 	virtual void VPCALL Dot( float *dst,			const idVec3 &constant,	const idVec3 *src,		const int count );
137 	virtual void VPCALL Dot( float *dst,			const idVec3 &constant,	const idPlane *src,		const int count );
138 	virtual void VPCALL Dot( float *dst,			const idVec3 &constant,	const idDrawVert *src,	const int count );
139 	virtual void VPCALL Dot( float *dst,			const idPlane &constant,const idVec3 *src,		const int count );
140 	virtual void VPCALL Dot( float *dst,			const idPlane &constant,const idPlane *src,		const int count );
141 	virtual void VPCALL Dot( float *dst,			const idPlane &constant,const idDrawVert *src,	const int count );
142 	virtual void VPCALL Dot( float *dst,			const idVec3 *src0,		const idVec3 *src1,		const int count );
143 	virtual void VPCALL Dot( float &dot,			const float *src1,		const float *src2,		const int count );
144 #endif
145 
146 #ifdef ENABLE_COMPARES
147 	// Comparisons, works for both aligned and unaligned data
148 	virtual void VPCALL CmpGT( byte *dst,			const float *src0,		const float constant,	const int count );
149 	virtual void VPCALL CmpGT( byte *dst,			const byte bitNum,		const float *src0,		const float constant,	const int count );
150 	virtual void VPCALL CmpGE( byte *dst,			const float *src0,		const float constant,	const int count );
151 	virtual void VPCALL CmpGE( byte *dst,			const byte bitNum,		const float *src0,		const float constant,	const int count );
152 	virtual void VPCALL CmpLT( byte *dst,			const float *src0,		const float constant,	const int count );
153 	virtual void VPCALL CmpLT( byte *dst,			const byte bitNum,		const float *src0,		const float constant,	const int count );
154 	virtual void VPCALL CmpLE( byte *dst,			const float *src0,		const float constant,	const int count );
155 	virtual void VPCALL CmpLE( byte *dst,			const byte bitNum,		const float *src0,		const float constant,	const int count );
156 #endif
157 
158 #ifdef ENABLE_MINMAX
159 	// Min/Max. Expects data structures in contiguous memory
160 	virtual void VPCALL MinMax( float &min,			float &max,				const float *src,		const int count );
161 	virtual	void VPCALL MinMax( idVec2 &min,		idVec2 &max,			const idVec2 *src,		const int count );
162 	virtual void VPCALL MinMax( idVec3 &min,		idVec3 &max,			const idVec3 *src,		const int count );
163 	virtual	void VPCALL MinMax( idVec3 &min,		idVec3 &max,			const idDrawVert *src,	const int count );
164 	virtual	void VPCALL MinMax( idVec3 &min,		idVec3 &max,			const idDrawVert *src,	const int *indexes,		const int count );
165 #endif
166 
167 #ifdef ENABLE_CLAMP
168 	// Clamp operations. Works for both aligned and unaligned data
169 	virtual void VPCALL Clamp( float *dst,			const float *src,		const float min,		const float max,		const int count );
170 	virtual void VPCALL ClampMin( float *dst,		const float *src,		const float min,		const int count );
171 	virtual void VPCALL ClampMax( float *dst,		const float *src,		const float max,		const int count );
172 #endif
173 
174 	// These are already using memcpy and memset functions. Leaving default implementation
175 //	virtual void VPCALL Memcpy( void *dst,			const void *src,		const int count );
176 //	virtual void VPCALL Memset( void *dst,			const int val,			const int count );
177 
178 #ifdef ENABLE_16ROUTINES
179 	// Operations that expect 16-byte aligned data and 16-byte padded memory (with zeros), generally faster
180 	virtual void VPCALL Zero16( float *dst,			const int count );
181 	virtual void VPCALL Negate16( float *dst,		const int count );
182 	virtual void VPCALL Copy16( float *dst,			const float *src,		const int count );
183 	virtual void VPCALL Add16( float *dst,			const float *src1,		const float *src2,		const int count );
184 	virtual void VPCALL Sub16( float *dst,			const float *src1,		const float *src2,		const int count );
185 	virtual void VPCALL Mul16( float *dst,			const float *src1,		const float constant,	const int count );
186 	virtual void VPCALL AddAssign16( float *dst,	const float *src,		const int count );
187 	virtual void VPCALL SubAssign16( float *dst,	const float *src,		const int count );
188 	virtual void VPCALL MulAssign16( float *dst,	const float constant,	const int count );
189 #endif
190 
191 //  Most of these deal with tiny matrices or vectors, generally not worth altivec'ing since
192 //  the scalar code is already really fast
193 
194 //	virtual void VPCALL MatX_MultiplyVecX( idVecX &dst, const idMatX &mat, const idVecX &vec );
195 //	virtual void VPCALL MatX_MultiplyAddVecX( idVecX &dst, const idMatX &mat, const idVecX &vec );
196 //	virtual void VPCALL MatX_MultiplySubVecX( idVecX &dst, const idMatX &mat, const idVecX &vec );
197 //	virtual void VPCALL MatX_TransposeMultiplyVecX( idVecX &dst, const idMatX &mat, const idVecX &vec );
198 //	virtual void VPCALL MatX_TransposeMultiplyAddVecX( idVecX &dst, const idMatX &mat, const idVecX &vec );
199 //	virtual void VPCALL MatX_TransposeMultiplySubVecX( idVecX &dst, const idMatX &mat, const idVecX &vec );
200 //	virtual void VPCALL MatX_MultiplyMatX( idMatX &dst, const idMatX &m1, const idMatX &m2 );
201 //	virtual void VPCALL MatX_TransposeMultiplyMatX( idMatX &dst, const idMatX &m1, const idMatX &m2 );
202 
203 #ifdef ENABLE_LOWER_TRIANGULAR
204 	virtual void VPCALL MatX_LowerTriangularSolve( const idMatX &L, float *x, const float *b, const int n, int skip = 0 );
205 	virtual void VPCALL MatX_LowerTriangularSolveTranspose( const idMatX &L, float *x, const float *b, const int n );
206 	virtual bool VPCALL MatX_LDLTFactor( idMatX &mat, idVecX &invDiag, const int n );
207 #endif
208 #ifdef LIVE_VICARIOUSLY
209 	virtual void VPCALL BlendJoints( idJointQuat *joints, const idJointQuat *blendJoints, const float lerp, const int *index, const int numJoints );
210 	virtual void VPCALL ConvertJointQuatsToJointMats( idJointMat *jointMats, const idJointQuat *jointQuats, const int numJoints );
211 	virtual void VPCALL ConvertJointMatsToJointQuats( idJointQuat *jointQuats, const idJointMat *jointMats, const int numJoints );
212 #endif
213 
214 #ifdef LIVE_VICARIOUSLY
215 	virtual void VPCALL TransformJoints( idJointMat *jointMats, const int *parents, const int firstJoint, const int lastJoint );
216 	virtual void VPCALL UntransformJoints( idJointMat *jointMats, const int *parents, const int firstJoint, const int lastJoint );
217 	virtual void VPCALL TransformVerts( idDrawVert *verts, const int numVerts, const idJointMat *joints, const idVec4 *weights, const int *index, const int numWeights );
218 #endif
219 
220 #ifdef ENABLE_CULL
221 	virtual void VPCALL TracePointCull( byte *cullBits, byte &totalOr, const float radius, const idPlane *planes, const idDrawVert *verts, const int numVerts );
222 	virtual void VPCALL DecalPointCull( byte *cullBits, const idPlane *planes, const idDrawVert *verts, const int numVerts );
223 	virtual void VPCALL OverlayPointCull( byte *cullBits, idVec2 *texCoords, const idPlane *planes, const idDrawVert *verts, const int numVerts );
224 #endif
225 
226 #ifdef ENABLE_DERIVE
227 	virtual void VPCALL DeriveTriPlanes( idPlane *planes, const idDrawVert *verts, const int numVerts, const int *indexes, const int numIndexes );
228 	virtual void VPCALL DeriveTangents( idPlane *planes, idDrawVert *verts, const int numVerts, const int *indexes, const int numIndexes );
229 	virtual void VPCALL DeriveUnsmoothedTangents( idDrawVert *verts, const dominantTri_s *dominantTris, const int numVerts );
230 	virtual void VPCALL NormalizeTangents( idDrawVert *verts, const int numVerts );
231 #endif
232 
233 #ifdef ENABLE_CREATE
234 	virtual void VPCALL CreateTextureSpaceLightVectors( idVec3 *lightVectors, const idVec3 &lightOrigin, const idDrawVert *verts, const int numVerts, const int *indexes, const int numIndexes );
235 	virtual void VPCALL CreateSpecularTextureCoords( idVec4 *texCoords, const idVec3 &lightOrigin, const idVec3 &viewOrigin, const idDrawVert *verts, const int numVerts, const int *indexes, const int numIndexes );
236 	virtual int  VPCALL CreateShadowCache( idVec4 *vertexCache, int *vertRemap, const idVec3 &lightOrigin, const idDrawVert *verts, const int numVerts );
237 	virtual int  VPCALL CreateVertexProgramShadowCache( idVec4 *vertexCache, const idDrawVert *verts, const int numVerts );
238 #endif
239 
240 #ifdef ENABLE_SOUND_ROUTINES
241 	// Sound upsampling and mixing routines, works for aligned and unaligned data
242 	virtual void VPCALL UpSamplePCMTo44kHz( float *dest, const short *pcm, const int numSamples, const int kHz, const int numChannels );
243 	virtual void VPCALL UpSampleOGGTo44kHz( float *dest, const float * const *ogg, const int numSamples, const int kHz, const int numChannels );
244 	virtual void VPCALL MixSoundTwoSpeakerMono( float *mixBuffer, const float *samples, const int numSamples, const float lastV[2], const float currentV[2] );
245 	virtual void VPCALL MixSoundTwoSpeakerStereo( float *mixBuffer, const float *samples, const int numSamples, const float lastV[2], const float currentV[2] );
246 	virtual void VPCALL MixSoundSixSpeakerMono( float *mixBuffer, const float *samples, const int numSamples, const float lastV[6], const float currentV[6] );
247 	virtual void VPCALL MixSoundSixSpeakerStereo( float *mixBuffer, const float *samples, const int numSamples, const float lastV[6], const float currentV[6] );
248 	virtual void VPCALL MixedSoundToSamples( short *samples, const float *mixBuffer, const int numSamples );
249 #endif
250 #endif
251 
252 };
253 
254 #endif /* !__MATH_SIMD_ALTIVEC_H__ */
255