1 /* 2 =========================================================================== 3 4 Doom 3 GPL Source Code 5 Copyright (C) 1999-2011 id Software LLC, a ZeniMax Media company. 6 7 This file is part of the Doom 3 GPL Source Code ("Doom 3 Source Code"). 8 9 Doom 3 Source Code is free software: you can redistribute it and/or modify 10 it under the terms of the GNU General Public License as published by 11 the Free Software Foundation, either version 3 of the License, or 12 (at your option) any later version. 13 14 Doom 3 Source Code is distributed in the hope that it will be useful, 15 but WITHOUT ANY WARRANTY; without even the implied warranty of 16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 17 GNU General Public License for more details. 18 19 You should have received a copy of the GNU General Public License 20 along with Doom 3 Source Code. If not, see <http://www.gnu.org/licenses/>. 21 22 In addition, the Doom 3 Source Code is also subject to certain additional terms. You should have received a copy of these additional terms immediately following the terms and conditions of the GNU General Public License which accompanied the Doom 3 Source Code. If not, please request a copy in writing from id Software at the address below. 23 24 If you have questions concerning this license or the applicable additional terms, you may contact in writing id Software LLC, c/o ZeniMax Media Inc., Suite 120, Rockville, Maryland 20850 USA. 25 26 =========================================================================== 27 */ 28 29 #ifndef __MATH_SIMD_ALTIVEC_H__ 30 #define __MATH_SIMD_ALTIVEC_H__ 31 32 #include "idlib/math/Simd_Generic.h" 33 34 /* 35 =============================================================================== 36 37 AltiVec implementation of idSIMDProcessor 38 39 =============================================================================== 40 */ 41 42 // Defines for enabling parts of the library 43 44 // Turns on/off the simple math routines (add, sub, div, etc) 45 #define ENABLE_SIMPLE_MATH 46 47 // Turns on/off the dot routines 48 #define ENABLE_DOT 49 50 // Turns on/off the compare routines 51 #define ENABLE_COMPARES 52 53 // The MinMax routines introduce a couple of bugs. In the bathroom of the alphalabs2 map, the 54 // wrong surface appears in the mirror at times. It also introduces a noticable delay when map 55 // data is loaded such as going through doors. 56 // Turns on/off MinMax routines 57 //#define ENABLE_MINMAX 58 59 // Turns on/off Clamp routines 60 #define ENABLE_CLAMP 61 62 // Turns on/off XXX16 routines 63 #define ENABLE_16ROUTINES 64 65 // Turns on/off LowerTriangularSolve, LowerTriangularSolveTranspose, and MatX_LDLTFactor 66 #define ENABLE_LOWER_TRIANGULAR 67 68 // Turns on/off TracePointCull, DecalPointCull, and OverlayPoint 69 // The Enable_Cull routines breaks the g_decals functionality, DecalPointCull is 70 // the likely suspect. Bullet holes do not appear on the walls when this optimization 71 // is enabled. 72 //#define ENABLE_CULL 73 74 // Turns on/off DeriveTriPlanes, DeriveTangents, DeriveUnsmoothedTangents, NormalizeTangents 75 #define ENABLE_DERIVE 76 77 // Turns on/off CreateTextureSpaceLightVectors, CreateShadowCache, CreateVertexProgramShadowCache 78 #define ENABLE_CREATE 79 80 // Turns on/off the sound routines 81 #define ENABLE_SOUND_ROUTINES 82 83 // Turns on/off the stuff that isn't on elsewhere 84 // Currently: BlendJoints, TransformJoints, UntransformJoints, ConvertJointQuatsToJointMats, and 85 // ConvertJointMatsToJointQuats 86 #define LIVE_VICARIOUSLY 87 88 // This assumes that the dest (and mixBuffer) array to the sound functions is aligned. If this is not true, we take a large 89 // performance hit from having to do unaligned stores 90 //#define SOUND_DEST_ALIGNED 91 92 // This assumes that the vertexCache array to CreateShadowCache and CreateVertexProgramShadowCache is aligned. If it's not, 93 // then we take a big performance hit from unaligned stores. 94 //#define VERTEXCACHE_ALIGNED 95 96 // This turns on support for PPC intrinsics in the SIMD_AltiVec.cpp file. Right now it's only used for frsqrte. GCC 97 // supports these intrinsics but XLC does not. 98 #if defined(__GNUC__) && defined(__ALTIVEC__) 99 #define PPC_INTRINSICS 100 #endif 101 102 // This assumes that the idDrawVert array that is used in DeriveUnsmoothedTangents is aligned. If its not aligned, 103 // then we don't get any speedup 104 //#define DERIVE_UNSMOOTH_DRAWVERT_ALIGNED 105 106 // Disable DRAWVERT_PADDED since we disabled the ENABLE_CULL optimizations and the default 107 // implementation does not allow for the extra padding. 108 // This assumes that idDrawVert has been padded by 4 bytes so that xyz always starts at an aligned 109 // address 110 //#define DRAWVERT_PADDED 111 112 class idSIMD_AltiVec : public idSIMD_Generic { 113 #if defined(MACOS_X) && defined(__GNUC__) && defined(__ALTIVEC__) 114 public: 115 116 virtual const char * VPCALL GetName( void ) const; 117 118 #ifdef ENABLE_SIMPLE_MATH 119 // Basic math, works for both aligned and unaligned data 120 virtual void VPCALL Add( float *dst, const float constant, const float *src, const int count ); 121 virtual void VPCALL Add( float *dst, const float *src0, const float *src1, const int count ); 122 virtual void VPCALL Sub( float *dst, const float constant, const float *src, const int count ); 123 virtual void VPCALL Sub( float *dst, const float *src0, const float *src1, const int count ); 124 virtual void VPCALL Mul( float *dst, const float constant, const float *src, const int count); 125 virtual void VPCALL Mul( float *dst, const float *src0, const float *src1, const int count ); 126 virtual void VPCALL Div( float *dst, const float constant, const float *divisor, const int count ); 127 virtual void VPCALL Div( float *dst, const float *src0, const float *src1, const int count ); 128 virtual void VPCALL MulAdd( float *dst, const float constant, const float *src, const int count ); 129 virtual void VPCALL MulAdd( float *dst, const float *src0, const float *src1, const int count ); 130 virtual void VPCALL MulSub( float *dst, const float constant, const float *src, const int count ); 131 virtual void VPCALL MulSub( float *dst, const float *src0, const float *src1, const int count ); 132 #endif 133 134 #ifdef ENABLE_DOT 135 // Dot products, expects data structures in contiguous memory 136 virtual void VPCALL Dot( float *dst, const idVec3 &constant, const idVec3 *src, const int count ); 137 virtual void VPCALL Dot( float *dst, const idVec3 &constant, const idPlane *src, const int count ); 138 virtual void VPCALL Dot( float *dst, const idVec3 &constant, const idDrawVert *src, const int count ); 139 virtual void VPCALL Dot( float *dst, const idPlane &constant,const idVec3 *src, const int count ); 140 virtual void VPCALL Dot( float *dst, const idPlane &constant,const idPlane *src, const int count ); 141 virtual void VPCALL Dot( float *dst, const idPlane &constant,const idDrawVert *src, const int count ); 142 virtual void VPCALL Dot( float *dst, const idVec3 *src0, const idVec3 *src1, const int count ); 143 virtual void VPCALL Dot( float &dot, const float *src1, const float *src2, const int count ); 144 #endif 145 146 #ifdef ENABLE_COMPARES 147 // Comparisons, works for both aligned and unaligned data 148 virtual void VPCALL CmpGT( byte *dst, const float *src0, const float constant, const int count ); 149 virtual void VPCALL CmpGT( byte *dst, const byte bitNum, const float *src0, const float constant, const int count ); 150 virtual void VPCALL CmpGE( byte *dst, const float *src0, const float constant, const int count ); 151 virtual void VPCALL CmpGE( byte *dst, const byte bitNum, const float *src0, const float constant, const int count ); 152 virtual void VPCALL CmpLT( byte *dst, const float *src0, const float constant, const int count ); 153 virtual void VPCALL CmpLT( byte *dst, const byte bitNum, const float *src0, const float constant, const int count ); 154 virtual void VPCALL CmpLE( byte *dst, const float *src0, const float constant, const int count ); 155 virtual void VPCALL CmpLE( byte *dst, const byte bitNum, const float *src0, const float constant, const int count ); 156 #endif 157 158 #ifdef ENABLE_MINMAX 159 // Min/Max. Expects data structures in contiguous memory 160 virtual void VPCALL MinMax( float &min, float &max, const float *src, const int count ); 161 virtual void VPCALL MinMax( idVec2 &min, idVec2 &max, const idVec2 *src, const int count ); 162 virtual void VPCALL MinMax( idVec3 &min, idVec3 &max, const idVec3 *src, const int count ); 163 virtual void VPCALL MinMax( idVec3 &min, idVec3 &max, const idDrawVert *src, const int count ); 164 virtual void VPCALL MinMax( idVec3 &min, idVec3 &max, const idDrawVert *src, const int *indexes, const int count ); 165 #endif 166 167 #ifdef ENABLE_CLAMP 168 // Clamp operations. Works for both aligned and unaligned data 169 virtual void VPCALL Clamp( float *dst, const float *src, const float min, const float max, const int count ); 170 virtual void VPCALL ClampMin( float *dst, const float *src, const float min, const int count ); 171 virtual void VPCALL ClampMax( float *dst, const float *src, const float max, const int count ); 172 #endif 173 174 // These are already using memcpy and memset functions. Leaving default implementation 175 // virtual void VPCALL Memcpy( void *dst, const void *src, const int count ); 176 // virtual void VPCALL Memset( void *dst, const int val, const int count ); 177 178 #ifdef ENABLE_16ROUTINES 179 // Operations that expect 16-byte aligned data and 16-byte padded memory (with zeros), generally faster 180 virtual void VPCALL Zero16( float *dst, const int count ); 181 virtual void VPCALL Negate16( float *dst, const int count ); 182 virtual void VPCALL Copy16( float *dst, const float *src, const int count ); 183 virtual void VPCALL Add16( float *dst, const float *src1, const float *src2, const int count ); 184 virtual void VPCALL Sub16( float *dst, const float *src1, const float *src2, const int count ); 185 virtual void VPCALL Mul16( float *dst, const float *src1, const float constant, const int count ); 186 virtual void VPCALL AddAssign16( float *dst, const float *src, const int count ); 187 virtual void VPCALL SubAssign16( float *dst, const float *src, const int count ); 188 virtual void VPCALL MulAssign16( float *dst, const float constant, const int count ); 189 #endif 190 191 // Most of these deal with tiny matrices or vectors, generally not worth altivec'ing since 192 // the scalar code is already really fast 193 194 // virtual void VPCALL MatX_MultiplyVecX( idVecX &dst, const idMatX &mat, const idVecX &vec ); 195 // virtual void VPCALL MatX_MultiplyAddVecX( idVecX &dst, const idMatX &mat, const idVecX &vec ); 196 // virtual void VPCALL MatX_MultiplySubVecX( idVecX &dst, const idMatX &mat, const idVecX &vec ); 197 // virtual void VPCALL MatX_TransposeMultiplyVecX( idVecX &dst, const idMatX &mat, const idVecX &vec ); 198 // virtual void VPCALL MatX_TransposeMultiplyAddVecX( idVecX &dst, const idMatX &mat, const idVecX &vec ); 199 // virtual void VPCALL MatX_TransposeMultiplySubVecX( idVecX &dst, const idMatX &mat, const idVecX &vec ); 200 // virtual void VPCALL MatX_MultiplyMatX( idMatX &dst, const idMatX &m1, const idMatX &m2 ); 201 // virtual void VPCALL MatX_TransposeMultiplyMatX( idMatX &dst, const idMatX &m1, const idMatX &m2 ); 202 203 #ifdef ENABLE_LOWER_TRIANGULAR 204 virtual void VPCALL MatX_LowerTriangularSolve( const idMatX &L, float *x, const float *b, const int n, int skip = 0 ); 205 virtual void VPCALL MatX_LowerTriangularSolveTranspose( const idMatX &L, float *x, const float *b, const int n ); 206 virtual bool VPCALL MatX_LDLTFactor( idMatX &mat, idVecX &invDiag, const int n ); 207 #endif 208 #ifdef LIVE_VICARIOUSLY 209 virtual void VPCALL BlendJoints( idJointQuat *joints, const idJointQuat *blendJoints, const float lerp, const int *index, const int numJoints ); 210 virtual void VPCALL ConvertJointQuatsToJointMats( idJointMat *jointMats, const idJointQuat *jointQuats, const int numJoints ); 211 virtual void VPCALL ConvertJointMatsToJointQuats( idJointQuat *jointQuats, const idJointMat *jointMats, const int numJoints ); 212 #endif 213 214 #ifdef LIVE_VICARIOUSLY 215 virtual void VPCALL TransformJoints( idJointMat *jointMats, const int *parents, const int firstJoint, const int lastJoint ); 216 virtual void VPCALL UntransformJoints( idJointMat *jointMats, const int *parents, const int firstJoint, const int lastJoint ); 217 virtual void VPCALL TransformVerts( idDrawVert *verts, const int numVerts, const idJointMat *joints, const idVec4 *weights, const int *index, const int numWeights ); 218 #endif 219 220 #ifdef ENABLE_CULL 221 virtual void VPCALL TracePointCull( byte *cullBits, byte &totalOr, const float radius, const idPlane *planes, const idDrawVert *verts, const int numVerts ); 222 virtual void VPCALL DecalPointCull( byte *cullBits, const idPlane *planes, const idDrawVert *verts, const int numVerts ); 223 virtual void VPCALL OverlayPointCull( byte *cullBits, idVec2 *texCoords, const idPlane *planes, const idDrawVert *verts, const int numVerts ); 224 #endif 225 226 #ifdef ENABLE_DERIVE 227 virtual void VPCALL DeriveTriPlanes( idPlane *planes, const idDrawVert *verts, const int numVerts, const int *indexes, const int numIndexes ); 228 virtual void VPCALL DeriveTangents( idPlane *planes, idDrawVert *verts, const int numVerts, const int *indexes, const int numIndexes ); 229 virtual void VPCALL DeriveUnsmoothedTangents( idDrawVert *verts, const dominantTri_s *dominantTris, const int numVerts ); 230 virtual void VPCALL NormalizeTangents( idDrawVert *verts, const int numVerts ); 231 #endif 232 233 #ifdef ENABLE_CREATE 234 virtual void VPCALL CreateTextureSpaceLightVectors( idVec3 *lightVectors, const idVec3 &lightOrigin, const idDrawVert *verts, const int numVerts, const int *indexes, const int numIndexes ); 235 virtual void VPCALL CreateSpecularTextureCoords( idVec4 *texCoords, const idVec3 &lightOrigin, const idVec3 &viewOrigin, const idDrawVert *verts, const int numVerts, const int *indexes, const int numIndexes ); 236 virtual int VPCALL CreateShadowCache( idVec4 *vertexCache, int *vertRemap, const idVec3 &lightOrigin, const idDrawVert *verts, const int numVerts ); 237 virtual int VPCALL CreateVertexProgramShadowCache( idVec4 *vertexCache, const idDrawVert *verts, const int numVerts ); 238 #endif 239 240 #ifdef ENABLE_SOUND_ROUTINES 241 // Sound upsampling and mixing routines, works for aligned and unaligned data 242 virtual void VPCALL UpSamplePCMTo44kHz( float *dest, const short *pcm, const int numSamples, const int kHz, const int numChannels ); 243 virtual void VPCALL UpSampleOGGTo44kHz( float *dest, const float * const *ogg, const int numSamples, const int kHz, const int numChannels ); 244 virtual void VPCALL MixSoundTwoSpeakerMono( float *mixBuffer, const float *samples, const int numSamples, const float lastV[2], const float currentV[2] ); 245 virtual void VPCALL MixSoundTwoSpeakerStereo( float *mixBuffer, const float *samples, const int numSamples, const float lastV[2], const float currentV[2] ); 246 virtual void VPCALL MixSoundSixSpeakerMono( float *mixBuffer, const float *samples, const int numSamples, const float lastV[6], const float currentV[6] ); 247 virtual void VPCALL MixSoundSixSpeakerStereo( float *mixBuffer, const float *samples, const int numSamples, const float lastV[6], const float currentV[6] ); 248 virtual void VPCALL MixedSoundToSamples( short *samples, const float *mixBuffer, const int numSamples ); 249 #endif 250 #endif 251 252 }; 253 254 #endif /* !__MATH_SIMD_ALTIVEC_H__ */ 255