1/* 2Copyright (c) 2012 Advanced Micro Devices, Inc. 3 4This software is provided 'as-is', without any express or implied warranty. 5In no event will the authors be held liable for any damages arising from the use of this software. 6Permission is granted to anyone to use this software for any purpose, 7including commercial applications, and to alter it and redistribute it freely, 8subject to the following restrictions: 9 101. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. 112. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. 123. This notice may not be removed or altered from any source distribution. 13*/ 14//Originally written by Takahiro Harada 15 16 17//#pragma OPENCL EXTENSION cl_amd_printf : enable 18#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable 19#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable 20#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable 21#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable 22 23 24#ifdef cl_ext_atomic_counters_32 25#pragma OPENCL EXTENSION cl_ext_atomic_counters_32 : enable 26#else 27#define counter32_t volatile global int* 28#endif 29 30typedef unsigned int u32; 31typedef unsigned short u16; 32typedef unsigned char u8; 33 34#define GET_GROUP_IDX get_group_id(0) 35#define GET_LOCAL_IDX get_local_id(0) 36#define GET_GLOBAL_IDX get_global_id(0) 37#define GET_GROUP_SIZE get_local_size(0) 38#define GET_NUM_GROUPS get_num_groups(0) 39#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE) 40#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE) 41#define AtomInc(x) atom_inc(&(x)) 42#define AtomInc1(x, out) out = atom_inc(&(x)) 43#define AppendInc(x, out) out = atomic_inc(x) 44#define AtomAdd(x, value) atom_add(&(x), value) 45#define AtomCmpxhg(x, cmp, value) atom_cmpxchg( &(x), cmp, value ) 46#define AtomXhg(x, value) atom_xchg ( &(x), value ) 47 48 49#define SELECT_UINT4( b, a, condition ) select( b,a,condition ) 50 51#define mymake_float4 (float4) 52//#define make_float2 (float2) 53//#define make_uint4 (uint4) 54//#define make_int4 (int4) 55//#define make_uint2 (uint2) 56//#define make_int2 (int2) 57 58 59#define max2 max 60#define min2 min 61 62 63/////////////////////////////////////// 64// Vector 65/////////////////////////////////////// 66 67 68 69 70__inline 71float4 fastNormalize4(float4 v) 72{ 73 return fast_normalize(v); 74} 75 76 77 78__inline 79float4 cross3(float4 a, float4 b) 80{ 81 return cross(a,b); 82} 83 84__inline 85float dot3F4(float4 a, float4 b) 86{ 87 float4 a1 = mymake_float4(a.xyz,0.f); 88 float4 b1 = mymake_float4(b.xyz,0.f); 89 return dot(a1, b1); 90} 91 92 93 94 95__inline 96float4 normalize3(const float4 a) 97{ 98 float4 n = mymake_float4(a.x, a.y, a.z, 0.f); 99 return fastNormalize4( n ); 100// float length = sqrtf(dot3F4(a, a)); 101// return 1.f/length * a; 102} 103 104 105 106 107/////////////////////////////////////// 108// Matrix3x3 109/////////////////////////////////////// 110 111typedef struct 112{ 113 float4 m_row[3]; 114}Matrix3x3; 115 116 117 118 119 120 121__inline 122float4 mtMul1(Matrix3x3 a, float4 b); 123 124__inline 125float4 mtMul3(float4 a, Matrix3x3 b); 126 127 128 129 130__inline 131float4 mtMul1(Matrix3x3 a, float4 b) 132{ 133 float4 ans; 134 ans.x = dot3F4( a.m_row[0], b ); 135 ans.y = dot3F4( a.m_row[1], b ); 136 ans.z = dot3F4( a.m_row[2], b ); 137 ans.w = 0.f; 138 return ans; 139} 140 141__inline 142float4 mtMul3(float4 a, Matrix3x3 b) 143{ 144 float4 colx = mymake_float4(b.m_row[0].x, b.m_row[1].x, b.m_row[2].x, 0); 145 float4 coly = mymake_float4(b.m_row[0].y, b.m_row[1].y, b.m_row[2].y, 0); 146 float4 colz = mymake_float4(b.m_row[0].z, b.m_row[1].z, b.m_row[2].z, 0); 147 148 float4 ans; 149 ans.x = dot3F4( a, colx ); 150 ans.y = dot3F4( a, coly ); 151 ans.z = dot3F4( a, colz ); 152 return ans; 153} 154 155/////////////////////////////////////// 156// Quaternion 157/////////////////////////////////////// 158 159typedef float4 Quaternion; 160 161 162 163 164 165 166 167#define WG_SIZE 64 168 169typedef struct 170{ 171 float4 m_pos; 172 Quaternion m_quat; 173 float4 m_linVel; 174 float4 m_angVel; 175 176 u32 m_shapeIdx; 177 float m_invMass; 178 float m_restituitionCoeff; 179 float m_frictionCoeff; 180} Body; 181 182typedef struct 183{ 184 Matrix3x3 m_invInertia; 185 Matrix3x3 m_initInvInertia; 186} Shape; 187 188typedef struct 189{ 190 float4 m_linear; 191 float4 m_worldPos[4]; 192 float4 m_center; 193 float m_jacCoeffInv[4]; 194 float m_b[4]; 195 float m_appliedRambdaDt[4]; 196 197 float m_fJacCoeffInv[2]; 198 float m_fAppliedRambdaDt[2]; 199 200 u32 m_bodyA; 201 u32 m_bodyB; 202 203 int m_batchIdx; 204 u32 m_paddings[1]; 205} Constraint4; 206 207 208 209typedef struct 210{ 211 int m_nConstraints; 212 int m_start; 213 int m_batchIdx; 214 int m_nSplit; 215// int m_paddings[1]; 216} ConstBuffer; 217 218typedef struct 219{ 220 int m_solveFriction; 221 int m_maxBatch; // long batch really kills the performance 222 int m_batchIdx; 223 int m_nSplit; 224// int m_paddings[1]; 225} ConstBufferBatchSolve; 226 227void setLinearAndAngular( float4 n, float4 r0, float4 r1, float4* linear, float4* angular0, float4* angular1); 228 229void setLinearAndAngular( float4 n, float4 r0, float4 r1, float4* linear, float4* angular0, float4* angular1) 230{ 231 *linear = mymake_float4(-n.xyz,0.f); 232 *angular0 = -cross3(r0, n); 233 *angular1 = cross3(r1, n); 234} 235 236float calcRelVel( float4 l0, float4 l1, float4 a0, float4 a1, float4 linVel0, float4 angVel0, float4 linVel1, float4 angVel1 ); 237 238float calcRelVel( float4 l0, float4 l1, float4 a0, float4 a1, float4 linVel0, float4 angVel0, float4 linVel1, float4 angVel1 ) 239{ 240 return dot3F4(l0, linVel0) + dot3F4(a0, angVel0) + dot3F4(l1, linVel1) + dot3F4(a1, angVel1); 241} 242 243 244float calcJacCoeff(const float4 linear0, const float4 linear1, const float4 angular0, const float4 angular1, 245 float invMass0, const Matrix3x3* invInertia0, float invMass1, const Matrix3x3* invInertia1); 246 247float calcJacCoeff(const float4 linear0, const float4 linear1, const float4 angular0, const float4 angular1, 248 float invMass0, const Matrix3x3* invInertia0, float invMass1, const Matrix3x3* invInertia1) 249{ 250 // linear0,1 are normlized 251 float jmj0 = invMass0;//dot3F4(linear0, linear0)*invMass0; 252 float jmj1 = dot3F4(mtMul3(angular0,*invInertia0), angular0); 253 float jmj2 = invMass1;//dot3F4(linear1, linear1)*invMass1; 254 float jmj3 = dot3F4(mtMul3(angular1,*invInertia1), angular1); 255 return -1.f/(jmj0+jmj1+jmj2+jmj3); 256} 257 258 259void solveContact(__global Constraint4* cs, 260 float4 posA, float4* linVelA, float4* angVelA, float invMassA, Matrix3x3 invInertiaA, 261 float4 posB, float4* linVelB, float4* angVelB, float invMassB, Matrix3x3 invInertiaB); 262 263void solveContact(__global Constraint4* cs, 264 float4 posA, float4* linVelA, float4* angVelA, float invMassA, Matrix3x3 invInertiaA, 265 float4 posB, float4* linVelB, float4* angVelB, float invMassB, Matrix3x3 invInertiaB) 266{ 267 float minRambdaDt = 0; 268 float maxRambdaDt = FLT_MAX; 269 270 for(int ic=0; ic<4; ic++) 271 { 272 if( cs->m_jacCoeffInv[ic] == 0.f ) continue; 273 274 float4 angular0, angular1, linear; 275 float4 r0 = cs->m_worldPos[ic] - posA; 276 float4 r1 = cs->m_worldPos[ic] - posB; 277 setLinearAndAngular( -cs->m_linear, r0, r1, &linear, &angular0, &angular1 ); 278 279 float rambdaDt = calcRelVel( cs->m_linear, -cs->m_linear, angular0, angular1, 280 *linVelA, *angVelA, *linVelB, *angVelB ) + cs->m_b[ic]; 281 rambdaDt *= cs->m_jacCoeffInv[ic]; 282 283 { 284 float prevSum = cs->m_appliedRambdaDt[ic]; 285 float updated = prevSum; 286 updated += rambdaDt; 287 updated = max2( updated, minRambdaDt ); 288 updated = min2( updated, maxRambdaDt ); 289 rambdaDt = updated - prevSum; 290 cs->m_appliedRambdaDt[ic] = updated; 291 } 292 293 float4 linImp0 = invMassA*linear*rambdaDt; 294 float4 linImp1 = invMassB*(-linear)*rambdaDt; 295 float4 angImp0 = mtMul1(invInertiaA, angular0)*rambdaDt; 296 float4 angImp1 = mtMul1(invInertiaB, angular1)*rambdaDt; 297 298 *linVelA += linImp0; 299 *angVelA += angImp0; 300 *linVelB += linImp1; 301 *angVelB += angImp1; 302 } 303} 304 305void btPlaneSpace1 (const float4* n, float4* p, float4* q); 306 void btPlaneSpace1 (const float4* n, float4* p, float4* q) 307{ 308 if (fabs(n[0].z) > 0.70710678f) { 309 // choose p in y-z plane 310 float a = n[0].y*n[0].y + n[0].z*n[0].z; 311 float k = 1.f/sqrt(a); 312 p[0].x = 0; 313 p[0].y = -n[0].z*k; 314 p[0].z = n[0].y*k; 315 // set q = n x p 316 q[0].x = a*k; 317 q[0].y = -n[0].x*p[0].z; 318 q[0].z = n[0].x*p[0].y; 319 } 320 else { 321 // choose p in x-y plane 322 float a = n[0].x*n[0].x + n[0].y*n[0].y; 323 float k = 1.f/sqrt(a); 324 p[0].x = -n[0].y*k; 325 p[0].y = n[0].x*k; 326 p[0].z = 0; 327 // set q = n x p 328 q[0].x = -n[0].z*p[0].y; 329 q[0].y = n[0].z*p[0].x; 330 q[0].z = a*k; 331 } 332} 333 334void solveContactConstraint(__global Body* gBodies, __global Shape* gShapes, __global Constraint4* ldsCs); 335void solveContactConstraint(__global Body* gBodies, __global Shape* gShapes, __global Constraint4* ldsCs) 336{ 337 //float frictionCoeff = ldsCs[0].m_linear.w; 338 int aIdx = ldsCs[0].m_bodyA; 339 int bIdx = ldsCs[0].m_bodyB; 340 341 float4 posA = gBodies[aIdx].m_pos; 342 float4 linVelA = gBodies[aIdx].m_linVel; 343 float4 angVelA = gBodies[aIdx].m_angVel; 344 float invMassA = gBodies[aIdx].m_invMass; 345 Matrix3x3 invInertiaA = gShapes[aIdx].m_invInertia; 346 347 float4 posB = gBodies[bIdx].m_pos; 348 float4 linVelB = gBodies[bIdx].m_linVel; 349 float4 angVelB = gBodies[bIdx].m_angVel; 350 float invMassB = gBodies[bIdx].m_invMass; 351 Matrix3x3 invInertiaB = gShapes[bIdx].m_invInertia; 352 353 solveContact( ldsCs, posA, &linVelA, &angVelA, invMassA, invInertiaA, 354 posB, &linVelB, &angVelB, invMassB, invInertiaB ); 355 356 if (gBodies[aIdx].m_invMass) 357 { 358 gBodies[aIdx].m_linVel = linVelA; 359 gBodies[aIdx].m_angVel = angVelA; 360 } else 361 { 362 gBodies[aIdx].m_linVel = mymake_float4(0,0,0,0); 363 gBodies[aIdx].m_angVel = mymake_float4(0,0,0,0); 364 365 } 366 if (gBodies[bIdx].m_invMass) 367 { 368 gBodies[bIdx].m_linVel = linVelB; 369 gBodies[bIdx].m_angVel = angVelB; 370 } else 371 { 372 gBodies[bIdx].m_linVel = mymake_float4(0,0,0,0); 373 gBodies[bIdx].m_angVel = mymake_float4(0,0,0,0); 374 375 } 376 377} 378 379 380 381typedef struct 382{ 383 int m_valInt0; 384 int m_valInt1; 385 int m_valInt2; 386 int m_valInt3; 387 388 float m_val0; 389 float m_val1; 390 float m_val2; 391 float m_val3; 392} SolverDebugInfo; 393 394 395 396 397__kernel 398__attribute__((reqd_work_group_size(WG_SIZE,1,1))) 399void BatchSolveKernelContact(__global Body* gBodies, 400 __global Shape* gShapes, 401 __global Constraint4* gConstraints, 402 __global int* gN, 403 __global int* gOffsets, 404 __global int* batchSizes, 405 int maxBatch1, 406 int cellBatch, 407 int4 nSplit 408 ) 409{ 410 //__local int ldsBatchIdx[WG_SIZE+1]; 411 __local int ldsCurBatch; 412 __local int ldsNextBatch; 413 __local int ldsStart; 414 415 int lIdx = GET_LOCAL_IDX; 416 int wgIdx = GET_GROUP_IDX; 417 418// int gIdx = GET_GLOBAL_IDX; 419// debugInfo[gIdx].m_valInt0 = gIdx; 420 //debugInfo[gIdx].m_valInt1 = GET_GROUP_SIZE; 421 422 423 424 425 int zIdx = (wgIdx/((nSplit.x*nSplit.y)/4))*2+((cellBatch&4)>>2); 426 int remain= (wgIdx%((nSplit.x*nSplit.y)/4)); 427 int yIdx = (remain/(nSplit.x/2))*2 + ((cellBatch&2)>>1); 428 int xIdx = (remain%(nSplit.x/2))*2 + (cellBatch&1); 429 int cellIdx = xIdx+yIdx*nSplit.x+zIdx*(nSplit.x*nSplit.y); 430 431 //int xIdx = (wgIdx/(nSplit/2))*2 + (bIdx&1); 432 //int yIdx = (wgIdx%(nSplit/2))*2 + (bIdx>>1); 433 //int cellIdx = xIdx+yIdx*nSplit; 434 435 if( gN[cellIdx] == 0 ) 436 return; 437 438 int maxBatch = batchSizes[cellIdx]; 439 440 441 const int start = gOffsets[cellIdx]; 442 const int end = start + gN[cellIdx]; 443 444 445 446 447 if( lIdx == 0 ) 448 { 449 ldsCurBatch = 0; 450 ldsNextBatch = 0; 451 ldsStart = start; 452 } 453 454 455 GROUP_LDS_BARRIER; 456 457 int idx=ldsStart+lIdx; 458 while (ldsCurBatch < maxBatch) 459 { 460 for(; idx<end; ) 461 { 462 if (gConstraints[idx].m_batchIdx == ldsCurBatch) 463 { 464 solveContactConstraint( gBodies, gShapes, &gConstraints[idx] ); 465 466 idx+=64; 467 } else 468 { 469 break; 470 } 471 } 472 GROUP_LDS_BARRIER; 473 474 if( lIdx == 0 ) 475 { 476 ldsCurBatch++; 477 } 478 GROUP_LDS_BARRIER; 479 } 480 481 482} 483 484 485 486__kernel void solveSingleContactKernel(__global Body* gBodies, 487 __global Shape* gShapes, 488 __global Constraint4* gConstraints, 489 int cellIdx, 490 int batchOffset, 491 int numConstraintsInBatch 492 ) 493{ 494 495 int index = get_global_id(0); 496 if (index < numConstraintsInBatch) 497 { 498 int idx=batchOffset+index; 499 solveContactConstraint( gBodies, gShapes, &gConstraints[idx] ); 500 } 501} 502