1/* 2Copyright (c) 2012 Advanced Micro Devices, Inc. 3 4This software is provided 'as-is', without any express or implied warranty. 5In no event will the authors be held liable for any damages arising from the use of this software. 6Permission is granted to anyone to use this software for any purpose, 7including commercial applications, and to alter it and redistribute it freely, 8subject to the following restrictions: 9 101. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. 112. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. 123. This notice may not be removed or altered from any source distribution. 13*/ 14//Originally written by Takahiro Harada 15 16 17#include "Bullet3Collision/NarrowPhaseCollision/shared/b3Contact4Data.h" 18 19#pragma OPENCL EXTENSION cl_amd_printf : enable 20#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable 21#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable 22#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable 23#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable 24 25 26#ifdef cl_ext_atomic_counters_32 27#pragma OPENCL EXTENSION cl_ext_atomic_counters_32 : enable 28#else 29#define counter32_t volatile global int* 30#endif 31 32typedef unsigned int u32; 33typedef unsigned short u16; 34typedef unsigned char u8; 35 36#define GET_GROUP_IDX get_group_id(0) 37#define GET_LOCAL_IDX get_local_id(0) 38#define GET_GLOBAL_IDX get_global_id(0) 39#define GET_GROUP_SIZE get_local_size(0) 40#define GET_NUM_GROUPS get_num_groups(0) 41#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE) 42#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE) 43#define AtomInc(x) atom_inc(&(x)) 44#define AtomInc1(x, out) out = atom_inc(&(x)) 45#define AppendInc(x, out) out = atomic_inc(x) 46#define AtomAdd(x, value) atom_add(&(x), value) 47#define AtomCmpxhg(x, cmp, value) atom_cmpxchg( &(x), cmp, value ) 48#define AtomXhg(x, value) atom_xchg ( &(x), value ) 49 50 51#define SELECT_UINT4( b, a, condition ) select( b,a,condition ) 52 53#define make_float4 (float4) 54#define make_float2 (float2) 55#define make_uint4 (uint4) 56#define make_int4 (int4) 57#define make_uint2 (uint2) 58#define make_int2 (int2) 59 60 61#define max2 max 62#define min2 min 63 64 65/////////////////////////////////////// 66// Vector 67/////////////////////////////////////// 68__inline 69float fastDiv(float numerator, float denominator) 70{ 71 return native_divide(numerator, denominator); 72// return numerator/denominator; 73} 74 75__inline 76float4 fastDiv4(float4 numerator, float4 denominator) 77{ 78 return native_divide(numerator, denominator); 79} 80 81__inline 82float fastSqrtf(float f2) 83{ 84 return native_sqrt(f2); 85// return sqrt(f2); 86} 87 88__inline 89float fastRSqrt(float f2) 90{ 91 return native_rsqrt(f2); 92} 93 94__inline 95float fastLength4(float4 v) 96{ 97 return fast_length(v); 98} 99 100__inline 101float4 fastNormalize4(float4 v) 102{ 103 return fast_normalize(v); 104} 105 106 107__inline 108float sqrtf(float a) 109{ 110// return sqrt(a); 111 return native_sqrt(a); 112} 113 114__inline 115float4 cross3(float4 a, float4 b) 116{ 117 return cross(a,b); 118} 119 120__inline 121float dot3F4(float4 a, float4 b) 122{ 123 float4 a1 = make_float4(a.xyz,0.f); 124 float4 b1 = make_float4(b.xyz,0.f); 125 return dot(a1, b1); 126} 127 128__inline 129float length3(const float4 a) 130{ 131 return sqrtf(dot3F4(a,a)); 132} 133 134__inline 135float dot4(const float4 a, const float4 b) 136{ 137 return dot( a, b ); 138} 139 140// for height 141__inline 142float dot3w1(const float4 point, const float4 eqn) 143{ 144 return dot3F4(point,eqn) + eqn.w; 145} 146 147__inline 148float4 normalize3(const float4 a) 149{ 150 float4 n = make_float4(a.x, a.y, a.z, 0.f); 151 return fastNormalize4( n ); 152// float length = sqrtf(dot3F4(a, a)); 153// return 1.f/length * a; 154} 155 156__inline 157float4 normalize4(const float4 a) 158{ 159 float length = sqrtf(dot4(a, a)); 160 return 1.f/length * a; 161} 162 163__inline 164float4 createEquation(const float4 a, const float4 b, const float4 c) 165{ 166 float4 eqn; 167 float4 ab = b-a; 168 float4 ac = c-a; 169 eqn = normalize3( cross3(ab, ac) ); 170 eqn.w = -dot3F4(eqn,a); 171 return eqn; 172} 173 174/////////////////////////////////////// 175// Matrix3x3 176/////////////////////////////////////// 177 178typedef struct 179{ 180 float4 m_row[3]; 181}Matrix3x3; 182 183__inline 184Matrix3x3 mtZero(); 185 186__inline 187Matrix3x3 mtIdentity(); 188 189__inline 190Matrix3x3 mtTranspose(Matrix3x3 m); 191 192__inline 193Matrix3x3 mtMul(Matrix3x3 a, Matrix3x3 b); 194 195__inline 196float4 mtMul1(Matrix3x3 a, float4 b); 197 198__inline 199float4 mtMul3(float4 a, Matrix3x3 b); 200 201__inline 202Matrix3x3 mtZero() 203{ 204 Matrix3x3 m; 205 m.m_row[0] = (float4)(0.f); 206 m.m_row[1] = (float4)(0.f); 207 m.m_row[2] = (float4)(0.f); 208 return m; 209} 210 211__inline 212Matrix3x3 mtIdentity() 213{ 214 Matrix3x3 m; 215 m.m_row[0] = (float4)(1,0,0,0); 216 m.m_row[1] = (float4)(0,1,0,0); 217 m.m_row[2] = (float4)(0,0,1,0); 218 return m; 219} 220 221__inline 222Matrix3x3 mtTranspose(Matrix3x3 m) 223{ 224 Matrix3x3 out; 225 out.m_row[0] = (float4)(m.m_row[0].x, m.m_row[1].x, m.m_row[2].x, 0.f); 226 out.m_row[1] = (float4)(m.m_row[0].y, m.m_row[1].y, m.m_row[2].y, 0.f); 227 out.m_row[2] = (float4)(m.m_row[0].z, m.m_row[1].z, m.m_row[2].z, 0.f); 228 return out; 229} 230 231__inline 232Matrix3x3 mtMul(Matrix3x3 a, Matrix3x3 b) 233{ 234 Matrix3x3 transB; 235 transB = mtTranspose( b ); 236 Matrix3x3 ans; 237 // why this doesn't run when 0ing in the for{} 238 a.m_row[0].w = 0.f; 239 a.m_row[1].w = 0.f; 240 a.m_row[2].w = 0.f; 241 for(int i=0; i<3; i++) 242 { 243// a.m_row[i].w = 0.f; 244 ans.m_row[i].x = dot3F4(a.m_row[i],transB.m_row[0]); 245 ans.m_row[i].y = dot3F4(a.m_row[i],transB.m_row[1]); 246 ans.m_row[i].z = dot3F4(a.m_row[i],transB.m_row[2]); 247 ans.m_row[i].w = 0.f; 248 } 249 return ans; 250} 251 252__inline 253float4 mtMul1(Matrix3x3 a, float4 b) 254{ 255 float4 ans; 256 ans.x = dot3F4( a.m_row[0], b ); 257 ans.y = dot3F4( a.m_row[1], b ); 258 ans.z = dot3F4( a.m_row[2], b ); 259 ans.w = 0.f; 260 return ans; 261} 262 263__inline 264float4 mtMul3(float4 a, Matrix3x3 b) 265{ 266 float4 colx = make_float4(b.m_row[0].x, b.m_row[1].x, b.m_row[2].x, 0); 267 float4 coly = make_float4(b.m_row[0].y, b.m_row[1].y, b.m_row[2].y, 0); 268 float4 colz = make_float4(b.m_row[0].z, b.m_row[1].z, b.m_row[2].z, 0); 269 270 float4 ans; 271 ans.x = dot3F4( a, colx ); 272 ans.y = dot3F4( a, coly ); 273 ans.z = dot3F4( a, colz ); 274 return ans; 275} 276 277/////////////////////////////////////// 278// Quaternion 279/////////////////////////////////////// 280 281typedef float4 Quaternion; 282 283__inline 284Quaternion qtMul(Quaternion a, Quaternion b); 285 286__inline 287Quaternion qtNormalize(Quaternion in); 288 289__inline 290float4 qtRotate(Quaternion q, float4 vec); 291 292__inline 293Quaternion qtInvert(Quaternion q); 294 295 296 297 298 299__inline 300Quaternion qtMul(Quaternion a, Quaternion b) 301{ 302 Quaternion ans; 303 ans = cross3( a, b ); 304 ans += a.w*b+b.w*a; 305// ans.w = a.w*b.w - (a.x*b.x+a.y*b.y+a.z*b.z); 306 ans.w = a.w*b.w - dot3F4(a, b); 307 return ans; 308} 309 310__inline 311Quaternion qtNormalize(Quaternion in) 312{ 313 return fastNormalize4(in); 314// in /= length( in ); 315// return in; 316} 317__inline 318float4 qtRotate(Quaternion q, float4 vec) 319{ 320 Quaternion qInv = qtInvert( q ); 321 float4 vcpy = vec; 322 vcpy.w = 0.f; 323 float4 out = qtMul(qtMul(q,vcpy),qInv); 324 return out; 325} 326 327__inline 328Quaternion qtInvert(Quaternion q) 329{ 330 return (Quaternion)(-q.xyz, q.w); 331} 332 333__inline 334float4 qtInvRotate(const Quaternion q, float4 vec) 335{ 336 return qtRotate( qtInvert( q ), vec ); 337} 338 339 340 341 342#define WG_SIZE 64 343 344typedef struct 345{ 346 float4 m_pos; 347 Quaternion m_quat; 348 float4 m_linVel; 349 float4 m_angVel; 350 351 u32 m_shapeIdx; 352 float m_invMass; 353 float m_restituitionCoeff; 354 float m_frictionCoeff; 355} Body; 356 357typedef struct 358{ 359 Matrix3x3 m_invInertia; 360 Matrix3x3 m_initInvInertia; 361} Shape; 362 363typedef struct 364{ 365 float4 m_linear; 366 float4 m_worldPos[4]; 367 float4 m_center; 368 float m_jacCoeffInv[4]; 369 float m_b[4]; 370 float m_appliedRambdaDt[4]; 371 372 float m_fJacCoeffInv[2]; 373 float m_fAppliedRambdaDt[2]; 374 375 u32 m_bodyA; 376 u32 m_bodyB; 377 378 int m_batchIdx; 379 u32 m_paddings[1]; 380} Constraint4; 381 382 383 384typedef struct 385{ 386 int m_nConstraints; 387 int m_start; 388 int m_batchIdx; 389 int m_nSplit; 390// int m_paddings[1]; 391} ConstBuffer; 392 393typedef struct 394{ 395 int m_solveFriction; 396 int m_maxBatch; // long batch really kills the performance 397 int m_batchIdx; 398 int m_nSplit; 399// int m_paddings[1]; 400} ConstBufferBatchSolve; 401 402 403 404 405 406typedef struct 407{ 408 int m_valInt0; 409 int m_valInt1; 410 int m_valInt2; 411 int m_valInt3; 412 413 float m_val0; 414 float m_val1; 415 float m_val2; 416 float m_val3; 417} SolverDebugInfo; 418 419 420 421 422// others 423__kernel 424__attribute__((reqd_work_group_size(WG_SIZE,1,1))) 425void ReorderContactKernel(__global struct b3Contact4Data* in, __global struct b3Contact4Data* out, __global int2* sortData, int4 cb ) 426{ 427 int nContacts = cb.x; 428 int gIdx = GET_GLOBAL_IDX; 429 430 if( gIdx < nContacts ) 431 { 432 int srcIdx = sortData[gIdx].y; 433 out[gIdx] = in[srcIdx]; 434 } 435} 436 437__kernel __attribute__((reqd_work_group_size(WG_SIZE,1,1))) 438void SetDeterminismSortDataChildShapeB(__global struct b3Contact4Data* contactsIn, __global int2* sortDataOut, int nContacts) 439{ 440 int gIdx = GET_GLOBAL_IDX; 441 442 if( gIdx < nContacts ) 443 { 444 int2 sd; 445 sd.x = contactsIn[gIdx].m_childIndexB; 446 sd.y = gIdx; 447 sortDataOut[gIdx] = sd; 448 } 449} 450 451__kernel __attribute__((reqd_work_group_size(WG_SIZE,1,1))) 452void SetDeterminismSortDataChildShapeA(__global struct b3Contact4Data* contactsIn, __global int2* sortDataInOut, int nContacts) 453{ 454 int gIdx = GET_GLOBAL_IDX; 455 456 if( gIdx < nContacts ) 457 { 458 int2 sdIn; 459 sdIn = sortDataInOut[gIdx]; 460 int2 sdOut; 461 sdOut.x = contactsIn[sdIn.y].m_childIndexA; 462 sdOut.y = sdIn.y; 463 sortDataInOut[gIdx] = sdOut; 464 } 465} 466 467__kernel __attribute__((reqd_work_group_size(WG_SIZE,1,1))) 468void SetDeterminismSortDataBodyA(__global struct b3Contact4Data* contactsIn, __global int2* sortDataInOut, int nContacts) 469{ 470 int gIdx = GET_GLOBAL_IDX; 471 472 if( gIdx < nContacts ) 473 { 474 int2 sdIn; 475 sdIn = sortDataInOut[gIdx]; 476 int2 sdOut; 477 sdOut.x = contactsIn[sdIn.y].m_bodyAPtrAndSignBit; 478 sdOut.y = sdIn.y; 479 sortDataInOut[gIdx] = sdOut; 480 } 481} 482 483 484__kernel 485__attribute__((reqd_work_group_size(WG_SIZE,1,1))) 486void SetDeterminismSortDataBodyB(__global struct b3Contact4Data* contactsIn, __global int2* sortDataInOut, int nContacts) 487{ 488 int gIdx = GET_GLOBAL_IDX; 489 490 if( gIdx < nContacts ) 491 { 492 int2 sdIn; 493 sdIn = sortDataInOut[gIdx]; 494 int2 sdOut; 495 sdOut.x = contactsIn[sdIn.y].m_bodyBPtrAndSignBit; 496 sdOut.y = sdIn.y; 497 sortDataInOut[gIdx] = sdOut; 498 } 499} 500 501 502 503 504typedef struct 505{ 506 int m_nContacts; 507 int m_staticIdx; 508 float m_scale; 509 int m_nSplit; 510} ConstBufferSSD; 511 512 513__constant const int gridTable4x4[] = 514{ 515 0,1,17,16, 516 1,2,18,19, 517 17,18,32,3, 518 16,19,3,34 519}; 520 521__constant const int gridTable8x8[] = 522{ 523 0, 2, 3, 16, 17, 18, 19, 1, 524 66, 64, 80, 67, 82, 81, 65, 83, 525 131,144,128,130,147,129,145,146, 526 208,195,194,192,193,211,210,209, 527 21, 22, 23, 5, 4, 6, 7, 20, 528 86, 85, 69, 87, 70, 68, 84, 71, 529 151,133,149,150,135,148,132,134, 530 197,27,214,213,212,199,198,196 531 532}; 533 534 535 536 537#define USE_SPATIAL_BATCHING 1 538#define USE_4x4_GRID 1 539 540__kernel 541__attribute__((reqd_work_group_size(WG_SIZE,1,1))) 542void SetSortDataKernel(__global struct b3Contact4Data* gContact, __global Body* gBodies, __global int2* gSortDataOut, 543int nContacts,float scale,int4 nSplit,int staticIdx) 544 545{ 546 int gIdx = GET_GLOBAL_IDX; 547 548 if( gIdx < nContacts ) 549 { 550 int aPtrAndSignBit = gContact[gIdx].m_bodyAPtrAndSignBit; 551 int bPtrAndSignBit = gContact[gIdx].m_bodyBPtrAndSignBit; 552 553 int aIdx = abs(aPtrAndSignBit ); 554 int bIdx = abs(bPtrAndSignBit); 555 556 bool aStatic = (aPtrAndSignBit<0) ||(aPtrAndSignBit==staticIdx); 557 bool bStatic = (bPtrAndSignBit<0) ||(bPtrAndSignBit==staticIdx); 558 559#if USE_SPATIAL_BATCHING 560 int idx = (aStatic)? bIdx: aIdx; 561 float4 p = gBodies[idx].m_pos; 562 int xIdx = (int)((p.x-((p.x<0.f)?1.f:0.f))*scale) & (nSplit.x-1); 563 int yIdx = (int)((p.y-((p.y<0.f)?1.f:0.f))*scale) & (nSplit.y-1); 564 int zIdx = (int)((p.z-((p.z<0.f)?1.f:0.f))*scale) & (nSplit.z-1); 565 int newIndex = (xIdx+yIdx*nSplit.x+zIdx*nSplit.x*nSplit.y); 566 567#else//USE_SPATIAL_BATCHING 568 #if USE_4x4_GRID 569 int aa = aIdx&3; 570 int bb = bIdx&3; 571 if (aStatic) 572 aa = bb; 573 if (bStatic) 574 bb = aa; 575 576 int gridIndex = aa + bb*4; 577 int newIndex = gridTable4x4[gridIndex]; 578 #else//USE_4x4_GRID 579 int aa = aIdx&7; 580 int bb = bIdx&7; 581 if (aStatic) 582 aa = bb; 583 if (bStatic) 584 bb = aa; 585 586 int gridIndex = aa + bb*8; 587 int newIndex = gridTable8x8[gridIndex]; 588 #endif//USE_4x4_GRID 589#endif//USE_SPATIAL_BATCHING 590 591 592 gSortDataOut[gIdx].x = newIndex; 593 gSortDataOut[gIdx].y = gIdx; 594 } 595 else 596 { 597 gSortDataOut[gIdx].x = 0xffffffff; 598 } 599} 600 601__kernel 602__attribute__((reqd_work_group_size(WG_SIZE,1,1))) 603void CopyConstraintKernel(__global struct b3Contact4Data* gIn, __global struct b3Contact4Data* gOut, int4 cb ) 604{ 605 int gIdx = GET_GLOBAL_IDX; 606 if( gIdx < cb.x ) 607 { 608 gOut[gIdx] = gIn[gIdx]; 609 } 610} 611 612 613 614