1/*
2Copyright (c) 2012 Advanced Micro Devices, Inc.
3
4This software is provided 'as-is', without any express or implied warranty.
5In no event will the authors be held liable for any damages arising from the use of this software.
6Permission is granted to anyone to use this software for any purpose,
7including commercial applications, and to alter it and redistribute it freely,
8subject to the following restrictions:
9
101. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
112. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
123. This notice may not be removed or altered from any source distribution.
13*/
14//Originally written by Takahiro Harada
15
16
17#include "Bullet3Collision/NarrowPhaseCollision/shared/b3Contact4Data.h"
18
19#pragma OPENCL EXTENSION cl_amd_printf : enable
20#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable
21#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
22#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable
23#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable
24
25
26#ifdef cl_ext_atomic_counters_32
27#pragma OPENCL EXTENSION cl_ext_atomic_counters_32 : enable
28#else
29#define counter32_t volatile global int*
30#endif
31
32typedef unsigned int u32;
33typedef unsigned short u16;
34typedef unsigned char u8;
35
36#define GET_GROUP_IDX get_group_id(0)
37#define GET_LOCAL_IDX get_local_id(0)
38#define GET_GLOBAL_IDX get_global_id(0)
39#define GET_GROUP_SIZE get_local_size(0)
40#define GET_NUM_GROUPS get_num_groups(0)
41#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)
42#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)
43#define AtomInc(x) atom_inc(&(x))
44#define AtomInc1(x, out) out = atom_inc(&(x))
45#define AppendInc(x, out) out = atomic_inc(x)
46#define AtomAdd(x, value) atom_add(&(x), value)
47#define AtomCmpxhg(x, cmp, value) atom_cmpxchg( &(x), cmp, value )
48#define AtomXhg(x, value) atom_xchg ( &(x), value )
49
50
51#define SELECT_UINT4( b, a, condition ) select( b,a,condition )
52
53#define make_float4 (float4)
54#define make_float2 (float2)
55#define make_uint4 (uint4)
56#define make_int4 (int4)
57#define make_uint2 (uint2)
58#define make_int2 (int2)
59
60
61#define max2 max
62#define min2 min
63
64
65///////////////////////////////////////
66//	Vector
67///////////////////////////////////////
68__inline
69float fastDiv(float numerator, float denominator)
70{
71	return native_divide(numerator, denominator);
72//	return numerator/denominator;
73}
74
75__inline
76float4 fastDiv4(float4 numerator, float4 denominator)
77{
78	return native_divide(numerator, denominator);
79}
80
81__inline
82float fastSqrtf(float f2)
83{
84	return native_sqrt(f2);
85//	return sqrt(f2);
86}
87
88__inline
89float fastRSqrt(float f2)
90{
91	return native_rsqrt(f2);
92}
93
94__inline
95float fastLength4(float4 v)
96{
97	return fast_length(v);
98}
99
100__inline
101float4 fastNormalize4(float4 v)
102{
103	return fast_normalize(v);
104}
105
106
107__inline
108float sqrtf(float a)
109{
110//	return sqrt(a);
111	return native_sqrt(a);
112}
113
114__inline
115float4 cross3(float4 a, float4 b)
116{
117	return cross(a,b);
118}
119
120__inline
121float dot3F4(float4 a, float4 b)
122{
123	float4 a1 = make_float4(a.xyz,0.f);
124	float4 b1 = make_float4(b.xyz,0.f);
125	return dot(a1, b1);
126}
127
128__inline
129float length3(const float4 a)
130{
131	return sqrtf(dot3F4(a,a));
132}
133
134__inline
135float dot4(const float4 a, const float4 b)
136{
137	return dot( a, b );
138}
139
140//	for height
141__inline
142float dot3w1(const float4 point, const float4 eqn)
143{
144	return dot3F4(point,eqn) + eqn.w;
145}
146
147__inline
148float4 normalize3(const float4 a)
149{
150	float4 n = make_float4(a.x, a.y, a.z, 0.f);
151	return fastNormalize4( n );
152//	float length = sqrtf(dot3F4(a, a));
153//	return 1.f/length * a;
154}
155
156__inline
157float4 normalize4(const float4 a)
158{
159	float length = sqrtf(dot4(a, a));
160	return 1.f/length * a;
161}
162
163__inline
164float4 createEquation(const float4 a, const float4 b, const float4 c)
165{
166	float4 eqn;
167	float4 ab = b-a;
168	float4 ac = c-a;
169	eqn = normalize3( cross3(ab, ac) );
170	eqn.w = -dot3F4(eqn,a);
171	return eqn;
172}
173
174///////////////////////////////////////
175//	Matrix3x3
176///////////////////////////////////////
177
178typedef struct
179{
180	float4 m_row[3];
181}Matrix3x3;
182
183__inline
184Matrix3x3 mtZero();
185
186__inline
187Matrix3x3 mtIdentity();
188
189__inline
190Matrix3x3 mtTranspose(Matrix3x3 m);
191
192__inline
193Matrix3x3 mtMul(Matrix3x3 a, Matrix3x3 b);
194
195__inline
196float4 mtMul1(Matrix3x3 a, float4 b);
197
198__inline
199float4 mtMul3(float4 a, Matrix3x3 b);
200
201__inline
202Matrix3x3 mtZero()
203{
204	Matrix3x3 m;
205	m.m_row[0] = (float4)(0.f);
206	m.m_row[1] = (float4)(0.f);
207	m.m_row[2] = (float4)(0.f);
208	return m;
209}
210
211__inline
212Matrix3x3 mtIdentity()
213{
214	Matrix3x3 m;
215	m.m_row[0] = (float4)(1,0,0,0);
216	m.m_row[1] = (float4)(0,1,0,0);
217	m.m_row[2] = (float4)(0,0,1,0);
218	return m;
219}
220
221__inline
222Matrix3x3 mtTranspose(Matrix3x3 m)
223{
224	Matrix3x3 out;
225	out.m_row[0] = (float4)(m.m_row[0].x, m.m_row[1].x, m.m_row[2].x, 0.f);
226	out.m_row[1] = (float4)(m.m_row[0].y, m.m_row[1].y, m.m_row[2].y, 0.f);
227	out.m_row[2] = (float4)(m.m_row[0].z, m.m_row[1].z, m.m_row[2].z, 0.f);
228	return out;
229}
230
231__inline
232Matrix3x3 mtMul(Matrix3x3 a, Matrix3x3 b)
233{
234	Matrix3x3 transB;
235	transB = mtTranspose( b );
236	Matrix3x3 ans;
237	//	why this doesn't run when 0ing in the for{}
238	a.m_row[0].w = 0.f;
239	a.m_row[1].w = 0.f;
240	a.m_row[2].w = 0.f;
241	for(int i=0; i<3; i++)
242	{
243//	a.m_row[i].w = 0.f;
244		ans.m_row[i].x = dot3F4(a.m_row[i],transB.m_row[0]);
245		ans.m_row[i].y = dot3F4(a.m_row[i],transB.m_row[1]);
246		ans.m_row[i].z = dot3F4(a.m_row[i],transB.m_row[2]);
247		ans.m_row[i].w = 0.f;
248	}
249	return ans;
250}
251
252__inline
253float4 mtMul1(Matrix3x3 a, float4 b)
254{
255	float4 ans;
256	ans.x = dot3F4( a.m_row[0], b );
257	ans.y = dot3F4( a.m_row[1], b );
258	ans.z = dot3F4( a.m_row[2], b );
259	ans.w = 0.f;
260	return ans;
261}
262
263__inline
264float4 mtMul3(float4 a, Matrix3x3 b)
265{
266	float4 colx = make_float4(b.m_row[0].x, b.m_row[1].x, b.m_row[2].x, 0);
267	float4 coly = make_float4(b.m_row[0].y, b.m_row[1].y, b.m_row[2].y, 0);
268	float4 colz = make_float4(b.m_row[0].z, b.m_row[1].z, b.m_row[2].z, 0);
269
270	float4 ans;
271	ans.x = dot3F4( a, colx );
272	ans.y = dot3F4( a, coly );
273	ans.z = dot3F4( a, colz );
274	return ans;
275}
276
277///////////////////////////////////////
278//	Quaternion
279///////////////////////////////////////
280
281typedef float4 Quaternion;
282
283__inline
284Quaternion qtMul(Quaternion a, Quaternion b);
285
286__inline
287Quaternion qtNormalize(Quaternion in);
288
289__inline
290float4 qtRotate(Quaternion q, float4 vec);
291
292__inline
293Quaternion qtInvert(Quaternion q);
294
295
296
297
298
299__inline
300Quaternion qtMul(Quaternion a, Quaternion b)
301{
302	Quaternion ans;
303	ans = cross3( a, b );
304	ans += a.w*b+b.w*a;
305//	ans.w = a.w*b.w - (a.x*b.x+a.y*b.y+a.z*b.z);
306	ans.w = a.w*b.w - dot3F4(a, b);
307	return ans;
308}
309
310__inline
311Quaternion qtNormalize(Quaternion in)
312{
313	return fastNormalize4(in);
314//	in /= length( in );
315//	return in;
316}
317__inline
318float4 qtRotate(Quaternion q, float4 vec)
319{
320	Quaternion qInv = qtInvert( q );
321	float4 vcpy = vec;
322	vcpy.w = 0.f;
323	float4 out = qtMul(qtMul(q,vcpy),qInv);
324	return out;
325}
326
327__inline
328Quaternion qtInvert(Quaternion q)
329{
330	return (Quaternion)(-q.xyz, q.w);
331}
332
333__inline
334float4 qtInvRotate(const Quaternion q, float4 vec)
335{
336	return qtRotate( qtInvert( q ), vec );
337}
338
339
340
341
342#define WG_SIZE 64
343
344typedef struct
345{
346	float4 m_pos;
347	Quaternion m_quat;
348	float4 m_linVel;
349	float4 m_angVel;
350
351	u32 m_shapeIdx;
352	float m_invMass;
353	float m_restituitionCoeff;
354	float m_frictionCoeff;
355} Body;
356
357typedef struct
358{
359	Matrix3x3 m_invInertia;
360	Matrix3x3 m_initInvInertia;
361} Shape;
362
363typedef struct
364{
365	float4 m_linear;
366	float4 m_worldPos[4];
367	float4 m_center;
368	float m_jacCoeffInv[4];
369	float m_b[4];
370	float m_appliedRambdaDt[4];
371
372	float m_fJacCoeffInv[2];
373	float m_fAppliedRambdaDt[2];
374
375	u32 m_bodyA;
376	u32 m_bodyB;
377
378	int m_batchIdx;
379	u32 m_paddings[1];
380} Constraint4;
381
382
383
384typedef struct
385{
386	int m_nConstraints;
387	int m_start;
388	int m_batchIdx;
389	int m_nSplit;
390//	int m_paddings[1];
391} ConstBuffer;
392
393typedef struct
394{
395	int m_solveFriction;
396	int m_maxBatch;	//	long batch really kills the performance
397	int m_batchIdx;
398	int m_nSplit;
399//	int m_paddings[1];
400} ConstBufferBatchSolve;
401
402
403
404
405
406typedef struct
407{
408	int m_valInt0;
409	int m_valInt1;
410	int m_valInt2;
411	int m_valInt3;
412
413	float m_val0;
414	float m_val1;
415	float m_val2;
416	float m_val3;
417} SolverDebugInfo;
418
419
420
421
422//	others
423__kernel
424__attribute__((reqd_work_group_size(WG_SIZE,1,1)))
425void ReorderContactKernel(__global struct b3Contact4Data* in, __global struct b3Contact4Data* out, __global int2* sortData, int4 cb )
426{
427	int nContacts = cb.x;
428	int gIdx = GET_GLOBAL_IDX;
429
430	if( gIdx < nContacts )
431	{
432		int srcIdx = sortData[gIdx].y;
433		out[gIdx] = in[srcIdx];
434	}
435}
436
437__kernel __attribute__((reqd_work_group_size(WG_SIZE,1,1)))
438void SetDeterminismSortDataChildShapeB(__global struct b3Contact4Data* contactsIn, __global int2* sortDataOut, int nContacts)
439{
440	int gIdx = GET_GLOBAL_IDX;
441
442	if( gIdx < nContacts )
443	{
444		int2 sd;
445		sd.x = contactsIn[gIdx].m_childIndexB;
446		sd.y = gIdx;
447		sortDataOut[gIdx] = sd;
448	}
449}
450
451__kernel __attribute__((reqd_work_group_size(WG_SIZE,1,1)))
452void SetDeterminismSortDataChildShapeA(__global struct b3Contact4Data* contactsIn, __global int2* sortDataInOut, int nContacts)
453{
454	int gIdx = GET_GLOBAL_IDX;
455
456	if( gIdx < nContacts )
457	{
458		int2 sdIn;
459		sdIn = sortDataInOut[gIdx];
460		int2 sdOut;
461		sdOut.x = contactsIn[sdIn.y].m_childIndexA;
462		sdOut.y = sdIn.y;
463		sortDataInOut[gIdx] = sdOut;
464	}
465}
466
467__kernel __attribute__((reqd_work_group_size(WG_SIZE,1,1)))
468void SetDeterminismSortDataBodyA(__global struct b3Contact4Data* contactsIn, __global int2* sortDataInOut, int nContacts)
469{
470	int gIdx = GET_GLOBAL_IDX;
471
472	if( gIdx < nContacts )
473	{
474		int2 sdIn;
475		sdIn = sortDataInOut[gIdx];
476		int2 sdOut;
477		sdOut.x = contactsIn[sdIn.y].m_bodyAPtrAndSignBit;
478		sdOut.y = sdIn.y;
479		sortDataInOut[gIdx] = sdOut;
480	}
481}
482
483
484__kernel
485__attribute__((reqd_work_group_size(WG_SIZE,1,1)))
486void SetDeterminismSortDataBodyB(__global struct b3Contact4Data* contactsIn, __global int2* sortDataInOut, int nContacts)
487{
488	int gIdx = GET_GLOBAL_IDX;
489
490	if( gIdx < nContacts )
491	{
492		int2 sdIn;
493		sdIn = sortDataInOut[gIdx];
494		int2 sdOut;
495		sdOut.x = contactsIn[sdIn.y].m_bodyBPtrAndSignBit;
496		sdOut.y = sdIn.y;
497		sortDataInOut[gIdx] = sdOut;
498	}
499}
500
501
502
503
504typedef struct
505{
506	int m_nContacts;
507	int m_staticIdx;
508	float m_scale;
509	int m_nSplit;
510} ConstBufferSSD;
511
512
513__constant const int gridTable4x4[] =
514{
515    0,1,17,16,
516	1,2,18,19,
517	17,18,32,3,
518	16,19,3,34
519};
520
521__constant const int gridTable8x8[] =
522{
523	  0,  2,  3, 16, 17, 18, 19,  1,
524	 66, 64, 80, 67, 82, 81, 65, 83,
525	131,144,128,130,147,129,145,146,
526	208,195,194,192,193,211,210,209,
527	 21, 22, 23,  5,  4,  6,  7, 20,
528	 86, 85, 69, 87, 70, 68, 84, 71,
529	151,133,149,150,135,148,132,134,
530	197,27,214,213,212,199,198,196
531
532};
533
534
535
536
537#define USE_SPATIAL_BATCHING 1
538#define USE_4x4_GRID 1
539
540__kernel
541__attribute__((reqd_work_group_size(WG_SIZE,1,1)))
542void SetSortDataKernel(__global struct b3Contact4Data* gContact, __global Body* gBodies, __global int2* gSortDataOut,
543int nContacts,float scale,int4 nSplit,int staticIdx)
544
545{
546	int gIdx = GET_GLOBAL_IDX;
547
548	if( gIdx < nContacts )
549	{
550		int aPtrAndSignBit  = gContact[gIdx].m_bodyAPtrAndSignBit;
551		int bPtrAndSignBit  = gContact[gIdx].m_bodyBPtrAndSignBit;
552
553		int aIdx = abs(aPtrAndSignBit );
554		int bIdx = abs(bPtrAndSignBit);
555
556		bool aStatic = (aPtrAndSignBit<0) ||(aPtrAndSignBit==staticIdx);
557		bool bStatic = (bPtrAndSignBit<0) ||(bPtrAndSignBit==staticIdx);
558
559#if USE_SPATIAL_BATCHING
560		int idx = (aStatic)? bIdx: aIdx;
561		float4 p = gBodies[idx].m_pos;
562		int xIdx = (int)((p.x-((p.x<0.f)?1.f:0.f))*scale) & (nSplit.x-1);
563		int yIdx = (int)((p.y-((p.y<0.f)?1.f:0.f))*scale) & (nSplit.y-1);
564		int zIdx = (int)((p.z-((p.z<0.f)?1.f:0.f))*scale) & (nSplit.z-1);
565		int newIndex = (xIdx+yIdx*nSplit.x+zIdx*nSplit.x*nSplit.y);
566
567#else//USE_SPATIAL_BATCHING
568	#if USE_4x4_GRID
569		int aa = aIdx&3;
570		int bb = bIdx&3;
571		if (aStatic)
572			aa = bb;
573		if (bStatic)
574			bb = aa;
575
576		int gridIndex = aa + bb*4;
577		int newIndex = gridTable4x4[gridIndex];
578	#else//USE_4x4_GRID
579		int aa = aIdx&7;
580		int bb = bIdx&7;
581		if (aStatic)
582			aa = bb;
583		if (bStatic)
584			bb = aa;
585
586		int gridIndex = aa + bb*8;
587		int newIndex = gridTable8x8[gridIndex];
588	#endif//USE_4x4_GRID
589#endif//USE_SPATIAL_BATCHING
590
591
592		gSortDataOut[gIdx].x = newIndex;
593		gSortDataOut[gIdx].y = gIdx;
594	}
595	else
596	{
597		gSortDataOut[gIdx].x = 0xffffffff;
598	}
599}
600
601__kernel
602__attribute__((reqd_work_group_size(WG_SIZE,1,1)))
603void CopyConstraintKernel(__global struct b3Contact4Data* gIn, __global struct b3Contact4Data* gOut, int4 cb )
604{
605	int gIdx = GET_GLOBAL_IDX;
606	if( gIdx < cb.x )
607	{
608		gOut[gIdx] = gIn[gIdx];
609	}
610}
611
612
613
614