1/*
2Copyright (c) 2012 Advanced Micro Devices, Inc.
3
4This software is provided 'as-is', without any express or implied warranty.
5In no event will the authors be held liable for any damages arising from the use of this software.
6Permission is granted to anyone to use this software for any purpose,
7including commercial applications, and to alter it and redistribute it freely,
8subject to the following restrictions:
9
101. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
112. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
123. This notice may not be removed or altered from any source distribution.
13*/
14//Originally written by Takahiro Harada
15
16
17//#pragma OPENCL EXTENSION cl_amd_printf : enable
18#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable
19#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
20#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable
21#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable
22
23
24#ifdef cl_ext_atomic_counters_32
25#pragma OPENCL EXTENSION cl_ext_atomic_counters_32 : enable
26#else
27#define counter32_t volatile global int*
28#endif
29
30typedef unsigned int u32;
31typedef unsigned short u16;
32typedef unsigned char u8;
33
34#define GET_GROUP_IDX get_group_id(0)
35#define GET_LOCAL_IDX get_local_id(0)
36#define GET_GLOBAL_IDX get_global_id(0)
37#define GET_GROUP_SIZE get_local_size(0)
38#define GET_NUM_GROUPS get_num_groups(0)
39#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)
40#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)
41#define AtomInc(x) atom_inc(&(x))
42#define AtomInc1(x, out) out = atom_inc(&(x))
43#define AppendInc(x, out) out = atomic_inc(x)
44#define AtomAdd(x, value) atom_add(&(x), value)
45#define AtomCmpxhg(x, cmp, value) atom_cmpxchg( &(x), cmp, value )
46#define AtomXhg(x, value) atom_xchg ( &(x), value )
47
48
49#define SELECT_UINT4( b, a, condition ) select( b,a,condition )
50
51#define mymake_float4 (float4)
52//#define make_float2 (float2)
53//#define make_uint4 (uint4)
54//#define make_int4 (int4)
55//#define make_uint2 (uint2)
56//#define make_int2 (int2)
57
58
59#define max2 max
60#define min2 min
61
62
63///////////////////////////////////////
64//	Vector
65///////////////////////////////////////
66
67
68
69
70__inline
71float4 fastNormalize4(float4 v)
72{
73	return fast_normalize(v);
74}
75
76
77
78__inline
79float4 cross3(float4 a, float4 b)
80{
81	return cross(a,b);
82}
83
84__inline
85float dot3F4(float4 a, float4 b)
86{
87	float4 a1 = mymake_float4(a.xyz,0.f);
88	float4 b1 = mymake_float4(b.xyz,0.f);
89	return dot(a1, b1);
90}
91
92
93
94
95__inline
96float4 normalize3(const float4 a)
97{
98	float4 n = mymake_float4(a.x, a.y, a.z, 0.f);
99	return fastNormalize4( n );
100//	float length = sqrtf(dot3F4(a, a));
101//	return 1.f/length * a;
102}
103
104
105
106
107///////////////////////////////////////
108//	Matrix3x3
109///////////////////////////////////////
110
111typedef struct
112{
113	float4 m_row[3];
114}Matrix3x3;
115
116
117
118
119
120
121__inline
122float4 mtMul1(Matrix3x3 a, float4 b);
123
124__inline
125float4 mtMul3(float4 a, Matrix3x3 b);
126
127
128
129
130__inline
131float4 mtMul1(Matrix3x3 a, float4 b)
132{
133	float4 ans;
134	ans.x = dot3F4( a.m_row[0], b );
135	ans.y = dot3F4( a.m_row[1], b );
136	ans.z = dot3F4( a.m_row[2], b );
137	ans.w = 0.f;
138	return ans;
139}
140
141__inline
142float4 mtMul3(float4 a, Matrix3x3 b)
143{
144	float4 colx = mymake_float4(b.m_row[0].x, b.m_row[1].x, b.m_row[2].x, 0);
145	float4 coly = mymake_float4(b.m_row[0].y, b.m_row[1].y, b.m_row[2].y, 0);
146	float4 colz = mymake_float4(b.m_row[0].z, b.m_row[1].z, b.m_row[2].z, 0);
147
148	float4 ans;
149	ans.x = dot3F4( a, colx );
150	ans.y = dot3F4( a, coly );
151	ans.z = dot3F4( a, colz );
152	return ans;
153}
154
155///////////////////////////////////////
156//	Quaternion
157///////////////////////////////////////
158
159typedef float4 Quaternion;
160
161
162
163
164
165
166
167#define WG_SIZE 64
168
169typedef struct
170{
171	float4 m_pos;
172	Quaternion m_quat;
173	float4 m_linVel;
174	float4 m_angVel;
175
176	u32 m_shapeIdx;
177	float m_invMass;
178	float m_restituitionCoeff;
179	float m_frictionCoeff;
180} Body;
181
182typedef struct
183{
184	Matrix3x3 m_invInertia;
185	Matrix3x3 m_initInvInertia;
186} Shape;
187
188typedef struct
189{
190	float4 m_linear;
191	float4 m_worldPos[4];
192	float4 m_center;
193	float m_jacCoeffInv[4];
194	float m_b[4];
195	float m_appliedRambdaDt[4];
196
197	float m_fJacCoeffInv[2];
198	float m_fAppliedRambdaDt[2];
199
200	u32 m_bodyA;
201	u32 m_bodyB;
202
203	int m_batchIdx;
204	u32 m_paddings[1];
205} Constraint4;
206
207
208
209typedef struct
210{
211	int m_nConstraints;
212	int m_start;
213	int m_batchIdx;
214	int m_nSplit;
215//	int m_paddings[1];
216} ConstBuffer;
217
218typedef struct
219{
220	int m_solveFriction;
221	int m_maxBatch;	//	long batch really kills the performance
222	int m_batchIdx;
223	int m_nSplit;
224//	int m_paddings[1];
225} ConstBufferBatchSolve;
226
227void setLinearAndAngular( float4 n, float4 r0, float4 r1, float4* linear, float4* angular0, float4* angular1);
228
229void setLinearAndAngular( float4 n, float4 r0, float4 r1, float4* linear, float4* angular0, float4* angular1)
230{
231	*linear = mymake_float4(-n.xyz,0.f);
232	*angular0 = -cross3(r0, n);
233	*angular1 = cross3(r1, n);
234}
235
236float calcRelVel( float4 l0, float4 l1, float4 a0, float4 a1, float4 linVel0, float4 angVel0, float4 linVel1, float4 angVel1 );
237
238float calcRelVel( float4 l0, float4 l1, float4 a0, float4 a1, float4 linVel0, float4 angVel0, float4 linVel1, float4 angVel1 )
239{
240	return dot3F4(l0, linVel0) + dot3F4(a0, angVel0) + dot3F4(l1, linVel1) + dot3F4(a1, angVel1);
241}
242
243
244float calcJacCoeff(const float4 linear0, const float4 linear1, const float4 angular0, const float4 angular1,
245				   float invMass0, const Matrix3x3* invInertia0, float invMass1, const Matrix3x3* invInertia1);
246
247float calcJacCoeff(const float4 linear0, const float4 linear1, const float4 angular0, const float4 angular1,
248					float invMass0, const Matrix3x3* invInertia0, float invMass1, const Matrix3x3* invInertia1)
249{
250	//	linear0,1 are normlized
251	float jmj0 = invMass0;//dot3F4(linear0, linear0)*invMass0;
252	float jmj1 = dot3F4(mtMul3(angular0,*invInertia0), angular0);
253	float jmj2 = invMass1;//dot3F4(linear1, linear1)*invMass1;
254	float jmj3 = dot3F4(mtMul3(angular1,*invInertia1), angular1);
255	return -1.f/(jmj0+jmj1+jmj2+jmj3);
256}
257
258
259void solveContact(__global Constraint4* cs,
260				  float4 posA, float4* linVelA, float4* angVelA, float invMassA, Matrix3x3 invInertiaA,
261				  float4 posB, float4* linVelB, float4* angVelB, float invMassB, Matrix3x3 invInertiaB);
262
263void solveContact(__global Constraint4* cs,
264			float4 posA, float4* linVelA, float4* angVelA, float invMassA, Matrix3x3 invInertiaA,
265			float4 posB, float4* linVelB, float4* angVelB, float invMassB, Matrix3x3 invInertiaB)
266{
267	float minRambdaDt = 0;
268	float maxRambdaDt = FLT_MAX;
269
270	for(int ic=0; ic<4; ic++)
271	{
272		if( cs->m_jacCoeffInv[ic] == 0.f ) continue;
273
274		float4 angular0, angular1, linear;
275		float4 r0 = cs->m_worldPos[ic] - posA;
276		float4 r1 = cs->m_worldPos[ic] - posB;
277		setLinearAndAngular( -cs->m_linear, r0, r1, &linear, &angular0, &angular1 );
278
279		float rambdaDt = calcRelVel( cs->m_linear, -cs->m_linear, angular0, angular1,
280			*linVelA, *angVelA, *linVelB, *angVelB ) + cs->m_b[ic];
281		rambdaDt *= cs->m_jacCoeffInv[ic];
282
283		{
284			float prevSum = cs->m_appliedRambdaDt[ic];
285			float updated = prevSum;
286			updated += rambdaDt;
287			updated = max2( updated, minRambdaDt );
288			updated = min2( updated, maxRambdaDt );
289			rambdaDt = updated - prevSum;
290			cs->m_appliedRambdaDt[ic] = updated;
291		}
292
293		float4 linImp0 = invMassA*linear*rambdaDt;
294		float4 linImp1 = invMassB*(-linear)*rambdaDt;
295		float4 angImp0 = mtMul1(invInertiaA, angular0)*rambdaDt;
296		float4 angImp1 = mtMul1(invInertiaB, angular1)*rambdaDt;
297
298		*linVelA += linImp0;
299		*angVelA += angImp0;
300		*linVelB += linImp1;
301		*angVelB += angImp1;
302	}
303}
304
305void btPlaneSpace1 (const float4* n, float4* p, float4* q);
306 void btPlaneSpace1 (const float4* n, float4* p, float4* q)
307{
308  if (fabs(n[0].z) > 0.70710678f) {
309    // choose p in y-z plane
310    float a = n[0].y*n[0].y + n[0].z*n[0].z;
311    float k = 1.f/sqrt(a);
312    p[0].x = 0;
313	p[0].y = -n[0].z*k;
314	p[0].z = n[0].y*k;
315    // set q = n x p
316    q[0].x = a*k;
317	q[0].y = -n[0].x*p[0].z;
318	q[0].z = n[0].x*p[0].y;
319  }
320  else {
321    // choose p in x-y plane
322    float a = n[0].x*n[0].x + n[0].y*n[0].y;
323    float k = 1.f/sqrt(a);
324    p[0].x = -n[0].y*k;
325	p[0].y = n[0].x*k;
326	p[0].z = 0;
327    // set q = n x p
328    q[0].x = -n[0].z*p[0].y;
329	q[0].y = n[0].z*p[0].x;
330	q[0].z = a*k;
331  }
332}
333
334void solveContactConstraint(__global Body* gBodies, __global Shape* gShapes, __global Constraint4* ldsCs);
335void solveContactConstraint(__global Body* gBodies, __global Shape* gShapes, __global Constraint4* ldsCs)
336{
337	//float frictionCoeff = ldsCs[0].m_linear.w;
338	int aIdx = ldsCs[0].m_bodyA;
339	int bIdx = ldsCs[0].m_bodyB;
340
341	float4 posA = gBodies[aIdx].m_pos;
342	float4 linVelA = gBodies[aIdx].m_linVel;
343	float4 angVelA = gBodies[aIdx].m_angVel;
344	float invMassA = gBodies[aIdx].m_invMass;
345	Matrix3x3 invInertiaA = gShapes[aIdx].m_invInertia;
346
347	float4 posB = gBodies[bIdx].m_pos;
348	float4 linVelB = gBodies[bIdx].m_linVel;
349	float4 angVelB = gBodies[bIdx].m_angVel;
350	float invMassB = gBodies[bIdx].m_invMass;
351	Matrix3x3 invInertiaB = gShapes[bIdx].m_invInertia;
352
353	solveContact( ldsCs, posA, &linVelA, &angVelA, invMassA, invInertiaA,
354			posB, &linVelB, &angVelB, invMassB, invInertiaB );
355
356  if (gBodies[aIdx].m_invMass)
357  {
358		gBodies[aIdx].m_linVel = linVelA;
359		gBodies[aIdx].m_angVel = angVelA;
360	} else
361	{
362		gBodies[aIdx].m_linVel = mymake_float4(0,0,0,0);
363		gBodies[aIdx].m_angVel = mymake_float4(0,0,0,0);
364
365	}
366	if (gBodies[bIdx].m_invMass)
367  {
368		gBodies[bIdx].m_linVel = linVelB;
369		gBodies[bIdx].m_angVel = angVelB;
370	} else
371	{
372		gBodies[bIdx].m_linVel = mymake_float4(0,0,0,0);
373		gBodies[bIdx].m_angVel = mymake_float4(0,0,0,0);
374
375	}
376
377}
378
379
380
381typedef struct
382{
383	int m_valInt0;
384	int m_valInt1;
385	int m_valInt2;
386	int m_valInt3;
387
388	float m_val0;
389	float m_val1;
390	float m_val2;
391	float m_val3;
392} SolverDebugInfo;
393
394
395
396
397__kernel
398__attribute__((reqd_work_group_size(WG_SIZE,1,1)))
399void BatchSolveKernelContact(__global Body* gBodies,
400                      __global Shape* gShapes,
401                      __global Constraint4* gConstraints,
402                      __global int* gN,
403                      __global int* gOffsets,
404                      __global	int* batchSizes,
405                       int maxBatch1,
406                       int cellBatch,
407                       int4 nSplit
408                      )
409{
410	//__local int ldsBatchIdx[WG_SIZE+1];
411	__local int ldsCurBatch;
412	__local int ldsNextBatch;
413	__local int ldsStart;
414
415	int lIdx = GET_LOCAL_IDX;
416	int wgIdx = GET_GROUP_IDX;
417
418//	int gIdx = GET_GLOBAL_IDX;
419//	debugInfo[gIdx].m_valInt0 = gIdx;
420	//debugInfo[gIdx].m_valInt1 = GET_GROUP_SIZE;
421
422
423
424
425	int zIdx = (wgIdx/((nSplit.x*nSplit.y)/4))*2+((cellBatch&4)>>2);
426	int remain= (wgIdx%((nSplit.x*nSplit.y)/4));
427	int yIdx = (remain/(nSplit.x/2))*2 + ((cellBatch&2)>>1);
428	int xIdx = (remain%(nSplit.x/2))*2 + (cellBatch&1);
429	int cellIdx = xIdx+yIdx*nSplit.x+zIdx*(nSplit.x*nSplit.y);
430
431	//int xIdx = (wgIdx/(nSplit/2))*2 + (bIdx&1);
432	//int yIdx = (wgIdx%(nSplit/2))*2 + (bIdx>>1);
433	//int cellIdx = xIdx+yIdx*nSplit;
434
435	if( gN[cellIdx] == 0 )
436		return;
437
438	int maxBatch = batchSizes[cellIdx];
439
440
441	const int start = gOffsets[cellIdx];
442	const int end = start + gN[cellIdx];
443
444
445
446
447	if( lIdx == 0 )
448	{
449		ldsCurBatch = 0;
450		ldsNextBatch = 0;
451		ldsStart = start;
452	}
453
454
455	GROUP_LDS_BARRIER;
456
457	int idx=ldsStart+lIdx;
458	while (ldsCurBatch < maxBatch)
459	{
460		for(; idx<end; )
461		{
462			if (gConstraints[idx].m_batchIdx == ldsCurBatch)
463			{
464					solveContactConstraint( gBodies, gShapes, &gConstraints[idx] );
465
466				 idx+=64;
467			} else
468			{
469				break;
470			}
471		}
472		GROUP_LDS_BARRIER;
473
474		if( lIdx == 0 )
475		{
476			ldsCurBatch++;
477		}
478		GROUP_LDS_BARRIER;
479	}
480
481
482}
483
484
485
486__kernel void solveSingleContactKernel(__global Body* gBodies,
487                      __global Shape* gShapes,
488                      __global Constraint4* gConstraints,
489                       int cellIdx,
490                       int batchOffset,
491                       int numConstraintsInBatch
492                      )
493{
494
495	int index = get_global_id(0);
496	if (index < numConstraintsInBatch)
497	{
498		int idx=batchOffset+index;
499		solveContactConstraint( gBodies, gShapes, &gConstraints[idx] );
500	}
501}
502