1 
2 #include "b3GpuJacobiContactSolver.h"
3 #include "Bullet3Collision/NarrowPhaseCollision/b3Contact4.h"
4 #include "Bullet3Common/b3AlignedObjectArray.h"
5 #include "Bullet3OpenCL/ParallelPrimitives/b3FillCL.h"  //b3Int2
6 class b3Vector3;
7 #include "Bullet3OpenCL/ParallelPrimitives/b3RadixSort32CL.h"
8 #include "Bullet3OpenCL/ParallelPrimitives/b3PrefixScanCL.h"
9 #include "Bullet3OpenCL/ParallelPrimitives/b3LauncherCL.h"
10 #include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h"
11 #include "Bullet3OpenCL/RigidBody/kernels/solverUtils.h"
12 #include "Bullet3Common/b3Logging.h"
13 #include "b3GpuConstraint4.h"
14 #include "Bullet3Common/shared/b3Int2.h"
15 #include "Bullet3Common/shared/b3Int4.h"
16 #define SOLVER_UTILS_KERNEL_PATH "src/Bullet3OpenCL/RigidBody/kernels/solverUtils.cl"
17 
18 struct b3GpuJacobiSolverInternalData
19 {
20 	//btRadixSort32CL*	m_sort32;
21 	//btBoundSearchCL*	m_search;
22 	b3PrefixScanCL* m_scan;
23 
24 	b3OpenCLArray<unsigned int>* m_bodyCount;
25 	b3OpenCLArray<b3Int2>* m_contactConstraintOffsets;
26 	b3OpenCLArray<unsigned int>* m_offsetSplitBodies;
27 
28 	b3OpenCLArray<b3Vector3>* m_deltaLinearVelocities;
29 	b3OpenCLArray<b3Vector3>* m_deltaAngularVelocities;
30 
31 	b3AlignedObjectArray<b3Vector3> m_deltaLinearVelocitiesCPU;
32 	b3AlignedObjectArray<b3Vector3> m_deltaAngularVelocitiesCPU;
33 
34 	b3OpenCLArray<b3GpuConstraint4>* m_contactConstraints;
35 
36 	b3FillCL* m_filler;
37 
38 	cl_kernel m_countBodiesKernel;
39 	cl_kernel m_contactToConstraintSplitKernel;
40 	cl_kernel m_clearVelocitiesKernel;
41 	cl_kernel m_averageVelocitiesKernel;
42 	cl_kernel m_updateBodyVelocitiesKernel;
43 	cl_kernel m_solveContactKernel;
44 	cl_kernel m_solveFrictionKernel;
45 };
46 
b3GpuJacobiContactSolver(cl_context ctx,cl_device_id device,cl_command_queue queue,int pairCapacity)47 b3GpuJacobiContactSolver::b3GpuJacobiContactSolver(cl_context ctx, cl_device_id device, cl_command_queue queue, int pairCapacity)
48 	: m_context(ctx),
49 	  m_device(device),
50 	  m_queue(queue)
51 {
52 	m_data = new b3GpuJacobiSolverInternalData;
53 	m_data->m_scan = new b3PrefixScanCL(m_context, m_device, m_queue);
54 	m_data->m_bodyCount = new b3OpenCLArray<unsigned int>(m_context, m_queue);
55 	m_data->m_filler = new b3FillCL(m_context, m_device, m_queue);
56 	m_data->m_contactConstraintOffsets = new b3OpenCLArray<b3Int2>(m_context, m_queue);
57 	m_data->m_offsetSplitBodies = new b3OpenCLArray<unsigned int>(m_context, m_queue);
58 	m_data->m_contactConstraints = new b3OpenCLArray<b3GpuConstraint4>(m_context, m_queue);
59 	m_data->m_deltaLinearVelocities = new b3OpenCLArray<b3Vector3>(m_context, m_queue);
60 	m_data->m_deltaAngularVelocities = new b3OpenCLArray<b3Vector3>(m_context, m_queue);
61 
62 	cl_int pErrNum;
63 	const char* additionalMacros = "";
64 	const char* solverUtilsSource = solverUtilsCL;
65 	{
66 		cl_program solverUtilsProg = b3OpenCLUtils::compileCLProgramFromString(ctx, device, solverUtilsSource, &pErrNum, additionalMacros, SOLVER_UTILS_KERNEL_PATH);
67 		b3Assert(solverUtilsProg);
68 		m_data->m_countBodiesKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, solverUtilsSource, "CountBodiesKernel", &pErrNum, solverUtilsProg, additionalMacros);
69 		b3Assert(m_data->m_countBodiesKernel);
70 
71 		m_data->m_contactToConstraintSplitKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, solverUtilsSource, "ContactToConstraintSplitKernel", &pErrNum, solverUtilsProg, additionalMacros);
72 		b3Assert(m_data->m_contactToConstraintSplitKernel);
73 		m_data->m_clearVelocitiesKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, solverUtilsSource, "ClearVelocitiesKernel", &pErrNum, solverUtilsProg, additionalMacros);
74 		b3Assert(m_data->m_clearVelocitiesKernel);
75 
76 		m_data->m_averageVelocitiesKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, solverUtilsSource, "AverageVelocitiesKernel", &pErrNum, solverUtilsProg, additionalMacros);
77 		b3Assert(m_data->m_averageVelocitiesKernel);
78 
79 		m_data->m_updateBodyVelocitiesKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, solverUtilsSource, "UpdateBodyVelocitiesKernel", &pErrNum, solverUtilsProg, additionalMacros);
80 		b3Assert(m_data->m_updateBodyVelocitiesKernel);
81 
82 		m_data->m_solveContactKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, solverUtilsSource, "SolveContactJacobiKernel", &pErrNum, solverUtilsProg, additionalMacros);
83 		b3Assert(m_data->m_solveContactKernel);
84 
85 		m_data->m_solveFrictionKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, solverUtilsSource, "SolveFrictionJacobiKernel", &pErrNum, solverUtilsProg, additionalMacros);
86 		b3Assert(m_data->m_solveFrictionKernel);
87 	}
88 }
89 
~b3GpuJacobiContactSolver()90 b3GpuJacobiContactSolver::~b3GpuJacobiContactSolver()
91 {
92 	clReleaseKernel(m_data->m_solveContactKernel);
93 	clReleaseKernel(m_data->m_solveFrictionKernel);
94 	clReleaseKernel(m_data->m_countBodiesKernel);
95 	clReleaseKernel(m_data->m_contactToConstraintSplitKernel);
96 	clReleaseKernel(m_data->m_averageVelocitiesKernel);
97 	clReleaseKernel(m_data->m_updateBodyVelocitiesKernel);
98 	clReleaseKernel(m_data->m_clearVelocitiesKernel);
99 
100 	delete m_data->m_deltaLinearVelocities;
101 	delete m_data->m_deltaAngularVelocities;
102 	delete m_data->m_contactConstraints;
103 	delete m_data->m_offsetSplitBodies;
104 	delete m_data->m_contactConstraintOffsets;
105 	delete m_data->m_bodyCount;
106 	delete m_data->m_filler;
107 	delete m_data->m_scan;
108 	delete m_data;
109 }
110 
make_float4(float v)111 b3Vector3 make_float4(float v)
112 {
113 	return b3MakeVector3(v, v, v);
114 }
115 
make_float4(float x,float y,float z,float w)116 b3Vector4 make_float4(float x, float y, float z, float w)
117 {
118 	return b3MakeVector4(x, y, z, w);
119 }
120 
calcRelVel(const b3Vector3 & l0,const b3Vector3 & l1,const b3Vector3 & a0,const b3Vector3 & a1,const b3Vector3 & linVel0,const b3Vector3 & angVel0,const b3Vector3 & linVel1,const b3Vector3 & angVel1)121 static inline float calcRelVel(const b3Vector3& l0, const b3Vector3& l1, const b3Vector3& a0, const b3Vector3& a1,
122 							   const b3Vector3& linVel0, const b3Vector3& angVel0, const b3Vector3& linVel1, const b3Vector3& angVel1)
123 {
124 	return b3Dot(l0, linVel0) + b3Dot(a0, angVel0) + b3Dot(l1, linVel1) + b3Dot(a1, angVel1);
125 }
126 
setLinearAndAngular(const b3Vector3 & n,const b3Vector3 & r0,const b3Vector3 & r1,b3Vector3 & linear,b3Vector3 & angular0,b3Vector3 & angular1)127 static inline void setLinearAndAngular(const b3Vector3& n, const b3Vector3& r0, const b3Vector3& r1,
128 									   b3Vector3& linear, b3Vector3& angular0, b3Vector3& angular1)
129 {
130 	linear = n;
131 	angular0 = b3Cross(r0, n);
132 	angular1 = -b3Cross(r1, n);
133 }
134 
solveContact(b3GpuConstraint4 & cs,const b3Vector3 & posA,const b3Vector3 & linVelARO,const b3Vector3 & angVelARO,float invMassA,const b3Matrix3x3 & invInertiaA,const b3Vector3 & posB,const b3Vector3 & linVelBRO,const b3Vector3 & angVelBRO,float invMassB,const b3Matrix3x3 & invInertiaB,float maxRambdaDt[4],float minRambdaDt[4],b3Vector3 & dLinVelA,b3Vector3 & dAngVelA,b3Vector3 & dLinVelB,b3Vector3 & dAngVelB)135 static __inline void solveContact(b3GpuConstraint4& cs,
136 								  const b3Vector3& posA, const b3Vector3& linVelARO, const b3Vector3& angVelARO, float invMassA, const b3Matrix3x3& invInertiaA,
137 								  const b3Vector3& posB, const b3Vector3& linVelBRO, const b3Vector3& angVelBRO, float invMassB, const b3Matrix3x3& invInertiaB,
138 								  float maxRambdaDt[4], float minRambdaDt[4], b3Vector3& dLinVelA, b3Vector3& dAngVelA, b3Vector3& dLinVelB, b3Vector3& dAngVelB)
139 {
140 	for (int ic = 0; ic < 4; ic++)
141 	{
142 		//	dont necessary because this makes change to 0
143 		if (cs.m_jacCoeffInv[ic] == 0.f) continue;
144 
145 		{
146 			b3Vector3 angular0, angular1, linear;
147 			b3Vector3 r0 = cs.m_worldPos[ic] - (b3Vector3&)posA;
148 			b3Vector3 r1 = cs.m_worldPos[ic] - (b3Vector3&)posB;
149 			setLinearAndAngular((const b3Vector3&)cs.m_linear, (const b3Vector3&)r0, (const b3Vector3&)r1, linear, angular0, angular1);
150 
151 			float rambdaDt = calcRelVel((const b3Vector3&)cs.m_linear, (const b3Vector3&)-cs.m_linear, angular0, angular1,
152 										linVelARO + dLinVelA, angVelARO + dAngVelA, linVelBRO + dLinVelB, angVelBRO + dAngVelB) +
153 							 cs.m_b[ic];
154 			rambdaDt *= cs.m_jacCoeffInv[ic];
155 
156 			{
157 				float prevSum = cs.m_appliedRambdaDt[ic];
158 				float updated = prevSum;
159 				updated += rambdaDt;
160 				updated = b3Max(updated, minRambdaDt[ic]);
161 				updated = b3Min(updated, maxRambdaDt[ic]);
162 				rambdaDt = updated - prevSum;
163 				cs.m_appliedRambdaDt[ic] = updated;
164 			}
165 
166 			b3Vector3 linImp0 = invMassA * linear * rambdaDt;
167 			b3Vector3 linImp1 = invMassB * (-linear) * rambdaDt;
168 			b3Vector3 angImp0 = (invInertiaA * angular0) * rambdaDt;
169 			b3Vector3 angImp1 = (invInertiaB * angular1) * rambdaDt;
170 #ifdef _WIN32
171 			b3Assert(_finite(linImp0.getX()));
172 			b3Assert(_finite(linImp1.getX()));
173 #endif
174 
175 			if (invMassA)
176 			{
177 				dLinVelA += linImp0;
178 				dAngVelA += angImp0;
179 			}
180 			if (invMassB)
181 			{
182 				dLinVelB += linImp1;
183 				dAngVelB += angImp1;
184 			}
185 		}
186 	}
187 }
188 
solveContact3(b3GpuConstraint4 * cs,b3Vector3 * posAPtr,b3Vector3 * linVelA,b3Vector3 * angVelA,float invMassA,const b3Matrix3x3 & invInertiaA,b3Vector3 * posBPtr,b3Vector3 * linVelB,b3Vector3 * angVelB,float invMassB,const b3Matrix3x3 & invInertiaB,b3Vector3 * dLinVelA,b3Vector3 * dAngVelA,b3Vector3 * dLinVelB,b3Vector3 * dAngVelB)189 void solveContact3(b3GpuConstraint4* cs,
190 				   b3Vector3* posAPtr, b3Vector3* linVelA, b3Vector3* angVelA, float invMassA, const b3Matrix3x3& invInertiaA,
191 				   b3Vector3* posBPtr, b3Vector3* linVelB, b3Vector3* angVelB, float invMassB, const b3Matrix3x3& invInertiaB,
192 				   b3Vector3* dLinVelA, b3Vector3* dAngVelA, b3Vector3* dLinVelB, b3Vector3* dAngVelB)
193 {
194 	float minRambdaDt = 0;
195 	float maxRambdaDt = FLT_MAX;
196 
197 	for (int ic = 0; ic < 4; ic++)
198 	{
199 		if (cs->m_jacCoeffInv[ic] == 0.f) continue;
200 
201 		b3Vector3 angular0, angular1, linear;
202 		b3Vector3 r0 = cs->m_worldPos[ic] - *posAPtr;
203 		b3Vector3 r1 = cs->m_worldPos[ic] - *posBPtr;
204 		setLinearAndAngular(cs->m_linear, r0, r1, linear, angular0, angular1);
205 
206 		float rambdaDt = calcRelVel(cs->m_linear, -cs->m_linear, angular0, angular1,
207 									*linVelA + *dLinVelA, *angVelA + *dAngVelA, *linVelB + *dLinVelB, *angVelB + *dAngVelB) +
208 						 cs->m_b[ic];
209 		rambdaDt *= cs->m_jacCoeffInv[ic];
210 
211 		{
212 			float prevSum = cs->m_appliedRambdaDt[ic];
213 			float updated = prevSum;
214 			updated += rambdaDt;
215 			updated = b3Max(updated, minRambdaDt);
216 			updated = b3Min(updated, maxRambdaDt);
217 			rambdaDt = updated - prevSum;
218 			cs->m_appliedRambdaDt[ic] = updated;
219 		}
220 
221 		b3Vector3 linImp0 = invMassA * linear * rambdaDt;
222 		b3Vector3 linImp1 = invMassB * (-linear) * rambdaDt;
223 		b3Vector3 angImp0 = (invInertiaA * angular0) * rambdaDt;
224 		b3Vector3 angImp1 = (invInertiaB * angular1) * rambdaDt;
225 
226 		if (invMassA)
227 		{
228 			*dLinVelA += linImp0;
229 			*dAngVelA += angImp0;
230 		}
231 		if (invMassB)
232 		{
233 			*dLinVelB += linImp1;
234 			*dAngVelB += angImp1;
235 		}
236 	}
237 }
238 
solveFriction(b3GpuConstraint4 & cs,const b3Vector3 & posA,const b3Vector3 & linVelARO,const b3Vector3 & angVelARO,float invMassA,const b3Matrix3x3 & invInertiaA,const b3Vector3 & posB,const b3Vector3 & linVelBRO,const b3Vector3 & angVelBRO,float invMassB,const b3Matrix3x3 & invInertiaB,float maxRambdaDt[4],float minRambdaDt[4],b3Vector3 & dLinVelA,b3Vector3 & dAngVelA,b3Vector3 & dLinVelB,b3Vector3 & dAngVelB)239 static inline void solveFriction(b3GpuConstraint4& cs,
240 								 const b3Vector3& posA, const b3Vector3& linVelARO, const b3Vector3& angVelARO, float invMassA, const b3Matrix3x3& invInertiaA,
241 								 const b3Vector3& posB, const b3Vector3& linVelBRO, const b3Vector3& angVelBRO, float invMassB, const b3Matrix3x3& invInertiaB,
242 								 float maxRambdaDt[4], float minRambdaDt[4], b3Vector3& dLinVelA, b3Vector3& dAngVelA, b3Vector3& dLinVelB, b3Vector3& dAngVelB)
243 {
244 	b3Vector3 linVelA = linVelARO + dLinVelA;
245 	b3Vector3 linVelB = linVelBRO + dLinVelB;
246 	b3Vector3 angVelA = angVelARO + dAngVelA;
247 	b3Vector3 angVelB = angVelBRO + dAngVelB;
248 
249 	if (cs.m_fJacCoeffInv[0] == 0 && cs.m_fJacCoeffInv[0] == 0) return;
250 	const b3Vector3& center = (const b3Vector3&)cs.m_center;
251 
252 	b3Vector3 n = -(const b3Vector3&)cs.m_linear;
253 
254 	b3Vector3 tangent[2];
255 #if 1
256 	b3PlaneSpace1(n, tangent[0], tangent[1]);
257 #else
258 	b3Vector3 r = cs.m_worldPos[0] - center;
259 	tangent[0] = cross3(n, r);
260 	tangent[1] = cross3(tangent[0], n);
261 	tangent[0] = normalize3(tangent[0]);
262 	tangent[1] = normalize3(tangent[1]);
263 #endif
264 
265 	b3Vector3 angular0, angular1, linear;
266 	b3Vector3 r0 = center - posA;
267 	b3Vector3 r1 = center - posB;
268 	for (int i = 0; i < 2; i++)
269 	{
270 		setLinearAndAngular(tangent[i], r0, r1, linear, angular0, angular1);
271 		float rambdaDt = calcRelVel(linear, -linear, angular0, angular1,
272 									linVelA, angVelA, linVelB, angVelB);
273 		rambdaDt *= cs.m_fJacCoeffInv[i];
274 
275 		{
276 			float prevSum = cs.m_fAppliedRambdaDt[i];
277 			float updated = prevSum;
278 			updated += rambdaDt;
279 			updated = b3Max(updated, minRambdaDt[i]);
280 			updated = b3Min(updated, maxRambdaDt[i]);
281 			rambdaDt = updated - prevSum;
282 			cs.m_fAppliedRambdaDt[i] = updated;
283 		}
284 
285 		b3Vector3 linImp0 = invMassA * linear * rambdaDt;
286 		b3Vector3 linImp1 = invMassB * (-linear) * rambdaDt;
287 		b3Vector3 angImp0 = (invInertiaA * angular0) * rambdaDt;
288 		b3Vector3 angImp1 = (invInertiaB * angular1) * rambdaDt;
289 #ifdef _WIN32
290 		b3Assert(_finite(linImp0.getX()));
291 		b3Assert(_finite(linImp1.getX()));
292 #endif
293 		if (invMassA)
294 		{
295 			dLinVelA += linImp0;
296 			dAngVelA += angImp0;
297 		}
298 		if (invMassB)
299 		{
300 			dLinVelB += linImp1;
301 			dAngVelB += angImp1;
302 		}
303 	}
304 
305 	{  //	angular damping for point constraint
306 		b3Vector3 ab = (posB - posA).normalized();
307 		b3Vector3 ac = (center - posA).normalized();
308 		if (b3Dot(ab, ac) > 0.95f || (invMassA == 0.f || invMassB == 0.f))
309 		{
310 			float angNA = b3Dot(n, angVelA);
311 			float angNB = b3Dot(n, angVelB);
312 
313 			if (invMassA)
314 				dAngVelA -= (angNA * 0.1f) * n;
315 			if (invMassB)
316 				dAngVelB -= (angNB * 0.1f) * n;
317 		}
318 	}
319 }
320 
calcJacCoeff(const b3Vector3 & linear0,const b3Vector3 & linear1,const b3Vector3 & angular0,const b3Vector3 & angular1,float invMass0,const b3Matrix3x3 * invInertia0,float invMass1,const b3Matrix3x3 * invInertia1,float countA,float countB)321 float calcJacCoeff(const b3Vector3& linear0, const b3Vector3& linear1, const b3Vector3& angular0, const b3Vector3& angular1,
322 				   float invMass0, const b3Matrix3x3* invInertia0, float invMass1, const b3Matrix3x3* invInertia1, float countA, float countB)
323 {
324 	//	linear0,1 are normlized
325 	float jmj0 = invMass0;  //dot3F4(linear0, linear0)*invMass0;
326 
327 	float jmj1 = b3Dot(mtMul3(angular0, *invInertia0), angular0);
328 	float jmj2 = invMass1;  //dot3F4(linear1, linear1)*invMass1;
329 	float jmj3 = b3Dot(mtMul3(angular1, *invInertia1), angular1);
330 	return -1.f / ((jmj0 + jmj1) * countA + (jmj2 + jmj3) * countB);
331 	//	return -1.f/((jmj0+jmj1)+(jmj2+jmj3));
332 }
333 
setConstraint4(const b3Vector3 & posA,const b3Vector3 & linVelA,const b3Vector3 & angVelA,float invMassA,const b3Matrix3x3 & invInertiaA,const b3Vector3 & posB,const b3Vector3 & linVelB,const b3Vector3 & angVelB,float invMassB,const b3Matrix3x3 & invInertiaB,b3Contact4 * src,float dt,float positionDrift,float positionConstraintCoeff,float countA,float countB,b3GpuConstraint4 * dstC)334 void setConstraint4(const b3Vector3& posA, const b3Vector3& linVelA, const b3Vector3& angVelA, float invMassA, const b3Matrix3x3& invInertiaA,
335 					const b3Vector3& posB, const b3Vector3& linVelB, const b3Vector3& angVelB, float invMassB, const b3Matrix3x3& invInertiaB,
336 					b3Contact4* src, float dt, float positionDrift, float positionConstraintCoeff, float countA, float countB,
337 					b3GpuConstraint4* dstC)
338 {
339 	dstC->m_bodyA = abs(src->m_bodyAPtrAndSignBit);
340 	dstC->m_bodyB = abs(src->m_bodyBPtrAndSignBit);
341 
342 	float dtInv = 1.f / dt;
343 	for (int ic = 0; ic < 4; ic++)
344 	{
345 		dstC->m_appliedRambdaDt[ic] = 0.f;
346 	}
347 	dstC->m_fJacCoeffInv[0] = dstC->m_fJacCoeffInv[1] = 0.f;
348 
349 	dstC->m_linear = src->m_worldNormalOnB;
350 	dstC->m_linear[3] = 0.7f;  //src->getFrictionCoeff() );
351 	for (int ic = 0; ic < 4; ic++)
352 	{
353 		b3Vector3 r0 = src->m_worldPosB[ic] - posA;
354 		b3Vector3 r1 = src->m_worldPosB[ic] - posB;
355 
356 		if (ic >= src->m_worldNormalOnB[3])  //npoints
357 		{
358 			dstC->m_jacCoeffInv[ic] = 0.f;
359 			continue;
360 		}
361 
362 		float relVelN;
363 		{
364 			b3Vector3 linear, angular0, angular1;
365 			setLinearAndAngular(src->m_worldNormalOnB, r0, r1, linear, angular0, angular1);
366 
367 			dstC->m_jacCoeffInv[ic] = calcJacCoeff(linear, -linear, angular0, angular1,
368 												   invMassA, &invInertiaA, invMassB, &invInertiaB, countA, countB);
369 
370 			relVelN = calcRelVel(linear, -linear, angular0, angular1,
371 								 linVelA, angVelA, linVelB, angVelB);
372 
373 			float e = 0.f;  //src->getRestituitionCoeff();
374 			if (relVelN * relVelN < 0.004f)
375 			{
376 				e = 0.f;
377 			}
378 
379 			dstC->m_b[ic] = e * relVelN;
380 			//float penetration = src->m_worldPos[ic].w;
381 			dstC->m_b[ic] += (src->m_worldPosB[ic][3] + positionDrift) * positionConstraintCoeff * dtInv;
382 			dstC->m_appliedRambdaDt[ic] = 0.f;
383 		}
384 	}
385 
386 	if (src->m_worldNormalOnB[3] > 0)  //npoints
387 	{                                  //	prepare friction
388 		b3Vector3 center = make_float4(0.f);
389 		for (int i = 0; i < src->m_worldNormalOnB[3]; i++)
390 			center += src->m_worldPosB[i];
391 		center /= (float)src->m_worldNormalOnB[3];
392 
393 		b3Vector3 tangent[2];
394 		b3PlaneSpace1(src->m_worldNormalOnB, tangent[0], tangent[1]);
395 
396 		b3Vector3 r[2];
397 		r[0] = center - posA;
398 		r[1] = center - posB;
399 
400 		for (int i = 0; i < 2; i++)
401 		{
402 			b3Vector3 linear, angular0, angular1;
403 			setLinearAndAngular(tangent[i], r[0], r[1], linear, angular0, angular1);
404 
405 			dstC->m_fJacCoeffInv[i] = calcJacCoeff(linear, -linear, angular0, angular1,
406 												   invMassA, &invInertiaA, invMassB, &invInertiaB, countA, countB);
407 			dstC->m_fAppliedRambdaDt[i] = 0.f;
408 		}
409 		dstC->m_center = center;
410 	}
411 
412 	for (int i = 0; i < 4; i++)
413 	{
414 		if (i < src->m_worldNormalOnB[3])
415 		{
416 			dstC->m_worldPos[i] = src->m_worldPosB[i];
417 		}
418 		else
419 		{
420 			dstC->m_worldPos[i] = make_float4(0.f);
421 		}
422 	}
423 }
424 
ContactToConstraintKernel(b3Contact4 * gContact,b3RigidBodyData * gBodies,b3InertiaData * gShapes,b3GpuConstraint4 * gConstraintOut,int nContacts,float dt,float positionDrift,float positionConstraintCoeff,int gIdx,b3AlignedObjectArray<unsigned int> & bodyCount)425 void ContactToConstraintKernel(b3Contact4* gContact, b3RigidBodyData* gBodies, b3InertiaData* gShapes, b3GpuConstraint4* gConstraintOut, int nContacts,
426 							   float dt,
427 							   float positionDrift,
428 							   float positionConstraintCoeff, int gIdx, b3AlignedObjectArray<unsigned int>& bodyCount)
429 {
430 	//int gIdx = 0;//GET_GLOBAL_IDX;
431 
432 	if (gIdx < nContacts)
433 	{
434 		int aIdx = abs(gContact[gIdx].m_bodyAPtrAndSignBit);
435 		int bIdx = abs(gContact[gIdx].m_bodyBPtrAndSignBit);
436 
437 		b3Vector3 posA = gBodies[aIdx].m_pos;
438 		b3Vector3 linVelA = gBodies[aIdx].m_linVel;
439 		b3Vector3 angVelA = gBodies[aIdx].m_angVel;
440 		float invMassA = gBodies[aIdx].m_invMass;
441 		b3Matrix3x3 invInertiaA = gShapes[aIdx].m_invInertiaWorld;  //.m_invInertia;
442 
443 		b3Vector3 posB = gBodies[bIdx].m_pos;
444 		b3Vector3 linVelB = gBodies[bIdx].m_linVel;
445 		b3Vector3 angVelB = gBodies[bIdx].m_angVel;
446 		float invMassB = gBodies[bIdx].m_invMass;
447 		b3Matrix3x3 invInertiaB = gShapes[bIdx].m_invInertiaWorld;  //m_invInertia;
448 
449 		b3GpuConstraint4 cs;
450 		float countA = invMassA ? (float)(bodyCount[aIdx]) : 1;
451 		float countB = invMassB ? (float)(bodyCount[bIdx]) : 1;
452 		setConstraint4(posA, linVelA, angVelA, invMassA, invInertiaA, posB, linVelB, angVelB, invMassB, invInertiaB,
453 					   &gContact[gIdx], dt, positionDrift, positionConstraintCoeff, countA, countB,
454 					   &cs);
455 
456 		cs.m_batchIdx = gContact[gIdx].m_batchIdx;
457 
458 		gConstraintOut[gIdx] = cs;
459 	}
460 }
461 
solveGroupHost(b3RigidBodyData * bodies,b3InertiaData * inertias,int numBodies,b3Contact4 * manifoldPtr,int numManifolds,const b3JacobiSolverInfo & solverInfo)462 void b3GpuJacobiContactSolver::solveGroupHost(b3RigidBodyData* bodies, b3InertiaData* inertias, int numBodies, b3Contact4* manifoldPtr, int numManifolds, const b3JacobiSolverInfo& solverInfo)
463 {
464 	B3_PROFILE("b3GpuJacobiContactSolver::solveGroup");
465 
466 	b3AlignedObjectArray<unsigned int> bodyCount;
467 	bodyCount.resize(numBodies);
468 	for (int i = 0; i < numBodies; i++)
469 		bodyCount[i] = 0;
470 
471 	b3AlignedObjectArray<b3Int2> contactConstraintOffsets;
472 	contactConstraintOffsets.resize(numManifolds);
473 
474 	for (int i = 0; i < numManifolds; i++)
475 	{
476 		int pa = manifoldPtr[i].m_bodyAPtrAndSignBit;
477 		int pb = manifoldPtr[i].m_bodyBPtrAndSignBit;
478 
479 		bool isFixedA = (pa < 0) || (pa == solverInfo.m_fixedBodyIndex);
480 		bool isFixedB = (pb < 0) || (pb == solverInfo.m_fixedBodyIndex);
481 
482 		int bodyIndexA = manifoldPtr[i].getBodyA();
483 		int bodyIndexB = manifoldPtr[i].getBodyB();
484 
485 		if (!isFixedA)
486 		{
487 			contactConstraintOffsets[i].x = bodyCount[bodyIndexA];
488 			bodyCount[bodyIndexA]++;
489 		}
490 		if (!isFixedB)
491 		{
492 			contactConstraintOffsets[i].y = bodyCount[bodyIndexB];
493 			bodyCount[bodyIndexB]++;
494 		}
495 	}
496 
497 	b3AlignedObjectArray<unsigned int> offsetSplitBodies;
498 	offsetSplitBodies.resize(numBodies);
499 	unsigned int totalNumSplitBodies;
500 	m_data->m_scan->executeHost(bodyCount, offsetSplitBodies, numBodies, &totalNumSplitBodies);
501 	int numlastBody = bodyCount[numBodies - 1];
502 	totalNumSplitBodies += numlastBody;
503 	printf("totalNumSplitBodies = %d\n", totalNumSplitBodies);
504 
505 	b3AlignedObjectArray<b3GpuConstraint4> contactConstraints;
506 	contactConstraints.resize(numManifolds);
507 
508 	for (int i = 0; i < numManifolds; i++)
509 	{
510 		ContactToConstraintKernel(&manifoldPtr[0], bodies, inertias, &contactConstraints[0], numManifolds,
511 								  solverInfo.m_deltaTime,
512 								  solverInfo.m_positionDrift,
513 								  solverInfo.m_positionConstraintCoeff,
514 								  i, bodyCount);
515 	}
516 	int maxIter = solverInfo.m_numIterations;
517 
518 	b3AlignedObjectArray<b3Vector3> deltaLinearVelocities;
519 	b3AlignedObjectArray<b3Vector3> deltaAngularVelocities;
520 	deltaLinearVelocities.resize(totalNumSplitBodies);
521 	deltaAngularVelocities.resize(totalNumSplitBodies);
522 	for (unsigned int i = 0; i < totalNumSplitBodies; i++)
523 	{
524 		deltaLinearVelocities[i].setZero();
525 		deltaAngularVelocities[i].setZero();
526 	}
527 
528 	for (int iter = 0; iter < maxIter; iter++)
529 	{
530 		int i = 0;
531 		for (i = 0; i < numManifolds; i++)
532 		{
533 			//float frictionCoeff = contactConstraints[i].getFrictionCoeff();
534 			int aIdx = (int)contactConstraints[i].m_bodyA;
535 			int bIdx = (int)contactConstraints[i].m_bodyB;
536 			b3RigidBodyData& bodyA = bodies[aIdx];
537 			b3RigidBodyData& bodyB = bodies[bIdx];
538 
539 			b3Vector3 zero = b3MakeVector3(0, 0, 0);
540 
541 			b3Vector3* dlvAPtr = &zero;
542 			b3Vector3* davAPtr = &zero;
543 			b3Vector3* dlvBPtr = &zero;
544 			b3Vector3* davBPtr = &zero;
545 
546 			if (bodyA.m_invMass)
547 			{
548 				int bodyOffsetA = offsetSplitBodies[aIdx];
549 				int constraintOffsetA = contactConstraintOffsets[i].x;
550 				int splitIndexA = bodyOffsetA + constraintOffsetA;
551 				dlvAPtr = &deltaLinearVelocities[splitIndexA];
552 				davAPtr = &deltaAngularVelocities[splitIndexA];
553 			}
554 
555 			if (bodyB.m_invMass)
556 			{
557 				int bodyOffsetB = offsetSplitBodies[bIdx];
558 				int constraintOffsetB = contactConstraintOffsets[i].y;
559 				int splitIndexB = bodyOffsetB + constraintOffsetB;
560 				dlvBPtr = &deltaLinearVelocities[splitIndexB];
561 				davBPtr = &deltaAngularVelocities[splitIndexB];
562 			}
563 
564 			{
565 				float maxRambdaDt[4] = {FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX};
566 				float minRambdaDt[4] = {0.f, 0.f, 0.f, 0.f};
567 
568 				solveContact(contactConstraints[i], (b3Vector3&)bodyA.m_pos, (b3Vector3&)bodyA.m_linVel, (b3Vector3&)bodyA.m_angVel, bodyA.m_invMass, inertias[aIdx].m_invInertiaWorld,
569 							 (b3Vector3&)bodyB.m_pos, (b3Vector3&)bodyB.m_linVel, (b3Vector3&)bodyB.m_angVel, bodyB.m_invMass, inertias[bIdx].m_invInertiaWorld,
570 							 maxRambdaDt, minRambdaDt, *dlvAPtr, *davAPtr, *dlvBPtr, *davBPtr);
571 			}
572 		}
573 
574 		//easy
575 		for (int i = 0; i < numBodies; i++)
576 		{
577 			if (bodies[i].m_invMass)
578 			{
579 				int bodyOffset = offsetSplitBodies[i];
580 				int count = bodyCount[i];
581 				float factor = 1.f / float(count);
582 				b3Vector3 averageLinVel;
583 				averageLinVel.setZero();
584 				b3Vector3 averageAngVel;
585 				averageAngVel.setZero();
586 				for (int j = 0; j < count; j++)
587 				{
588 					averageLinVel += deltaLinearVelocities[bodyOffset + j] * factor;
589 					averageAngVel += deltaAngularVelocities[bodyOffset + j] * factor;
590 				}
591 				for (int j = 0; j < count; j++)
592 				{
593 					deltaLinearVelocities[bodyOffset + j] = averageLinVel;
594 					deltaAngularVelocities[bodyOffset + j] = averageAngVel;
595 				}
596 			}
597 		}
598 	}
599 	for (int iter = 0; iter < maxIter; iter++)
600 	{
601 		//int i=0;
602 
603 		//solve friction
604 
605 		for (int i = 0; i < numManifolds; i++)
606 		{
607 			float maxRambdaDt[4] = {FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX};
608 			float minRambdaDt[4] = {0.f, 0.f, 0.f, 0.f};
609 
610 			float sum = 0;
611 			for (int j = 0; j < 4; j++)
612 			{
613 				sum += contactConstraints[i].m_appliedRambdaDt[j];
614 			}
615 			float frictionCoeff = contactConstraints[i].getFrictionCoeff();
616 			int aIdx = (int)contactConstraints[i].m_bodyA;
617 			int bIdx = (int)contactConstraints[i].m_bodyB;
618 			b3RigidBodyData& bodyA = bodies[aIdx];
619 			b3RigidBodyData& bodyB = bodies[bIdx];
620 
621 			b3Vector3 zero = b3MakeVector3(0, 0, 0);
622 
623 			b3Vector3* dlvAPtr = &zero;
624 			b3Vector3* davAPtr = &zero;
625 			b3Vector3* dlvBPtr = &zero;
626 			b3Vector3* davBPtr = &zero;
627 
628 			if (bodyA.m_invMass)
629 			{
630 				int bodyOffsetA = offsetSplitBodies[aIdx];
631 				int constraintOffsetA = contactConstraintOffsets[i].x;
632 				int splitIndexA = bodyOffsetA + constraintOffsetA;
633 				dlvAPtr = &deltaLinearVelocities[splitIndexA];
634 				davAPtr = &deltaAngularVelocities[splitIndexA];
635 			}
636 
637 			if (bodyB.m_invMass)
638 			{
639 				int bodyOffsetB = offsetSplitBodies[bIdx];
640 				int constraintOffsetB = contactConstraintOffsets[i].y;
641 				int splitIndexB = bodyOffsetB + constraintOffsetB;
642 				dlvBPtr = &deltaLinearVelocities[splitIndexB];
643 				davBPtr = &deltaAngularVelocities[splitIndexB];
644 			}
645 
646 			for (int j = 0; j < 4; j++)
647 			{
648 				maxRambdaDt[j] = frictionCoeff * sum;
649 				minRambdaDt[j] = -maxRambdaDt[j];
650 			}
651 
652 			solveFriction(contactConstraints[i], (b3Vector3&)bodyA.m_pos, (b3Vector3&)bodyA.m_linVel, (b3Vector3&)bodyA.m_angVel, bodyA.m_invMass, inertias[aIdx].m_invInertiaWorld,
653 						  (b3Vector3&)bodyB.m_pos, (b3Vector3&)bodyB.m_linVel, (b3Vector3&)bodyB.m_angVel, bodyB.m_invMass, inertias[bIdx].m_invInertiaWorld,
654 						  maxRambdaDt, minRambdaDt, *dlvAPtr, *davAPtr, *dlvBPtr, *davBPtr);
655 		}
656 
657 		//easy
658 		for (int i = 0; i < numBodies; i++)
659 		{
660 			if (bodies[i].m_invMass)
661 			{
662 				int bodyOffset = offsetSplitBodies[i];
663 				int count = bodyCount[i];
664 				float factor = 1.f / float(count);
665 				b3Vector3 averageLinVel;
666 				averageLinVel.setZero();
667 				b3Vector3 averageAngVel;
668 				averageAngVel.setZero();
669 				for (int j = 0; j < count; j++)
670 				{
671 					averageLinVel += deltaLinearVelocities[bodyOffset + j] * factor;
672 					averageAngVel += deltaAngularVelocities[bodyOffset + j] * factor;
673 				}
674 				for (int j = 0; j < count; j++)
675 				{
676 					deltaLinearVelocities[bodyOffset + j] = averageLinVel;
677 					deltaAngularVelocities[bodyOffset + j] = averageAngVel;
678 				}
679 			}
680 		}
681 	}
682 
683 	//easy
684 	for (int i = 0; i < numBodies; i++)
685 	{
686 		if (bodies[i].m_invMass)
687 		{
688 			int bodyOffset = offsetSplitBodies[i];
689 			int count = bodyCount[i];
690 			if (count)
691 			{
692 				bodies[i].m_linVel += deltaLinearVelocities[bodyOffset];
693 				bodies[i].m_angVel += deltaAngularVelocities[bodyOffset];
694 			}
695 		}
696 	}
697 }
698 
solveContacts(int numBodies,cl_mem bodyBuf,cl_mem inertiaBuf,int numContacts,cl_mem contactBuf,const struct b3Config & config,int static0Index)699 void b3GpuJacobiContactSolver::solveContacts(int numBodies, cl_mem bodyBuf, cl_mem inertiaBuf, int numContacts, cl_mem contactBuf, const struct b3Config& config, int static0Index)
700 //
701 //
702 //void  b3GpuJacobiContactSolver::solveGroup(b3OpenCLArray<b3RigidBodyData>* bodies,b3OpenCLArray<b3InertiaData>* inertias,b3OpenCLArray<b3Contact4>* manifoldPtr,const btJacobiSolverInfo& solverInfo)
703 {
704 	b3JacobiSolverInfo solverInfo;
705 	solverInfo.m_fixedBodyIndex = static0Index;
706 
707 	B3_PROFILE("b3GpuJacobiContactSolver::solveGroup");
708 
709 	//int numBodies = bodies->size();
710 	int numManifolds = numContacts;  //manifoldPtr->size();
711 
712 	{
713 		B3_PROFILE("resize");
714 		m_data->m_bodyCount->resize(numBodies);
715 	}
716 
717 	unsigned int val = 0;
718 	b3Int2 val2;
719 	val2.x = 0;
720 	val2.y = 0;
721 
722 	{
723 		B3_PROFILE("m_filler");
724 		m_data->m_contactConstraintOffsets->resize(numManifolds);
725 		m_data->m_filler->execute(*m_data->m_bodyCount, val, numBodies);
726 
727 		m_data->m_filler->execute(*m_data->m_contactConstraintOffsets, val2, numManifolds);
728 	}
729 
730 	{
731 		B3_PROFILE("m_countBodiesKernel");
732 		b3LauncherCL launcher(this->m_queue, m_data->m_countBodiesKernel, "m_countBodiesKernel");
733 		launcher.setBuffer(contactBuf);  //manifoldPtr->getBufferCL());
734 		launcher.setBuffer(m_data->m_bodyCount->getBufferCL());
735 		launcher.setBuffer(m_data->m_contactConstraintOffsets->getBufferCL());
736 		launcher.setConst(numManifolds);
737 		launcher.setConst(solverInfo.m_fixedBodyIndex);
738 		launcher.launch1D(numManifolds);
739 	}
740 	unsigned int totalNumSplitBodies = 0;
741 	{
742 		B3_PROFILE("m_scan->execute");
743 
744 		m_data->m_offsetSplitBodies->resize(numBodies);
745 		m_data->m_scan->execute(*m_data->m_bodyCount, *m_data->m_offsetSplitBodies, numBodies, &totalNumSplitBodies);
746 		totalNumSplitBodies += m_data->m_bodyCount->at(numBodies - 1);
747 	}
748 
749 	{
750 		B3_PROFILE("m_data->m_contactConstraints->resize");
751 		//int numContacts = manifoldPtr->size();
752 		m_data->m_contactConstraints->resize(numContacts);
753 	}
754 
755 	{
756 		B3_PROFILE("contactToConstraintSplitKernel");
757 		b3LauncherCL launcher(m_queue, m_data->m_contactToConstraintSplitKernel, "m_contactToConstraintSplitKernel");
758 		launcher.setBuffer(contactBuf);
759 		launcher.setBuffer(bodyBuf);
760 		launcher.setBuffer(inertiaBuf);
761 		launcher.setBuffer(m_data->m_contactConstraints->getBufferCL());
762 		launcher.setBuffer(m_data->m_bodyCount->getBufferCL());
763 		launcher.setConst(numContacts);
764 		launcher.setConst(solverInfo.m_deltaTime);
765 		launcher.setConst(solverInfo.m_positionDrift);
766 		launcher.setConst(solverInfo.m_positionConstraintCoeff);
767 		launcher.launch1D(numContacts, 64);
768 	}
769 
770 	{
771 		B3_PROFILE("m_data->m_deltaLinearVelocities->resize");
772 		m_data->m_deltaLinearVelocities->resize(totalNumSplitBodies);
773 		m_data->m_deltaAngularVelocities->resize(totalNumSplitBodies);
774 	}
775 
776 	{
777 		B3_PROFILE("m_clearVelocitiesKernel");
778 		b3LauncherCL launch(m_queue, m_data->m_clearVelocitiesKernel, "m_clearVelocitiesKernel");
779 		launch.setBuffer(m_data->m_deltaAngularVelocities->getBufferCL());
780 		launch.setBuffer(m_data->m_deltaLinearVelocities->getBufferCL());
781 		launch.setConst(totalNumSplitBodies);
782 		launch.launch1D(totalNumSplitBodies);
783 		clFinish(m_queue);
784 	}
785 
786 	int maxIter = solverInfo.m_numIterations;
787 
788 	for (int iter = 0; iter < maxIter; iter++)
789 	{
790 		{
791 			B3_PROFILE("m_solveContactKernel");
792 			b3LauncherCL launcher(m_queue, m_data->m_solveContactKernel, "m_solveContactKernel");
793 			launcher.setBuffer(m_data->m_contactConstraints->getBufferCL());
794 			launcher.setBuffer(bodyBuf);
795 			launcher.setBuffer(inertiaBuf);
796 			launcher.setBuffer(m_data->m_contactConstraintOffsets->getBufferCL());
797 			launcher.setBuffer(m_data->m_offsetSplitBodies->getBufferCL());
798 			launcher.setBuffer(m_data->m_deltaLinearVelocities->getBufferCL());
799 			launcher.setBuffer(m_data->m_deltaAngularVelocities->getBufferCL());
800 			launcher.setConst(solverInfo.m_deltaTime);
801 			launcher.setConst(solverInfo.m_positionDrift);
802 			launcher.setConst(solverInfo.m_positionConstraintCoeff);
803 			launcher.setConst(solverInfo.m_fixedBodyIndex);
804 			launcher.setConst(numManifolds);
805 
806 			launcher.launch1D(numManifolds);
807 			clFinish(m_queue);
808 		}
809 
810 		{
811 			B3_PROFILE("average velocities");
812 			b3LauncherCL launcher(m_queue, m_data->m_averageVelocitiesKernel, "m_averageVelocitiesKernel");
813 			launcher.setBuffer(bodyBuf);
814 			launcher.setBuffer(m_data->m_offsetSplitBodies->getBufferCL());
815 			launcher.setBuffer(m_data->m_bodyCount->getBufferCL());
816 			launcher.setBuffer(m_data->m_deltaLinearVelocities->getBufferCL());
817 			launcher.setBuffer(m_data->m_deltaAngularVelocities->getBufferCL());
818 			launcher.setConst(numBodies);
819 			launcher.launch1D(numBodies);
820 			clFinish(m_queue);
821 		}
822 
823 		{
824 			B3_PROFILE("m_solveFrictionKernel");
825 			b3LauncherCL launcher(m_queue, m_data->m_solveFrictionKernel, "m_solveFrictionKernel");
826 			launcher.setBuffer(m_data->m_contactConstraints->getBufferCL());
827 			launcher.setBuffer(bodyBuf);
828 			launcher.setBuffer(inertiaBuf);
829 			launcher.setBuffer(m_data->m_contactConstraintOffsets->getBufferCL());
830 			launcher.setBuffer(m_data->m_offsetSplitBodies->getBufferCL());
831 			launcher.setBuffer(m_data->m_deltaLinearVelocities->getBufferCL());
832 			launcher.setBuffer(m_data->m_deltaAngularVelocities->getBufferCL());
833 			launcher.setConst(solverInfo.m_deltaTime);
834 			launcher.setConst(solverInfo.m_positionDrift);
835 			launcher.setConst(solverInfo.m_positionConstraintCoeff);
836 			launcher.setConst(solverInfo.m_fixedBodyIndex);
837 			launcher.setConst(numManifolds);
838 
839 			launcher.launch1D(numManifolds);
840 			clFinish(m_queue);
841 		}
842 
843 		{
844 			B3_PROFILE("average velocities");
845 			b3LauncherCL launcher(m_queue, m_data->m_averageVelocitiesKernel, "m_averageVelocitiesKernel");
846 			launcher.setBuffer(bodyBuf);
847 			launcher.setBuffer(m_data->m_offsetSplitBodies->getBufferCL());
848 			launcher.setBuffer(m_data->m_bodyCount->getBufferCL());
849 			launcher.setBuffer(m_data->m_deltaLinearVelocities->getBufferCL());
850 			launcher.setBuffer(m_data->m_deltaAngularVelocities->getBufferCL());
851 			launcher.setConst(numBodies);
852 			launcher.launch1D(numBodies);
853 			clFinish(m_queue);
854 		}
855 	}
856 
857 	{
858 		B3_PROFILE("update body velocities");
859 		b3LauncherCL launcher(m_queue, m_data->m_updateBodyVelocitiesKernel, "m_updateBodyVelocitiesKernel");
860 		launcher.setBuffer(bodyBuf);
861 		launcher.setBuffer(m_data->m_offsetSplitBodies->getBufferCL());
862 		launcher.setBuffer(m_data->m_bodyCount->getBufferCL());
863 		launcher.setBuffer(m_data->m_deltaLinearVelocities->getBufferCL());
864 		launcher.setBuffer(m_data->m_deltaAngularVelocities->getBufferCL());
865 		launcher.setConst(numBodies);
866 		launcher.launch1D(numBodies);
867 		clFinish(m_queue);
868 	}
869 }
870 
871 #if 0
872 
873 void  b3GpuJacobiContactSolver::solveGroupMixed(b3OpenCLArray<b3RigidBodyData>* bodiesGPU,b3OpenCLArray<b3InertiaData>* inertiasGPU,b3OpenCLArray<b3Contact4>* manifoldPtrGPU,const btJacobiSolverInfo& solverInfo)
874 {
875 
876 	b3AlignedObjectArray<b3RigidBodyData> bodiesCPU;
877 	bodiesGPU->copyToHost(bodiesCPU);
878 	b3AlignedObjectArray<b3InertiaData> inertiasCPU;
879 	inertiasGPU->copyToHost(inertiasCPU);
880 	b3AlignedObjectArray<b3Contact4> manifoldPtrCPU;
881 	manifoldPtrGPU->copyToHost(manifoldPtrCPU);
882 
883 	int numBodiesCPU = bodiesGPU->size();
884 	int numManifoldsCPU = manifoldPtrGPU->size();
885 	B3_PROFILE("b3GpuJacobiContactSolver::solveGroupMixed");
886 
887 	b3AlignedObjectArray<unsigned int> bodyCount;
888 	bodyCount.resize(numBodiesCPU);
889 	for (int i=0;i<numBodiesCPU;i++)
890 		bodyCount[i] = 0;
891 
892 	b3AlignedObjectArray<b3Int2> contactConstraintOffsets;
893 	contactConstraintOffsets.resize(numManifoldsCPU);
894 
895 
896 	for (int i=0;i<numManifoldsCPU;i++)
897 	{
898 		int pa = manifoldPtrCPU[i].m_bodyAPtrAndSignBit;
899 		int pb = manifoldPtrCPU[i].m_bodyBPtrAndSignBit;
900 
901 		bool isFixedA = (pa <0) || (pa == solverInfo.m_fixedBodyIndex);
902 		bool isFixedB = (pb <0) || (pb == solverInfo.m_fixedBodyIndex);
903 
904 		int bodyIndexA = manifoldPtrCPU[i].getBodyA();
905 		int bodyIndexB = manifoldPtrCPU[i].getBodyB();
906 
907 		if (!isFixedA)
908 		{
909 			contactConstraintOffsets[i].x = bodyCount[bodyIndexA];
910 			bodyCount[bodyIndexA]++;
911 		}
912 		if (!isFixedB)
913 		{
914 			contactConstraintOffsets[i].y = bodyCount[bodyIndexB];
915 			bodyCount[bodyIndexB]++;
916 		}
917 	}
918 
919 	b3AlignedObjectArray<unsigned int> offsetSplitBodies;
920 	offsetSplitBodies.resize(numBodiesCPU);
921 	unsigned int totalNumSplitBodiesCPU;
922 	m_data->m_scan->executeHost(bodyCount,offsetSplitBodies,numBodiesCPU,&totalNumSplitBodiesCPU);
923 	int numlastBody = bodyCount[numBodiesCPU-1];
924 	totalNumSplitBodiesCPU += numlastBody;
925 
926 		int numBodies = bodiesGPU->size();
927 	int numManifolds = manifoldPtrGPU->size();
928 
929 	m_data->m_bodyCount->resize(numBodies);
930 
931 	unsigned int val=0;
932 	b3Int2 val2;
933 	val2.x=0;
934 	val2.y=0;
935 
936 	 {
937 		B3_PROFILE("m_filler");
938 		m_data->m_contactConstraintOffsets->resize(numManifolds);
939 		m_data->m_filler->execute(*m_data->m_bodyCount,val,numBodies);
940 
941 
942 		m_data->m_filler->execute(*m_data->m_contactConstraintOffsets,val2,numManifolds);
943 	}
944 
945 	{
946 		B3_PROFILE("m_countBodiesKernel");
947 		b3LauncherCL launcher(this->m_queue,m_data->m_countBodiesKernel);
948 		launcher.setBuffer(manifoldPtrGPU->getBufferCL());
949 		launcher.setBuffer(m_data->m_bodyCount->getBufferCL());
950 		launcher.setBuffer(m_data->m_contactConstraintOffsets->getBufferCL());
951 		launcher.setConst(numManifolds);
952 		launcher.setConst(solverInfo.m_fixedBodyIndex);
953 		launcher.launch1D(numManifolds);
954 	}
955 
956 	unsigned int totalNumSplitBodies=0;
957 	m_data->m_offsetSplitBodies->resize(numBodies);
958 	m_data->m_scan->execute(*m_data->m_bodyCount,*m_data->m_offsetSplitBodies,numBodies,&totalNumSplitBodies);
959 	totalNumSplitBodies+=m_data->m_bodyCount->at(numBodies-1);
960 
961 	if (totalNumSplitBodies != totalNumSplitBodiesCPU)
962 	{
963 		printf("error in totalNumSplitBodies!\n");
964 	}
965 
966 	int numContacts = manifoldPtrGPU->size();
967 	m_data->m_contactConstraints->resize(numContacts);
968 
969 
970 	{
971 		B3_PROFILE("contactToConstraintSplitKernel");
972 		b3LauncherCL launcher( m_queue, m_data->m_contactToConstraintSplitKernel);
973 		launcher.setBuffer(manifoldPtrGPU->getBufferCL());
974 		launcher.setBuffer(bodiesGPU->getBufferCL());
975 		launcher.setBuffer(inertiasGPU->getBufferCL());
976 		launcher.setBuffer(m_data->m_contactConstraints->getBufferCL());
977 		launcher.setBuffer(m_data->m_bodyCount->getBufferCL());
978         launcher.setConst(numContacts);
979 		launcher.setConst(solverInfo.m_deltaTime);
980 		launcher.setConst(solverInfo.m_positionDrift);
981 		launcher.setConst(solverInfo.m_positionConstraintCoeff);
982 		launcher.launch1D( numContacts, 64 );
983 		clFinish(m_queue);
984 	}
985 
986 
987 
988 	b3AlignedObjectArray<b3GpuConstraint4> contactConstraints;
989 	contactConstraints.resize(numManifoldsCPU);
990 
991 	for (int i=0;i<numManifoldsCPU;i++)
992 	{
993 		ContactToConstraintKernel(&manifoldPtrCPU[0],&bodiesCPU[0],&inertiasCPU[0],&contactConstraints[0],numManifoldsCPU,
994 			solverInfo.m_deltaTime,
995 			solverInfo.m_positionDrift,
996 			solverInfo.m_positionConstraintCoeff,
997 			i, bodyCount);
998 	}
999 	int maxIter = solverInfo.m_numIterations;
1000 
1001 
1002 	b3AlignedObjectArray<b3Vector3> deltaLinearVelocities;
1003 	b3AlignedObjectArray<b3Vector3> deltaAngularVelocities;
1004 	deltaLinearVelocities.resize(totalNumSplitBodiesCPU);
1005 	deltaAngularVelocities.resize(totalNumSplitBodiesCPU);
1006 	for (int i=0;i<totalNumSplitBodiesCPU;i++)
1007 	{
1008 		deltaLinearVelocities[i].setZero();
1009 		deltaAngularVelocities[i].setZero();
1010 	}
1011 
1012 	m_data->m_deltaLinearVelocities->resize(totalNumSplitBodies);
1013 	m_data->m_deltaAngularVelocities->resize(totalNumSplitBodies);
1014 
1015 
1016 
1017 	{
1018 		B3_PROFILE("m_clearVelocitiesKernel");
1019 		b3LauncherCL launch(m_queue,m_data->m_clearVelocitiesKernel);
1020 		launch.setBuffer(m_data->m_deltaAngularVelocities->getBufferCL());
1021 		launch.setBuffer(m_data->m_deltaLinearVelocities->getBufferCL());
1022 		launch.setConst(totalNumSplitBodies);
1023 		launch.launch1D(totalNumSplitBodies);
1024 	}
1025 
1026 
1027 		///!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
1028 
1029 
1030 	m_data->m_contactConstraints->copyToHost(contactConstraints);
1031 	m_data->m_offsetSplitBodies->copyToHost(offsetSplitBodies);
1032 	m_data->m_contactConstraintOffsets->copyToHost(contactConstraintOffsets);
1033 	m_data->m_deltaLinearVelocities->copyToHost(deltaLinearVelocities);
1034 	m_data->m_deltaAngularVelocities->copyToHost(deltaAngularVelocities);
1035 
1036 	for (int iter = 0;iter<maxIter;iter++)
1037 	{
1038 
1039 				{
1040 			B3_PROFILE("m_solveContactKernel");
1041 			b3LauncherCL launcher( m_queue, m_data->m_solveContactKernel );
1042 			launcher.setBuffer(m_data->m_contactConstraints->getBufferCL());
1043 			launcher.setBuffer(bodiesGPU->getBufferCL());
1044 			launcher.setBuffer(inertiasGPU->getBufferCL());
1045 			launcher.setBuffer(m_data->m_contactConstraintOffsets->getBufferCL());
1046 			launcher.setBuffer(m_data->m_offsetSplitBodies->getBufferCL());
1047 			launcher.setBuffer(m_data->m_deltaLinearVelocities->getBufferCL());
1048 			launcher.setBuffer(m_data->m_deltaAngularVelocities->getBufferCL());
1049 			launcher.setConst(solverInfo.m_deltaTime);
1050 			launcher.setConst(solverInfo.m_positionDrift);
1051 			launcher.setConst(solverInfo.m_positionConstraintCoeff);
1052 			launcher.setConst(solverInfo.m_fixedBodyIndex);
1053 			launcher.setConst(numManifolds);
1054 
1055 			launcher.launch1D(numManifolds);
1056 			clFinish(m_queue);
1057 		}
1058 
1059 
1060 		int i=0;
1061 		for( i=0; i<numManifoldsCPU; i++)
1062 		{
1063 
1064 			float frictionCoeff = contactConstraints[i].getFrictionCoeff();
1065 			int aIdx = (int)contactConstraints[i].m_bodyA;
1066 			int bIdx = (int)contactConstraints[i].m_bodyB;
1067 			b3RigidBodyData& bodyA = bodiesCPU[aIdx];
1068 			b3RigidBodyData& bodyB = bodiesCPU[bIdx];
1069 
1070 			b3Vector3 zero(0,0,0);
1071 
1072 			b3Vector3* dlvAPtr=&zero;
1073 			b3Vector3* davAPtr=&zero;
1074 			b3Vector3* dlvBPtr=&zero;
1075 			b3Vector3* davBPtr=&zero;
1076 
1077 			if (bodyA.m_invMass)
1078 			{
1079 				int bodyOffsetA = offsetSplitBodies[aIdx];
1080 				int constraintOffsetA = contactConstraintOffsets[i].x;
1081 				int splitIndexA = bodyOffsetA+constraintOffsetA;
1082 				dlvAPtr = &deltaLinearVelocities[splitIndexA];
1083 				davAPtr = &deltaAngularVelocities[splitIndexA];
1084 			}
1085 
1086 			if (bodyB.m_invMass)
1087 			{
1088 				int bodyOffsetB = offsetSplitBodies[bIdx];
1089 				int constraintOffsetB = contactConstraintOffsets[i].y;
1090 				int splitIndexB= bodyOffsetB+constraintOffsetB;
1091 				dlvBPtr =&deltaLinearVelocities[splitIndexB];
1092 				davBPtr = &deltaAngularVelocities[splitIndexB];
1093 			}
1094 
1095 
1096 
1097 			{
1098 				float maxRambdaDt[4] = {FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX};
1099 				float minRambdaDt[4] = {0.f,0.f,0.f,0.f};
1100 
1101 				solveContact( contactConstraints[i], (b3Vector3&)bodyA.m_pos, (b3Vector3&)bodyA.m_linVel, (b3Vector3&)bodyA.m_angVel, bodyA.m_invMass, inertiasCPU[aIdx].m_invInertiaWorld,
1102 					(b3Vector3&)bodyB.m_pos, (b3Vector3&)bodyB.m_linVel, (b3Vector3&)bodyB.m_angVel, bodyB.m_invMass, inertiasCPU[bIdx].m_invInertiaWorld,
1103 					maxRambdaDt, minRambdaDt , *dlvAPtr,*davAPtr,*dlvBPtr,*davBPtr		);
1104 
1105 
1106 			}
1107 		}
1108 
1109 
1110 		{
1111 			B3_PROFILE("average velocities");
1112 			b3LauncherCL launcher( m_queue, m_data->m_averageVelocitiesKernel);
1113 			launcher.setBuffer(bodiesGPU->getBufferCL());
1114 			launcher.setBuffer(m_data->m_offsetSplitBodies->getBufferCL());
1115 			launcher.setBuffer(m_data->m_bodyCount->getBufferCL());
1116 			launcher.setBuffer(m_data->m_deltaLinearVelocities->getBufferCL());
1117 			launcher.setBuffer(m_data->m_deltaAngularVelocities->getBufferCL());
1118 			launcher.setConst(numBodies);
1119 			launcher.launch1D(numBodies);
1120 			clFinish(m_queue);
1121 		}
1122 
1123 		//easy
1124 		for (int i=0;i<numBodiesCPU;i++)
1125 		{
1126 			if (bodiesCPU[i].m_invMass)
1127 			{
1128 				int bodyOffset = offsetSplitBodies[i];
1129 				int count = bodyCount[i];
1130 				float factor = 1.f/float(count);
1131 				b3Vector3 averageLinVel;
1132 				averageLinVel.setZero();
1133 				b3Vector3 averageAngVel;
1134 				averageAngVel.setZero();
1135 				for (int j=0;j<count;j++)
1136 				{
1137 					averageLinVel += deltaLinearVelocities[bodyOffset+j]*factor;
1138 					averageAngVel += deltaAngularVelocities[bodyOffset+j]*factor;
1139 				}
1140 				for (int j=0;j<count;j++)
1141 				{
1142 					deltaLinearVelocities[bodyOffset+j] = averageLinVel;
1143 					deltaAngularVelocities[bodyOffset+j] = averageAngVel;
1144 				}
1145 			}
1146 		}
1147 //	m_data->m_deltaAngularVelocities->copyFromHost(deltaAngularVelocities);
1148 	//m_data->m_deltaLinearVelocities->copyFromHost(deltaLinearVelocities);
1149 	m_data->m_deltaAngularVelocities->copyToHost(deltaAngularVelocities);
1150 	m_data->m_deltaLinearVelocities->copyToHost(deltaLinearVelocities);
1151 
1152 #if 0
1153 
1154 		{
1155 			B3_PROFILE("m_solveFrictionKernel");
1156 			b3LauncherCL launcher( m_queue, m_data->m_solveFrictionKernel);
1157 			launcher.setBuffer(m_data->m_contactConstraints->getBufferCL());
1158 			launcher.setBuffer(bodiesGPU->getBufferCL());
1159 			launcher.setBuffer(inertiasGPU->getBufferCL());
1160 			launcher.setBuffer(m_data->m_contactConstraintOffsets->getBufferCL());
1161 			launcher.setBuffer(m_data->m_offsetSplitBodies->getBufferCL());
1162 			launcher.setBuffer(m_data->m_deltaLinearVelocities->getBufferCL());
1163 			launcher.setBuffer(m_data->m_deltaAngularVelocities->getBufferCL());
1164 			launcher.setConst(solverInfo.m_deltaTime);
1165 			launcher.setConst(solverInfo.m_positionDrift);
1166 			launcher.setConst(solverInfo.m_positionConstraintCoeff);
1167 			launcher.setConst(solverInfo.m_fixedBodyIndex);
1168 			launcher.setConst(numManifolds);
1169 
1170 			launcher.launch1D(numManifolds);
1171 			clFinish(m_queue);
1172 		}
1173 
1174 		//solve friction
1175 
1176 		for(int i=0; i<numManifoldsCPU; i++)
1177 		{
1178 			float maxRambdaDt[4] = {FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX};
1179 			float minRambdaDt[4] = {0.f,0.f,0.f,0.f};
1180 
1181 			float sum = 0;
1182 			for(int j=0; j<4; j++)
1183 			{
1184 				sum +=contactConstraints[i].m_appliedRambdaDt[j];
1185 			}
1186 			float frictionCoeff = contactConstraints[i].getFrictionCoeff();
1187 			int aIdx = (int)contactConstraints[i].m_bodyA;
1188 			int bIdx = (int)contactConstraints[i].m_bodyB;
1189 			b3RigidBodyData& bodyA = bodiesCPU[aIdx];
1190 			b3RigidBodyData& bodyB = bodiesCPU[bIdx];
1191 
1192 			b3Vector3 zero(0,0,0);
1193 
1194 			b3Vector3* dlvAPtr=&zero;
1195 			b3Vector3* davAPtr=&zero;
1196 			b3Vector3* dlvBPtr=&zero;
1197 			b3Vector3* davBPtr=&zero;
1198 
1199 			if (bodyA.m_invMass)
1200 			{
1201 				int bodyOffsetA = offsetSplitBodies[aIdx];
1202 				int constraintOffsetA = contactConstraintOffsets[i].x;
1203 				int splitIndexA = bodyOffsetA+constraintOffsetA;
1204 				dlvAPtr = &deltaLinearVelocities[splitIndexA];
1205 				davAPtr = &deltaAngularVelocities[splitIndexA];
1206 			}
1207 
1208 			if (bodyB.m_invMass)
1209 			{
1210 				int bodyOffsetB = offsetSplitBodies[bIdx];
1211 				int constraintOffsetB = contactConstraintOffsets[i].y;
1212 				int splitIndexB= bodyOffsetB+constraintOffsetB;
1213 				dlvBPtr =&deltaLinearVelocities[splitIndexB];
1214 				davBPtr = &deltaAngularVelocities[splitIndexB];
1215 			}
1216 
1217 			for(int j=0; j<4; j++)
1218 			{
1219 				maxRambdaDt[j] = frictionCoeff*sum;
1220 				minRambdaDt[j] = -maxRambdaDt[j];
1221 			}
1222 
1223 			solveFriction( contactConstraints[i], (b3Vector3&)bodyA.m_pos, (b3Vector3&)bodyA.m_linVel, (b3Vector3&)bodyA.m_angVel, bodyA.m_invMass,inertiasCPU[aIdx].m_invInertiaWorld,
1224 				(b3Vector3&)bodyB.m_pos, (b3Vector3&)bodyB.m_linVel, (b3Vector3&)bodyB.m_angVel, bodyB.m_invMass, inertiasCPU[bIdx].m_invInertiaWorld,
1225 				maxRambdaDt, minRambdaDt , *dlvAPtr,*davAPtr,*dlvBPtr,*davBPtr);
1226 
1227 		}
1228 
1229 		{
1230 			B3_PROFILE("average velocities");
1231 			b3LauncherCL launcher( m_queue, m_data->m_averageVelocitiesKernel);
1232 			launcher.setBuffer(bodiesGPU->getBufferCL());
1233 			launcher.setBuffer(m_data->m_offsetSplitBodies->getBufferCL());
1234 			launcher.setBuffer(m_data->m_bodyCount->getBufferCL());
1235 			launcher.setBuffer(m_data->m_deltaLinearVelocities->getBufferCL());
1236 			launcher.setBuffer(m_data->m_deltaAngularVelocities->getBufferCL());
1237 			launcher.setConst(numBodies);
1238 			launcher.launch1D(numBodies);
1239 			clFinish(m_queue);
1240 		}
1241 
1242 		//easy
1243 		for (int i=0;i<numBodiesCPU;i++)
1244 		{
1245 			if (bodiesCPU[i].m_invMass)
1246 			{
1247 				int bodyOffset = offsetSplitBodies[i];
1248 				int count = bodyCount[i];
1249 				float factor = 1.f/float(count);
1250 				b3Vector3 averageLinVel;
1251 				averageLinVel.setZero();
1252 				b3Vector3 averageAngVel;
1253 				averageAngVel.setZero();
1254 				for (int j=0;j<count;j++)
1255 				{
1256 					averageLinVel += deltaLinearVelocities[bodyOffset+j]*factor;
1257 					averageAngVel += deltaAngularVelocities[bodyOffset+j]*factor;
1258 				}
1259 				for (int j=0;j<count;j++)
1260 				{
1261 					deltaLinearVelocities[bodyOffset+j] = averageLinVel;
1262 					deltaAngularVelocities[bodyOffset+j] = averageAngVel;
1263 				}
1264 			}
1265 		}
1266 
1267 #endif
1268 
1269 	}
1270 
1271 	{
1272 		B3_PROFILE("update body velocities");
1273 		b3LauncherCL launcher( m_queue, m_data->m_updateBodyVelocitiesKernel);
1274 		launcher.setBuffer(bodiesGPU->getBufferCL());
1275 		launcher.setBuffer(m_data->m_offsetSplitBodies->getBufferCL());
1276 		launcher.setBuffer(m_data->m_bodyCount->getBufferCL());
1277 		launcher.setBuffer(m_data->m_deltaLinearVelocities->getBufferCL());
1278 		launcher.setBuffer(m_data->m_deltaAngularVelocities->getBufferCL());
1279 		launcher.setConst(numBodies);
1280 		launcher.launch1D(numBodies);
1281 		clFinish(m_queue);
1282 	}
1283 
1284 
1285 	//easy
1286 	for (int i=0;i<numBodiesCPU;i++)
1287 	{
1288 		if (bodiesCPU[i].m_invMass)
1289 		{
1290 			int bodyOffset = offsetSplitBodies[i];
1291 			int count = bodyCount[i];
1292 			if (count)
1293 			{
1294 				bodiesCPU[i].m_linVel += deltaLinearVelocities[bodyOffset];
1295 				bodiesCPU[i].m_angVel += deltaAngularVelocities[bodyOffset];
1296 			}
1297 		}
1298 	}
1299 
1300 
1301 //	bodiesGPU->copyFromHost(bodiesCPU);
1302 
1303 
1304 }
1305 #endif
1306