1 /*
2 Copyright (c) 2012 Advanced Micro Devices, Inc.
3 
4 This software is provided 'as-is', without any express or implied warranty.
5 In no event will the authors be held liable for any damages arising from the use of this software.
6 Permission is granted to anyone to use this software for any purpose,
7 including commercial applications, and to alter it and redistribute it freely,
8 subject to the following restrictions:
9 
10 1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
11 2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
12 3. This notice may not be removed or altered from any source distribution.
13 */
14 //Originally written by Takahiro Harada
15 
16 #include "b3Solver.h"
17 
18 ///useNewBatchingKernel  is a rewritten kernel using just a single thread of the warp, for experiments
19 bool useNewBatchingKernel = true;
20 bool gConvertConstraintOnCpu = false;
21 
22 #define B3_SOLVER_SETUP_KERNEL_PATH "src/Bullet3OpenCL/RigidBody/kernels/solverSetup.cl"
23 #define B3_SOLVER_SETUP2_KERNEL_PATH "src/Bullet3OpenCL/RigidBody/kernels/solverSetup2.cl"
24 #define B3_SOLVER_CONTACT_KERNEL_PATH "src/Bullet3OpenCL/RigidBody/kernels/solveContact.cl"
25 #define B3_SOLVER_FRICTION_KERNEL_PATH "src/Bullet3OpenCL/RigidBody/kernels/solveFriction.cl"
26 #define B3_BATCHING_PATH "src/Bullet3OpenCL/RigidBody/kernels/batchingKernels.cl"
27 #define B3_BATCHING_NEW_PATH "src/Bullet3OpenCL/RigidBody/kernels/batchingKernelsNew.cl"
28 
29 #include "Bullet3Dynamics/shared/b3ConvertConstraint4.h"
30 
31 #include "kernels/solverSetup.h"
32 #include "kernels/solverSetup2.h"
33 
34 #include "kernels/solveContact.h"
35 #include "kernels/solveFriction.h"
36 
37 #include "kernels/batchingKernels.h"
38 #include "kernels/batchingKernelsNew.h"
39 
40 #include "Bullet3OpenCL/ParallelPrimitives/b3LauncherCL.h"
41 #include "Bullet3Common/b3Vector3.h"
42 
43 struct SolverDebugInfo
44 {
45 	int m_valInt0;
46 	int m_valInt1;
47 	int m_valInt2;
48 	int m_valInt3;
49 
50 	int m_valInt4;
51 	int m_valInt5;
52 	int m_valInt6;
53 	int m_valInt7;
54 
55 	int m_valInt8;
56 	int m_valInt9;
57 	int m_valInt10;
58 	int m_valInt11;
59 
60 	int m_valInt12;
61 	int m_valInt13;
62 	int m_valInt14;
63 	int m_valInt15;
64 
65 	float m_val0;
66 	float m_val1;
67 	float m_val2;
68 	float m_val3;
69 };
70 
71 class SolverDeviceInl
72 {
73 public:
74 	struct ParallelSolveData
75 	{
76 		b3OpenCLArray<unsigned int>* m_numConstraints;
77 		b3OpenCLArray<unsigned int>* m_offsets;
78 	};
79 };
80 
b3Solver(cl_context ctx,cl_device_id device,cl_command_queue queue,int pairCapacity)81 b3Solver::b3Solver(cl_context ctx, cl_device_id device, cl_command_queue queue, int pairCapacity)
82 	: m_context(ctx),
83 	  m_device(device),
84 	  m_queue(queue),
85 	  m_batchSizes(ctx, queue),
86 	  m_nIterations(4)
87 {
88 	m_sort32 = new b3RadixSort32CL(ctx, device, queue);
89 	m_scan = new b3PrefixScanCL(ctx, device, queue, B3_SOLVER_N_CELLS);
90 	m_search = new b3BoundSearchCL(ctx, device, queue, B3_SOLVER_N_CELLS);
91 
92 	const int sortSize = B3NEXTMULTIPLEOF(pairCapacity, 512);
93 
94 	m_sortDataBuffer = new b3OpenCLArray<b3SortData>(ctx, queue, sortSize);
95 	m_contactBuffer2 = new b3OpenCLArray<b3Contact4>(ctx, queue);
96 
97 	m_numConstraints = new b3OpenCLArray<unsigned int>(ctx, queue, B3_SOLVER_N_CELLS);
98 	m_numConstraints->resize(B3_SOLVER_N_CELLS);
99 
100 	m_offsets = new b3OpenCLArray<unsigned int>(ctx, queue, B3_SOLVER_N_CELLS);
101 	m_offsets->resize(B3_SOLVER_N_CELLS);
102 	const char* additionalMacros = "";
103 	//	const char* srcFileNameForCaching="";
104 
105 	cl_int pErrNum;
106 	const char* batchKernelSource = batchingKernelsCL;
107 	const char* batchKernelNewSource = batchingKernelsNewCL;
108 
109 	const char* solverSetupSource = solverSetupCL;
110 	const char* solverSetup2Source = solverSetup2CL;
111 	const char* solveContactSource = solveContactCL;
112 	const char* solveFrictionSource = solveFrictionCL;
113 
114 	{
115 		cl_program solveContactProg = b3OpenCLUtils::compileCLProgramFromString(ctx, device, solveContactSource, &pErrNum, additionalMacros, B3_SOLVER_CONTACT_KERNEL_PATH);
116 		b3Assert(solveContactProg);
117 
118 		cl_program solveFrictionProg = b3OpenCLUtils::compileCLProgramFromString(ctx, device, solveFrictionSource, &pErrNum, additionalMacros, B3_SOLVER_FRICTION_KERNEL_PATH);
119 		b3Assert(solveFrictionProg);
120 
121 		cl_program solverSetup2Prog = b3OpenCLUtils::compileCLProgramFromString(ctx, device, solverSetup2Source, &pErrNum, additionalMacros, B3_SOLVER_SETUP2_KERNEL_PATH);
122 		b3Assert(solverSetup2Prog);
123 
124 		cl_program solverSetupProg = b3OpenCLUtils::compileCLProgramFromString(ctx, device, solverSetupSource, &pErrNum, additionalMacros, B3_SOLVER_SETUP_KERNEL_PATH);
125 		b3Assert(solverSetupProg);
126 
127 		m_solveFrictionKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, solveFrictionSource, "BatchSolveKernelFriction", &pErrNum, solveFrictionProg, additionalMacros);
128 		b3Assert(m_solveFrictionKernel);
129 
130 		m_solveContactKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, solveContactSource, "BatchSolveKernelContact", &pErrNum, solveContactProg, additionalMacros);
131 		b3Assert(m_solveContactKernel);
132 
133 		m_contactToConstraintKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, solverSetupSource, "ContactToConstraintKernel", &pErrNum, solverSetupProg, additionalMacros);
134 		b3Assert(m_contactToConstraintKernel);
135 
136 		m_setSortDataKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, solverSetup2Source, "SetSortDataKernel", &pErrNum, solverSetup2Prog, additionalMacros);
137 		b3Assert(m_setSortDataKernel);
138 
139 		m_reorderContactKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, solverSetup2Source, "ReorderContactKernel", &pErrNum, solverSetup2Prog, additionalMacros);
140 		b3Assert(m_reorderContactKernel);
141 
142 		m_copyConstraintKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, solverSetup2Source, "CopyConstraintKernel", &pErrNum, solverSetup2Prog, additionalMacros);
143 		b3Assert(m_copyConstraintKernel);
144 	}
145 
146 	{
147 		cl_program batchingProg = b3OpenCLUtils::compileCLProgramFromString(ctx, device, batchKernelSource, &pErrNum, additionalMacros, B3_BATCHING_PATH);
148 		//cl_program batchingProg = b3OpenCLUtils::compileCLProgramFromString( ctx, device, 0, &pErrNum,additionalMacros, B3_BATCHING_PATH,true);
149 		b3Assert(batchingProg);
150 
151 		m_batchingKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, batchKernelSource, "CreateBatches", &pErrNum, batchingProg, additionalMacros);
152 		b3Assert(m_batchingKernel);
153 	}
154 	{
155 		cl_program batchingNewProg = b3OpenCLUtils::compileCLProgramFromString(ctx, device, batchKernelNewSource, &pErrNum, additionalMacros, B3_BATCHING_NEW_PATH);
156 		b3Assert(batchingNewProg);
157 
158 		m_batchingKernelNew = b3OpenCLUtils::compileCLKernelFromString(ctx, device, batchKernelNewSource, "CreateBatchesNew", &pErrNum, batchingNewProg, additionalMacros);
159 		//m_batchingKernelNew = b3OpenCLUtils::compileCLKernelFromString( ctx, device, batchKernelNewSource, "CreateBatchesBruteForce", &pErrNum, batchingNewProg,additionalMacros );
160 		b3Assert(m_batchingKernelNew);
161 	}
162 }
163 
~b3Solver()164 b3Solver::~b3Solver()
165 {
166 	delete m_offsets;
167 	delete m_numConstraints;
168 	delete m_sortDataBuffer;
169 	delete m_contactBuffer2;
170 
171 	delete m_sort32;
172 	delete m_scan;
173 	delete m_search;
174 
175 	clReleaseKernel(m_batchingKernel);
176 	clReleaseKernel(m_batchingKernelNew);
177 
178 	clReleaseKernel(m_solveContactKernel);
179 	clReleaseKernel(m_solveFrictionKernel);
180 
181 	clReleaseKernel(m_contactToConstraintKernel);
182 	clReleaseKernel(m_setSortDataKernel);
183 	clReleaseKernel(m_reorderContactKernel);
184 	clReleaseKernel(m_copyConstraintKernel);
185 }
186 
187 template <bool JACOBI>
solveContact(b3GpuConstraint4 & cs,const b3Vector3 & posA,b3Vector3 & linVelA,b3Vector3 & angVelA,float invMassA,const b3Matrix3x3 & invInertiaA,const b3Vector3 & posB,b3Vector3 & linVelB,b3Vector3 & angVelB,float invMassB,const b3Matrix3x3 & invInertiaB,float maxRambdaDt[4],float minRambdaDt[4])188 static __inline void solveContact(b3GpuConstraint4& cs,
189 								  const b3Vector3& posA, b3Vector3& linVelA, b3Vector3& angVelA, float invMassA, const b3Matrix3x3& invInertiaA,
190 								  const b3Vector3& posB, b3Vector3& linVelB, b3Vector3& angVelB, float invMassB, const b3Matrix3x3& invInertiaB,
191 								  float maxRambdaDt[4], float minRambdaDt[4])
192 {
193 	b3Vector3 dLinVelA;
194 	dLinVelA.setZero();
195 	b3Vector3 dAngVelA;
196 	dAngVelA.setZero();
197 	b3Vector3 dLinVelB;
198 	dLinVelB.setZero();
199 	b3Vector3 dAngVelB;
200 	dAngVelB.setZero();
201 
202 	for (int ic = 0; ic < 4; ic++)
203 	{
204 		//	dont necessary because this makes change to 0
205 		if (cs.m_jacCoeffInv[ic] == 0.f) continue;
206 
207 		{
208 			b3Vector3 angular0, angular1, linear;
209 			b3Vector3 r0 = cs.m_worldPos[ic] - (b3Vector3&)posA;
210 			b3Vector3 r1 = cs.m_worldPos[ic] - (b3Vector3&)posB;
211 			setLinearAndAngular((const b3Vector3&)cs.m_linear, (const b3Vector3&)r0, (const b3Vector3&)r1, &linear, &angular0, &angular1);
212 
213 			float rambdaDt = calcRelVel((const b3Vector3&)cs.m_linear, (const b3Vector3&)-cs.m_linear, angular0, angular1,
214 										linVelA, angVelA, linVelB, angVelB) +
215 							 cs.m_b[ic];
216 			rambdaDt *= cs.m_jacCoeffInv[ic];
217 
218 			{
219 				float prevSum = cs.m_appliedRambdaDt[ic];
220 				float updated = prevSum;
221 				updated += rambdaDt;
222 				updated = b3Max(updated, minRambdaDt[ic]);
223 				updated = b3Min(updated, maxRambdaDt[ic]);
224 				rambdaDt = updated - prevSum;
225 				cs.m_appliedRambdaDt[ic] = updated;
226 			}
227 
228 			b3Vector3 linImp0 = invMassA * linear * rambdaDt;
229 			b3Vector3 linImp1 = invMassB * (-linear) * rambdaDt;
230 			b3Vector3 angImp0 = (invInertiaA * angular0) * rambdaDt;
231 			b3Vector3 angImp1 = (invInertiaB * angular1) * rambdaDt;
232 #ifdef _WIN32
233 			b3Assert(_finite(linImp0.getX()));
234 			b3Assert(_finite(linImp1.getX()));
235 #endif
236 			if (JACOBI)
237 			{
238 				dLinVelA += linImp0;
239 				dAngVelA += angImp0;
240 				dLinVelB += linImp1;
241 				dAngVelB += angImp1;
242 			}
243 			else
244 			{
245 				linVelA += linImp0;
246 				angVelA += angImp0;
247 				linVelB += linImp1;
248 				angVelB += angImp1;
249 			}
250 		}
251 	}
252 
253 	if (JACOBI)
254 	{
255 		linVelA += dLinVelA;
256 		angVelA += dAngVelA;
257 		linVelB += dLinVelB;
258 		angVelB += dAngVelB;
259 	}
260 }
261 
solveFriction(b3GpuConstraint4 & cs,const b3Vector3 & posA,b3Vector3 & linVelA,b3Vector3 & angVelA,float invMassA,const b3Matrix3x3 & invInertiaA,const b3Vector3 & posB,b3Vector3 & linVelB,b3Vector3 & angVelB,float invMassB,const b3Matrix3x3 & invInertiaB,float maxRambdaDt[4],float minRambdaDt[4])262 static __inline void solveFriction(b3GpuConstraint4& cs,
263 								   const b3Vector3& posA, b3Vector3& linVelA, b3Vector3& angVelA, float invMassA, const b3Matrix3x3& invInertiaA,
264 								   const b3Vector3& posB, b3Vector3& linVelB, b3Vector3& angVelB, float invMassB, const b3Matrix3x3& invInertiaB,
265 								   float maxRambdaDt[4], float minRambdaDt[4])
266 {
267 	if (cs.m_fJacCoeffInv[0] == 0 && cs.m_fJacCoeffInv[0] == 0) return;
268 	const b3Vector3& center = (const b3Vector3&)cs.m_center;
269 
270 	b3Vector3 n = -(const b3Vector3&)cs.m_linear;
271 
272 	b3Vector3 tangent[2];
273 #if 1
274 	b3PlaneSpace1(n, tangent[0], tangent[1]);
275 #else
276 	b3Vector3 r = cs.m_worldPos[0] - center;
277 	tangent[0] = cross3(n, r);
278 	tangent[1] = cross3(tangent[0], n);
279 	tangent[0] = normalize3(tangent[0]);
280 	tangent[1] = normalize3(tangent[1]);
281 #endif
282 
283 	b3Vector3 angular0, angular1, linear;
284 	b3Vector3 r0 = center - posA;
285 	b3Vector3 r1 = center - posB;
286 	for (int i = 0; i < 2; i++)
287 	{
288 		setLinearAndAngular(tangent[i], r0, r1, &linear, &angular0, &angular1);
289 		float rambdaDt = calcRelVel(linear, -linear, angular0, angular1,
290 									linVelA, angVelA, linVelB, angVelB);
291 		rambdaDt *= cs.m_fJacCoeffInv[i];
292 
293 		{
294 			float prevSum = cs.m_fAppliedRambdaDt[i];
295 			float updated = prevSum;
296 			updated += rambdaDt;
297 			updated = b3Max(updated, minRambdaDt[i]);
298 			updated = b3Min(updated, maxRambdaDt[i]);
299 			rambdaDt = updated - prevSum;
300 			cs.m_fAppliedRambdaDt[i] = updated;
301 		}
302 
303 		b3Vector3 linImp0 = invMassA * linear * rambdaDt;
304 		b3Vector3 linImp1 = invMassB * (-linear) * rambdaDt;
305 		b3Vector3 angImp0 = (invInertiaA * angular0) * rambdaDt;
306 		b3Vector3 angImp1 = (invInertiaB * angular1) * rambdaDt;
307 #ifdef _WIN32
308 		b3Assert(_finite(linImp0.getX()));
309 		b3Assert(_finite(linImp1.getX()));
310 #endif
311 		linVelA += linImp0;
312 		angVelA += angImp0;
313 		linVelB += linImp1;
314 		angVelB += angImp1;
315 	}
316 
317 	{  //	angular damping for point constraint
318 		b3Vector3 ab = (posB - posA).normalized();
319 		b3Vector3 ac = (center - posA).normalized();
320 		if (b3Dot(ab, ac) > 0.95f || (invMassA == 0.f || invMassB == 0.f))
321 		{
322 			float angNA = b3Dot(n, angVelA);
323 			float angNB = b3Dot(n, angVelB);
324 
325 			angVelA -= (angNA * 0.1f) * n;
326 			angVelB -= (angNB * 0.1f) * n;
327 		}
328 	}
329 }
330 /*
331  b3AlignedObjectArray<b3RigidBodyData>& m_bodies;
332 	b3AlignedObjectArray<b3InertiaData>& m_shapes;
333 	b3AlignedObjectArray<b3GpuConstraint4>& m_constraints;
334 	b3AlignedObjectArray<int>* m_batchSizes;
335 	int m_cellIndex;
336 	int m_curWgidx;
337 	int m_start;
338 	int m_nConstraints;
339 	bool m_solveFriction;
340 	int m_maxNumBatches;
341  */
342 
343 struct SolveTask  // : public ThreadPool::Task
344 {
SolveTaskSolveTask345 	SolveTask(b3AlignedObjectArray<b3RigidBodyData>& bodies, b3AlignedObjectArray<b3InertiaData>& shapes, b3AlignedObjectArray<b3GpuConstraint4>& constraints,
346 			  int start, int nConstraints, int maxNumBatches, b3AlignedObjectArray<int>* wgUsedBodies, int curWgidx, b3AlignedObjectArray<int>* batchSizes, int cellIndex)
347 		: m_bodies(bodies), m_shapes(shapes), m_constraints(constraints), m_batchSizes(batchSizes), m_cellIndex(cellIndex), m_curWgidx(curWgidx), m_start(start), m_nConstraints(nConstraints), m_solveFriction(true), m_maxNumBatches(maxNumBatches)
348 	{
349 	}
350 
getTypeSolveTask351 	unsigned short int getType() { return 0; }
352 
runSolveTask353 	void run(int tIdx)
354 	{
355 		int offset = 0;
356 		for (int ii = 0; ii < B3_MAX_NUM_BATCHES; ii++)
357 		{
358 			int numInBatch = m_batchSizes->at(m_cellIndex * B3_MAX_NUM_BATCHES + ii);
359 			if (!numInBatch)
360 				break;
361 
362 			for (int jj = 0; jj < numInBatch; jj++)
363 			{
364 				int i = m_start + offset + jj;
365 				int batchId = m_constraints[i].m_batchIdx;
366 				b3Assert(batchId == ii);
367 				float frictionCoeff = m_constraints[i].getFrictionCoeff();
368 				int aIdx = (int)m_constraints[i].m_bodyA;
369 				int bIdx = (int)m_constraints[i].m_bodyB;
370 				//				int localBatch = m_constraints[i].m_batchIdx;
371 				b3RigidBodyData& bodyA = m_bodies[aIdx];
372 				b3RigidBodyData& bodyB = m_bodies[bIdx];
373 
374 				if (!m_solveFriction)
375 				{
376 					float maxRambdaDt[4] = {FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX};
377 					float minRambdaDt[4] = {0.f, 0.f, 0.f, 0.f};
378 
379 					solveContact<false>(m_constraints[i], (b3Vector3&)bodyA.m_pos, (b3Vector3&)bodyA.m_linVel, (b3Vector3&)bodyA.m_angVel, bodyA.m_invMass, (const b3Matrix3x3&)m_shapes[aIdx].m_invInertiaWorld,
380 										(b3Vector3&)bodyB.m_pos, (b3Vector3&)bodyB.m_linVel, (b3Vector3&)bodyB.m_angVel, bodyB.m_invMass, (const b3Matrix3x3&)m_shapes[bIdx].m_invInertiaWorld,
381 										maxRambdaDt, minRambdaDt);
382 				}
383 				else
384 				{
385 					float maxRambdaDt[4] = {FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX};
386 					float minRambdaDt[4] = {0.f, 0.f, 0.f, 0.f};
387 					float sum = 0;
388 					for (int j = 0; j < 4; j++)
389 					{
390 						sum += m_constraints[i].m_appliedRambdaDt[j];
391 					}
392 					frictionCoeff = 0.7f;
393 					for (int j = 0; j < 4; j++)
394 					{
395 						maxRambdaDt[j] = frictionCoeff * sum;
396 						minRambdaDt[j] = -maxRambdaDt[j];
397 					}
398 					solveFriction(m_constraints[i], (b3Vector3&)bodyA.m_pos, (b3Vector3&)bodyA.m_linVel, (b3Vector3&)bodyA.m_angVel, bodyA.m_invMass, (const b3Matrix3x3&)m_shapes[aIdx].m_invInertiaWorld,
399 								  (b3Vector3&)bodyB.m_pos, (b3Vector3&)bodyB.m_linVel, (b3Vector3&)bodyB.m_angVel, bodyB.m_invMass, (const b3Matrix3x3&)m_shapes[bIdx].m_invInertiaWorld,
400 								  maxRambdaDt, minRambdaDt);
401 				}
402 			}
403 			offset += numInBatch;
404 		}
405 		/*		for (int bb=0;bb<m_maxNumBatches;bb++)
406 		{
407 			//for(int ic=m_nConstraints-1; ic>=0; ic--)
408 			for(int ic=0; ic<m_nConstraints; ic++)
409 			{
410 
411 				int i = m_start + ic;
412 				if (m_constraints[i].m_batchIdx != bb)
413 					continue;
414 
415 				float frictionCoeff = m_constraints[i].getFrictionCoeff();
416 				int aIdx = (int)m_constraints[i].m_bodyA;
417 				int bIdx = (int)m_constraints[i].m_bodyB;
418 				int localBatch = m_constraints[i].m_batchIdx;
419 				b3RigidBodyData& bodyA = m_bodies[aIdx];
420 				b3RigidBodyData& bodyB = m_bodies[bIdx];
421 
422 				if( !m_solveFriction )
423 				{
424 					float maxRambdaDt[4] = {FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX};
425 					float minRambdaDt[4] = {0.f,0.f,0.f,0.f};
426 
427 					solveContact<false>( m_constraints[i], (b3Vector3&)bodyA.m_pos, (b3Vector3&)bodyA.m_linVel, (b3Vector3&)bodyA.m_angVel, bodyA.m_invMass, (const b3Matrix3x3 &)m_shapes[aIdx].m_invInertiaWorld,
428 							(b3Vector3&)bodyB.m_pos, (b3Vector3&)bodyB.m_linVel, (b3Vector3&)bodyB.m_angVel, bodyB.m_invMass, (const b3Matrix3x3 &)m_shapes[bIdx].m_invInertiaWorld,
429 						maxRambdaDt, minRambdaDt );
430 				}
431 				else
432 				{
433 					float maxRambdaDt[4] = {FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX};
434 					float minRambdaDt[4] = {0.f,0.f,0.f,0.f};
435 					float sum = 0;
436 					for(int j=0; j<4; j++)
437 					{
438 						sum +=m_constraints[i].m_appliedRambdaDt[j];
439 					}
440 					frictionCoeff = 0.7f;
441 					for(int j=0; j<4; j++)
442 					{
443 						maxRambdaDt[j] = frictionCoeff*sum;
444 						minRambdaDt[j] = -maxRambdaDt[j];
445 					}
446 					solveFriction( m_constraints[i], (b3Vector3&)bodyA.m_pos, (b3Vector3&)bodyA.m_linVel, (b3Vector3&)bodyA.m_angVel, bodyA.m_invMass,(const b3Matrix3x3 &) m_shapes[aIdx].m_invInertiaWorld,
447 						(b3Vector3&)bodyB.m_pos, (b3Vector3&)bodyB.m_linVel, (b3Vector3&)bodyB.m_angVel, bodyB.m_invMass,(const b3Matrix3x3 &) m_shapes[bIdx].m_invInertiaWorld,
448 						maxRambdaDt, minRambdaDt );
449 
450 				}
451 			}
452 		}
453 		*/
454 	}
455 
456 	b3AlignedObjectArray<b3RigidBodyData>& m_bodies;
457 	b3AlignedObjectArray<b3InertiaData>& m_shapes;
458 	b3AlignedObjectArray<b3GpuConstraint4>& m_constraints;
459 	b3AlignedObjectArray<int>* m_batchSizes;
460 	int m_cellIndex;
461 	int m_curWgidx;
462 	int m_start;
463 	int m_nConstraints;
464 	bool m_solveFriction;
465 	int m_maxNumBatches;
466 };
467 
solveContactConstraintHost(b3OpenCLArray<b3RigidBodyData> * bodyBuf,b3OpenCLArray<b3InertiaData> * shapeBuf,b3OpenCLArray<b3GpuConstraint4> * constraint,void * additionalData,int n,int maxNumBatches,b3AlignedObjectArray<int> * batchSizes)468 void b3Solver::solveContactConstraintHost(b3OpenCLArray<b3RigidBodyData>* bodyBuf, b3OpenCLArray<b3InertiaData>* shapeBuf,
469 										  b3OpenCLArray<b3GpuConstraint4>* constraint, void* additionalData, int n, int maxNumBatches, b3AlignedObjectArray<int>* batchSizes)
470 {
471 #if 0
472 	{
473 		int nSplitX = B3_SOLVER_N_SPLIT_X;
474 		int nSplitY = B3_SOLVER_N_SPLIT_Y;
475 		int numWorkgroups = B3_SOLVER_N_CELLS/B3_SOLVER_N_BATCHES;
476 		for (int z=0;z<4;z++)
477 		{
478 			for (int y=0;y<4;y++)
479 			{
480 				for (int x=0;x<4;x++)
481 				{
482 					int newIndex = (x+y*nSplitX+z*nSplitX*nSplitY);
483 				//	printf("newIndex=%d\n",newIndex);
484 
485 					int zIdx = newIndex/(nSplitX*nSplitY);
486 					int remain = newIndex%(nSplitX*nSplitY);
487 					int yIdx = remain/nSplitX;
488 					int xIdx = remain%nSplitX;
489 				//	printf("newIndex=%d\n",newIndex);
490 				}
491 			}
492 		}
493 
494 		//for (int wgIdx=numWorkgroups-1;wgIdx>=0;wgIdx--)
495 		for (int cellBatch=0;cellBatch<B3_SOLVER_N_BATCHES;cellBatch++)
496 		{
497 			for (int wgIdx=0;wgIdx<numWorkgroups;wgIdx++)
498 			{
499 				int zIdx = (wgIdx/((nSplitX*nSplitY)/4))*2+((cellBatch&4)>>2);
500 				int remain= (wgIdx%((nSplitX*nSplitY)/4));
501 				int yIdx = (remain/(nSplitX/2))*2 + ((cellBatch&2)>>1);
502 				int xIdx = (remain%(nSplitX/2))*2 + (cellBatch&1);
503 
504 				/*int zIdx = newIndex/(nSplitX*nSplitY);
505 				int remain = newIndex%(nSplitX*nSplitY);
506 				int yIdx = remain/nSplitX;
507 				int xIdx = remain%nSplitX;
508 				*/
509 				int cellIdx = xIdx+yIdx*nSplitX+zIdx*(nSplitX*nSplitY);
510 			//	printf("wgIdx %d: xIdx=%d, yIdx=%d, zIdx=%d, cellIdx=%d, cell Batch %d\n",wgIdx,xIdx,yIdx,zIdx,cellIdx,cellBatch);
511 			}
512 		}
513 	}
514 #endif
515 
516 	b3AlignedObjectArray<b3RigidBodyData> bodyNative;
517 	bodyBuf->copyToHost(bodyNative);
518 	b3AlignedObjectArray<b3InertiaData> shapeNative;
519 	shapeBuf->copyToHost(shapeNative);
520 	b3AlignedObjectArray<b3GpuConstraint4> constraintNative;
521 	constraint->copyToHost(constraintNative);
522 
523 	b3AlignedObjectArray<unsigned int> numConstraintsHost;
524 	m_numConstraints->copyToHost(numConstraintsHost);
525 
526 	//printf("------------------------\n");
527 	b3AlignedObjectArray<unsigned int> offsetsHost;
528 	m_offsets->copyToHost(offsetsHost);
529 	static int frame = 0;
530 	bool useBatches = true;
531 	if (useBatches)
532 	{
533 		for (int iter = 0; iter < m_nIterations; iter++)
534 		{
535 			for (int cellBatch = 0; cellBatch < B3_SOLVER_N_BATCHES; cellBatch++)
536 			{
537 				int nSplitX = B3_SOLVER_N_SPLIT_X;
538 				int nSplitY = B3_SOLVER_N_SPLIT_Y;
539 				int numWorkgroups = B3_SOLVER_N_CELLS / B3_SOLVER_N_BATCHES;
540 				//printf("cell Batch %d\n",cellBatch);
541 				b3AlignedObjectArray<int> usedBodies[B3_SOLVER_N_CELLS];
542 				for (int i = 0; i < B3_SOLVER_N_CELLS; i++)
543 				{
544 					usedBodies[i].resize(0);
545 				}
546 
547 				//for (int wgIdx=numWorkgroups-1;wgIdx>=0;wgIdx--)
548 				for (int wgIdx = 0; wgIdx < numWorkgroups; wgIdx++)
549 				{
550 					int zIdx = (wgIdx / ((nSplitX * nSplitY) / 4)) * 2 + ((cellBatch & 4) >> 2);
551 					int remain = (wgIdx % ((nSplitX * nSplitY) / 4));
552 					int yIdx = (remain / (nSplitX / 2)) * 2 + ((cellBatch & 2) >> 1);
553 					int xIdx = (remain % (nSplitX / 2)) * 2 + (cellBatch & 1);
554 					int cellIdx = xIdx + yIdx * nSplitX + zIdx * (nSplitX * nSplitY);
555 
556 					if (numConstraintsHost[cellIdx] == 0)
557 						continue;
558 
559 					//printf("wgIdx %d: xIdx=%d, yIdx=%d, zIdx=%d, cellIdx=%d, cell Batch %d\n",wgIdx,xIdx,yIdx,zIdx,cellIdx,cellBatch);
560 					//printf("cell %d has %d constraints\n", cellIdx,numConstraintsHost[cellIdx]);
561 					if (zIdx)
562 					{
563 						//printf("?\n");
564 					}
565 
566 					if (iter == 0)
567 					{
568 						//printf("frame=%d, Cell xIdx=%x, yIdx=%d ",frame, xIdx,yIdx);
569 						//printf("cellBatch=%d, wgIdx=%d, #constraints in cell=%d\n",cellBatch,wgIdx,numConstraintsHost[cellIdx]);
570 					}
571 					const int start = offsetsHost[cellIdx];
572 					int numConstraintsInCell = numConstraintsHost[cellIdx];
573 					//				const int end = start + numConstraintsInCell;
574 
575 					SolveTask task(bodyNative, shapeNative, constraintNative, start, numConstraintsInCell, maxNumBatches, usedBodies, wgIdx, batchSizes, cellIdx);
576 					task.m_solveFriction = false;
577 					task.run(0);
578 				}
579 			}
580 		}
581 
582 		for (int iter = 0; iter < m_nIterations; iter++)
583 		{
584 			for (int cellBatch = 0; cellBatch < B3_SOLVER_N_BATCHES; cellBatch++)
585 			{
586 				int nSplitX = B3_SOLVER_N_SPLIT_X;
587 				int nSplitY = B3_SOLVER_N_SPLIT_Y;
588 
589 				int numWorkgroups = B3_SOLVER_N_CELLS / B3_SOLVER_N_BATCHES;
590 
591 				for (int wgIdx = 0; wgIdx < numWorkgroups; wgIdx++)
592 				{
593 					int zIdx = (wgIdx / ((nSplitX * nSplitY) / 4)) * 2 + ((cellBatch & 4) >> 2);
594 					int remain = (wgIdx % ((nSplitX * nSplitY) / 4));
595 					int yIdx = (remain / (nSplitX / 2)) * 2 + ((cellBatch & 2) >> 1);
596 					int xIdx = (remain % (nSplitX / 2)) * 2 + (cellBatch & 1);
597 
598 					int cellIdx = xIdx + yIdx * nSplitX + zIdx * (nSplitX * nSplitY);
599 
600 					if (numConstraintsHost[cellIdx] == 0)
601 						continue;
602 
603 					//printf("yIdx=%d\n",yIdx);
604 
605 					const int start = offsetsHost[cellIdx];
606 					int numConstraintsInCell = numConstraintsHost[cellIdx];
607 					//				const int end = start + numConstraintsInCell;
608 
609 					SolveTask task(bodyNative, shapeNative, constraintNative, start, numConstraintsInCell, maxNumBatches, 0, 0, batchSizes, cellIdx);
610 					task.m_solveFriction = true;
611 					task.run(0);
612 				}
613 			}
614 		}
615 	}
616 	else
617 	{
618 		for (int iter = 0; iter < m_nIterations; iter++)
619 		{
620 			SolveTask task(bodyNative, shapeNative, constraintNative, 0, n, maxNumBatches, 0, 0, 0, 0);
621 			task.m_solveFriction = false;
622 			task.run(0);
623 		}
624 
625 		for (int iter = 0; iter < m_nIterations; iter++)
626 		{
627 			SolveTask task(bodyNative, shapeNative, constraintNative, 0, n, maxNumBatches, 0, 0, 0, 0);
628 			task.m_solveFriction = true;
629 			task.run(0);
630 		}
631 	}
632 
633 	bodyBuf->copyFromHost(bodyNative);
634 	shapeBuf->copyFromHost(shapeNative);
635 	constraint->copyFromHost(constraintNative);
636 	frame++;
637 }
638 
checkConstraintBatch(const b3OpenCLArray<b3RigidBodyData> * bodyBuf,const b3OpenCLArray<b3InertiaData> * shapeBuf,b3OpenCLArray<b3GpuConstraint4> * constraint,b3OpenCLArray<unsigned int> * m_numConstraints,b3OpenCLArray<unsigned int> * m_offsets,int batchId)639 void checkConstraintBatch(const b3OpenCLArray<b3RigidBodyData>* bodyBuf,
640 						  const b3OpenCLArray<b3InertiaData>* shapeBuf,
641 						  b3OpenCLArray<b3GpuConstraint4>* constraint,
642 						  b3OpenCLArray<unsigned int>* m_numConstraints,
643 						  b3OpenCLArray<unsigned int>* m_offsets,
644 						  int batchId)
645 {
646 	//						b3BufferInfoCL( m_numConstraints->getBufferCL() ),
647 	//						b3BufferInfoCL( m_offsets->getBufferCL() )
648 
649 	int cellBatch = batchId;
650 	const int nn = B3_SOLVER_N_CELLS;
651 	//	int numWorkItems = 64*nn/B3_SOLVER_N_BATCHES;
652 
653 	b3AlignedObjectArray<unsigned int> gN;
654 	m_numConstraints->copyToHost(gN);
655 	b3AlignedObjectArray<unsigned int> gOffsets;
656 	m_offsets->copyToHost(gOffsets);
657 	int nSplitX = B3_SOLVER_N_SPLIT_X;
658 	int nSplitY = B3_SOLVER_N_SPLIT_Y;
659 
660 	//	int bIdx = batchId;
661 
662 	b3AlignedObjectArray<b3GpuConstraint4> cpuConstraints;
663 	constraint->copyToHost(cpuConstraints);
664 
665 	printf("batch = %d\n", batchId);
666 
667 	int numWorkgroups = nn / B3_SOLVER_N_BATCHES;
668 	b3AlignedObjectArray<int> usedBodies;
669 
670 	for (int wgIdx = 0; wgIdx < numWorkgroups; wgIdx++)
671 	{
672 		printf("wgIdx = %d           ", wgIdx);
673 
674 		int zIdx = (wgIdx / ((nSplitX * nSplitY)) / 2) * 2 + ((cellBatch & 4) >> 2);
675 		int remain = wgIdx % ((nSplitX * nSplitY));
676 		int yIdx = (remain % (nSplitX / 2)) * 2 + ((cellBatch & 2) >> 1);
677 		int xIdx = (remain / (nSplitX / 2)) * 2 + (cellBatch & 1);
678 
679 		int cellIdx = xIdx + yIdx * nSplitX + zIdx * (nSplitX * nSplitY);
680 		printf("cellIdx=%d\n", cellIdx);
681 		if (gN[cellIdx] == 0)
682 			continue;
683 
684 		const int start = gOffsets[cellIdx];
685 		const int end = start + gN[cellIdx];
686 
687 		for (int c = start; c < end; c++)
688 		{
689 			b3GpuConstraint4& constraint = cpuConstraints[c];
690 			//printf("constraint (%d,%d)\n", constraint.m_bodyA,constraint.m_bodyB);
691 			if (usedBodies.findLinearSearch(constraint.m_bodyA) < usedBodies.size())
692 			{
693 				printf("error?\n");
694 			}
695 			if (usedBodies.findLinearSearch(constraint.m_bodyB) < usedBodies.size())
696 			{
697 				printf("error?\n");
698 			}
699 		}
700 
701 		for (int c = start; c < end; c++)
702 		{
703 			b3GpuConstraint4& constraint = cpuConstraints[c];
704 			usedBodies.push_back(constraint.m_bodyA);
705 			usedBodies.push_back(constraint.m_bodyB);
706 		}
707 	}
708 }
709 
710 static bool verify = false;
711 
solveContactConstraint(const b3OpenCLArray<b3RigidBodyData> * bodyBuf,const b3OpenCLArray<b3InertiaData> * shapeBuf,b3OpenCLArray<b3GpuConstraint4> * constraint,void * additionalData,int n,int maxNumBatches)712 void b3Solver::solveContactConstraint(const b3OpenCLArray<b3RigidBodyData>* bodyBuf, const b3OpenCLArray<b3InertiaData>* shapeBuf,
713 									  b3OpenCLArray<b3GpuConstraint4>* constraint, void* additionalData, int n, int maxNumBatches)
714 {
715 	b3Int4 cdata = b3MakeInt4(n, 0, 0, 0);
716 	{
717 		const int nn = B3_SOLVER_N_CELLS;
718 
719 		cdata.x = 0;
720 		cdata.y = maxNumBatches;  //250;
721 
722 		int numWorkItems = 64 * nn / B3_SOLVER_N_BATCHES;
723 #ifdef DEBUG_ME
724 		SolverDebugInfo* debugInfo = new SolverDebugInfo[numWorkItems];
725 		adl::b3OpenCLArray<SolverDebugInfo> gpuDebugInfo(data->m_device, numWorkItems);
726 #endif
727 
728 		{
729 			B3_PROFILE("m_batchSolveKernel iterations");
730 			for (int iter = 0; iter < m_nIterations; iter++)
731 			{
732 				for (int ib = 0; ib < B3_SOLVER_N_BATCHES; ib++)
733 				{
734 					if (verify)
735 					{
736 						checkConstraintBatch(bodyBuf, shapeBuf, constraint, m_numConstraints, m_offsets, ib);
737 					}
738 
739 #ifdef DEBUG_ME
740 					memset(debugInfo, 0, sizeof(SolverDebugInfo) * numWorkItems);
741 					gpuDebugInfo.write(debugInfo, numWorkItems);
742 #endif
743 
744 					cdata.z = ib;
745 
746 					b3LauncherCL launcher(m_queue, m_solveContactKernel, "m_solveContactKernel");
747 #if 1
748 
749 					b3BufferInfoCL bInfo[] = {
750 
751 						b3BufferInfoCL(bodyBuf->getBufferCL()),
752 						b3BufferInfoCL(shapeBuf->getBufferCL()),
753 						b3BufferInfoCL(constraint->getBufferCL()),
754 						b3BufferInfoCL(m_numConstraints->getBufferCL()),
755 						b3BufferInfoCL(m_offsets->getBufferCL())
756 #ifdef DEBUG_ME
757 							,
758 						b3BufferInfoCL(&gpuDebugInfo)
759 #endif
760 					};
761 
762 					launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
763 					//launcher.setConst(  cdata.x );
764 					launcher.setConst(cdata.y);
765 					launcher.setConst(cdata.z);
766 					b3Int4 nSplit;
767 					nSplit.x = B3_SOLVER_N_SPLIT_X;
768 					nSplit.y = B3_SOLVER_N_SPLIT_Y;
769 					nSplit.z = B3_SOLVER_N_SPLIT_Z;
770 
771 					launcher.setConst(nSplit);
772 					launcher.launch1D(numWorkItems, 64);
773 
774 #else
775 					const char* fileName = "m_batchSolveKernel.bin";
776 					FILE* f = fopen(fileName, "rb");
777 					if (f)
778 					{
779 						int sizeInBytes = 0;
780 						if (fseek(f, 0, SEEK_END) || (sizeInBytes = ftell(f)) == EOF || fseek(f, 0, SEEK_SET))
781 						{
782 							printf("error, cannot get file size\n");
783 							exit(0);
784 						}
785 
786 						unsigned char* buf = (unsigned char*)malloc(sizeInBytes);
787 						fread(buf, sizeInBytes, 1, f);
788 						int serializedBytes = launcher.deserializeArgs(buf, sizeInBytes, m_context);
789 						int num = *(int*)&buf[serializedBytes];
790 
791 						launcher.launch1D(num);
792 
793 						//this clFinish is for testing on errors
794 						clFinish(m_queue);
795 					}
796 
797 #endif
798 
799 #ifdef DEBUG_ME
800 					clFinish(m_queue);
801 					gpuDebugInfo.read(debugInfo, numWorkItems);
802 					clFinish(m_queue);
803 					for (int i = 0; i < numWorkItems; i++)
804 					{
805 						if (debugInfo[i].m_valInt2 > 0)
806 						{
807 							printf("debugInfo[i].m_valInt2 = %d\n", i, debugInfo[i].m_valInt2);
808 						}
809 
810 						if (debugInfo[i].m_valInt3 > 0)
811 						{
812 							printf("debugInfo[i].m_valInt3 = %d\n", i, debugInfo[i].m_valInt3);
813 						}
814 					}
815 #endif  //DEBUG_ME
816 				}
817 			}
818 
819 			clFinish(m_queue);
820 		}
821 
822 		cdata.x = 1;
823 		bool applyFriction = true;
824 		if (applyFriction)
825 		{
826 			B3_PROFILE("m_batchSolveKernel iterations2");
827 			for (int iter = 0; iter < m_nIterations; iter++)
828 			{
829 				for (int ib = 0; ib < B3_SOLVER_N_BATCHES; ib++)
830 				{
831 					cdata.z = ib;
832 
833 					b3BufferInfoCL bInfo[] = {
834 						b3BufferInfoCL(bodyBuf->getBufferCL()),
835 						b3BufferInfoCL(shapeBuf->getBufferCL()),
836 						b3BufferInfoCL(constraint->getBufferCL()),
837 						b3BufferInfoCL(m_numConstraints->getBufferCL()),
838 						b3BufferInfoCL(m_offsets->getBufferCL())
839 #ifdef DEBUG_ME
840 							,
841 						b3BufferInfoCL(&gpuDebugInfo)
842 #endif  //DEBUG_ME
843 					};
844 					b3LauncherCL launcher(m_queue, m_solveFrictionKernel, "m_solveFrictionKernel");
845 					launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
846 					//launcher.setConst(  cdata.x );
847 					launcher.setConst(cdata.y);
848 					launcher.setConst(cdata.z);
849 					b3Int4 nSplit;
850 					nSplit.x = B3_SOLVER_N_SPLIT_X;
851 					nSplit.y = B3_SOLVER_N_SPLIT_Y;
852 					nSplit.z = B3_SOLVER_N_SPLIT_Z;
853 
854 					launcher.setConst(nSplit);
855 
856 					launcher.launch1D(64 * nn / B3_SOLVER_N_BATCHES, 64);
857 				}
858 			}
859 			clFinish(m_queue);
860 		}
861 #ifdef DEBUG_ME
862 		delete[] debugInfo;
863 #endif  //DEBUG_ME
864 	}
865 }
866 
convertToConstraints(const b3OpenCLArray<b3RigidBodyData> * bodyBuf,const b3OpenCLArray<b3InertiaData> * shapeBuf,b3OpenCLArray<b3Contact4> * contactsIn,b3OpenCLArray<b3GpuConstraint4> * contactCOut,void * additionalData,int nContacts,const ConstraintCfg & cfg)867 void b3Solver::convertToConstraints(const b3OpenCLArray<b3RigidBodyData>* bodyBuf,
868 									const b3OpenCLArray<b3InertiaData>* shapeBuf,
869 									b3OpenCLArray<b3Contact4>* contactsIn, b3OpenCLArray<b3GpuConstraint4>* contactCOut, void* additionalData,
870 									int nContacts, const ConstraintCfg& cfg)
871 {
872 	//	b3OpenCLArray<b3GpuConstraint4>* constraintNative =0;
873 	contactCOut->resize(nContacts);
874 	struct CB
875 	{
876 		int m_nContacts;
877 		float m_dt;
878 		float m_positionDrift;
879 		float m_positionConstraintCoeff;
880 	};
881 
882 	{
883 		CB cdata;
884 		cdata.m_nContacts = nContacts;
885 		cdata.m_dt = cfg.m_dt;
886 		cdata.m_positionDrift = cfg.m_positionDrift;
887 		cdata.m_positionConstraintCoeff = cfg.m_positionConstraintCoeff;
888 
889 		if (gConvertConstraintOnCpu)
890 		{
891 			b3AlignedObjectArray<b3RigidBodyData> gBodies;
892 			bodyBuf->copyToHost(gBodies);
893 
894 			b3AlignedObjectArray<b3Contact4> gContact;
895 			contactsIn->copyToHost(gContact);
896 
897 			b3AlignedObjectArray<b3InertiaData> gShapes;
898 			shapeBuf->copyToHost(gShapes);
899 
900 			b3AlignedObjectArray<b3GpuConstraint4> gConstraintOut;
901 			gConstraintOut.resize(nContacts);
902 
903 			B3_PROFILE("cpu contactToConstraintKernel");
904 			for (int gIdx = 0; gIdx < nContacts; gIdx++)
905 			{
906 				int aIdx = abs(gContact[gIdx].m_bodyAPtrAndSignBit);
907 				int bIdx = abs(gContact[gIdx].m_bodyBPtrAndSignBit);
908 
909 				b3Float4 posA = gBodies[aIdx].m_pos;
910 				b3Float4 linVelA = gBodies[aIdx].m_linVel;
911 				b3Float4 angVelA = gBodies[aIdx].m_angVel;
912 				float invMassA = gBodies[aIdx].m_invMass;
913 				b3Mat3x3 invInertiaA = gShapes[aIdx].m_initInvInertia;
914 
915 				b3Float4 posB = gBodies[bIdx].m_pos;
916 				b3Float4 linVelB = gBodies[bIdx].m_linVel;
917 				b3Float4 angVelB = gBodies[bIdx].m_angVel;
918 				float invMassB = gBodies[bIdx].m_invMass;
919 				b3Mat3x3 invInertiaB = gShapes[bIdx].m_initInvInertia;
920 
921 				b3ContactConstraint4_t cs;
922 
923 				setConstraint4(posA, linVelA, angVelA, invMassA, invInertiaA, posB, linVelB, angVelB, invMassB, invInertiaB,
924 							   &gContact[gIdx], cdata.m_dt, cdata.m_positionDrift, cdata.m_positionConstraintCoeff,
925 							   &cs);
926 
927 				cs.m_batchIdx = gContact[gIdx].m_batchIdx;
928 
929 				gConstraintOut[gIdx] = (b3GpuConstraint4&)cs;
930 			}
931 
932 			contactCOut->copyFromHost(gConstraintOut);
933 		}
934 		else
935 		{
936 			B3_PROFILE("gpu m_contactToConstraintKernel");
937 
938 			b3BufferInfoCL bInfo[] = {b3BufferInfoCL(contactsIn->getBufferCL()), b3BufferInfoCL(bodyBuf->getBufferCL()), b3BufferInfoCL(shapeBuf->getBufferCL()),
939 									  b3BufferInfoCL(contactCOut->getBufferCL())};
940 			b3LauncherCL launcher(m_queue, m_contactToConstraintKernel, "m_contactToConstraintKernel");
941 			launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
942 			//launcher.setConst(  cdata );
943 
944 			launcher.setConst(cdata.m_nContacts);
945 			launcher.setConst(cdata.m_dt);
946 			launcher.setConst(cdata.m_positionDrift);
947 			launcher.setConst(cdata.m_positionConstraintCoeff);
948 
949 			launcher.launch1D(nContacts, 64);
950 			clFinish(m_queue);
951 		}
952 	}
953 }
954 
955 /*
956 void b3Solver::sortContacts(  const b3OpenCLArray<b3RigidBodyData>* bodyBuf,
957 			b3OpenCLArray<b3Contact4>* contactsIn, void* additionalData,
958 			int nContacts, const b3Solver::ConstraintCfg& cfg )
959 {
960 
961 
962 
963 	const int sortAlignment = 512; // todo. get this out of sort
964 	if( cfg.m_enableParallelSolve )
965 	{
966 
967 
968 		int sortSize = NEXTMULTIPLEOF( nContacts, sortAlignment );
969 
970 		b3OpenCLArray<unsigned int>* countsNative = m_numConstraints;//BufferUtils::map<TYPE_CL, false>( data->m_device, &countsHost );
971 		b3OpenCLArray<unsigned int>* offsetsNative = m_offsets;//BufferUtils::map<TYPE_CL, false>( data->m_device, &offsetsHost );
972 
973 		{	//	2. set cell idx
974 			struct CB
975 			{
976 				int m_nContacts;
977 				int m_staticIdx;
978 				float m_scale;
979 				int m_nSplit;
980 			};
981 
982 			b3Assert( sortSize%64 == 0 );
983 			CB cdata;
984 			cdata.m_nContacts = nContacts;
985 			cdata.m_staticIdx = cfg.m_staticIdx;
986 			cdata.m_scale = 1.f/(N_OBJ_PER_SPLIT*cfg.m_averageExtent);
987 			cdata.m_nSplit = B3_SOLVER_N_SPLIT;
988 
989 
990 			b3BufferInfoCL bInfo[] = { b3BufferInfoCL( contactsIn->getBufferCL() ), b3BufferInfoCL( bodyBuf->getBufferCL() ), b3BufferInfoCL( m_sortDataBuffer->getBufferCL() ) };
991 			b3LauncherCL launcher( m_queue, m_setSortDataKernel );
992 			launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
993 			launcher.setConst(  cdata );
994 			launcher.launch1D( sortSize, 64 );
995 		}
996 
997 		{	//	3. sort by cell idx
998 			int n = B3_SOLVER_N_SPLIT*B3_SOLVER_N_SPLIT;
999 			int sortBit = 32;
1000 			//if( n <= 0xffff ) sortBit = 16;
1001 			//if( n <= 0xff ) sortBit = 8;
1002 			m_sort32->execute(*m_sortDataBuffer,sortSize);
1003 		}
1004 		{	//	4. find entries
1005 			m_search->execute( *m_sortDataBuffer, nContacts, *countsNative, B3_SOLVER_N_SPLIT*B3_SOLVER_N_SPLIT, b3BoundSearchCL::COUNT);
1006 
1007 			m_scan->execute( *countsNative, *offsetsNative, B3_SOLVER_N_SPLIT*B3_SOLVER_N_SPLIT );
1008 		}
1009 
1010 		{	//	5. sort constraints by cellIdx
1011 			//	todo. preallocate this
1012 //			b3Assert( contactsIn->getType() == TYPE_HOST );
1013 //			b3OpenCLArray<b3Contact4>* out = BufferUtils::map<TYPE_CL, false>( data->m_device, contactsIn );	//	copying contacts to this buffer
1014 
1015 			{
1016 
1017 
1018 				b3Int4 cdata; cdata.x = nContacts;
1019 				b3BufferInfoCL bInfo[] = { b3BufferInfoCL( contactsIn->getBufferCL() ), b3BufferInfoCL( m_contactBuffer->getBufferCL() ), b3BufferInfoCL( m_sortDataBuffer->getBufferCL() ) };
1020 				b3LauncherCL launcher( m_queue, m_reorderContactKernel );
1021 				launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
1022 				launcher.setConst(  cdata );
1023 				launcher.launch1D( nContacts, 64 );
1024 			}
1025 //			BufferUtils::unmap<true>( out, contactsIn, nContacts );
1026 		}
1027 	}
1028 
1029 
1030 }
1031 
1032 */
batchContacts(b3OpenCLArray<b3Contact4> * contacts,int nContacts,b3OpenCLArray<unsigned int> * nNative,b3OpenCLArray<unsigned int> * offsetsNative,int staticIdx)1033 void b3Solver::batchContacts(b3OpenCLArray<b3Contact4>* contacts, int nContacts, b3OpenCLArray<unsigned int>* nNative, b3OpenCLArray<unsigned int>* offsetsNative, int staticIdx)
1034 {
1035 	int numWorkItems = 64 * B3_SOLVER_N_CELLS;
1036 	{
1037 		B3_PROFILE("batch generation");
1038 
1039 		b3Int4 cdata;
1040 		cdata.x = nContacts;
1041 		cdata.y = 0;
1042 		cdata.z = staticIdx;
1043 
1044 #ifdef BATCH_DEBUG
1045 		SolverDebugInfo* debugInfo = new SolverDebugInfo[numWorkItems];
1046 		adl::b3OpenCLArray<SolverDebugInfo> gpuDebugInfo(data->m_device, numWorkItems);
1047 		memset(debugInfo, 0, sizeof(SolverDebugInfo) * numWorkItems);
1048 		gpuDebugInfo.write(debugInfo, numWorkItems);
1049 #endif
1050 
1051 #if 0
1052 		b3BufferInfoCL bInfo[] = {
1053 			b3BufferInfoCL( contacts->getBufferCL() ),
1054 			b3BufferInfoCL(  m_contactBuffer2->getBufferCL()),
1055 			b3BufferInfoCL( nNative->getBufferCL() ),
1056 			b3BufferInfoCL( offsetsNative->getBufferCL() ),
1057 #ifdef BATCH_DEBUG
1058 			,	b3BufferInfoCL(&gpuDebugInfo)
1059 #endif
1060 		};
1061 #endif
1062 
1063 		{
1064 			m_batchSizes.resize(nNative->size());
1065 			B3_PROFILE("batchingKernel");
1066 			//b3LauncherCL launcher( m_queue, m_batchingKernel);
1067 			cl_kernel k = useNewBatchingKernel ? m_batchingKernelNew : m_batchingKernel;
1068 
1069 			b3LauncherCL launcher(m_queue, k, "*batchingKernel");
1070 			if (!useNewBatchingKernel)
1071 			{
1072 				launcher.setBuffer(contacts->getBufferCL());
1073 			}
1074 			launcher.setBuffer(m_contactBuffer2->getBufferCL());
1075 			launcher.setBuffer(nNative->getBufferCL());
1076 			launcher.setBuffer(offsetsNative->getBufferCL());
1077 
1078 			launcher.setBuffer(m_batchSizes.getBufferCL());
1079 
1080 			//launcher.setConst(  cdata );
1081 			launcher.setConst(staticIdx);
1082 
1083 			launcher.launch1D(numWorkItems, 64);
1084 			//clFinish(m_queue);
1085 			//b3AlignedObjectArray<int> batchSizesCPU;
1086 			//m_batchSizes.copyToHost(batchSizesCPU);
1087 			//printf(".\n");
1088 		}
1089 
1090 #ifdef BATCH_DEBUG
1091 		aaaa
1092 			b3Contact4* hostContacts = new b3Contact4[nContacts];
1093 		m_contactBuffer->read(hostContacts, nContacts);
1094 		clFinish(m_queue);
1095 
1096 		gpuDebugInfo.read(debugInfo, numWorkItems);
1097 		clFinish(m_queue);
1098 
1099 		for (int i = 0; i < numWorkItems; i++)
1100 		{
1101 			if (debugInfo[i].m_valInt1 > 0)
1102 			{
1103 				printf("catch\n");
1104 			}
1105 			if (debugInfo[i].m_valInt2 > 0)
1106 			{
1107 				printf("catch22\n");
1108 			}
1109 
1110 			if (debugInfo[i].m_valInt3 > 0)
1111 			{
1112 				printf("catch666\n");
1113 			}
1114 
1115 			if (debugInfo[i].m_valInt4 > 0)
1116 			{
1117 				printf("catch777\n");
1118 			}
1119 		}
1120 		delete[] debugInfo;
1121 #endif  //BATCH_DEBUG
1122 	}
1123 
1124 	//	copy buffer to buffer
1125 	//b3Assert(m_contactBuffer->size()==nContacts);
1126 	//contacts->copyFromOpenCLArray( *m_contactBuffer);
1127 	//clFinish(m_queue);//needed?
1128 }
1129