1 
2 bool gUseLargeBatches = false;
3 bool gCpuBatchContacts = false;
4 bool gCpuSolveConstraint = false;
5 bool gCpuRadixSort = false;
6 bool gCpuSetSortData = false;
7 bool gCpuSortContactsDeterminism = false;
8 bool gUseCpuCopyConstraints = false;
9 bool gUseScanHost = false;
10 bool gReorderContactsOnCpu = false;
11 
12 bool optionalSortContactsDeterminism = true;
13 
14 #include "b3GpuPgsContactSolver.h"
15 #include "Bullet3OpenCL/ParallelPrimitives/b3RadixSort32CL.h"
16 
17 #include "Bullet3OpenCL/ParallelPrimitives/b3LauncherCL.h"
18 #include "Bullet3OpenCL/ParallelPrimitives/b3BoundSearchCL.h"
19 #include "Bullet3OpenCL/ParallelPrimitives/b3PrefixScanCL.h"
20 #include <string.h>
21 #include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h"
22 #include "Bullet3Collision/NarrowPhaseCollision/b3Config.h"
23 #include "b3Solver.h"
24 
25 #define B3_SOLVER_SETUP_KERNEL_PATH "src/Bullet3OpenCL/RigidBody/kernels/solverSetup.cl"
26 #define B3_SOLVER_SETUP2_KERNEL_PATH "src/Bullet3OpenCL/RigidBody/kernels/solverSetup2.cl"
27 #define B3_SOLVER_CONTACT_KERNEL_PATH "src/Bullet3OpenCL/RigidBody/kernels/solveContact.cl"
28 #define B3_SOLVER_FRICTION_KERNEL_PATH "src/Bullet3OpenCL/RigidBody/kernels/solveFriction.cl"
29 #define B3_BATCHING_PATH "src/Bullet3OpenCL/RigidBody/kernels/batchingKernels.cl"
30 #define B3_BATCHING_NEW_PATH "src/Bullet3OpenCL/RigidBody/kernels/batchingKernelsNew.cl"
31 
32 #include "kernels/solverSetup.h"
33 #include "kernels/solverSetup2.h"
34 #include "kernels/solveContact.h"
35 #include "kernels/solveFriction.h"
36 #include "kernels/batchingKernels.h"
37 #include "kernels/batchingKernelsNew.h"
38 
39 struct b3GpuBatchingPgsSolverInternalData
40 {
41 	cl_context m_context;
42 	cl_device_id m_device;
43 	cl_command_queue m_queue;
44 	int m_pairCapacity;
45 	int m_nIterations;
46 
47 	b3OpenCLArray<b3GpuConstraint4>* m_contactCGPU;
48 	b3OpenCLArray<unsigned int>* m_numConstraints;
49 	b3OpenCLArray<unsigned int>* m_offsets;
50 
51 	b3Solver* m_solverGPU;
52 
53 	cl_kernel m_batchingKernel;
54 	cl_kernel m_batchingKernelNew;
55 	cl_kernel m_solveContactKernel;
56 	cl_kernel m_solveSingleContactKernel;
57 	cl_kernel m_solveSingleFrictionKernel;
58 	cl_kernel m_solveFrictionKernel;
59 	cl_kernel m_contactToConstraintKernel;
60 	cl_kernel m_setSortDataKernel;
61 	cl_kernel m_reorderContactKernel;
62 	cl_kernel m_copyConstraintKernel;
63 
64 	cl_kernel m_setDeterminismSortDataBodyAKernel;
65 	cl_kernel m_setDeterminismSortDataBodyBKernel;
66 	cl_kernel m_setDeterminismSortDataChildShapeAKernel;
67 	cl_kernel m_setDeterminismSortDataChildShapeBKernel;
68 
69 	class b3RadixSort32CL* m_sort32;
70 	class b3BoundSearchCL* m_search;
71 	class b3PrefixScanCL* m_scan;
72 
73 	b3OpenCLArray<b3SortData>* m_sortDataBuffer;
74 	b3OpenCLArray<b3Contact4>* m_contactBuffer;
75 
76 	b3OpenCLArray<b3RigidBodyData>* m_bodyBufferGPU;
77 	b3OpenCLArray<b3InertiaData>* m_inertiaBufferGPU;
78 	b3OpenCLArray<b3Contact4>* m_pBufContactOutGPU;
79 
80 	b3OpenCLArray<b3Contact4>* m_pBufContactOutGPUCopy;
81 	b3OpenCLArray<b3SortData>* m_contactKeyValues;
82 
83 	b3AlignedObjectArray<unsigned int> m_idxBuffer;
84 	b3AlignedObjectArray<b3SortData> m_sortData;
85 	b3AlignedObjectArray<b3Contact4> m_old;
86 
87 	b3AlignedObjectArray<int> m_batchSizes;
88 	b3OpenCLArray<int>* m_batchSizesGpu;
89 };
90 
b3GpuPgsContactSolver(cl_context ctx,cl_device_id device,cl_command_queue q,int pairCapacity)91 b3GpuPgsContactSolver::b3GpuPgsContactSolver(cl_context ctx, cl_device_id device, cl_command_queue q, int pairCapacity)
92 {
93 	m_debugOutput = 0;
94 	m_data = new b3GpuBatchingPgsSolverInternalData;
95 	m_data->m_context = ctx;
96 	m_data->m_device = device;
97 	m_data->m_queue = q;
98 	m_data->m_pairCapacity = pairCapacity;
99 	m_data->m_nIterations = 4;
100 	m_data->m_batchSizesGpu = new b3OpenCLArray<int>(ctx, q);
101 	m_data->m_bodyBufferGPU = new b3OpenCLArray<b3RigidBodyData>(ctx, q);
102 	m_data->m_inertiaBufferGPU = new b3OpenCLArray<b3InertiaData>(ctx, q);
103 	m_data->m_pBufContactOutGPU = new b3OpenCLArray<b3Contact4>(ctx, q);
104 
105 	m_data->m_pBufContactOutGPUCopy = new b3OpenCLArray<b3Contact4>(ctx, q);
106 	m_data->m_contactKeyValues = new b3OpenCLArray<b3SortData>(ctx, q);
107 
108 	m_data->m_solverGPU = new b3Solver(ctx, device, q, 512 * 1024);
109 
110 	m_data->m_sort32 = new b3RadixSort32CL(ctx, device, m_data->m_queue);
111 	m_data->m_scan = new b3PrefixScanCL(ctx, device, m_data->m_queue, B3_SOLVER_N_CELLS);
112 	m_data->m_search = new b3BoundSearchCL(ctx, device, m_data->m_queue, B3_SOLVER_N_CELLS);
113 
114 	const int sortSize = B3NEXTMULTIPLEOF(pairCapacity, 512);
115 
116 	m_data->m_sortDataBuffer = new b3OpenCLArray<b3SortData>(ctx, m_data->m_queue, sortSize);
117 	m_data->m_contactBuffer = new b3OpenCLArray<b3Contact4>(ctx, m_data->m_queue);
118 
119 	m_data->m_numConstraints = new b3OpenCLArray<unsigned int>(ctx, m_data->m_queue, B3_SOLVER_N_CELLS);
120 	m_data->m_numConstraints->resize(B3_SOLVER_N_CELLS);
121 
122 	m_data->m_contactCGPU = new b3OpenCLArray<b3GpuConstraint4>(ctx, q, pairCapacity);
123 
124 	m_data->m_offsets = new b3OpenCLArray<unsigned int>(ctx, m_data->m_queue, B3_SOLVER_N_CELLS);
125 	m_data->m_offsets->resize(B3_SOLVER_N_CELLS);
126 	const char* additionalMacros = "";
127 	//const char* srcFileNameForCaching="";
128 
129 	cl_int pErrNum;
130 	const char* batchKernelSource = batchingKernelsCL;
131 	const char* batchKernelNewSource = batchingKernelsNewCL;
132 	const char* solverSetupSource = solverSetupCL;
133 	const char* solverSetup2Source = solverSetup2CL;
134 	const char* solveContactSource = solveContactCL;
135 	const char* solveFrictionSource = solveFrictionCL;
136 
137 	{
138 		cl_program solveContactProg = b3OpenCLUtils::compileCLProgramFromString(ctx, device, solveContactSource, &pErrNum, additionalMacros, B3_SOLVER_CONTACT_KERNEL_PATH);
139 		b3Assert(solveContactProg);
140 
141 		cl_program solveFrictionProg = b3OpenCLUtils::compileCLProgramFromString(ctx, device, solveFrictionSource, &pErrNum, additionalMacros, B3_SOLVER_FRICTION_KERNEL_PATH);
142 		b3Assert(solveFrictionProg);
143 
144 		cl_program solverSetup2Prog = b3OpenCLUtils::compileCLProgramFromString(ctx, device, solverSetup2Source, &pErrNum, additionalMacros, B3_SOLVER_SETUP2_KERNEL_PATH);
145 
146 		b3Assert(solverSetup2Prog);
147 
148 		cl_program solverSetupProg = b3OpenCLUtils::compileCLProgramFromString(ctx, device, solverSetupSource, &pErrNum, additionalMacros, B3_SOLVER_SETUP_KERNEL_PATH);
149 		b3Assert(solverSetupProg);
150 
151 		m_data->m_solveFrictionKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, solveFrictionSource, "BatchSolveKernelFriction", &pErrNum, solveFrictionProg, additionalMacros);
152 		b3Assert(m_data->m_solveFrictionKernel);
153 
154 		m_data->m_solveContactKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, solveContactSource, "BatchSolveKernelContact", &pErrNum, solveContactProg, additionalMacros);
155 		b3Assert(m_data->m_solveContactKernel);
156 
157 		m_data->m_solveSingleContactKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, solveContactSource, "solveSingleContactKernel", &pErrNum, solveContactProg, additionalMacros);
158 		b3Assert(m_data->m_solveSingleContactKernel);
159 
160 		m_data->m_solveSingleFrictionKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, solveFrictionSource, "solveSingleFrictionKernel", &pErrNum, solveFrictionProg, additionalMacros);
161 		b3Assert(m_data->m_solveSingleFrictionKernel);
162 
163 		m_data->m_contactToConstraintKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, solverSetupSource, "ContactToConstraintKernel", &pErrNum, solverSetupProg, additionalMacros);
164 		b3Assert(m_data->m_contactToConstraintKernel);
165 
166 		m_data->m_setSortDataKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, solverSetup2Source, "SetSortDataKernel", &pErrNum, solverSetup2Prog, additionalMacros);
167 		b3Assert(m_data->m_setSortDataKernel);
168 
169 		m_data->m_setDeterminismSortDataBodyAKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, solverSetup2Source, "SetDeterminismSortDataBodyA", &pErrNum, solverSetup2Prog, additionalMacros);
170 		b3Assert(m_data->m_setDeterminismSortDataBodyAKernel);
171 
172 		m_data->m_setDeterminismSortDataBodyBKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, solverSetup2Source, "SetDeterminismSortDataBodyB", &pErrNum, solverSetup2Prog, additionalMacros);
173 		b3Assert(m_data->m_setDeterminismSortDataBodyBKernel);
174 
175 		m_data->m_setDeterminismSortDataChildShapeAKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, solverSetup2Source, "SetDeterminismSortDataChildShapeA", &pErrNum, solverSetup2Prog, additionalMacros);
176 		b3Assert(m_data->m_setDeterminismSortDataChildShapeAKernel);
177 
178 		m_data->m_setDeterminismSortDataChildShapeBKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, solverSetup2Source, "SetDeterminismSortDataChildShapeB", &pErrNum, solverSetup2Prog, additionalMacros);
179 		b3Assert(m_data->m_setDeterminismSortDataChildShapeBKernel);
180 
181 		m_data->m_reorderContactKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, solverSetup2Source, "ReorderContactKernel", &pErrNum, solverSetup2Prog, additionalMacros);
182 		b3Assert(m_data->m_reorderContactKernel);
183 
184 		m_data->m_copyConstraintKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, solverSetup2Source, "CopyConstraintKernel", &pErrNum, solverSetup2Prog, additionalMacros);
185 		b3Assert(m_data->m_copyConstraintKernel);
186 	}
187 
188 	{
189 		cl_program batchingProg = b3OpenCLUtils::compileCLProgramFromString(ctx, device, batchKernelSource, &pErrNum, additionalMacros, B3_BATCHING_PATH);
190 		b3Assert(batchingProg);
191 
192 		m_data->m_batchingKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, batchKernelSource, "CreateBatches", &pErrNum, batchingProg, additionalMacros);
193 		b3Assert(m_data->m_batchingKernel);
194 	}
195 
196 	{
197 		cl_program batchingNewProg = b3OpenCLUtils::compileCLProgramFromString(ctx, device, batchKernelNewSource, &pErrNum, additionalMacros, B3_BATCHING_NEW_PATH);
198 		b3Assert(batchingNewProg);
199 
200 		m_data->m_batchingKernelNew = b3OpenCLUtils::compileCLKernelFromString(ctx, device, batchKernelNewSource, "CreateBatchesNew", &pErrNum, batchingNewProg, additionalMacros);
201 		b3Assert(m_data->m_batchingKernelNew);
202 	}
203 }
204 
~b3GpuPgsContactSolver()205 b3GpuPgsContactSolver::~b3GpuPgsContactSolver()
206 {
207 	delete m_data->m_batchSizesGpu;
208 	delete m_data->m_bodyBufferGPU;
209 	delete m_data->m_inertiaBufferGPU;
210 	delete m_data->m_pBufContactOutGPU;
211 	delete m_data->m_pBufContactOutGPUCopy;
212 	delete m_data->m_contactKeyValues;
213 
214 	delete m_data->m_contactCGPU;
215 	delete m_data->m_numConstraints;
216 	delete m_data->m_offsets;
217 	delete m_data->m_sortDataBuffer;
218 	delete m_data->m_contactBuffer;
219 
220 	delete m_data->m_sort32;
221 	delete m_data->m_scan;
222 	delete m_data->m_search;
223 	delete m_data->m_solverGPU;
224 
225 	clReleaseKernel(m_data->m_batchingKernel);
226 	clReleaseKernel(m_data->m_batchingKernelNew);
227 	clReleaseKernel(m_data->m_solveSingleContactKernel);
228 	clReleaseKernel(m_data->m_solveSingleFrictionKernel);
229 	clReleaseKernel(m_data->m_solveContactKernel);
230 	clReleaseKernel(m_data->m_solveFrictionKernel);
231 
232 	clReleaseKernel(m_data->m_contactToConstraintKernel);
233 	clReleaseKernel(m_data->m_setSortDataKernel);
234 	clReleaseKernel(m_data->m_reorderContactKernel);
235 	clReleaseKernel(m_data->m_copyConstraintKernel);
236 
237 	clReleaseKernel(m_data->m_setDeterminismSortDataBodyAKernel);
238 	clReleaseKernel(m_data->m_setDeterminismSortDataBodyBKernel);
239 	clReleaseKernel(m_data->m_setDeterminismSortDataChildShapeAKernel);
240 	clReleaseKernel(m_data->m_setDeterminismSortDataChildShapeBKernel);
241 
242 	delete m_data;
243 }
244 
245 struct b3ConstraintCfg
246 {
b3ConstraintCfgb3ConstraintCfg247 	b3ConstraintCfg(float dt = 0.f) : m_positionDrift(0.005f), m_positionConstraintCoeff(0.2f), m_dt(dt), m_staticIdx(0) {}
248 
249 	float m_positionDrift;
250 	float m_positionConstraintCoeff;
251 	float m_dt;
252 	bool m_enableParallelSolve;
253 	float m_batchCellSize;
254 	int m_staticIdx;
255 };
256 
solveContactConstraintBatchSizes(const b3OpenCLArray<b3RigidBodyData> * bodyBuf,const b3OpenCLArray<b3InertiaData> * shapeBuf,b3OpenCLArray<b3GpuConstraint4> * constraint,void * additionalData,int n,int maxNumBatches,int numIterations,const b3AlignedObjectArray<int> * batchSizes)257 void b3GpuPgsContactSolver::solveContactConstraintBatchSizes(const b3OpenCLArray<b3RigidBodyData>* bodyBuf, const b3OpenCLArray<b3InertiaData>* shapeBuf,
258 															 b3OpenCLArray<b3GpuConstraint4>* constraint, void* additionalData, int n, int maxNumBatches, int numIterations, const b3AlignedObjectArray<int>* batchSizes)  //const b3OpenCLArray<int>* gpuBatchSizes)
259 {
260 	B3_PROFILE("solveContactConstraintBatchSizes");
261 	int numBatches = batchSizes->size() / B3_MAX_NUM_BATCHES;
262 	for (int iter = 0; iter < numIterations; iter++)
263 	{
264 		for (int cellId = 0; cellId < numBatches; cellId++)
265 		{
266 			int offset = 0;
267 			for (int ii = 0; ii < B3_MAX_NUM_BATCHES; ii++)
268 			{
269 				int numInBatch = batchSizes->at(cellId * B3_MAX_NUM_BATCHES + ii);
270 				if (!numInBatch)
271 					break;
272 
273 				{
274 					b3LauncherCL launcher(m_data->m_queue, m_data->m_solveSingleContactKernel, "m_solveSingleContactKernel");
275 					launcher.setBuffer(bodyBuf->getBufferCL());
276 					launcher.setBuffer(shapeBuf->getBufferCL());
277 					launcher.setBuffer(constraint->getBufferCL());
278 					launcher.setConst(cellId);
279 					launcher.setConst(offset);
280 					launcher.setConst(numInBatch);
281 					launcher.launch1D(numInBatch);
282 					offset += numInBatch;
283 				}
284 			}
285 		}
286 	}
287 
288 	for (int iter = 0; iter < numIterations; iter++)
289 	{
290 		for (int cellId = 0; cellId < numBatches; cellId++)
291 		{
292 			int offset = 0;
293 			for (int ii = 0; ii < B3_MAX_NUM_BATCHES; ii++)
294 			{
295 				int numInBatch = batchSizes->at(cellId * B3_MAX_NUM_BATCHES + ii);
296 				if (!numInBatch)
297 					break;
298 
299 				{
300 					b3LauncherCL launcher(m_data->m_queue, m_data->m_solveSingleFrictionKernel, "m_solveSingleFrictionKernel");
301 					launcher.setBuffer(bodyBuf->getBufferCL());
302 					launcher.setBuffer(shapeBuf->getBufferCL());
303 					launcher.setBuffer(constraint->getBufferCL());
304 					launcher.setConst(cellId);
305 					launcher.setConst(offset);
306 					launcher.setConst(numInBatch);
307 					launcher.launch1D(numInBatch);
308 					offset += numInBatch;
309 				}
310 			}
311 		}
312 	}
313 }
314 
solveContactConstraint(const b3OpenCLArray<b3RigidBodyData> * bodyBuf,const b3OpenCLArray<b3InertiaData> * shapeBuf,b3OpenCLArray<b3GpuConstraint4> * constraint,void * additionalData,int n,int maxNumBatches,int numIterations,const b3AlignedObjectArray<int> * batchSizes)315 void b3GpuPgsContactSolver::solveContactConstraint(const b3OpenCLArray<b3RigidBodyData>* bodyBuf, const b3OpenCLArray<b3InertiaData>* shapeBuf,
316 												   b3OpenCLArray<b3GpuConstraint4>* constraint, void* additionalData, int n, int maxNumBatches, int numIterations, const b3AlignedObjectArray<int>* batchSizes)  //,const b3OpenCLArray<int>* gpuBatchSizes)
317 {
318 	//sort the contacts
319 
320 	b3Int4 cdata = b3MakeInt4(n, 0, 0, 0);
321 	{
322 		const int nn = B3_SOLVER_N_CELLS;
323 
324 		cdata.x = 0;
325 		cdata.y = maxNumBatches;  //250;
326 
327 		int numWorkItems = 64 * nn / B3_SOLVER_N_BATCHES;
328 #ifdef DEBUG_ME
329 		SolverDebugInfo* debugInfo = new SolverDebugInfo[numWorkItems];
330 		adl::b3OpenCLArray<SolverDebugInfo> gpuDebugInfo(data->m_device, numWorkItems);
331 #endif
332 
333 		{
334 			B3_PROFILE("m_batchSolveKernel iterations");
335 			for (int iter = 0; iter < numIterations; iter++)
336 			{
337 				for (int ib = 0; ib < B3_SOLVER_N_BATCHES; ib++)
338 				{
339 #ifdef DEBUG_ME
340 					memset(debugInfo, 0, sizeof(SolverDebugInfo) * numWorkItems);
341 					gpuDebugInfo.write(debugInfo, numWorkItems);
342 #endif
343 
344 					cdata.z = ib;
345 
346 					b3LauncherCL launcher(m_data->m_queue, m_data->m_solveContactKernel, "m_solveContactKernel");
347 #if 1
348 
349 					b3BufferInfoCL bInfo[] = {
350 
351 						b3BufferInfoCL(bodyBuf->getBufferCL()),
352 						b3BufferInfoCL(shapeBuf->getBufferCL()),
353 						b3BufferInfoCL(constraint->getBufferCL()),
354 						b3BufferInfoCL(m_data->m_solverGPU->m_numConstraints->getBufferCL()),
355 						b3BufferInfoCL(m_data->m_solverGPU->m_offsets->getBufferCL())
356 #ifdef DEBUG_ME
357 							,
358 						b3BufferInfoCL(&gpuDebugInfo)
359 #endif
360 					};
361 
362 					launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
363 					launcher.setBuffer(m_data->m_solverGPU->m_batchSizes.getBufferCL());
364 					//launcher.setConst(  cdata.x );
365 					launcher.setConst(cdata.y);
366 					launcher.setConst(cdata.z);
367 					b3Int4 nSplit;
368 					nSplit.x = B3_SOLVER_N_SPLIT_X;
369 					nSplit.y = B3_SOLVER_N_SPLIT_Y;
370 					nSplit.z = B3_SOLVER_N_SPLIT_Z;
371 
372 					launcher.setConst(nSplit);
373 					launcher.launch1D(numWorkItems, 64);
374 
375 #else
376 					const char* fileName = "m_batchSolveKernel.bin";
377 					FILE* f = fopen(fileName, "rb");
378 					if (f)
379 					{
380 						int sizeInBytes = 0;
381 						if (fseek(f, 0, SEEK_END) || (sizeInBytes = ftell(f)) == EOF || fseek(f, 0, SEEK_SET))
382 						{
383 							printf("error, cannot get file size\n");
384 							exit(0);
385 						}
386 
387 						unsigned char* buf = (unsigned char*)malloc(sizeInBytes);
388 						fread(buf, sizeInBytes, 1, f);
389 						int serializedBytes = launcher.deserializeArgs(buf, sizeInBytes, m_context);
390 						int num = *(int*)&buf[serializedBytes];
391 
392 						launcher.launch1D(num);
393 
394 						//this clFinish is for testing on errors
395 						clFinish(m_queue);
396 					}
397 
398 #endif
399 
400 #ifdef DEBUG_ME
401 					clFinish(m_queue);
402 					gpuDebugInfo.read(debugInfo, numWorkItems);
403 					clFinish(m_queue);
404 					for (int i = 0; i < numWorkItems; i++)
405 					{
406 						if (debugInfo[i].m_valInt2 > 0)
407 						{
408 							printf("debugInfo[i].m_valInt2 = %d\n", i, debugInfo[i].m_valInt2);
409 						}
410 
411 						if (debugInfo[i].m_valInt3 > 0)
412 						{
413 							printf("debugInfo[i].m_valInt3 = %d\n", i, debugInfo[i].m_valInt3);
414 						}
415 					}
416 #endif  //DEBUG_ME
417 				}
418 			}
419 
420 			clFinish(m_data->m_queue);
421 		}
422 
423 		cdata.x = 1;
424 		bool applyFriction = true;
425 		if (applyFriction)
426 		{
427 			B3_PROFILE("m_batchSolveKernel iterations2");
428 			for (int iter = 0; iter < numIterations; iter++)
429 			{
430 				for (int ib = 0; ib < B3_SOLVER_N_BATCHES; ib++)
431 				{
432 					cdata.z = ib;
433 
434 					b3BufferInfoCL bInfo[] = {
435 						b3BufferInfoCL(bodyBuf->getBufferCL()),
436 						b3BufferInfoCL(shapeBuf->getBufferCL()),
437 						b3BufferInfoCL(constraint->getBufferCL()),
438 						b3BufferInfoCL(m_data->m_solverGPU->m_numConstraints->getBufferCL()),
439 						b3BufferInfoCL(m_data->m_solverGPU->m_offsets->getBufferCL())
440 #ifdef DEBUG_ME
441 							,
442 						b3BufferInfoCL(&gpuDebugInfo)
443 #endif  //DEBUG_ME
444 					};
445 					b3LauncherCL launcher(m_data->m_queue, m_data->m_solveFrictionKernel, "m_solveFrictionKernel");
446 					launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
447 					launcher.setBuffer(m_data->m_solverGPU->m_batchSizes.getBufferCL());
448 					//launcher.setConst(  cdata.x );
449 					launcher.setConst(cdata.y);
450 					launcher.setConst(cdata.z);
451 
452 					b3Int4 nSplit;
453 					nSplit.x = B3_SOLVER_N_SPLIT_X;
454 					nSplit.y = B3_SOLVER_N_SPLIT_Y;
455 					nSplit.z = B3_SOLVER_N_SPLIT_Z;
456 
457 					launcher.setConst(nSplit);
458 
459 					launcher.launch1D(64 * nn / B3_SOLVER_N_BATCHES, 64);
460 				}
461 			}
462 			clFinish(m_data->m_queue);
463 		}
464 #ifdef DEBUG_ME
465 		delete[] debugInfo;
466 #endif  //DEBUG_ME
467 	}
468 }
469 
sortfnc(const b3SortData & a,const b3SortData & b)470 static bool sortfnc(const b3SortData& a, const b3SortData& b)
471 {
472 	return (a.m_key < b.m_key);
473 }
474 
b3ContactCmp(const b3Contact4 & p,const b3Contact4 & q)475 static bool b3ContactCmp(const b3Contact4& p, const b3Contact4& q)
476 {
477 	return ((p.m_bodyAPtrAndSignBit < q.m_bodyAPtrAndSignBit) ||
478 			((p.m_bodyAPtrAndSignBit == q.m_bodyAPtrAndSignBit) && (p.m_bodyBPtrAndSignBit < q.m_bodyBPtrAndSignBit)) ||
479 			((p.m_bodyAPtrAndSignBit == q.m_bodyAPtrAndSignBit) && (p.m_bodyBPtrAndSignBit == q.m_bodyBPtrAndSignBit) && p.m_childIndexA < q.m_childIndexA) ||
480 			((p.m_bodyAPtrAndSignBit == q.m_bodyAPtrAndSignBit) && (p.m_bodyBPtrAndSignBit == q.m_bodyBPtrAndSignBit) && p.m_childIndexA < q.m_childIndexA) ||
481 			((p.m_bodyAPtrAndSignBit == q.m_bodyAPtrAndSignBit) && (p.m_bodyBPtrAndSignBit == q.m_bodyBPtrAndSignBit) && p.m_childIndexA == q.m_childIndexA && p.m_childIndexB < q.m_childIndexB));
482 }
483 
484 #define USE_SPATIAL_BATCHING 1
485 #define USE_4x4_GRID 1
486 
487 #ifndef USE_SPATIAL_BATCHING
488 static const int gridTable4x4[] =
489 	{
490 		0, 1, 17, 16,
491 		1, 2, 18, 19,
492 		17, 18, 32, 3,
493 		16, 19, 3, 34};
494 static const int gridTable8x8[] =
495 	{
496 		0, 2, 3, 16, 17, 18, 19, 1,
497 		66, 64, 80, 67, 82, 81, 65, 83,
498 		131, 144, 128, 130, 147, 129, 145, 146,
499 		208, 195, 194, 192, 193, 211, 210, 209,
500 		21, 22, 23, 5, 4, 6, 7, 20,
501 		86, 85, 69, 87, 70, 68, 84, 71,
502 		151, 133, 149, 150, 135, 148, 132, 134,
503 		197, 27, 214, 213, 212, 199, 198, 196
504 
505 };
506 
507 #endif
508 
SetSortDataCPU(b3Contact4 * gContact,b3RigidBodyData * gBodies,b3SortData * gSortDataOut,int nContacts,float scale,const b3Int4 & nSplit,int staticIdx)509 void SetSortDataCPU(b3Contact4* gContact, b3RigidBodyData* gBodies, b3SortData* gSortDataOut, int nContacts, float scale, const b3Int4& nSplit, int staticIdx)
510 {
511 	for (int gIdx = 0; gIdx < nContacts; gIdx++)
512 	{
513 		if (gIdx < nContacts)
514 		{
515 			int aPtrAndSignBit = gContact[gIdx].m_bodyAPtrAndSignBit;
516 			int bPtrAndSignBit = gContact[gIdx].m_bodyBPtrAndSignBit;
517 
518 			int aIdx = abs(aPtrAndSignBit);
519 			int bIdx = abs(bPtrAndSignBit);
520 
521 			bool aStatic = (aPtrAndSignBit < 0) || (aPtrAndSignBit == staticIdx);
522 
523 #if USE_SPATIAL_BATCHING
524 			int idx = (aStatic) ? bIdx : aIdx;
525 			b3Vector3 p = gBodies[idx].m_pos;
526 			int xIdx = (int)((p.x - ((p.x < 0.f) ? 1.f : 0.f)) * scale) & (nSplit.x - 1);
527 			int yIdx = (int)((p.y - ((p.y < 0.f) ? 1.f : 0.f)) * scale) & (nSplit.y - 1);
528 			int zIdx = (int)((p.z - ((p.z < 0.f) ? 1.f : 0.f)) * scale) & (nSplit.z - 1);
529 
530 			int newIndex = (xIdx + yIdx * nSplit.x + zIdx * nSplit.x * nSplit.y);
531 
532 #else  //USE_SPATIAL_BATCHING
533 			bool bStatic = (bPtrAndSignBit < 0) || (bPtrAndSignBit == staticIdx);
534 
535 #if USE_4x4_GRID
536 			int aa = aIdx & 3;
537 			int bb = bIdx & 3;
538 			if (aStatic)
539 				aa = bb;
540 			if (bStatic)
541 				bb = aa;
542 
543 			int gridIndex = aa + bb * 4;
544 			int newIndex = gridTable4x4[gridIndex];
545 #else   //USE_4x4_GRID
546 			int aa = aIdx & 7;
547 			int bb = bIdx & 7;
548 			if (aStatic)
549 				aa = bb;
550 			if (bStatic)
551 				bb = aa;
552 
553 			int gridIndex = aa + bb * 8;
554 			int newIndex = gridTable8x8[gridIndex];
555 #endif  //USE_4x4_GRID
556 #endif  //USE_SPATIAL_BATCHING
557 
558 			gSortDataOut[gIdx].x = newIndex;
559 			gSortDataOut[gIdx].y = gIdx;
560 		}
561 		else
562 		{
563 			gSortDataOut[gIdx].x = 0xffffffff;
564 		}
565 	}
566 }
567 
solveContacts(int numBodies,cl_mem bodyBuf,cl_mem inertiaBuf,int numContacts,cl_mem contactBuf,const b3Config & config,int static0Index)568 void b3GpuPgsContactSolver::solveContacts(int numBodies, cl_mem bodyBuf, cl_mem inertiaBuf, int numContacts, cl_mem contactBuf, const b3Config& config, int static0Index)
569 {
570 	B3_PROFILE("solveContacts");
571 	m_data->m_bodyBufferGPU->setFromOpenCLBuffer(bodyBuf, numBodies);
572 	m_data->m_inertiaBufferGPU->setFromOpenCLBuffer(inertiaBuf, numBodies);
573 	m_data->m_pBufContactOutGPU->setFromOpenCLBuffer(contactBuf, numContacts);
574 
575 	if (optionalSortContactsDeterminism)
576 	{
577 		if (!gCpuSortContactsDeterminism)
578 		{
579 			B3_PROFILE("GPU Sort contact constraints (determinism)");
580 
581 			m_data->m_pBufContactOutGPUCopy->resize(numContacts);
582 			m_data->m_contactKeyValues->resize(numContacts);
583 
584 			m_data->m_pBufContactOutGPU->copyToCL(m_data->m_pBufContactOutGPUCopy->getBufferCL(), numContacts, 0, 0);
585 
586 			{
587 				b3LauncherCL launcher(m_data->m_queue, m_data->m_setDeterminismSortDataChildShapeBKernel, "m_setDeterminismSortDataChildShapeBKernel");
588 				launcher.setBuffer(m_data->m_pBufContactOutGPUCopy->getBufferCL());
589 				launcher.setBuffer(m_data->m_contactKeyValues->getBufferCL());
590 				launcher.setConst(numContacts);
591 				launcher.launch1D(numContacts, 64);
592 			}
593 			m_data->m_solverGPU->m_sort32->execute(*m_data->m_contactKeyValues);
594 			{
595 				b3LauncherCL launcher(m_data->m_queue, m_data->m_setDeterminismSortDataChildShapeAKernel, "m_setDeterminismSortDataChildShapeAKernel");
596 				launcher.setBuffer(m_data->m_pBufContactOutGPUCopy->getBufferCL());
597 				launcher.setBuffer(m_data->m_contactKeyValues->getBufferCL());
598 				launcher.setConst(numContacts);
599 				launcher.launch1D(numContacts, 64);
600 			}
601 			m_data->m_solverGPU->m_sort32->execute(*m_data->m_contactKeyValues);
602 			{
603 				b3LauncherCL launcher(m_data->m_queue, m_data->m_setDeterminismSortDataBodyBKernel, "m_setDeterminismSortDataBodyBKernel");
604 				launcher.setBuffer(m_data->m_pBufContactOutGPUCopy->getBufferCL());
605 				launcher.setBuffer(m_data->m_contactKeyValues->getBufferCL());
606 				launcher.setConst(numContacts);
607 				launcher.launch1D(numContacts, 64);
608 			}
609 
610 			m_data->m_solverGPU->m_sort32->execute(*m_data->m_contactKeyValues);
611 
612 			{
613 				b3LauncherCL launcher(m_data->m_queue, m_data->m_setDeterminismSortDataBodyAKernel, "m_setDeterminismSortDataBodyAKernel");
614 				launcher.setBuffer(m_data->m_pBufContactOutGPUCopy->getBufferCL());
615 				launcher.setBuffer(m_data->m_contactKeyValues->getBufferCL());
616 				launcher.setConst(numContacts);
617 				launcher.launch1D(numContacts, 64);
618 			}
619 
620 			m_data->m_solverGPU->m_sort32->execute(*m_data->m_contactKeyValues);
621 
622 			{
623 				B3_PROFILE("gpu reorderContactKernel (determinism)");
624 
625 				b3Int4 cdata;
626 				cdata.x = numContacts;
627 
628 				//b3BufferInfoCL bInfo[] = { b3BufferInfoCL( m_data->m_pBufContactOutGPU->getBufferCL() ), b3BufferInfoCL( m_data->m_solverGPU->m_contactBuffer2->getBufferCL())
629 				//	, b3BufferInfoCL( m_data->m_solverGPU->m_sortDataBuffer->getBufferCL()) };
630 				b3LauncherCL launcher(m_data->m_queue, m_data->m_solverGPU->m_reorderContactKernel, "m_reorderContactKernel");
631 				launcher.setBuffer(m_data->m_pBufContactOutGPUCopy->getBufferCL());
632 				launcher.setBuffer(m_data->m_pBufContactOutGPU->getBufferCL());
633 				launcher.setBuffer(m_data->m_contactKeyValues->getBufferCL());
634 				launcher.setConst(cdata);
635 				launcher.launch1D(numContacts, 64);
636 			}
637 		}
638 		else
639 		{
640 			B3_PROFILE("CPU Sort contact constraints (determinism)");
641 			b3AlignedObjectArray<b3Contact4> cpuConstraints;
642 			m_data->m_pBufContactOutGPU->copyToHost(cpuConstraints);
643 			bool sort = true;
644 			if (sort)
645 			{
646 				cpuConstraints.quickSort(b3ContactCmp);
647 
648 				for (int i = 0; i < cpuConstraints.size(); i++)
649 				{
650 					cpuConstraints[i].m_batchIdx = i;
651 				}
652 			}
653 			m_data->m_pBufContactOutGPU->copyFromHost(cpuConstraints);
654 			if (m_debugOutput == 100)
655 			{
656 				for (int i = 0; i < cpuConstraints.size(); i++)
657 				{
658 					printf("c[%d].m_bodyA = %d, m_bodyB = %d, batchId = %d\n", i, cpuConstraints[i].m_bodyAPtrAndSignBit, cpuConstraints[i].m_bodyBPtrAndSignBit, cpuConstraints[i].m_batchIdx);
659 				}
660 			}
661 
662 			m_debugOutput++;
663 		}
664 	}
665 
666 	int nContactOut = m_data->m_pBufContactOutGPU->size();
667 
668 	bool useSolver = true;
669 
670 	if (useSolver)
671 	{
672 		float dt = 1. / 60.;
673 		b3ConstraintCfg csCfg(dt);
674 		csCfg.m_enableParallelSolve = true;
675 		csCfg.m_batchCellSize = 6;
676 		csCfg.m_staticIdx = static0Index;
677 
678 		b3OpenCLArray<b3RigidBodyData>* bodyBuf = m_data->m_bodyBufferGPU;
679 
680 		void* additionalData = 0;  //m_data->m_frictionCGPU;
681 		const b3OpenCLArray<b3InertiaData>* shapeBuf = m_data->m_inertiaBufferGPU;
682 		b3OpenCLArray<b3GpuConstraint4>* contactConstraintOut = m_data->m_contactCGPU;
683 		int nContacts = nContactOut;
684 
685 		int maxNumBatches = 0;
686 
687 		if (!gUseLargeBatches)
688 		{
689 			if (m_data->m_solverGPU->m_contactBuffer2)
690 			{
691 				m_data->m_solverGPU->m_contactBuffer2->resize(nContacts);
692 			}
693 
694 			if (m_data->m_solverGPU->m_contactBuffer2 == 0)
695 			{
696 				m_data->m_solverGPU->m_contactBuffer2 = new b3OpenCLArray<b3Contact4>(m_data->m_context, m_data->m_queue, nContacts);
697 				m_data->m_solverGPU->m_contactBuffer2->resize(nContacts);
698 			}
699 
700 			//clFinish(m_data->m_queue);
701 
702 			{
703 				B3_PROFILE("batching");
704 				//@todo: just reserve it, without copy of original contact (unless we use warmstarting)
705 
706 				//const b3OpenCLArray<b3RigidBodyData>* bodyNative = bodyBuf;
707 
708 				{
709 					//b3OpenCLArray<b3RigidBodyData>* bodyNative = b3OpenCLArrayUtils::map<adl::TYPE_CL, true>( data->m_device, bodyBuf );
710 					//b3OpenCLArray<b3Contact4>* contactNative = b3OpenCLArrayUtils::map<adl::TYPE_CL, true>( data->m_device, contactsIn );
711 
712 					const int sortAlignment = 512;  // todo. get this out of sort
713 					if (csCfg.m_enableParallelSolve)
714 					{
715 						int sortSize = B3NEXTMULTIPLEOF(nContacts, sortAlignment);
716 
717 						b3OpenCLArray<unsigned int>* countsNative = m_data->m_solverGPU->m_numConstraints;
718 						b3OpenCLArray<unsigned int>* offsetsNative = m_data->m_solverGPU->m_offsets;
719 
720 						if (!gCpuSetSortData)
721 						{  //	2. set cell idx
722 							B3_PROFILE("GPU set cell idx");
723 							struct CB
724 							{
725 								int m_nContacts;
726 								int m_staticIdx;
727 								float m_scale;
728 								b3Int4 m_nSplit;
729 							};
730 
731 							b3Assert(sortSize % 64 == 0);
732 							CB cdata;
733 							cdata.m_nContacts = nContacts;
734 							cdata.m_staticIdx = csCfg.m_staticIdx;
735 							cdata.m_scale = 1.f / csCfg.m_batchCellSize;
736 							cdata.m_nSplit.x = B3_SOLVER_N_SPLIT_X;
737 							cdata.m_nSplit.y = B3_SOLVER_N_SPLIT_Y;
738 							cdata.m_nSplit.z = B3_SOLVER_N_SPLIT_Z;
739 
740 							m_data->m_solverGPU->m_sortDataBuffer->resize(nContacts);
741 
742 							b3BufferInfoCL bInfo[] = {b3BufferInfoCL(m_data->m_pBufContactOutGPU->getBufferCL()), b3BufferInfoCL(bodyBuf->getBufferCL()), b3BufferInfoCL(m_data->m_solverGPU->m_sortDataBuffer->getBufferCL())};
743 							b3LauncherCL launcher(m_data->m_queue, m_data->m_solverGPU->m_setSortDataKernel, "m_setSortDataKernel");
744 							launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
745 							launcher.setConst(cdata.m_nContacts);
746 							launcher.setConst(cdata.m_scale);
747 							launcher.setConst(cdata.m_nSplit);
748 							launcher.setConst(cdata.m_staticIdx);
749 
750 							launcher.launch1D(sortSize, 64);
751 						}
752 						else
753 						{
754 							m_data->m_solverGPU->m_sortDataBuffer->resize(nContacts);
755 							b3AlignedObjectArray<b3SortData> sortDataCPU;
756 							m_data->m_solverGPU->m_sortDataBuffer->copyToHost(sortDataCPU);
757 
758 							b3AlignedObjectArray<b3Contact4> contactCPU;
759 							m_data->m_pBufContactOutGPU->copyToHost(contactCPU);
760 							b3AlignedObjectArray<b3RigidBodyData> bodiesCPU;
761 							bodyBuf->copyToHost(bodiesCPU);
762 							float scale = 1.f / csCfg.m_batchCellSize;
763 							b3Int4 nSplit;
764 							nSplit.x = B3_SOLVER_N_SPLIT_X;
765 							nSplit.y = B3_SOLVER_N_SPLIT_Y;
766 							nSplit.z = B3_SOLVER_N_SPLIT_Z;
767 
768 							SetSortDataCPU(&contactCPU[0], &bodiesCPU[0], &sortDataCPU[0], nContacts, scale, nSplit, csCfg.m_staticIdx);
769 
770 							m_data->m_solverGPU->m_sortDataBuffer->copyFromHost(sortDataCPU);
771 						}
772 
773 						if (!gCpuRadixSort)
774 						{  //	3. sort by cell idx
775 							B3_PROFILE("gpuRadixSort");
776 							//int n = B3_SOLVER_N_SPLIT*B3_SOLVER_N_SPLIT;
777 							//int sortBit = 32;
778 							//if( n <= 0xffff ) sortBit = 16;
779 							//if( n <= 0xff ) sortBit = 8;
780 							//adl::RadixSort<adl::TYPE_CL>::execute( data->m_sort, *data->m_sortDataBuffer, sortSize );
781 							//adl::RadixSort32<adl::TYPE_CL>::execute( data->m_sort32, *data->m_sortDataBuffer, sortSize );
782 							b3OpenCLArray<b3SortData>& keyValuesInOut = *(m_data->m_solverGPU->m_sortDataBuffer);
783 							this->m_data->m_solverGPU->m_sort32->execute(keyValuesInOut);
784 						}
785 						else
786 						{
787 							b3OpenCLArray<b3SortData>& keyValuesInOut = *(m_data->m_solverGPU->m_sortDataBuffer);
788 							b3AlignedObjectArray<b3SortData> hostValues;
789 							keyValuesInOut.copyToHost(hostValues);
790 							hostValues.quickSort(sortfnc);
791 							keyValuesInOut.copyFromHost(hostValues);
792 						}
793 
794 						if (gUseScanHost)
795 						{
796 							//	4. find entries
797 							B3_PROFILE("cpuBoundSearch");
798 							b3AlignedObjectArray<unsigned int> countsHost;
799 							countsNative->copyToHost(countsHost);
800 
801 							b3AlignedObjectArray<b3SortData> sortDataHost;
802 							m_data->m_solverGPU->m_sortDataBuffer->copyToHost(sortDataHost);
803 
804 							//m_data->m_solverGPU->m_search->executeHost(*m_data->m_solverGPU->m_sortDataBuffer,nContacts,*countsNative,B3_SOLVER_N_CELLS,b3BoundSearchCL::COUNT);
805 							m_data->m_solverGPU->m_search->executeHost(sortDataHost, nContacts, countsHost, B3_SOLVER_N_CELLS, b3BoundSearchCL::COUNT);
806 
807 							countsNative->copyFromHost(countsHost);
808 
809 							//adl::BoundSearch<adl::TYPE_CL>::execute( data->m_search, *data->m_sortDataBuffer, nContacts, *countsNative,
810 							//	B3_SOLVER_N_SPLIT*B3_SOLVER_N_SPLIT, adl::BoundSearchBase::COUNT );
811 
812 							//unsigned int sum;
813 							//m_data->m_solverGPU->m_scan->execute(*countsNative,*offsetsNative, B3_SOLVER_N_CELLS);//,&sum );
814 							b3AlignedObjectArray<unsigned int> offsetsHost;
815 							offsetsHost.resize(offsetsNative->size());
816 
817 							m_data->m_solverGPU->m_scan->executeHost(countsHost, offsetsHost, B3_SOLVER_N_CELLS);  //,&sum );
818 							offsetsNative->copyFromHost(offsetsHost);
819 
820 							//printf("sum = %d\n",sum);
821 						}
822 						else
823 						{
824 							//	4. find entries
825 							B3_PROFILE("gpuBoundSearch");
826 							m_data->m_solverGPU->m_search->execute(*m_data->m_solverGPU->m_sortDataBuffer, nContacts, *countsNative, B3_SOLVER_N_CELLS, b3BoundSearchCL::COUNT);
827 							m_data->m_solverGPU->m_scan->execute(*countsNative, *offsetsNative, B3_SOLVER_N_CELLS);  //,&sum );
828 						}
829 
830 						if (nContacts)
831 						{  //	5. sort constraints by cellIdx
832 							if (gReorderContactsOnCpu)
833 							{
834 								B3_PROFILE("cpu m_reorderContactKernel");
835 								b3AlignedObjectArray<b3SortData> sortDataHost;
836 								m_data->m_solverGPU->m_sortDataBuffer->copyToHost(sortDataHost);
837 								b3AlignedObjectArray<b3Contact4> inContacts;
838 								b3AlignedObjectArray<b3Contact4> outContacts;
839 								m_data->m_pBufContactOutGPU->copyToHost(inContacts);
840 								outContacts.resize(inContacts.size());
841 								for (int i = 0; i < nContacts; i++)
842 								{
843 									int srcIdx = sortDataHost[i].y;
844 									outContacts[i] = inContacts[srcIdx];
845 								}
846 								m_data->m_solverGPU->m_contactBuffer2->copyFromHost(outContacts);
847 
848 								/*								"void ReorderContactKernel(__global struct b3Contact4Data* in, __global struct b3Contact4Data* out, __global int2* sortData, int4 cb )\n"
849 								"{\n"
850 								"	int nContacts = cb.x;\n"
851 								"	int gIdx = GET_GLOBAL_IDX;\n"
852 								"	if( gIdx < nContacts )\n"
853 								"	{\n"
854 								"		int srcIdx = sortData[gIdx].y;\n"
855 								"		out[gIdx] = in[srcIdx];\n"
856 								"	}\n"
857 								"}\n"
858 								*/
859 							}
860 							else
861 							{
862 								B3_PROFILE("gpu m_reorderContactKernel");
863 
864 								b3Int4 cdata;
865 								cdata.x = nContacts;
866 
867 								b3BufferInfoCL bInfo[] = {
868 									b3BufferInfoCL(m_data->m_pBufContactOutGPU->getBufferCL()),
869 									b3BufferInfoCL(m_data->m_solverGPU->m_contactBuffer2->getBufferCL()), b3BufferInfoCL(m_data->m_solverGPU->m_sortDataBuffer->getBufferCL())};
870 
871 								b3LauncherCL launcher(m_data->m_queue, m_data->m_solverGPU->m_reorderContactKernel, "m_reorderContactKernel");
872 								launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
873 								launcher.setConst(cdata);
874 								launcher.launch1D(nContacts, 64);
875 							}
876 						}
877 					}
878 				}
879 
880 				//clFinish(m_data->m_queue);
881 
882 				//				{
883 				//				b3AlignedObjectArray<unsigned int> histogram;
884 				//				m_data->m_solverGPU->m_numConstraints->copyToHost(histogram);
885 				//				printf(",,,\n");
886 				//				}
887 
888 				if (nContacts)
889 				{
890 					if (gUseCpuCopyConstraints)
891 					{
892 						for (int i = 0; i < nContacts; i++)
893 						{
894 							m_data->m_pBufContactOutGPU->copyFromOpenCLArray(*m_data->m_solverGPU->m_contactBuffer2);
895 							//							m_data->m_solverGPU->m_contactBuffer2->getBufferCL();
896 							//						m_data->m_pBufContactOutGPU->getBufferCL()
897 						}
898 					}
899 					else
900 					{
901 						B3_PROFILE("gpu m_copyConstraintKernel");
902 						b3Int4 cdata;
903 						cdata.x = nContacts;
904 						b3BufferInfoCL bInfo[] = {
905 							b3BufferInfoCL(m_data->m_solverGPU->m_contactBuffer2->getBufferCL()),
906 							b3BufferInfoCL(m_data->m_pBufContactOutGPU->getBufferCL())};
907 
908 						b3LauncherCL launcher(m_data->m_queue, m_data->m_solverGPU->m_copyConstraintKernel, "m_copyConstraintKernel");
909 						launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
910 						launcher.setConst(cdata);
911 						launcher.launch1D(nContacts, 64);
912 						//we use the clFinish for proper benchmark/profile
913 						clFinish(m_data->m_queue);
914 					}
915 				}
916 
917 				//				bool compareGPU = false;
918 				if (nContacts)
919 				{
920 					if (!gCpuBatchContacts)
921 					{
922 						B3_PROFILE("gpu batchContacts");
923 						maxNumBatches = 250;  //250;
924 						m_data->m_solverGPU->batchContacts(m_data->m_pBufContactOutGPU, nContacts, m_data->m_solverGPU->m_numConstraints, m_data->m_solverGPU->m_offsets, csCfg.m_staticIdx);
925 						clFinish(m_data->m_queue);
926 					}
927 					else
928 					{
929 						B3_PROFILE("cpu batchContacts");
930 						static b3AlignedObjectArray<b3Contact4> cpuContacts;
931 						b3OpenCLArray<b3Contact4>* contactsIn = m_data->m_solverGPU->m_contactBuffer2;
932 						{
933 							B3_PROFILE("copyToHost");
934 							contactsIn->copyToHost(cpuContacts);
935 						}
936 						b3OpenCLArray<unsigned int>* countsNative = m_data->m_solverGPU->m_numConstraints;
937 						b3OpenCLArray<unsigned int>* offsetsNative = m_data->m_solverGPU->m_offsets;
938 
939 						b3AlignedObjectArray<unsigned int> nNativeHost;
940 						b3AlignedObjectArray<unsigned int> offsetsNativeHost;
941 
942 						{
943 							B3_PROFILE("countsNative/offsetsNative copyToHost");
944 							countsNative->copyToHost(nNativeHost);
945 							offsetsNative->copyToHost(offsetsNativeHost);
946 						}
947 
948 						int numNonzeroGrid = 0;
949 
950 						if (gUseLargeBatches)
951 						{
952 							m_data->m_batchSizes.resize(B3_MAX_NUM_BATCHES);
953 							int totalNumConstraints = cpuContacts.size();
954 							//int simdWidth =numBodies+1;//-1;//64;//-1;//32;
955 							int numBatches = sortConstraintByBatch3(&cpuContacts[0], totalNumConstraints, totalNumConstraints + 1, csCfg.m_staticIdx, numBodies, &m_data->m_batchSizes[0]);  //	on GPU
956 							maxNumBatches = b3Max(numBatches, maxNumBatches);
957 							static int globalMaxBatch = 0;
958 							if (maxNumBatches > globalMaxBatch)
959 							{
960 								globalMaxBatch = maxNumBatches;
961 								b3Printf("maxNumBatches = %d\n", maxNumBatches);
962 							}
963 						}
964 						else
965 						{
966 							m_data->m_batchSizes.resize(B3_SOLVER_N_CELLS * B3_MAX_NUM_BATCHES);
967 							B3_PROFILE("cpu batch grid");
968 							for (int i = 0; i < B3_SOLVER_N_CELLS; i++)
969 							{
970 								int n = (nNativeHost)[i];
971 								int offset = (offsetsNativeHost)[i];
972 								if (n)
973 								{
974 									numNonzeroGrid++;
975 									int simdWidth = numBodies + 1;                                                                                                                                 //-1;//64;//-1;//32;
976 									int numBatches = sortConstraintByBatch3(&cpuContacts[0] + offset, n, simdWidth, csCfg.m_staticIdx, numBodies, &m_data->m_batchSizes[i * B3_MAX_NUM_BATCHES]);  //	on GPU
977 									maxNumBatches = b3Max(numBatches, maxNumBatches);
978 									static int globalMaxBatch = 0;
979 									if (maxNumBatches > globalMaxBatch)
980 									{
981 										globalMaxBatch = maxNumBatches;
982 										b3Printf("maxNumBatches = %d\n", maxNumBatches);
983 									}
984 									//we use the clFinish for proper benchmark/profile
985 								}
986 							}
987 							//clFinish(m_data->m_queue);
988 						}
989 						{
990 							B3_PROFILE("m_contactBuffer->copyFromHost");
991 							m_data->m_solverGPU->m_contactBuffer2->copyFromHost((b3AlignedObjectArray<b3Contact4>&)cpuContacts);
992 						}
993 					}
994 				}
995 			}
996 		}
997 
998 		//printf("maxNumBatches = %d\n", maxNumBatches);
999 
1000 		if (gUseLargeBatches)
1001 		{
1002 			if (nContacts)
1003 			{
1004 				B3_PROFILE("cpu batchContacts");
1005 				static b3AlignedObjectArray<b3Contact4> cpuContacts;
1006 				//				b3OpenCLArray<b3Contact4>* contactsIn = m_data->m_solverGPU->m_contactBuffer2;
1007 				{
1008 					B3_PROFILE("copyToHost");
1009 					m_data->m_pBufContactOutGPU->copyToHost(cpuContacts);
1010 				}
1011 				//				b3OpenCLArray<unsigned int>* countsNative = m_data->m_solverGPU->m_numConstraints;
1012 				//				b3OpenCLArray<unsigned int>* offsetsNative = m_data->m_solverGPU->m_offsets;
1013 
1014 				//				int numNonzeroGrid=0;
1015 
1016 				{
1017 					m_data->m_batchSizes.resize(B3_MAX_NUM_BATCHES);
1018 					int totalNumConstraints = cpuContacts.size();
1019 					//				int simdWidth =numBodies+1;//-1;//64;//-1;//32;
1020 					int numBatches = sortConstraintByBatch3(&cpuContacts[0], totalNumConstraints, totalNumConstraints + 1, csCfg.m_staticIdx, numBodies, &m_data->m_batchSizes[0]);  //	on GPU
1021 					maxNumBatches = b3Max(numBatches, maxNumBatches);
1022 					static int globalMaxBatch = 0;
1023 					if (maxNumBatches > globalMaxBatch)
1024 					{
1025 						globalMaxBatch = maxNumBatches;
1026 						b3Printf("maxNumBatches = %d\n", maxNumBatches);
1027 					}
1028 				}
1029 				{
1030 					B3_PROFILE("m_contactBuffer->copyFromHost");
1031 					m_data->m_solverGPU->m_contactBuffer2->copyFromHost((b3AlignedObjectArray<b3Contact4>&)cpuContacts);
1032 				}
1033 			}
1034 		}
1035 
1036 		if (nContacts)
1037 		{
1038 			B3_PROFILE("gpu convertToConstraints");
1039 			m_data->m_solverGPU->convertToConstraints(bodyBuf,
1040 													  shapeBuf, m_data->m_solverGPU->m_contactBuffer2,
1041 													  contactConstraintOut,
1042 													  additionalData, nContacts,
1043 													  (b3SolverBase::ConstraintCfg&)csCfg);
1044 			clFinish(m_data->m_queue);
1045 		}
1046 
1047 		if (1)
1048 		{
1049 			int numIter = 4;
1050 
1051 			m_data->m_solverGPU->m_nIterations = numIter;  //10
1052 			if (!gCpuSolveConstraint)
1053 			{
1054 				B3_PROFILE("GPU solveContactConstraint");
1055 
1056 				/*m_data->m_solverGPU->solveContactConstraint(
1057 				m_data->m_bodyBufferGPU,
1058 				m_data->m_inertiaBufferGPU,
1059 				m_data->m_contactCGPU,0,
1060 				nContactOut ,
1061 				maxNumBatches);
1062 				*/
1063 
1064 				//m_data->m_batchSizesGpu->copyFromHost(m_data->m_batchSizes);
1065 
1066 				if (gUseLargeBatches)
1067 				{
1068 					solveContactConstraintBatchSizes(m_data->m_bodyBufferGPU,
1069 													 m_data->m_inertiaBufferGPU,
1070 													 m_data->m_contactCGPU, 0,
1071 													 nContactOut,
1072 													 maxNumBatches, numIter, &m_data->m_batchSizes);
1073 				}
1074 				else
1075 				{
1076 					solveContactConstraint(
1077 						m_data->m_bodyBufferGPU,
1078 						m_data->m_inertiaBufferGPU,
1079 						m_data->m_contactCGPU, 0,
1080 						nContactOut,
1081 						maxNumBatches, numIter, &m_data->m_batchSizes);  //m_data->m_batchSizesGpu);
1082 				}
1083 			}
1084 			else
1085 			{
1086 				B3_PROFILE("Host solveContactConstraint");
1087 
1088 				m_data->m_solverGPU->solveContactConstraintHost(m_data->m_bodyBufferGPU, m_data->m_inertiaBufferGPU, m_data->m_contactCGPU, 0, nContactOut, maxNumBatches, &m_data->m_batchSizes);
1089 			}
1090 		}
1091 
1092 #if 0
1093         if (0)
1094         {
1095             B3_PROFILE("read body velocities back to CPU");
1096             //read body updated linear/angular velocities back to CPU
1097             m_data->m_bodyBufferGPU->read(
1098                                                   m_data->m_bodyBufferCPU->m_ptr,numOfConvexRBodies);
1099             adl::DeviceUtils::waitForCompletion( m_data->m_deviceCL );
1100         }
1101 #endif
1102 	}
1103 }
1104 
batchContacts(b3OpenCLArray<b3Contact4> * contacts,int nContacts,b3OpenCLArray<unsigned int> * n,b3OpenCLArray<unsigned int> * offsets,int staticIdx)1105 void b3GpuPgsContactSolver::batchContacts(b3OpenCLArray<b3Contact4>* contacts, int nContacts, b3OpenCLArray<unsigned int>* n, b3OpenCLArray<unsigned int>* offsets, int staticIdx)
1106 {
1107 }
1108 
1109 b3AlignedObjectArray<unsigned int> idxBuffer;
1110 b3AlignedObjectArray<b3SortData> sortData;
1111 b3AlignedObjectArray<b3Contact4> old;
1112 
sortConstraintByBatch(b3Contact4 * cs,int n,int simdWidth,int staticIdx,int numBodies)1113 inline int b3GpuPgsContactSolver::sortConstraintByBatch(b3Contact4* cs, int n, int simdWidth, int staticIdx, int numBodies)
1114 {
1115 	B3_PROFILE("sortConstraintByBatch");
1116 	int numIter = 0;
1117 
1118 	sortData.resize(n);
1119 	idxBuffer.resize(n);
1120 	old.resize(n);
1121 
1122 	unsigned int* idxSrc = &idxBuffer[0];
1123 	unsigned int* idxDst = &idxBuffer[0];
1124 	int nIdxSrc, nIdxDst;
1125 
1126 	const int N_FLG = 256;
1127 	const int FLG_MASK = N_FLG - 1;
1128 	unsigned int flg[N_FLG / 32];
1129 #if defined(_DEBUG)
1130 	for (int i = 0; i < n; i++)
1131 		cs[i].getBatchIdx() = -1;
1132 #endif
1133 	for (int i = 0; i < n; i++)
1134 		idxSrc[i] = i;
1135 	nIdxSrc = n;
1136 
1137 	int batchIdx = 0;
1138 
1139 	{
1140 		B3_PROFILE("cpu batch innerloop");
1141 		while (nIdxSrc)
1142 		{
1143 			numIter++;
1144 			nIdxDst = 0;
1145 			int nCurrentBatch = 0;
1146 
1147 			//	clear flag
1148 			for (int i = 0; i < N_FLG / 32; i++) flg[i] = 0;
1149 
1150 			for (int i = 0; i < nIdxSrc; i++)
1151 			{
1152 				int idx = idxSrc[i];
1153 
1154 				b3Assert(idx < n);
1155 				//	check if it can go
1156 				int bodyAS = cs[idx].m_bodyAPtrAndSignBit;
1157 				int bodyBS = cs[idx].m_bodyBPtrAndSignBit;
1158 
1159 				int bodyA = abs(bodyAS);
1160 				int bodyB = abs(bodyBS);
1161 
1162 				int aIdx = bodyA & FLG_MASK;
1163 				int bIdx = bodyB & FLG_MASK;
1164 
1165 				unsigned int aUnavailable = flg[aIdx / 32] & (1 << (aIdx & 31));
1166 				unsigned int bUnavailable = flg[bIdx / 32] & (1 << (bIdx & 31));
1167 
1168 				bool aIsStatic = (bodyAS < 0) || bodyAS == staticIdx;
1169 				bool bIsStatic = (bodyBS < 0) || bodyBS == staticIdx;
1170 
1171 				//use inv_mass!
1172 				aUnavailable = !aIsStatic ? aUnavailable : 0;  //
1173 				bUnavailable = !bIsStatic ? bUnavailable : 0;
1174 
1175 				if (aUnavailable == 0 && bUnavailable == 0)  // ok
1176 				{
1177 					if (!aIsStatic)
1178 						flg[aIdx / 32] |= (1 << (aIdx & 31));
1179 					if (!bIsStatic)
1180 						flg[bIdx / 32] |= (1 << (bIdx & 31));
1181 
1182 					cs[idx].getBatchIdx() = batchIdx;
1183 					sortData[idx].m_key = batchIdx;
1184 					sortData[idx].m_value = idx;
1185 
1186 					{
1187 						nCurrentBatch++;
1188 						if (nCurrentBatch == simdWidth)
1189 						{
1190 							nCurrentBatch = 0;
1191 							for (int i = 0; i < N_FLG / 32; i++) flg[i] = 0;
1192 						}
1193 					}
1194 				}
1195 				else
1196 				{
1197 					idxDst[nIdxDst++] = idx;
1198 				}
1199 			}
1200 			b3Swap(idxSrc, idxDst);
1201 			b3Swap(nIdxSrc, nIdxDst);
1202 			batchIdx++;
1203 		}
1204 	}
1205 	{
1206 		B3_PROFILE("quickSort");
1207 		sortData.quickSort(sortfnc);
1208 	}
1209 
1210 	{
1211 		B3_PROFILE("reorder");
1212 		//	reorder
1213 
1214 		memcpy(&old[0], cs, sizeof(b3Contact4) * n);
1215 		for (int i = 0; i < n; i++)
1216 		{
1217 			int idx = sortData[i].m_value;
1218 			cs[i] = old[idx];
1219 		}
1220 	}
1221 
1222 #if defined(_DEBUG)
1223 	//		debugPrintf( "nBatches: %d\n", batchIdx );
1224 	for (int i = 0; i < n; i++)
1225 	{
1226 		b3Assert(cs[i].getBatchIdx() != -1);
1227 	}
1228 #endif
1229 	return batchIdx;
1230 }
1231 
1232 b3AlignedObjectArray<int> bodyUsed2;
1233 
sortConstraintByBatch2(b3Contact4 * cs,int numConstraints,int simdWidth,int staticIdx,int numBodies)1234 inline int b3GpuPgsContactSolver::sortConstraintByBatch2(b3Contact4* cs, int numConstraints, int simdWidth, int staticIdx, int numBodies)
1235 {
1236 	B3_PROFILE("sortConstraintByBatch2");
1237 
1238 	bodyUsed2.resize(2 * simdWidth);
1239 
1240 	for (int q = 0; q < 2 * simdWidth; q++)
1241 		bodyUsed2[q] = 0;
1242 
1243 	int curBodyUsed = 0;
1244 
1245 	int numIter = 0;
1246 
1247 	m_data->m_sortData.resize(numConstraints);
1248 	m_data->m_idxBuffer.resize(numConstraints);
1249 	m_data->m_old.resize(numConstraints);
1250 
1251 	unsigned int* idxSrc = &m_data->m_idxBuffer[0];
1252 
1253 #if defined(_DEBUG)
1254 	for (int i = 0; i < numConstraints; i++)
1255 		cs[i].getBatchIdx() = -1;
1256 #endif
1257 	for (int i = 0; i < numConstraints; i++)
1258 		idxSrc[i] = i;
1259 
1260 	int numValidConstraints = 0;
1261 	//	int unprocessedConstraintIndex = 0;
1262 
1263 	int batchIdx = 0;
1264 
1265 	{
1266 		B3_PROFILE("cpu batch innerloop");
1267 
1268 		while (numValidConstraints < numConstraints)
1269 		{
1270 			numIter++;
1271 			int nCurrentBatch = 0;
1272 			//	clear flag
1273 			for (int i = 0; i < curBodyUsed; i++)
1274 				bodyUsed2[i] = 0;
1275 			curBodyUsed = 0;
1276 
1277 			for (int i = numValidConstraints; i < numConstraints; i++)
1278 			{
1279 				int idx = idxSrc[i];
1280 				b3Assert(idx < numConstraints);
1281 				//	check if it can go
1282 				int bodyAS = cs[idx].m_bodyAPtrAndSignBit;
1283 				int bodyBS = cs[idx].m_bodyBPtrAndSignBit;
1284 				int bodyA = abs(bodyAS);
1285 				int bodyB = abs(bodyBS);
1286 				bool aIsStatic = (bodyAS < 0) || bodyAS == staticIdx;
1287 				bool bIsStatic = (bodyBS < 0) || bodyBS == staticIdx;
1288 				int aUnavailable = 0;
1289 				int bUnavailable = 0;
1290 				if (!aIsStatic)
1291 				{
1292 					for (int j = 0; j < curBodyUsed; j++)
1293 					{
1294 						if (bodyA == bodyUsed2[j])
1295 						{
1296 							aUnavailable = 1;
1297 							break;
1298 						}
1299 					}
1300 				}
1301 				if (!aUnavailable)
1302 					if (!bIsStatic)
1303 					{
1304 						for (int j = 0; j < curBodyUsed; j++)
1305 						{
1306 							if (bodyB == bodyUsed2[j])
1307 							{
1308 								bUnavailable = 1;
1309 								break;
1310 							}
1311 						}
1312 					}
1313 
1314 				if (aUnavailable == 0 && bUnavailable == 0)  // ok
1315 				{
1316 					if (!aIsStatic)
1317 					{
1318 						bodyUsed2[curBodyUsed++] = bodyA;
1319 					}
1320 					if (!bIsStatic)
1321 					{
1322 						bodyUsed2[curBodyUsed++] = bodyB;
1323 					}
1324 
1325 					cs[idx].getBatchIdx() = batchIdx;
1326 					m_data->m_sortData[idx].m_key = batchIdx;
1327 					m_data->m_sortData[idx].m_value = idx;
1328 
1329 					if (i != numValidConstraints)
1330 					{
1331 						b3Swap(idxSrc[i], idxSrc[numValidConstraints]);
1332 					}
1333 
1334 					numValidConstraints++;
1335 					{
1336 						nCurrentBatch++;
1337 						if (nCurrentBatch == simdWidth)
1338 						{
1339 							nCurrentBatch = 0;
1340 							for (int i = 0; i < curBodyUsed; i++)
1341 								bodyUsed2[i] = 0;
1342 
1343 							curBodyUsed = 0;
1344 						}
1345 					}
1346 				}
1347 			}
1348 
1349 			batchIdx++;
1350 		}
1351 	}
1352 	{
1353 		B3_PROFILE("quickSort");
1354 		//m_data->m_sortData.quickSort(sortfnc);
1355 	}
1356 
1357 	{
1358 		B3_PROFILE("reorder");
1359 		//	reorder
1360 
1361 		memcpy(&m_data->m_old[0], cs, sizeof(b3Contact4) * numConstraints);
1362 
1363 		for (int i = 0; i < numConstraints; i++)
1364 		{
1365 			b3Assert(m_data->m_sortData[idxSrc[i]].m_value == idxSrc[i]);
1366 			int idx = m_data->m_sortData[idxSrc[i]].m_value;
1367 			cs[i] = m_data->m_old[idx];
1368 		}
1369 	}
1370 
1371 #if defined(_DEBUG)
1372 	//		debugPrintf( "nBatches: %d\n", batchIdx );
1373 	for (int i = 0; i < numConstraints; i++)
1374 	{
1375 		b3Assert(cs[i].getBatchIdx() != -1);
1376 	}
1377 #endif
1378 
1379 	return batchIdx;
1380 }
1381 
1382 b3AlignedObjectArray<int> bodyUsed;
1383 b3AlignedObjectArray<int> curUsed;
1384 
sortConstraintByBatch3(b3Contact4 * cs,int numConstraints,int simdWidth,int staticIdx,int numBodies,int * batchSizes)1385 inline int b3GpuPgsContactSolver::sortConstraintByBatch3(b3Contact4* cs, int numConstraints, int simdWidth, int staticIdx, int numBodies, int* batchSizes)
1386 {
1387 	B3_PROFILE("sortConstraintByBatch3");
1388 
1389 	static int maxSwaps = 0;
1390 	int numSwaps = 0;
1391 
1392 	curUsed.resize(2 * simdWidth);
1393 
1394 	static int maxNumConstraints = 0;
1395 	if (maxNumConstraints < numConstraints)
1396 	{
1397 		maxNumConstraints = numConstraints;
1398 		//printf("maxNumConstraints  = %d\n",maxNumConstraints );
1399 	}
1400 
1401 	int numUsedArray = numBodies / 32 + 1;
1402 	bodyUsed.resize(numUsedArray);
1403 
1404 	for (int q = 0; q < numUsedArray; q++)
1405 		bodyUsed[q] = 0;
1406 
1407 	int curBodyUsed = 0;
1408 
1409 	int numIter = 0;
1410 
1411 	m_data->m_sortData.resize(0);
1412 	m_data->m_idxBuffer.resize(0);
1413 	m_data->m_old.resize(0);
1414 
1415 #if defined(_DEBUG)
1416 	for (int i = 0; i < numConstraints; i++)
1417 		cs[i].getBatchIdx() = -1;
1418 #endif
1419 
1420 	int numValidConstraints = 0;
1421 	//	int unprocessedConstraintIndex = 0;
1422 
1423 	int batchIdx = 0;
1424 
1425 	{
1426 		B3_PROFILE("cpu batch innerloop");
1427 
1428 		while (numValidConstraints < numConstraints)
1429 		{
1430 			numIter++;
1431 			int nCurrentBatch = 0;
1432 			batchSizes[batchIdx] = 0;
1433 
1434 			//	clear flag
1435 			for (int i = 0; i < curBodyUsed; i++)
1436 				bodyUsed[curUsed[i] / 32] = 0;
1437 
1438 			curBodyUsed = 0;
1439 
1440 			for (int i = numValidConstraints; i < numConstraints; i++)
1441 			{
1442 				int idx = i;
1443 				b3Assert(idx < numConstraints);
1444 				//	check if it can go
1445 				int bodyAS = cs[idx].m_bodyAPtrAndSignBit;
1446 				int bodyBS = cs[idx].m_bodyBPtrAndSignBit;
1447 				int bodyA = abs(bodyAS);
1448 				int bodyB = abs(bodyBS);
1449 				bool aIsStatic = (bodyAS < 0) || bodyAS == staticIdx;
1450 				bool bIsStatic = (bodyBS < 0) || bodyBS == staticIdx;
1451 				int aUnavailable = 0;
1452 				int bUnavailable = 0;
1453 				if (!aIsStatic)
1454 				{
1455 					aUnavailable = bodyUsed[bodyA / 32] & (1 << (bodyA & 31));
1456 				}
1457 				if (!aUnavailable)
1458 					if (!bIsStatic)
1459 					{
1460 						bUnavailable = bodyUsed[bodyB / 32] & (1 << (bodyB & 31));
1461 					}
1462 
1463 				if (aUnavailable == 0 && bUnavailable == 0)  // ok
1464 				{
1465 					if (!aIsStatic)
1466 					{
1467 						bodyUsed[bodyA / 32] |= (1 << (bodyA & 31));
1468 						curUsed[curBodyUsed++] = bodyA;
1469 					}
1470 					if (!bIsStatic)
1471 					{
1472 						bodyUsed[bodyB / 32] |= (1 << (bodyB & 31));
1473 						curUsed[curBodyUsed++] = bodyB;
1474 					}
1475 
1476 					cs[idx].getBatchIdx() = batchIdx;
1477 
1478 					if (i != numValidConstraints)
1479 					{
1480 						b3Swap(cs[i], cs[numValidConstraints]);
1481 						numSwaps++;
1482 					}
1483 
1484 					numValidConstraints++;
1485 					{
1486 						nCurrentBatch++;
1487 						if (nCurrentBatch == simdWidth)
1488 						{
1489 							batchSizes[batchIdx] += simdWidth;
1490 							nCurrentBatch = 0;
1491 							for (int i = 0; i < curBodyUsed; i++)
1492 								bodyUsed[curUsed[i] / 32] = 0;
1493 							curBodyUsed = 0;
1494 						}
1495 					}
1496 				}
1497 			}
1498 
1499 			if (batchIdx >= B3_MAX_NUM_BATCHES)
1500 			{
1501 				b3Error("batchIdx>=B3_MAX_NUM_BATCHES");
1502 				b3Assert(0);
1503 				break;
1504 			}
1505 
1506 			batchSizes[batchIdx] += nCurrentBatch;
1507 
1508 			batchIdx++;
1509 		}
1510 	}
1511 
1512 #if defined(_DEBUG)
1513 	//		debugPrintf( "nBatches: %d\n", batchIdx );
1514 	for (int i = 0; i < numConstraints; i++)
1515 	{
1516 		b3Assert(cs[i].getBatchIdx() != -1);
1517 	}
1518 #endif
1519 
1520 	batchSizes[batchIdx] = 0;
1521 
1522 	if (maxSwaps < numSwaps)
1523 	{
1524 		maxSwaps = numSwaps;
1525 		//printf("maxSwaps = %d\n", maxSwaps);
1526 	}
1527 
1528 	return batchIdx;
1529 }
1530