1
2 bool gUseLargeBatches = false;
3 bool gCpuBatchContacts = false;
4 bool gCpuSolveConstraint = false;
5 bool gCpuRadixSort = false;
6 bool gCpuSetSortData = false;
7 bool gCpuSortContactsDeterminism = false;
8 bool gUseCpuCopyConstraints = false;
9 bool gUseScanHost = false;
10 bool gReorderContactsOnCpu = false;
11
12 bool optionalSortContactsDeterminism = true;
13
14 #include "b3GpuPgsContactSolver.h"
15 #include "Bullet3OpenCL/ParallelPrimitives/b3RadixSort32CL.h"
16
17 #include "Bullet3OpenCL/ParallelPrimitives/b3LauncherCL.h"
18 #include "Bullet3OpenCL/ParallelPrimitives/b3BoundSearchCL.h"
19 #include "Bullet3OpenCL/ParallelPrimitives/b3PrefixScanCL.h"
20 #include <string.h>
21 #include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h"
22 #include "Bullet3Collision/NarrowPhaseCollision/b3Config.h"
23 #include "b3Solver.h"
24
25 #define B3_SOLVER_SETUP_KERNEL_PATH "src/Bullet3OpenCL/RigidBody/kernels/solverSetup.cl"
26 #define B3_SOLVER_SETUP2_KERNEL_PATH "src/Bullet3OpenCL/RigidBody/kernels/solverSetup2.cl"
27 #define B3_SOLVER_CONTACT_KERNEL_PATH "src/Bullet3OpenCL/RigidBody/kernels/solveContact.cl"
28 #define B3_SOLVER_FRICTION_KERNEL_PATH "src/Bullet3OpenCL/RigidBody/kernels/solveFriction.cl"
29 #define B3_BATCHING_PATH "src/Bullet3OpenCL/RigidBody/kernels/batchingKernels.cl"
30 #define B3_BATCHING_NEW_PATH "src/Bullet3OpenCL/RigidBody/kernels/batchingKernelsNew.cl"
31
32 #include "kernels/solverSetup.h"
33 #include "kernels/solverSetup2.h"
34 #include "kernels/solveContact.h"
35 #include "kernels/solveFriction.h"
36 #include "kernels/batchingKernels.h"
37 #include "kernels/batchingKernelsNew.h"
38
39 struct b3GpuBatchingPgsSolverInternalData
40 {
41 cl_context m_context;
42 cl_device_id m_device;
43 cl_command_queue m_queue;
44 int m_pairCapacity;
45 int m_nIterations;
46
47 b3OpenCLArray<b3GpuConstraint4>* m_contactCGPU;
48 b3OpenCLArray<unsigned int>* m_numConstraints;
49 b3OpenCLArray<unsigned int>* m_offsets;
50
51 b3Solver* m_solverGPU;
52
53 cl_kernel m_batchingKernel;
54 cl_kernel m_batchingKernelNew;
55 cl_kernel m_solveContactKernel;
56 cl_kernel m_solveSingleContactKernel;
57 cl_kernel m_solveSingleFrictionKernel;
58 cl_kernel m_solveFrictionKernel;
59 cl_kernel m_contactToConstraintKernel;
60 cl_kernel m_setSortDataKernel;
61 cl_kernel m_reorderContactKernel;
62 cl_kernel m_copyConstraintKernel;
63
64 cl_kernel m_setDeterminismSortDataBodyAKernel;
65 cl_kernel m_setDeterminismSortDataBodyBKernel;
66 cl_kernel m_setDeterminismSortDataChildShapeAKernel;
67 cl_kernel m_setDeterminismSortDataChildShapeBKernel;
68
69 class b3RadixSort32CL* m_sort32;
70 class b3BoundSearchCL* m_search;
71 class b3PrefixScanCL* m_scan;
72
73 b3OpenCLArray<b3SortData>* m_sortDataBuffer;
74 b3OpenCLArray<b3Contact4>* m_contactBuffer;
75
76 b3OpenCLArray<b3RigidBodyData>* m_bodyBufferGPU;
77 b3OpenCLArray<b3InertiaData>* m_inertiaBufferGPU;
78 b3OpenCLArray<b3Contact4>* m_pBufContactOutGPU;
79
80 b3OpenCLArray<b3Contact4>* m_pBufContactOutGPUCopy;
81 b3OpenCLArray<b3SortData>* m_contactKeyValues;
82
83 b3AlignedObjectArray<unsigned int> m_idxBuffer;
84 b3AlignedObjectArray<b3SortData> m_sortData;
85 b3AlignedObjectArray<b3Contact4> m_old;
86
87 b3AlignedObjectArray<int> m_batchSizes;
88 b3OpenCLArray<int>* m_batchSizesGpu;
89 };
90
b3GpuPgsContactSolver(cl_context ctx,cl_device_id device,cl_command_queue q,int pairCapacity)91 b3GpuPgsContactSolver::b3GpuPgsContactSolver(cl_context ctx, cl_device_id device, cl_command_queue q, int pairCapacity)
92 {
93 m_debugOutput = 0;
94 m_data = new b3GpuBatchingPgsSolverInternalData;
95 m_data->m_context = ctx;
96 m_data->m_device = device;
97 m_data->m_queue = q;
98 m_data->m_pairCapacity = pairCapacity;
99 m_data->m_nIterations = 4;
100 m_data->m_batchSizesGpu = new b3OpenCLArray<int>(ctx, q);
101 m_data->m_bodyBufferGPU = new b3OpenCLArray<b3RigidBodyData>(ctx, q);
102 m_data->m_inertiaBufferGPU = new b3OpenCLArray<b3InertiaData>(ctx, q);
103 m_data->m_pBufContactOutGPU = new b3OpenCLArray<b3Contact4>(ctx, q);
104
105 m_data->m_pBufContactOutGPUCopy = new b3OpenCLArray<b3Contact4>(ctx, q);
106 m_data->m_contactKeyValues = new b3OpenCLArray<b3SortData>(ctx, q);
107
108 m_data->m_solverGPU = new b3Solver(ctx, device, q, 512 * 1024);
109
110 m_data->m_sort32 = new b3RadixSort32CL(ctx, device, m_data->m_queue);
111 m_data->m_scan = new b3PrefixScanCL(ctx, device, m_data->m_queue, B3_SOLVER_N_CELLS);
112 m_data->m_search = new b3BoundSearchCL(ctx, device, m_data->m_queue, B3_SOLVER_N_CELLS);
113
114 const int sortSize = B3NEXTMULTIPLEOF(pairCapacity, 512);
115
116 m_data->m_sortDataBuffer = new b3OpenCLArray<b3SortData>(ctx, m_data->m_queue, sortSize);
117 m_data->m_contactBuffer = new b3OpenCLArray<b3Contact4>(ctx, m_data->m_queue);
118
119 m_data->m_numConstraints = new b3OpenCLArray<unsigned int>(ctx, m_data->m_queue, B3_SOLVER_N_CELLS);
120 m_data->m_numConstraints->resize(B3_SOLVER_N_CELLS);
121
122 m_data->m_contactCGPU = new b3OpenCLArray<b3GpuConstraint4>(ctx, q, pairCapacity);
123
124 m_data->m_offsets = new b3OpenCLArray<unsigned int>(ctx, m_data->m_queue, B3_SOLVER_N_CELLS);
125 m_data->m_offsets->resize(B3_SOLVER_N_CELLS);
126 const char* additionalMacros = "";
127 //const char* srcFileNameForCaching="";
128
129 cl_int pErrNum;
130 const char* batchKernelSource = batchingKernelsCL;
131 const char* batchKernelNewSource = batchingKernelsNewCL;
132 const char* solverSetupSource = solverSetupCL;
133 const char* solverSetup2Source = solverSetup2CL;
134 const char* solveContactSource = solveContactCL;
135 const char* solveFrictionSource = solveFrictionCL;
136
137 {
138 cl_program solveContactProg = b3OpenCLUtils::compileCLProgramFromString(ctx, device, solveContactSource, &pErrNum, additionalMacros, B3_SOLVER_CONTACT_KERNEL_PATH);
139 b3Assert(solveContactProg);
140
141 cl_program solveFrictionProg = b3OpenCLUtils::compileCLProgramFromString(ctx, device, solveFrictionSource, &pErrNum, additionalMacros, B3_SOLVER_FRICTION_KERNEL_PATH);
142 b3Assert(solveFrictionProg);
143
144 cl_program solverSetup2Prog = b3OpenCLUtils::compileCLProgramFromString(ctx, device, solverSetup2Source, &pErrNum, additionalMacros, B3_SOLVER_SETUP2_KERNEL_PATH);
145
146 b3Assert(solverSetup2Prog);
147
148 cl_program solverSetupProg = b3OpenCLUtils::compileCLProgramFromString(ctx, device, solverSetupSource, &pErrNum, additionalMacros, B3_SOLVER_SETUP_KERNEL_PATH);
149 b3Assert(solverSetupProg);
150
151 m_data->m_solveFrictionKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, solveFrictionSource, "BatchSolveKernelFriction", &pErrNum, solveFrictionProg, additionalMacros);
152 b3Assert(m_data->m_solveFrictionKernel);
153
154 m_data->m_solveContactKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, solveContactSource, "BatchSolveKernelContact", &pErrNum, solveContactProg, additionalMacros);
155 b3Assert(m_data->m_solveContactKernel);
156
157 m_data->m_solveSingleContactKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, solveContactSource, "solveSingleContactKernel", &pErrNum, solveContactProg, additionalMacros);
158 b3Assert(m_data->m_solveSingleContactKernel);
159
160 m_data->m_solveSingleFrictionKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, solveFrictionSource, "solveSingleFrictionKernel", &pErrNum, solveFrictionProg, additionalMacros);
161 b3Assert(m_data->m_solveSingleFrictionKernel);
162
163 m_data->m_contactToConstraintKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, solverSetupSource, "ContactToConstraintKernel", &pErrNum, solverSetupProg, additionalMacros);
164 b3Assert(m_data->m_contactToConstraintKernel);
165
166 m_data->m_setSortDataKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, solverSetup2Source, "SetSortDataKernel", &pErrNum, solverSetup2Prog, additionalMacros);
167 b3Assert(m_data->m_setSortDataKernel);
168
169 m_data->m_setDeterminismSortDataBodyAKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, solverSetup2Source, "SetDeterminismSortDataBodyA", &pErrNum, solverSetup2Prog, additionalMacros);
170 b3Assert(m_data->m_setDeterminismSortDataBodyAKernel);
171
172 m_data->m_setDeterminismSortDataBodyBKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, solverSetup2Source, "SetDeterminismSortDataBodyB", &pErrNum, solverSetup2Prog, additionalMacros);
173 b3Assert(m_data->m_setDeterminismSortDataBodyBKernel);
174
175 m_data->m_setDeterminismSortDataChildShapeAKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, solverSetup2Source, "SetDeterminismSortDataChildShapeA", &pErrNum, solverSetup2Prog, additionalMacros);
176 b3Assert(m_data->m_setDeterminismSortDataChildShapeAKernel);
177
178 m_data->m_setDeterminismSortDataChildShapeBKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, solverSetup2Source, "SetDeterminismSortDataChildShapeB", &pErrNum, solverSetup2Prog, additionalMacros);
179 b3Assert(m_data->m_setDeterminismSortDataChildShapeBKernel);
180
181 m_data->m_reorderContactKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, solverSetup2Source, "ReorderContactKernel", &pErrNum, solverSetup2Prog, additionalMacros);
182 b3Assert(m_data->m_reorderContactKernel);
183
184 m_data->m_copyConstraintKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, solverSetup2Source, "CopyConstraintKernel", &pErrNum, solverSetup2Prog, additionalMacros);
185 b3Assert(m_data->m_copyConstraintKernel);
186 }
187
188 {
189 cl_program batchingProg = b3OpenCLUtils::compileCLProgramFromString(ctx, device, batchKernelSource, &pErrNum, additionalMacros, B3_BATCHING_PATH);
190 b3Assert(batchingProg);
191
192 m_data->m_batchingKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, batchKernelSource, "CreateBatches", &pErrNum, batchingProg, additionalMacros);
193 b3Assert(m_data->m_batchingKernel);
194 }
195
196 {
197 cl_program batchingNewProg = b3OpenCLUtils::compileCLProgramFromString(ctx, device, batchKernelNewSource, &pErrNum, additionalMacros, B3_BATCHING_NEW_PATH);
198 b3Assert(batchingNewProg);
199
200 m_data->m_batchingKernelNew = b3OpenCLUtils::compileCLKernelFromString(ctx, device, batchKernelNewSource, "CreateBatchesNew", &pErrNum, batchingNewProg, additionalMacros);
201 b3Assert(m_data->m_batchingKernelNew);
202 }
203 }
204
~b3GpuPgsContactSolver()205 b3GpuPgsContactSolver::~b3GpuPgsContactSolver()
206 {
207 delete m_data->m_batchSizesGpu;
208 delete m_data->m_bodyBufferGPU;
209 delete m_data->m_inertiaBufferGPU;
210 delete m_data->m_pBufContactOutGPU;
211 delete m_data->m_pBufContactOutGPUCopy;
212 delete m_data->m_contactKeyValues;
213
214 delete m_data->m_contactCGPU;
215 delete m_data->m_numConstraints;
216 delete m_data->m_offsets;
217 delete m_data->m_sortDataBuffer;
218 delete m_data->m_contactBuffer;
219
220 delete m_data->m_sort32;
221 delete m_data->m_scan;
222 delete m_data->m_search;
223 delete m_data->m_solverGPU;
224
225 clReleaseKernel(m_data->m_batchingKernel);
226 clReleaseKernel(m_data->m_batchingKernelNew);
227 clReleaseKernel(m_data->m_solveSingleContactKernel);
228 clReleaseKernel(m_data->m_solveSingleFrictionKernel);
229 clReleaseKernel(m_data->m_solveContactKernel);
230 clReleaseKernel(m_data->m_solveFrictionKernel);
231
232 clReleaseKernel(m_data->m_contactToConstraintKernel);
233 clReleaseKernel(m_data->m_setSortDataKernel);
234 clReleaseKernel(m_data->m_reorderContactKernel);
235 clReleaseKernel(m_data->m_copyConstraintKernel);
236
237 clReleaseKernel(m_data->m_setDeterminismSortDataBodyAKernel);
238 clReleaseKernel(m_data->m_setDeterminismSortDataBodyBKernel);
239 clReleaseKernel(m_data->m_setDeterminismSortDataChildShapeAKernel);
240 clReleaseKernel(m_data->m_setDeterminismSortDataChildShapeBKernel);
241
242 delete m_data;
243 }
244
245 struct b3ConstraintCfg
246 {
b3ConstraintCfgb3ConstraintCfg247 b3ConstraintCfg(float dt = 0.f) : m_positionDrift(0.005f), m_positionConstraintCoeff(0.2f), m_dt(dt), m_staticIdx(0) {}
248
249 float m_positionDrift;
250 float m_positionConstraintCoeff;
251 float m_dt;
252 bool m_enableParallelSolve;
253 float m_batchCellSize;
254 int m_staticIdx;
255 };
256
solveContactConstraintBatchSizes(const b3OpenCLArray<b3RigidBodyData> * bodyBuf,const b3OpenCLArray<b3InertiaData> * shapeBuf,b3OpenCLArray<b3GpuConstraint4> * constraint,void * additionalData,int n,int maxNumBatches,int numIterations,const b3AlignedObjectArray<int> * batchSizes)257 void b3GpuPgsContactSolver::solveContactConstraintBatchSizes(const b3OpenCLArray<b3RigidBodyData>* bodyBuf, const b3OpenCLArray<b3InertiaData>* shapeBuf,
258 b3OpenCLArray<b3GpuConstraint4>* constraint, void* additionalData, int n, int maxNumBatches, int numIterations, const b3AlignedObjectArray<int>* batchSizes) //const b3OpenCLArray<int>* gpuBatchSizes)
259 {
260 B3_PROFILE("solveContactConstraintBatchSizes");
261 int numBatches = batchSizes->size() / B3_MAX_NUM_BATCHES;
262 for (int iter = 0; iter < numIterations; iter++)
263 {
264 for (int cellId = 0; cellId < numBatches; cellId++)
265 {
266 int offset = 0;
267 for (int ii = 0; ii < B3_MAX_NUM_BATCHES; ii++)
268 {
269 int numInBatch = batchSizes->at(cellId * B3_MAX_NUM_BATCHES + ii);
270 if (!numInBatch)
271 break;
272
273 {
274 b3LauncherCL launcher(m_data->m_queue, m_data->m_solveSingleContactKernel, "m_solveSingleContactKernel");
275 launcher.setBuffer(bodyBuf->getBufferCL());
276 launcher.setBuffer(shapeBuf->getBufferCL());
277 launcher.setBuffer(constraint->getBufferCL());
278 launcher.setConst(cellId);
279 launcher.setConst(offset);
280 launcher.setConst(numInBatch);
281 launcher.launch1D(numInBatch);
282 offset += numInBatch;
283 }
284 }
285 }
286 }
287
288 for (int iter = 0; iter < numIterations; iter++)
289 {
290 for (int cellId = 0; cellId < numBatches; cellId++)
291 {
292 int offset = 0;
293 for (int ii = 0; ii < B3_MAX_NUM_BATCHES; ii++)
294 {
295 int numInBatch = batchSizes->at(cellId * B3_MAX_NUM_BATCHES + ii);
296 if (!numInBatch)
297 break;
298
299 {
300 b3LauncherCL launcher(m_data->m_queue, m_data->m_solveSingleFrictionKernel, "m_solveSingleFrictionKernel");
301 launcher.setBuffer(bodyBuf->getBufferCL());
302 launcher.setBuffer(shapeBuf->getBufferCL());
303 launcher.setBuffer(constraint->getBufferCL());
304 launcher.setConst(cellId);
305 launcher.setConst(offset);
306 launcher.setConst(numInBatch);
307 launcher.launch1D(numInBatch);
308 offset += numInBatch;
309 }
310 }
311 }
312 }
313 }
314
solveContactConstraint(const b3OpenCLArray<b3RigidBodyData> * bodyBuf,const b3OpenCLArray<b3InertiaData> * shapeBuf,b3OpenCLArray<b3GpuConstraint4> * constraint,void * additionalData,int n,int maxNumBatches,int numIterations,const b3AlignedObjectArray<int> * batchSizes)315 void b3GpuPgsContactSolver::solveContactConstraint(const b3OpenCLArray<b3RigidBodyData>* bodyBuf, const b3OpenCLArray<b3InertiaData>* shapeBuf,
316 b3OpenCLArray<b3GpuConstraint4>* constraint, void* additionalData, int n, int maxNumBatches, int numIterations, const b3AlignedObjectArray<int>* batchSizes) //,const b3OpenCLArray<int>* gpuBatchSizes)
317 {
318 //sort the contacts
319
320 b3Int4 cdata = b3MakeInt4(n, 0, 0, 0);
321 {
322 const int nn = B3_SOLVER_N_CELLS;
323
324 cdata.x = 0;
325 cdata.y = maxNumBatches; //250;
326
327 int numWorkItems = 64 * nn / B3_SOLVER_N_BATCHES;
328 #ifdef DEBUG_ME
329 SolverDebugInfo* debugInfo = new SolverDebugInfo[numWorkItems];
330 adl::b3OpenCLArray<SolverDebugInfo> gpuDebugInfo(data->m_device, numWorkItems);
331 #endif
332
333 {
334 B3_PROFILE("m_batchSolveKernel iterations");
335 for (int iter = 0; iter < numIterations; iter++)
336 {
337 for (int ib = 0; ib < B3_SOLVER_N_BATCHES; ib++)
338 {
339 #ifdef DEBUG_ME
340 memset(debugInfo, 0, sizeof(SolverDebugInfo) * numWorkItems);
341 gpuDebugInfo.write(debugInfo, numWorkItems);
342 #endif
343
344 cdata.z = ib;
345
346 b3LauncherCL launcher(m_data->m_queue, m_data->m_solveContactKernel, "m_solveContactKernel");
347 #if 1
348
349 b3BufferInfoCL bInfo[] = {
350
351 b3BufferInfoCL(bodyBuf->getBufferCL()),
352 b3BufferInfoCL(shapeBuf->getBufferCL()),
353 b3BufferInfoCL(constraint->getBufferCL()),
354 b3BufferInfoCL(m_data->m_solverGPU->m_numConstraints->getBufferCL()),
355 b3BufferInfoCL(m_data->m_solverGPU->m_offsets->getBufferCL())
356 #ifdef DEBUG_ME
357 ,
358 b3BufferInfoCL(&gpuDebugInfo)
359 #endif
360 };
361
362 launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
363 launcher.setBuffer(m_data->m_solverGPU->m_batchSizes.getBufferCL());
364 //launcher.setConst( cdata.x );
365 launcher.setConst(cdata.y);
366 launcher.setConst(cdata.z);
367 b3Int4 nSplit;
368 nSplit.x = B3_SOLVER_N_SPLIT_X;
369 nSplit.y = B3_SOLVER_N_SPLIT_Y;
370 nSplit.z = B3_SOLVER_N_SPLIT_Z;
371
372 launcher.setConst(nSplit);
373 launcher.launch1D(numWorkItems, 64);
374
375 #else
376 const char* fileName = "m_batchSolveKernel.bin";
377 FILE* f = fopen(fileName, "rb");
378 if (f)
379 {
380 int sizeInBytes = 0;
381 if (fseek(f, 0, SEEK_END) || (sizeInBytes = ftell(f)) == EOF || fseek(f, 0, SEEK_SET))
382 {
383 printf("error, cannot get file size\n");
384 exit(0);
385 }
386
387 unsigned char* buf = (unsigned char*)malloc(sizeInBytes);
388 fread(buf, sizeInBytes, 1, f);
389 int serializedBytes = launcher.deserializeArgs(buf, sizeInBytes, m_context);
390 int num = *(int*)&buf[serializedBytes];
391
392 launcher.launch1D(num);
393
394 //this clFinish is for testing on errors
395 clFinish(m_queue);
396 }
397
398 #endif
399
400 #ifdef DEBUG_ME
401 clFinish(m_queue);
402 gpuDebugInfo.read(debugInfo, numWorkItems);
403 clFinish(m_queue);
404 for (int i = 0; i < numWorkItems; i++)
405 {
406 if (debugInfo[i].m_valInt2 > 0)
407 {
408 printf("debugInfo[i].m_valInt2 = %d\n", i, debugInfo[i].m_valInt2);
409 }
410
411 if (debugInfo[i].m_valInt3 > 0)
412 {
413 printf("debugInfo[i].m_valInt3 = %d\n", i, debugInfo[i].m_valInt3);
414 }
415 }
416 #endif //DEBUG_ME
417 }
418 }
419
420 clFinish(m_data->m_queue);
421 }
422
423 cdata.x = 1;
424 bool applyFriction = true;
425 if (applyFriction)
426 {
427 B3_PROFILE("m_batchSolveKernel iterations2");
428 for (int iter = 0; iter < numIterations; iter++)
429 {
430 for (int ib = 0; ib < B3_SOLVER_N_BATCHES; ib++)
431 {
432 cdata.z = ib;
433
434 b3BufferInfoCL bInfo[] = {
435 b3BufferInfoCL(bodyBuf->getBufferCL()),
436 b3BufferInfoCL(shapeBuf->getBufferCL()),
437 b3BufferInfoCL(constraint->getBufferCL()),
438 b3BufferInfoCL(m_data->m_solverGPU->m_numConstraints->getBufferCL()),
439 b3BufferInfoCL(m_data->m_solverGPU->m_offsets->getBufferCL())
440 #ifdef DEBUG_ME
441 ,
442 b3BufferInfoCL(&gpuDebugInfo)
443 #endif //DEBUG_ME
444 };
445 b3LauncherCL launcher(m_data->m_queue, m_data->m_solveFrictionKernel, "m_solveFrictionKernel");
446 launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
447 launcher.setBuffer(m_data->m_solverGPU->m_batchSizes.getBufferCL());
448 //launcher.setConst( cdata.x );
449 launcher.setConst(cdata.y);
450 launcher.setConst(cdata.z);
451
452 b3Int4 nSplit;
453 nSplit.x = B3_SOLVER_N_SPLIT_X;
454 nSplit.y = B3_SOLVER_N_SPLIT_Y;
455 nSplit.z = B3_SOLVER_N_SPLIT_Z;
456
457 launcher.setConst(nSplit);
458
459 launcher.launch1D(64 * nn / B3_SOLVER_N_BATCHES, 64);
460 }
461 }
462 clFinish(m_data->m_queue);
463 }
464 #ifdef DEBUG_ME
465 delete[] debugInfo;
466 #endif //DEBUG_ME
467 }
468 }
469
sortfnc(const b3SortData & a,const b3SortData & b)470 static bool sortfnc(const b3SortData& a, const b3SortData& b)
471 {
472 return (a.m_key < b.m_key);
473 }
474
b3ContactCmp(const b3Contact4 & p,const b3Contact4 & q)475 static bool b3ContactCmp(const b3Contact4& p, const b3Contact4& q)
476 {
477 return ((p.m_bodyAPtrAndSignBit < q.m_bodyAPtrAndSignBit) ||
478 ((p.m_bodyAPtrAndSignBit == q.m_bodyAPtrAndSignBit) && (p.m_bodyBPtrAndSignBit < q.m_bodyBPtrAndSignBit)) ||
479 ((p.m_bodyAPtrAndSignBit == q.m_bodyAPtrAndSignBit) && (p.m_bodyBPtrAndSignBit == q.m_bodyBPtrAndSignBit) && p.m_childIndexA < q.m_childIndexA) ||
480 ((p.m_bodyAPtrAndSignBit == q.m_bodyAPtrAndSignBit) && (p.m_bodyBPtrAndSignBit == q.m_bodyBPtrAndSignBit) && p.m_childIndexA < q.m_childIndexA) ||
481 ((p.m_bodyAPtrAndSignBit == q.m_bodyAPtrAndSignBit) && (p.m_bodyBPtrAndSignBit == q.m_bodyBPtrAndSignBit) && p.m_childIndexA == q.m_childIndexA && p.m_childIndexB < q.m_childIndexB));
482 }
483
484 #define USE_SPATIAL_BATCHING 1
485 #define USE_4x4_GRID 1
486
487 #ifndef USE_SPATIAL_BATCHING
488 static const int gridTable4x4[] =
489 {
490 0, 1, 17, 16,
491 1, 2, 18, 19,
492 17, 18, 32, 3,
493 16, 19, 3, 34};
494 static const int gridTable8x8[] =
495 {
496 0, 2, 3, 16, 17, 18, 19, 1,
497 66, 64, 80, 67, 82, 81, 65, 83,
498 131, 144, 128, 130, 147, 129, 145, 146,
499 208, 195, 194, 192, 193, 211, 210, 209,
500 21, 22, 23, 5, 4, 6, 7, 20,
501 86, 85, 69, 87, 70, 68, 84, 71,
502 151, 133, 149, 150, 135, 148, 132, 134,
503 197, 27, 214, 213, 212, 199, 198, 196
504
505 };
506
507 #endif
508
SetSortDataCPU(b3Contact4 * gContact,b3RigidBodyData * gBodies,b3SortData * gSortDataOut,int nContacts,float scale,const b3Int4 & nSplit,int staticIdx)509 void SetSortDataCPU(b3Contact4* gContact, b3RigidBodyData* gBodies, b3SortData* gSortDataOut, int nContacts, float scale, const b3Int4& nSplit, int staticIdx)
510 {
511 for (int gIdx = 0; gIdx < nContacts; gIdx++)
512 {
513 if (gIdx < nContacts)
514 {
515 int aPtrAndSignBit = gContact[gIdx].m_bodyAPtrAndSignBit;
516 int bPtrAndSignBit = gContact[gIdx].m_bodyBPtrAndSignBit;
517
518 int aIdx = abs(aPtrAndSignBit);
519 int bIdx = abs(bPtrAndSignBit);
520
521 bool aStatic = (aPtrAndSignBit < 0) || (aPtrAndSignBit == staticIdx);
522
523 #if USE_SPATIAL_BATCHING
524 int idx = (aStatic) ? bIdx : aIdx;
525 b3Vector3 p = gBodies[idx].m_pos;
526 int xIdx = (int)((p.x - ((p.x < 0.f) ? 1.f : 0.f)) * scale) & (nSplit.x - 1);
527 int yIdx = (int)((p.y - ((p.y < 0.f) ? 1.f : 0.f)) * scale) & (nSplit.y - 1);
528 int zIdx = (int)((p.z - ((p.z < 0.f) ? 1.f : 0.f)) * scale) & (nSplit.z - 1);
529
530 int newIndex = (xIdx + yIdx * nSplit.x + zIdx * nSplit.x * nSplit.y);
531
532 #else //USE_SPATIAL_BATCHING
533 bool bStatic = (bPtrAndSignBit < 0) || (bPtrAndSignBit == staticIdx);
534
535 #if USE_4x4_GRID
536 int aa = aIdx & 3;
537 int bb = bIdx & 3;
538 if (aStatic)
539 aa = bb;
540 if (bStatic)
541 bb = aa;
542
543 int gridIndex = aa + bb * 4;
544 int newIndex = gridTable4x4[gridIndex];
545 #else //USE_4x4_GRID
546 int aa = aIdx & 7;
547 int bb = bIdx & 7;
548 if (aStatic)
549 aa = bb;
550 if (bStatic)
551 bb = aa;
552
553 int gridIndex = aa + bb * 8;
554 int newIndex = gridTable8x8[gridIndex];
555 #endif //USE_4x4_GRID
556 #endif //USE_SPATIAL_BATCHING
557
558 gSortDataOut[gIdx].x = newIndex;
559 gSortDataOut[gIdx].y = gIdx;
560 }
561 else
562 {
563 gSortDataOut[gIdx].x = 0xffffffff;
564 }
565 }
566 }
567
solveContacts(int numBodies,cl_mem bodyBuf,cl_mem inertiaBuf,int numContacts,cl_mem contactBuf,const b3Config & config,int static0Index)568 void b3GpuPgsContactSolver::solveContacts(int numBodies, cl_mem bodyBuf, cl_mem inertiaBuf, int numContacts, cl_mem contactBuf, const b3Config& config, int static0Index)
569 {
570 B3_PROFILE("solveContacts");
571 m_data->m_bodyBufferGPU->setFromOpenCLBuffer(bodyBuf, numBodies);
572 m_data->m_inertiaBufferGPU->setFromOpenCLBuffer(inertiaBuf, numBodies);
573 m_data->m_pBufContactOutGPU->setFromOpenCLBuffer(contactBuf, numContacts);
574
575 if (optionalSortContactsDeterminism)
576 {
577 if (!gCpuSortContactsDeterminism)
578 {
579 B3_PROFILE("GPU Sort contact constraints (determinism)");
580
581 m_data->m_pBufContactOutGPUCopy->resize(numContacts);
582 m_data->m_contactKeyValues->resize(numContacts);
583
584 m_data->m_pBufContactOutGPU->copyToCL(m_data->m_pBufContactOutGPUCopy->getBufferCL(), numContacts, 0, 0);
585
586 {
587 b3LauncherCL launcher(m_data->m_queue, m_data->m_setDeterminismSortDataChildShapeBKernel, "m_setDeterminismSortDataChildShapeBKernel");
588 launcher.setBuffer(m_data->m_pBufContactOutGPUCopy->getBufferCL());
589 launcher.setBuffer(m_data->m_contactKeyValues->getBufferCL());
590 launcher.setConst(numContacts);
591 launcher.launch1D(numContacts, 64);
592 }
593 m_data->m_solverGPU->m_sort32->execute(*m_data->m_contactKeyValues);
594 {
595 b3LauncherCL launcher(m_data->m_queue, m_data->m_setDeterminismSortDataChildShapeAKernel, "m_setDeterminismSortDataChildShapeAKernel");
596 launcher.setBuffer(m_data->m_pBufContactOutGPUCopy->getBufferCL());
597 launcher.setBuffer(m_data->m_contactKeyValues->getBufferCL());
598 launcher.setConst(numContacts);
599 launcher.launch1D(numContacts, 64);
600 }
601 m_data->m_solverGPU->m_sort32->execute(*m_data->m_contactKeyValues);
602 {
603 b3LauncherCL launcher(m_data->m_queue, m_data->m_setDeterminismSortDataBodyBKernel, "m_setDeterminismSortDataBodyBKernel");
604 launcher.setBuffer(m_data->m_pBufContactOutGPUCopy->getBufferCL());
605 launcher.setBuffer(m_data->m_contactKeyValues->getBufferCL());
606 launcher.setConst(numContacts);
607 launcher.launch1D(numContacts, 64);
608 }
609
610 m_data->m_solverGPU->m_sort32->execute(*m_data->m_contactKeyValues);
611
612 {
613 b3LauncherCL launcher(m_data->m_queue, m_data->m_setDeterminismSortDataBodyAKernel, "m_setDeterminismSortDataBodyAKernel");
614 launcher.setBuffer(m_data->m_pBufContactOutGPUCopy->getBufferCL());
615 launcher.setBuffer(m_data->m_contactKeyValues->getBufferCL());
616 launcher.setConst(numContacts);
617 launcher.launch1D(numContacts, 64);
618 }
619
620 m_data->m_solverGPU->m_sort32->execute(*m_data->m_contactKeyValues);
621
622 {
623 B3_PROFILE("gpu reorderContactKernel (determinism)");
624
625 b3Int4 cdata;
626 cdata.x = numContacts;
627
628 //b3BufferInfoCL bInfo[] = { b3BufferInfoCL( m_data->m_pBufContactOutGPU->getBufferCL() ), b3BufferInfoCL( m_data->m_solverGPU->m_contactBuffer2->getBufferCL())
629 // , b3BufferInfoCL( m_data->m_solverGPU->m_sortDataBuffer->getBufferCL()) };
630 b3LauncherCL launcher(m_data->m_queue, m_data->m_solverGPU->m_reorderContactKernel, "m_reorderContactKernel");
631 launcher.setBuffer(m_data->m_pBufContactOutGPUCopy->getBufferCL());
632 launcher.setBuffer(m_data->m_pBufContactOutGPU->getBufferCL());
633 launcher.setBuffer(m_data->m_contactKeyValues->getBufferCL());
634 launcher.setConst(cdata);
635 launcher.launch1D(numContacts, 64);
636 }
637 }
638 else
639 {
640 B3_PROFILE("CPU Sort contact constraints (determinism)");
641 b3AlignedObjectArray<b3Contact4> cpuConstraints;
642 m_data->m_pBufContactOutGPU->copyToHost(cpuConstraints);
643 bool sort = true;
644 if (sort)
645 {
646 cpuConstraints.quickSort(b3ContactCmp);
647
648 for (int i = 0; i < cpuConstraints.size(); i++)
649 {
650 cpuConstraints[i].m_batchIdx = i;
651 }
652 }
653 m_data->m_pBufContactOutGPU->copyFromHost(cpuConstraints);
654 if (m_debugOutput == 100)
655 {
656 for (int i = 0; i < cpuConstraints.size(); i++)
657 {
658 printf("c[%d].m_bodyA = %d, m_bodyB = %d, batchId = %d\n", i, cpuConstraints[i].m_bodyAPtrAndSignBit, cpuConstraints[i].m_bodyBPtrAndSignBit, cpuConstraints[i].m_batchIdx);
659 }
660 }
661
662 m_debugOutput++;
663 }
664 }
665
666 int nContactOut = m_data->m_pBufContactOutGPU->size();
667
668 bool useSolver = true;
669
670 if (useSolver)
671 {
672 float dt = 1. / 60.;
673 b3ConstraintCfg csCfg(dt);
674 csCfg.m_enableParallelSolve = true;
675 csCfg.m_batchCellSize = 6;
676 csCfg.m_staticIdx = static0Index;
677
678 b3OpenCLArray<b3RigidBodyData>* bodyBuf = m_data->m_bodyBufferGPU;
679
680 void* additionalData = 0; //m_data->m_frictionCGPU;
681 const b3OpenCLArray<b3InertiaData>* shapeBuf = m_data->m_inertiaBufferGPU;
682 b3OpenCLArray<b3GpuConstraint4>* contactConstraintOut = m_data->m_contactCGPU;
683 int nContacts = nContactOut;
684
685 int maxNumBatches = 0;
686
687 if (!gUseLargeBatches)
688 {
689 if (m_data->m_solverGPU->m_contactBuffer2)
690 {
691 m_data->m_solverGPU->m_contactBuffer2->resize(nContacts);
692 }
693
694 if (m_data->m_solverGPU->m_contactBuffer2 == 0)
695 {
696 m_data->m_solverGPU->m_contactBuffer2 = new b3OpenCLArray<b3Contact4>(m_data->m_context, m_data->m_queue, nContacts);
697 m_data->m_solverGPU->m_contactBuffer2->resize(nContacts);
698 }
699
700 //clFinish(m_data->m_queue);
701
702 {
703 B3_PROFILE("batching");
704 //@todo: just reserve it, without copy of original contact (unless we use warmstarting)
705
706 //const b3OpenCLArray<b3RigidBodyData>* bodyNative = bodyBuf;
707
708 {
709 //b3OpenCLArray<b3RigidBodyData>* bodyNative = b3OpenCLArrayUtils::map<adl::TYPE_CL, true>( data->m_device, bodyBuf );
710 //b3OpenCLArray<b3Contact4>* contactNative = b3OpenCLArrayUtils::map<adl::TYPE_CL, true>( data->m_device, contactsIn );
711
712 const int sortAlignment = 512; // todo. get this out of sort
713 if (csCfg.m_enableParallelSolve)
714 {
715 int sortSize = B3NEXTMULTIPLEOF(nContacts, sortAlignment);
716
717 b3OpenCLArray<unsigned int>* countsNative = m_data->m_solverGPU->m_numConstraints;
718 b3OpenCLArray<unsigned int>* offsetsNative = m_data->m_solverGPU->m_offsets;
719
720 if (!gCpuSetSortData)
721 { // 2. set cell idx
722 B3_PROFILE("GPU set cell idx");
723 struct CB
724 {
725 int m_nContacts;
726 int m_staticIdx;
727 float m_scale;
728 b3Int4 m_nSplit;
729 };
730
731 b3Assert(sortSize % 64 == 0);
732 CB cdata;
733 cdata.m_nContacts = nContacts;
734 cdata.m_staticIdx = csCfg.m_staticIdx;
735 cdata.m_scale = 1.f / csCfg.m_batchCellSize;
736 cdata.m_nSplit.x = B3_SOLVER_N_SPLIT_X;
737 cdata.m_nSplit.y = B3_SOLVER_N_SPLIT_Y;
738 cdata.m_nSplit.z = B3_SOLVER_N_SPLIT_Z;
739
740 m_data->m_solverGPU->m_sortDataBuffer->resize(nContacts);
741
742 b3BufferInfoCL bInfo[] = {b3BufferInfoCL(m_data->m_pBufContactOutGPU->getBufferCL()), b3BufferInfoCL(bodyBuf->getBufferCL()), b3BufferInfoCL(m_data->m_solverGPU->m_sortDataBuffer->getBufferCL())};
743 b3LauncherCL launcher(m_data->m_queue, m_data->m_solverGPU->m_setSortDataKernel, "m_setSortDataKernel");
744 launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
745 launcher.setConst(cdata.m_nContacts);
746 launcher.setConst(cdata.m_scale);
747 launcher.setConst(cdata.m_nSplit);
748 launcher.setConst(cdata.m_staticIdx);
749
750 launcher.launch1D(sortSize, 64);
751 }
752 else
753 {
754 m_data->m_solverGPU->m_sortDataBuffer->resize(nContacts);
755 b3AlignedObjectArray<b3SortData> sortDataCPU;
756 m_data->m_solverGPU->m_sortDataBuffer->copyToHost(sortDataCPU);
757
758 b3AlignedObjectArray<b3Contact4> contactCPU;
759 m_data->m_pBufContactOutGPU->copyToHost(contactCPU);
760 b3AlignedObjectArray<b3RigidBodyData> bodiesCPU;
761 bodyBuf->copyToHost(bodiesCPU);
762 float scale = 1.f / csCfg.m_batchCellSize;
763 b3Int4 nSplit;
764 nSplit.x = B3_SOLVER_N_SPLIT_X;
765 nSplit.y = B3_SOLVER_N_SPLIT_Y;
766 nSplit.z = B3_SOLVER_N_SPLIT_Z;
767
768 SetSortDataCPU(&contactCPU[0], &bodiesCPU[0], &sortDataCPU[0], nContacts, scale, nSplit, csCfg.m_staticIdx);
769
770 m_data->m_solverGPU->m_sortDataBuffer->copyFromHost(sortDataCPU);
771 }
772
773 if (!gCpuRadixSort)
774 { // 3. sort by cell idx
775 B3_PROFILE("gpuRadixSort");
776 //int n = B3_SOLVER_N_SPLIT*B3_SOLVER_N_SPLIT;
777 //int sortBit = 32;
778 //if( n <= 0xffff ) sortBit = 16;
779 //if( n <= 0xff ) sortBit = 8;
780 //adl::RadixSort<adl::TYPE_CL>::execute( data->m_sort, *data->m_sortDataBuffer, sortSize );
781 //adl::RadixSort32<adl::TYPE_CL>::execute( data->m_sort32, *data->m_sortDataBuffer, sortSize );
782 b3OpenCLArray<b3SortData>& keyValuesInOut = *(m_data->m_solverGPU->m_sortDataBuffer);
783 this->m_data->m_solverGPU->m_sort32->execute(keyValuesInOut);
784 }
785 else
786 {
787 b3OpenCLArray<b3SortData>& keyValuesInOut = *(m_data->m_solverGPU->m_sortDataBuffer);
788 b3AlignedObjectArray<b3SortData> hostValues;
789 keyValuesInOut.copyToHost(hostValues);
790 hostValues.quickSort(sortfnc);
791 keyValuesInOut.copyFromHost(hostValues);
792 }
793
794 if (gUseScanHost)
795 {
796 // 4. find entries
797 B3_PROFILE("cpuBoundSearch");
798 b3AlignedObjectArray<unsigned int> countsHost;
799 countsNative->copyToHost(countsHost);
800
801 b3AlignedObjectArray<b3SortData> sortDataHost;
802 m_data->m_solverGPU->m_sortDataBuffer->copyToHost(sortDataHost);
803
804 //m_data->m_solverGPU->m_search->executeHost(*m_data->m_solverGPU->m_sortDataBuffer,nContacts,*countsNative,B3_SOLVER_N_CELLS,b3BoundSearchCL::COUNT);
805 m_data->m_solverGPU->m_search->executeHost(sortDataHost, nContacts, countsHost, B3_SOLVER_N_CELLS, b3BoundSearchCL::COUNT);
806
807 countsNative->copyFromHost(countsHost);
808
809 //adl::BoundSearch<adl::TYPE_CL>::execute( data->m_search, *data->m_sortDataBuffer, nContacts, *countsNative,
810 // B3_SOLVER_N_SPLIT*B3_SOLVER_N_SPLIT, adl::BoundSearchBase::COUNT );
811
812 //unsigned int sum;
813 //m_data->m_solverGPU->m_scan->execute(*countsNative,*offsetsNative, B3_SOLVER_N_CELLS);//,&sum );
814 b3AlignedObjectArray<unsigned int> offsetsHost;
815 offsetsHost.resize(offsetsNative->size());
816
817 m_data->m_solverGPU->m_scan->executeHost(countsHost, offsetsHost, B3_SOLVER_N_CELLS); //,&sum );
818 offsetsNative->copyFromHost(offsetsHost);
819
820 //printf("sum = %d\n",sum);
821 }
822 else
823 {
824 // 4. find entries
825 B3_PROFILE("gpuBoundSearch");
826 m_data->m_solverGPU->m_search->execute(*m_data->m_solverGPU->m_sortDataBuffer, nContacts, *countsNative, B3_SOLVER_N_CELLS, b3BoundSearchCL::COUNT);
827 m_data->m_solverGPU->m_scan->execute(*countsNative, *offsetsNative, B3_SOLVER_N_CELLS); //,&sum );
828 }
829
830 if (nContacts)
831 { // 5. sort constraints by cellIdx
832 if (gReorderContactsOnCpu)
833 {
834 B3_PROFILE("cpu m_reorderContactKernel");
835 b3AlignedObjectArray<b3SortData> sortDataHost;
836 m_data->m_solverGPU->m_sortDataBuffer->copyToHost(sortDataHost);
837 b3AlignedObjectArray<b3Contact4> inContacts;
838 b3AlignedObjectArray<b3Contact4> outContacts;
839 m_data->m_pBufContactOutGPU->copyToHost(inContacts);
840 outContacts.resize(inContacts.size());
841 for (int i = 0; i < nContacts; i++)
842 {
843 int srcIdx = sortDataHost[i].y;
844 outContacts[i] = inContacts[srcIdx];
845 }
846 m_data->m_solverGPU->m_contactBuffer2->copyFromHost(outContacts);
847
848 /* "void ReorderContactKernel(__global struct b3Contact4Data* in, __global struct b3Contact4Data* out, __global int2* sortData, int4 cb )\n"
849 "{\n"
850 " int nContacts = cb.x;\n"
851 " int gIdx = GET_GLOBAL_IDX;\n"
852 " if( gIdx < nContacts )\n"
853 " {\n"
854 " int srcIdx = sortData[gIdx].y;\n"
855 " out[gIdx] = in[srcIdx];\n"
856 " }\n"
857 "}\n"
858 */
859 }
860 else
861 {
862 B3_PROFILE("gpu m_reorderContactKernel");
863
864 b3Int4 cdata;
865 cdata.x = nContacts;
866
867 b3BufferInfoCL bInfo[] = {
868 b3BufferInfoCL(m_data->m_pBufContactOutGPU->getBufferCL()),
869 b3BufferInfoCL(m_data->m_solverGPU->m_contactBuffer2->getBufferCL()), b3BufferInfoCL(m_data->m_solverGPU->m_sortDataBuffer->getBufferCL())};
870
871 b3LauncherCL launcher(m_data->m_queue, m_data->m_solverGPU->m_reorderContactKernel, "m_reorderContactKernel");
872 launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
873 launcher.setConst(cdata);
874 launcher.launch1D(nContacts, 64);
875 }
876 }
877 }
878 }
879
880 //clFinish(m_data->m_queue);
881
882 // {
883 // b3AlignedObjectArray<unsigned int> histogram;
884 // m_data->m_solverGPU->m_numConstraints->copyToHost(histogram);
885 // printf(",,,\n");
886 // }
887
888 if (nContacts)
889 {
890 if (gUseCpuCopyConstraints)
891 {
892 for (int i = 0; i < nContacts; i++)
893 {
894 m_data->m_pBufContactOutGPU->copyFromOpenCLArray(*m_data->m_solverGPU->m_contactBuffer2);
895 // m_data->m_solverGPU->m_contactBuffer2->getBufferCL();
896 // m_data->m_pBufContactOutGPU->getBufferCL()
897 }
898 }
899 else
900 {
901 B3_PROFILE("gpu m_copyConstraintKernel");
902 b3Int4 cdata;
903 cdata.x = nContacts;
904 b3BufferInfoCL bInfo[] = {
905 b3BufferInfoCL(m_data->m_solverGPU->m_contactBuffer2->getBufferCL()),
906 b3BufferInfoCL(m_data->m_pBufContactOutGPU->getBufferCL())};
907
908 b3LauncherCL launcher(m_data->m_queue, m_data->m_solverGPU->m_copyConstraintKernel, "m_copyConstraintKernel");
909 launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
910 launcher.setConst(cdata);
911 launcher.launch1D(nContacts, 64);
912 //we use the clFinish for proper benchmark/profile
913 clFinish(m_data->m_queue);
914 }
915 }
916
917 // bool compareGPU = false;
918 if (nContacts)
919 {
920 if (!gCpuBatchContacts)
921 {
922 B3_PROFILE("gpu batchContacts");
923 maxNumBatches = 250; //250;
924 m_data->m_solverGPU->batchContacts(m_data->m_pBufContactOutGPU, nContacts, m_data->m_solverGPU->m_numConstraints, m_data->m_solverGPU->m_offsets, csCfg.m_staticIdx);
925 clFinish(m_data->m_queue);
926 }
927 else
928 {
929 B3_PROFILE("cpu batchContacts");
930 static b3AlignedObjectArray<b3Contact4> cpuContacts;
931 b3OpenCLArray<b3Contact4>* contactsIn = m_data->m_solverGPU->m_contactBuffer2;
932 {
933 B3_PROFILE("copyToHost");
934 contactsIn->copyToHost(cpuContacts);
935 }
936 b3OpenCLArray<unsigned int>* countsNative = m_data->m_solverGPU->m_numConstraints;
937 b3OpenCLArray<unsigned int>* offsetsNative = m_data->m_solverGPU->m_offsets;
938
939 b3AlignedObjectArray<unsigned int> nNativeHost;
940 b3AlignedObjectArray<unsigned int> offsetsNativeHost;
941
942 {
943 B3_PROFILE("countsNative/offsetsNative copyToHost");
944 countsNative->copyToHost(nNativeHost);
945 offsetsNative->copyToHost(offsetsNativeHost);
946 }
947
948 int numNonzeroGrid = 0;
949
950 if (gUseLargeBatches)
951 {
952 m_data->m_batchSizes.resize(B3_MAX_NUM_BATCHES);
953 int totalNumConstraints = cpuContacts.size();
954 //int simdWidth =numBodies+1;//-1;//64;//-1;//32;
955 int numBatches = sortConstraintByBatch3(&cpuContacts[0], totalNumConstraints, totalNumConstraints + 1, csCfg.m_staticIdx, numBodies, &m_data->m_batchSizes[0]); // on GPU
956 maxNumBatches = b3Max(numBatches, maxNumBatches);
957 static int globalMaxBatch = 0;
958 if (maxNumBatches > globalMaxBatch)
959 {
960 globalMaxBatch = maxNumBatches;
961 b3Printf("maxNumBatches = %d\n", maxNumBatches);
962 }
963 }
964 else
965 {
966 m_data->m_batchSizes.resize(B3_SOLVER_N_CELLS * B3_MAX_NUM_BATCHES);
967 B3_PROFILE("cpu batch grid");
968 for (int i = 0; i < B3_SOLVER_N_CELLS; i++)
969 {
970 int n = (nNativeHost)[i];
971 int offset = (offsetsNativeHost)[i];
972 if (n)
973 {
974 numNonzeroGrid++;
975 int simdWidth = numBodies + 1; //-1;//64;//-1;//32;
976 int numBatches = sortConstraintByBatch3(&cpuContacts[0] + offset, n, simdWidth, csCfg.m_staticIdx, numBodies, &m_data->m_batchSizes[i * B3_MAX_NUM_BATCHES]); // on GPU
977 maxNumBatches = b3Max(numBatches, maxNumBatches);
978 static int globalMaxBatch = 0;
979 if (maxNumBatches > globalMaxBatch)
980 {
981 globalMaxBatch = maxNumBatches;
982 b3Printf("maxNumBatches = %d\n", maxNumBatches);
983 }
984 //we use the clFinish for proper benchmark/profile
985 }
986 }
987 //clFinish(m_data->m_queue);
988 }
989 {
990 B3_PROFILE("m_contactBuffer->copyFromHost");
991 m_data->m_solverGPU->m_contactBuffer2->copyFromHost((b3AlignedObjectArray<b3Contact4>&)cpuContacts);
992 }
993 }
994 }
995 }
996 }
997
998 //printf("maxNumBatches = %d\n", maxNumBatches);
999
1000 if (gUseLargeBatches)
1001 {
1002 if (nContacts)
1003 {
1004 B3_PROFILE("cpu batchContacts");
1005 static b3AlignedObjectArray<b3Contact4> cpuContacts;
1006 // b3OpenCLArray<b3Contact4>* contactsIn = m_data->m_solverGPU->m_contactBuffer2;
1007 {
1008 B3_PROFILE("copyToHost");
1009 m_data->m_pBufContactOutGPU->copyToHost(cpuContacts);
1010 }
1011 // b3OpenCLArray<unsigned int>* countsNative = m_data->m_solverGPU->m_numConstraints;
1012 // b3OpenCLArray<unsigned int>* offsetsNative = m_data->m_solverGPU->m_offsets;
1013
1014 // int numNonzeroGrid=0;
1015
1016 {
1017 m_data->m_batchSizes.resize(B3_MAX_NUM_BATCHES);
1018 int totalNumConstraints = cpuContacts.size();
1019 // int simdWidth =numBodies+1;//-1;//64;//-1;//32;
1020 int numBatches = sortConstraintByBatch3(&cpuContacts[0], totalNumConstraints, totalNumConstraints + 1, csCfg.m_staticIdx, numBodies, &m_data->m_batchSizes[0]); // on GPU
1021 maxNumBatches = b3Max(numBatches, maxNumBatches);
1022 static int globalMaxBatch = 0;
1023 if (maxNumBatches > globalMaxBatch)
1024 {
1025 globalMaxBatch = maxNumBatches;
1026 b3Printf("maxNumBatches = %d\n", maxNumBatches);
1027 }
1028 }
1029 {
1030 B3_PROFILE("m_contactBuffer->copyFromHost");
1031 m_data->m_solverGPU->m_contactBuffer2->copyFromHost((b3AlignedObjectArray<b3Contact4>&)cpuContacts);
1032 }
1033 }
1034 }
1035
1036 if (nContacts)
1037 {
1038 B3_PROFILE("gpu convertToConstraints");
1039 m_data->m_solverGPU->convertToConstraints(bodyBuf,
1040 shapeBuf, m_data->m_solverGPU->m_contactBuffer2,
1041 contactConstraintOut,
1042 additionalData, nContacts,
1043 (b3SolverBase::ConstraintCfg&)csCfg);
1044 clFinish(m_data->m_queue);
1045 }
1046
1047 if (1)
1048 {
1049 int numIter = 4;
1050
1051 m_data->m_solverGPU->m_nIterations = numIter; //10
1052 if (!gCpuSolveConstraint)
1053 {
1054 B3_PROFILE("GPU solveContactConstraint");
1055
1056 /*m_data->m_solverGPU->solveContactConstraint(
1057 m_data->m_bodyBufferGPU,
1058 m_data->m_inertiaBufferGPU,
1059 m_data->m_contactCGPU,0,
1060 nContactOut ,
1061 maxNumBatches);
1062 */
1063
1064 //m_data->m_batchSizesGpu->copyFromHost(m_data->m_batchSizes);
1065
1066 if (gUseLargeBatches)
1067 {
1068 solveContactConstraintBatchSizes(m_data->m_bodyBufferGPU,
1069 m_data->m_inertiaBufferGPU,
1070 m_data->m_contactCGPU, 0,
1071 nContactOut,
1072 maxNumBatches, numIter, &m_data->m_batchSizes);
1073 }
1074 else
1075 {
1076 solveContactConstraint(
1077 m_data->m_bodyBufferGPU,
1078 m_data->m_inertiaBufferGPU,
1079 m_data->m_contactCGPU, 0,
1080 nContactOut,
1081 maxNumBatches, numIter, &m_data->m_batchSizes); //m_data->m_batchSizesGpu);
1082 }
1083 }
1084 else
1085 {
1086 B3_PROFILE("Host solveContactConstraint");
1087
1088 m_data->m_solverGPU->solveContactConstraintHost(m_data->m_bodyBufferGPU, m_data->m_inertiaBufferGPU, m_data->m_contactCGPU, 0, nContactOut, maxNumBatches, &m_data->m_batchSizes);
1089 }
1090 }
1091
1092 #if 0
1093 if (0)
1094 {
1095 B3_PROFILE("read body velocities back to CPU");
1096 //read body updated linear/angular velocities back to CPU
1097 m_data->m_bodyBufferGPU->read(
1098 m_data->m_bodyBufferCPU->m_ptr,numOfConvexRBodies);
1099 adl::DeviceUtils::waitForCompletion( m_data->m_deviceCL );
1100 }
1101 #endif
1102 }
1103 }
1104
batchContacts(b3OpenCLArray<b3Contact4> * contacts,int nContacts,b3OpenCLArray<unsigned int> * n,b3OpenCLArray<unsigned int> * offsets,int staticIdx)1105 void b3GpuPgsContactSolver::batchContacts(b3OpenCLArray<b3Contact4>* contacts, int nContacts, b3OpenCLArray<unsigned int>* n, b3OpenCLArray<unsigned int>* offsets, int staticIdx)
1106 {
1107 }
1108
1109 b3AlignedObjectArray<unsigned int> idxBuffer;
1110 b3AlignedObjectArray<b3SortData> sortData;
1111 b3AlignedObjectArray<b3Contact4> old;
1112
sortConstraintByBatch(b3Contact4 * cs,int n,int simdWidth,int staticIdx,int numBodies)1113 inline int b3GpuPgsContactSolver::sortConstraintByBatch(b3Contact4* cs, int n, int simdWidth, int staticIdx, int numBodies)
1114 {
1115 B3_PROFILE("sortConstraintByBatch");
1116 int numIter = 0;
1117
1118 sortData.resize(n);
1119 idxBuffer.resize(n);
1120 old.resize(n);
1121
1122 unsigned int* idxSrc = &idxBuffer[0];
1123 unsigned int* idxDst = &idxBuffer[0];
1124 int nIdxSrc, nIdxDst;
1125
1126 const int N_FLG = 256;
1127 const int FLG_MASK = N_FLG - 1;
1128 unsigned int flg[N_FLG / 32];
1129 #if defined(_DEBUG)
1130 for (int i = 0; i < n; i++)
1131 cs[i].getBatchIdx() = -1;
1132 #endif
1133 for (int i = 0; i < n; i++)
1134 idxSrc[i] = i;
1135 nIdxSrc = n;
1136
1137 int batchIdx = 0;
1138
1139 {
1140 B3_PROFILE("cpu batch innerloop");
1141 while (nIdxSrc)
1142 {
1143 numIter++;
1144 nIdxDst = 0;
1145 int nCurrentBatch = 0;
1146
1147 // clear flag
1148 for (int i = 0; i < N_FLG / 32; i++) flg[i] = 0;
1149
1150 for (int i = 0; i < nIdxSrc; i++)
1151 {
1152 int idx = idxSrc[i];
1153
1154 b3Assert(idx < n);
1155 // check if it can go
1156 int bodyAS = cs[idx].m_bodyAPtrAndSignBit;
1157 int bodyBS = cs[idx].m_bodyBPtrAndSignBit;
1158
1159 int bodyA = abs(bodyAS);
1160 int bodyB = abs(bodyBS);
1161
1162 int aIdx = bodyA & FLG_MASK;
1163 int bIdx = bodyB & FLG_MASK;
1164
1165 unsigned int aUnavailable = flg[aIdx / 32] & (1 << (aIdx & 31));
1166 unsigned int bUnavailable = flg[bIdx / 32] & (1 << (bIdx & 31));
1167
1168 bool aIsStatic = (bodyAS < 0) || bodyAS == staticIdx;
1169 bool bIsStatic = (bodyBS < 0) || bodyBS == staticIdx;
1170
1171 //use inv_mass!
1172 aUnavailable = !aIsStatic ? aUnavailable : 0; //
1173 bUnavailable = !bIsStatic ? bUnavailable : 0;
1174
1175 if (aUnavailable == 0 && bUnavailable == 0) // ok
1176 {
1177 if (!aIsStatic)
1178 flg[aIdx / 32] |= (1 << (aIdx & 31));
1179 if (!bIsStatic)
1180 flg[bIdx / 32] |= (1 << (bIdx & 31));
1181
1182 cs[idx].getBatchIdx() = batchIdx;
1183 sortData[idx].m_key = batchIdx;
1184 sortData[idx].m_value = idx;
1185
1186 {
1187 nCurrentBatch++;
1188 if (nCurrentBatch == simdWidth)
1189 {
1190 nCurrentBatch = 0;
1191 for (int i = 0; i < N_FLG / 32; i++) flg[i] = 0;
1192 }
1193 }
1194 }
1195 else
1196 {
1197 idxDst[nIdxDst++] = idx;
1198 }
1199 }
1200 b3Swap(idxSrc, idxDst);
1201 b3Swap(nIdxSrc, nIdxDst);
1202 batchIdx++;
1203 }
1204 }
1205 {
1206 B3_PROFILE("quickSort");
1207 sortData.quickSort(sortfnc);
1208 }
1209
1210 {
1211 B3_PROFILE("reorder");
1212 // reorder
1213
1214 memcpy(&old[0], cs, sizeof(b3Contact4) * n);
1215 for (int i = 0; i < n; i++)
1216 {
1217 int idx = sortData[i].m_value;
1218 cs[i] = old[idx];
1219 }
1220 }
1221
1222 #if defined(_DEBUG)
1223 // debugPrintf( "nBatches: %d\n", batchIdx );
1224 for (int i = 0; i < n; i++)
1225 {
1226 b3Assert(cs[i].getBatchIdx() != -1);
1227 }
1228 #endif
1229 return batchIdx;
1230 }
1231
1232 b3AlignedObjectArray<int> bodyUsed2;
1233
sortConstraintByBatch2(b3Contact4 * cs,int numConstraints,int simdWidth,int staticIdx,int numBodies)1234 inline int b3GpuPgsContactSolver::sortConstraintByBatch2(b3Contact4* cs, int numConstraints, int simdWidth, int staticIdx, int numBodies)
1235 {
1236 B3_PROFILE("sortConstraintByBatch2");
1237
1238 bodyUsed2.resize(2 * simdWidth);
1239
1240 for (int q = 0; q < 2 * simdWidth; q++)
1241 bodyUsed2[q] = 0;
1242
1243 int curBodyUsed = 0;
1244
1245 int numIter = 0;
1246
1247 m_data->m_sortData.resize(numConstraints);
1248 m_data->m_idxBuffer.resize(numConstraints);
1249 m_data->m_old.resize(numConstraints);
1250
1251 unsigned int* idxSrc = &m_data->m_idxBuffer[0];
1252
1253 #if defined(_DEBUG)
1254 for (int i = 0; i < numConstraints; i++)
1255 cs[i].getBatchIdx() = -1;
1256 #endif
1257 for (int i = 0; i < numConstraints; i++)
1258 idxSrc[i] = i;
1259
1260 int numValidConstraints = 0;
1261 // int unprocessedConstraintIndex = 0;
1262
1263 int batchIdx = 0;
1264
1265 {
1266 B3_PROFILE("cpu batch innerloop");
1267
1268 while (numValidConstraints < numConstraints)
1269 {
1270 numIter++;
1271 int nCurrentBatch = 0;
1272 // clear flag
1273 for (int i = 0; i < curBodyUsed; i++)
1274 bodyUsed2[i] = 0;
1275 curBodyUsed = 0;
1276
1277 for (int i = numValidConstraints; i < numConstraints; i++)
1278 {
1279 int idx = idxSrc[i];
1280 b3Assert(idx < numConstraints);
1281 // check if it can go
1282 int bodyAS = cs[idx].m_bodyAPtrAndSignBit;
1283 int bodyBS = cs[idx].m_bodyBPtrAndSignBit;
1284 int bodyA = abs(bodyAS);
1285 int bodyB = abs(bodyBS);
1286 bool aIsStatic = (bodyAS < 0) || bodyAS == staticIdx;
1287 bool bIsStatic = (bodyBS < 0) || bodyBS == staticIdx;
1288 int aUnavailable = 0;
1289 int bUnavailable = 0;
1290 if (!aIsStatic)
1291 {
1292 for (int j = 0; j < curBodyUsed; j++)
1293 {
1294 if (bodyA == bodyUsed2[j])
1295 {
1296 aUnavailable = 1;
1297 break;
1298 }
1299 }
1300 }
1301 if (!aUnavailable)
1302 if (!bIsStatic)
1303 {
1304 for (int j = 0; j < curBodyUsed; j++)
1305 {
1306 if (bodyB == bodyUsed2[j])
1307 {
1308 bUnavailable = 1;
1309 break;
1310 }
1311 }
1312 }
1313
1314 if (aUnavailable == 0 && bUnavailable == 0) // ok
1315 {
1316 if (!aIsStatic)
1317 {
1318 bodyUsed2[curBodyUsed++] = bodyA;
1319 }
1320 if (!bIsStatic)
1321 {
1322 bodyUsed2[curBodyUsed++] = bodyB;
1323 }
1324
1325 cs[idx].getBatchIdx() = batchIdx;
1326 m_data->m_sortData[idx].m_key = batchIdx;
1327 m_data->m_sortData[idx].m_value = idx;
1328
1329 if (i != numValidConstraints)
1330 {
1331 b3Swap(idxSrc[i], idxSrc[numValidConstraints]);
1332 }
1333
1334 numValidConstraints++;
1335 {
1336 nCurrentBatch++;
1337 if (nCurrentBatch == simdWidth)
1338 {
1339 nCurrentBatch = 0;
1340 for (int i = 0; i < curBodyUsed; i++)
1341 bodyUsed2[i] = 0;
1342
1343 curBodyUsed = 0;
1344 }
1345 }
1346 }
1347 }
1348
1349 batchIdx++;
1350 }
1351 }
1352 {
1353 B3_PROFILE("quickSort");
1354 //m_data->m_sortData.quickSort(sortfnc);
1355 }
1356
1357 {
1358 B3_PROFILE("reorder");
1359 // reorder
1360
1361 memcpy(&m_data->m_old[0], cs, sizeof(b3Contact4) * numConstraints);
1362
1363 for (int i = 0; i < numConstraints; i++)
1364 {
1365 b3Assert(m_data->m_sortData[idxSrc[i]].m_value == idxSrc[i]);
1366 int idx = m_data->m_sortData[idxSrc[i]].m_value;
1367 cs[i] = m_data->m_old[idx];
1368 }
1369 }
1370
1371 #if defined(_DEBUG)
1372 // debugPrintf( "nBatches: %d\n", batchIdx );
1373 for (int i = 0; i < numConstraints; i++)
1374 {
1375 b3Assert(cs[i].getBatchIdx() != -1);
1376 }
1377 #endif
1378
1379 return batchIdx;
1380 }
1381
1382 b3AlignedObjectArray<int> bodyUsed;
1383 b3AlignedObjectArray<int> curUsed;
1384
sortConstraintByBatch3(b3Contact4 * cs,int numConstraints,int simdWidth,int staticIdx,int numBodies,int * batchSizes)1385 inline int b3GpuPgsContactSolver::sortConstraintByBatch3(b3Contact4* cs, int numConstraints, int simdWidth, int staticIdx, int numBodies, int* batchSizes)
1386 {
1387 B3_PROFILE("sortConstraintByBatch3");
1388
1389 static int maxSwaps = 0;
1390 int numSwaps = 0;
1391
1392 curUsed.resize(2 * simdWidth);
1393
1394 static int maxNumConstraints = 0;
1395 if (maxNumConstraints < numConstraints)
1396 {
1397 maxNumConstraints = numConstraints;
1398 //printf("maxNumConstraints = %d\n",maxNumConstraints );
1399 }
1400
1401 int numUsedArray = numBodies / 32 + 1;
1402 bodyUsed.resize(numUsedArray);
1403
1404 for (int q = 0; q < numUsedArray; q++)
1405 bodyUsed[q] = 0;
1406
1407 int curBodyUsed = 0;
1408
1409 int numIter = 0;
1410
1411 m_data->m_sortData.resize(0);
1412 m_data->m_idxBuffer.resize(0);
1413 m_data->m_old.resize(0);
1414
1415 #if defined(_DEBUG)
1416 for (int i = 0; i < numConstraints; i++)
1417 cs[i].getBatchIdx() = -1;
1418 #endif
1419
1420 int numValidConstraints = 0;
1421 // int unprocessedConstraintIndex = 0;
1422
1423 int batchIdx = 0;
1424
1425 {
1426 B3_PROFILE("cpu batch innerloop");
1427
1428 while (numValidConstraints < numConstraints)
1429 {
1430 numIter++;
1431 int nCurrentBatch = 0;
1432 batchSizes[batchIdx] = 0;
1433
1434 // clear flag
1435 for (int i = 0; i < curBodyUsed; i++)
1436 bodyUsed[curUsed[i] / 32] = 0;
1437
1438 curBodyUsed = 0;
1439
1440 for (int i = numValidConstraints; i < numConstraints; i++)
1441 {
1442 int idx = i;
1443 b3Assert(idx < numConstraints);
1444 // check if it can go
1445 int bodyAS = cs[idx].m_bodyAPtrAndSignBit;
1446 int bodyBS = cs[idx].m_bodyBPtrAndSignBit;
1447 int bodyA = abs(bodyAS);
1448 int bodyB = abs(bodyBS);
1449 bool aIsStatic = (bodyAS < 0) || bodyAS == staticIdx;
1450 bool bIsStatic = (bodyBS < 0) || bodyBS == staticIdx;
1451 int aUnavailable = 0;
1452 int bUnavailable = 0;
1453 if (!aIsStatic)
1454 {
1455 aUnavailable = bodyUsed[bodyA / 32] & (1 << (bodyA & 31));
1456 }
1457 if (!aUnavailable)
1458 if (!bIsStatic)
1459 {
1460 bUnavailable = bodyUsed[bodyB / 32] & (1 << (bodyB & 31));
1461 }
1462
1463 if (aUnavailable == 0 && bUnavailable == 0) // ok
1464 {
1465 if (!aIsStatic)
1466 {
1467 bodyUsed[bodyA / 32] |= (1 << (bodyA & 31));
1468 curUsed[curBodyUsed++] = bodyA;
1469 }
1470 if (!bIsStatic)
1471 {
1472 bodyUsed[bodyB / 32] |= (1 << (bodyB & 31));
1473 curUsed[curBodyUsed++] = bodyB;
1474 }
1475
1476 cs[idx].getBatchIdx() = batchIdx;
1477
1478 if (i != numValidConstraints)
1479 {
1480 b3Swap(cs[i], cs[numValidConstraints]);
1481 numSwaps++;
1482 }
1483
1484 numValidConstraints++;
1485 {
1486 nCurrentBatch++;
1487 if (nCurrentBatch == simdWidth)
1488 {
1489 batchSizes[batchIdx] += simdWidth;
1490 nCurrentBatch = 0;
1491 for (int i = 0; i < curBodyUsed; i++)
1492 bodyUsed[curUsed[i] / 32] = 0;
1493 curBodyUsed = 0;
1494 }
1495 }
1496 }
1497 }
1498
1499 if (batchIdx >= B3_MAX_NUM_BATCHES)
1500 {
1501 b3Error("batchIdx>=B3_MAX_NUM_BATCHES");
1502 b3Assert(0);
1503 break;
1504 }
1505
1506 batchSizes[batchIdx] += nCurrentBatch;
1507
1508 batchIdx++;
1509 }
1510 }
1511
1512 #if defined(_DEBUG)
1513 // debugPrintf( "nBatches: %d\n", batchIdx );
1514 for (int i = 0; i < numConstraints; i++)
1515 {
1516 b3Assert(cs[i].getBatchIdx() != -1);
1517 }
1518 #endif
1519
1520 batchSizes[batchIdx] = 0;
1521
1522 if (maxSwaps < numSwaps)
1523 {
1524 maxSwaps = numSwaps;
1525 //printf("maxSwaps = %d\n", maxSwaps);
1526 }
1527
1528 return batchIdx;
1529 }
1530