1 /*
2 Copyright (c) 2012 Advanced Micro Devices, Inc.
3
4 This software is provided 'as-is', without any express or implied warranty.
5 In no event will the authors be held liable for any damages arising from the use of this software.
6 Permission is granted to anyone to use this software for any purpose,
7 including commercial applications, and to alter it and redistribute it freely,
8 subject to the following restrictions:
9
10 1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
11 2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
12 3. This notice may not be removed or altered from any source distribution.
13 */
14 //Originally written by Takahiro Harada
15
16 #include "b3Solver.h"
17
18 ///useNewBatchingKernel is a rewritten kernel using just a single thread of the warp, for experiments
19 bool useNewBatchingKernel = true;
20 bool gConvertConstraintOnCpu = false;
21
22 #define B3_SOLVER_SETUP_KERNEL_PATH "src/Bullet3OpenCL/RigidBody/kernels/solverSetup.cl"
23 #define B3_SOLVER_SETUP2_KERNEL_PATH "src/Bullet3OpenCL/RigidBody/kernels/solverSetup2.cl"
24 #define B3_SOLVER_CONTACT_KERNEL_PATH "src/Bullet3OpenCL/RigidBody/kernels/solveContact.cl"
25 #define B3_SOLVER_FRICTION_KERNEL_PATH "src/Bullet3OpenCL/RigidBody/kernels/solveFriction.cl"
26 #define B3_BATCHING_PATH "src/Bullet3OpenCL/RigidBody/kernels/batchingKernels.cl"
27 #define B3_BATCHING_NEW_PATH "src/Bullet3OpenCL/RigidBody/kernels/batchingKernelsNew.cl"
28
29 #include "Bullet3Dynamics/shared/b3ConvertConstraint4.h"
30
31 #include "kernels/solverSetup.h"
32 #include "kernels/solverSetup2.h"
33
34 #include "kernels/solveContact.h"
35 #include "kernels/solveFriction.h"
36
37 #include "kernels/batchingKernels.h"
38 #include "kernels/batchingKernelsNew.h"
39
40 #include "Bullet3OpenCL/ParallelPrimitives/b3LauncherCL.h"
41 #include "Bullet3Common/b3Vector3.h"
42
43 struct SolverDebugInfo
44 {
45 int m_valInt0;
46 int m_valInt1;
47 int m_valInt2;
48 int m_valInt3;
49
50 int m_valInt4;
51 int m_valInt5;
52 int m_valInt6;
53 int m_valInt7;
54
55 int m_valInt8;
56 int m_valInt9;
57 int m_valInt10;
58 int m_valInt11;
59
60 int m_valInt12;
61 int m_valInt13;
62 int m_valInt14;
63 int m_valInt15;
64
65 float m_val0;
66 float m_val1;
67 float m_val2;
68 float m_val3;
69 };
70
71 class SolverDeviceInl
72 {
73 public:
74 struct ParallelSolveData
75 {
76 b3OpenCLArray<unsigned int>* m_numConstraints;
77 b3OpenCLArray<unsigned int>* m_offsets;
78 };
79 };
80
b3Solver(cl_context ctx,cl_device_id device,cl_command_queue queue,int pairCapacity)81 b3Solver::b3Solver(cl_context ctx, cl_device_id device, cl_command_queue queue, int pairCapacity)
82 : m_context(ctx),
83 m_device(device),
84 m_queue(queue),
85 m_batchSizes(ctx, queue),
86 m_nIterations(4)
87 {
88 m_sort32 = new b3RadixSort32CL(ctx, device, queue);
89 m_scan = new b3PrefixScanCL(ctx, device, queue, B3_SOLVER_N_CELLS);
90 m_search = new b3BoundSearchCL(ctx, device, queue, B3_SOLVER_N_CELLS);
91
92 const int sortSize = B3NEXTMULTIPLEOF(pairCapacity, 512);
93
94 m_sortDataBuffer = new b3OpenCLArray<b3SortData>(ctx, queue, sortSize);
95 m_contactBuffer2 = new b3OpenCLArray<b3Contact4>(ctx, queue);
96
97 m_numConstraints = new b3OpenCLArray<unsigned int>(ctx, queue, B3_SOLVER_N_CELLS);
98 m_numConstraints->resize(B3_SOLVER_N_CELLS);
99
100 m_offsets = new b3OpenCLArray<unsigned int>(ctx, queue, B3_SOLVER_N_CELLS);
101 m_offsets->resize(B3_SOLVER_N_CELLS);
102 const char* additionalMacros = "";
103 // const char* srcFileNameForCaching="";
104
105 cl_int pErrNum;
106 const char* batchKernelSource = batchingKernelsCL;
107 const char* batchKernelNewSource = batchingKernelsNewCL;
108
109 const char* solverSetupSource = solverSetupCL;
110 const char* solverSetup2Source = solverSetup2CL;
111 const char* solveContactSource = solveContactCL;
112 const char* solveFrictionSource = solveFrictionCL;
113
114 {
115 cl_program solveContactProg = b3OpenCLUtils::compileCLProgramFromString(ctx, device, solveContactSource, &pErrNum, additionalMacros, B3_SOLVER_CONTACT_KERNEL_PATH);
116 b3Assert(solveContactProg);
117
118 cl_program solveFrictionProg = b3OpenCLUtils::compileCLProgramFromString(ctx, device, solveFrictionSource, &pErrNum, additionalMacros, B3_SOLVER_FRICTION_KERNEL_PATH);
119 b3Assert(solveFrictionProg);
120
121 cl_program solverSetup2Prog = b3OpenCLUtils::compileCLProgramFromString(ctx, device, solverSetup2Source, &pErrNum, additionalMacros, B3_SOLVER_SETUP2_KERNEL_PATH);
122 b3Assert(solverSetup2Prog);
123
124 cl_program solverSetupProg = b3OpenCLUtils::compileCLProgramFromString(ctx, device, solverSetupSource, &pErrNum, additionalMacros, B3_SOLVER_SETUP_KERNEL_PATH);
125 b3Assert(solverSetupProg);
126
127 m_solveFrictionKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, solveFrictionSource, "BatchSolveKernelFriction", &pErrNum, solveFrictionProg, additionalMacros);
128 b3Assert(m_solveFrictionKernel);
129
130 m_solveContactKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, solveContactSource, "BatchSolveKernelContact", &pErrNum, solveContactProg, additionalMacros);
131 b3Assert(m_solveContactKernel);
132
133 m_contactToConstraintKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, solverSetupSource, "ContactToConstraintKernel", &pErrNum, solverSetupProg, additionalMacros);
134 b3Assert(m_contactToConstraintKernel);
135
136 m_setSortDataKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, solverSetup2Source, "SetSortDataKernel", &pErrNum, solverSetup2Prog, additionalMacros);
137 b3Assert(m_setSortDataKernel);
138
139 m_reorderContactKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, solverSetup2Source, "ReorderContactKernel", &pErrNum, solverSetup2Prog, additionalMacros);
140 b3Assert(m_reorderContactKernel);
141
142 m_copyConstraintKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, solverSetup2Source, "CopyConstraintKernel", &pErrNum, solverSetup2Prog, additionalMacros);
143 b3Assert(m_copyConstraintKernel);
144 }
145
146 {
147 cl_program batchingProg = b3OpenCLUtils::compileCLProgramFromString(ctx, device, batchKernelSource, &pErrNum, additionalMacros, B3_BATCHING_PATH);
148 //cl_program batchingProg = b3OpenCLUtils::compileCLProgramFromString( ctx, device, 0, &pErrNum,additionalMacros, B3_BATCHING_PATH,true);
149 b3Assert(batchingProg);
150
151 m_batchingKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, batchKernelSource, "CreateBatches", &pErrNum, batchingProg, additionalMacros);
152 b3Assert(m_batchingKernel);
153 }
154 {
155 cl_program batchingNewProg = b3OpenCLUtils::compileCLProgramFromString(ctx, device, batchKernelNewSource, &pErrNum, additionalMacros, B3_BATCHING_NEW_PATH);
156 b3Assert(batchingNewProg);
157
158 m_batchingKernelNew = b3OpenCLUtils::compileCLKernelFromString(ctx, device, batchKernelNewSource, "CreateBatchesNew", &pErrNum, batchingNewProg, additionalMacros);
159 //m_batchingKernelNew = b3OpenCLUtils::compileCLKernelFromString( ctx, device, batchKernelNewSource, "CreateBatchesBruteForce", &pErrNum, batchingNewProg,additionalMacros );
160 b3Assert(m_batchingKernelNew);
161 }
162 }
163
~b3Solver()164 b3Solver::~b3Solver()
165 {
166 delete m_offsets;
167 delete m_numConstraints;
168 delete m_sortDataBuffer;
169 delete m_contactBuffer2;
170
171 delete m_sort32;
172 delete m_scan;
173 delete m_search;
174
175 clReleaseKernel(m_batchingKernel);
176 clReleaseKernel(m_batchingKernelNew);
177
178 clReleaseKernel(m_solveContactKernel);
179 clReleaseKernel(m_solveFrictionKernel);
180
181 clReleaseKernel(m_contactToConstraintKernel);
182 clReleaseKernel(m_setSortDataKernel);
183 clReleaseKernel(m_reorderContactKernel);
184 clReleaseKernel(m_copyConstraintKernel);
185 }
186
187 template <bool JACOBI>
solveContact(b3GpuConstraint4 & cs,const b3Vector3 & posA,b3Vector3 & linVelA,b3Vector3 & angVelA,float invMassA,const b3Matrix3x3 & invInertiaA,const b3Vector3 & posB,b3Vector3 & linVelB,b3Vector3 & angVelB,float invMassB,const b3Matrix3x3 & invInertiaB,float maxRambdaDt[4],float minRambdaDt[4])188 static __inline void solveContact(b3GpuConstraint4& cs,
189 const b3Vector3& posA, b3Vector3& linVelA, b3Vector3& angVelA, float invMassA, const b3Matrix3x3& invInertiaA,
190 const b3Vector3& posB, b3Vector3& linVelB, b3Vector3& angVelB, float invMassB, const b3Matrix3x3& invInertiaB,
191 float maxRambdaDt[4], float minRambdaDt[4])
192 {
193 b3Vector3 dLinVelA;
194 dLinVelA.setZero();
195 b3Vector3 dAngVelA;
196 dAngVelA.setZero();
197 b3Vector3 dLinVelB;
198 dLinVelB.setZero();
199 b3Vector3 dAngVelB;
200 dAngVelB.setZero();
201
202 for (int ic = 0; ic < 4; ic++)
203 {
204 // dont necessary because this makes change to 0
205 if (cs.m_jacCoeffInv[ic] == 0.f) continue;
206
207 {
208 b3Vector3 angular0, angular1, linear;
209 b3Vector3 r0 = cs.m_worldPos[ic] - (b3Vector3&)posA;
210 b3Vector3 r1 = cs.m_worldPos[ic] - (b3Vector3&)posB;
211 setLinearAndAngular((const b3Vector3&)cs.m_linear, (const b3Vector3&)r0, (const b3Vector3&)r1, &linear, &angular0, &angular1);
212
213 float rambdaDt = calcRelVel((const b3Vector3&)cs.m_linear, (const b3Vector3&)-cs.m_linear, angular0, angular1,
214 linVelA, angVelA, linVelB, angVelB) +
215 cs.m_b[ic];
216 rambdaDt *= cs.m_jacCoeffInv[ic];
217
218 {
219 float prevSum = cs.m_appliedRambdaDt[ic];
220 float updated = prevSum;
221 updated += rambdaDt;
222 updated = b3Max(updated, minRambdaDt[ic]);
223 updated = b3Min(updated, maxRambdaDt[ic]);
224 rambdaDt = updated - prevSum;
225 cs.m_appliedRambdaDt[ic] = updated;
226 }
227
228 b3Vector3 linImp0 = invMassA * linear * rambdaDt;
229 b3Vector3 linImp1 = invMassB * (-linear) * rambdaDt;
230 b3Vector3 angImp0 = (invInertiaA * angular0) * rambdaDt;
231 b3Vector3 angImp1 = (invInertiaB * angular1) * rambdaDt;
232 #ifdef _WIN32
233 b3Assert(_finite(linImp0.getX()));
234 b3Assert(_finite(linImp1.getX()));
235 #endif
236 if (JACOBI)
237 {
238 dLinVelA += linImp0;
239 dAngVelA += angImp0;
240 dLinVelB += linImp1;
241 dAngVelB += angImp1;
242 }
243 else
244 {
245 linVelA += linImp0;
246 angVelA += angImp0;
247 linVelB += linImp1;
248 angVelB += angImp1;
249 }
250 }
251 }
252
253 if (JACOBI)
254 {
255 linVelA += dLinVelA;
256 angVelA += dAngVelA;
257 linVelB += dLinVelB;
258 angVelB += dAngVelB;
259 }
260 }
261
solveFriction(b3GpuConstraint4 & cs,const b3Vector3 & posA,b3Vector3 & linVelA,b3Vector3 & angVelA,float invMassA,const b3Matrix3x3 & invInertiaA,const b3Vector3 & posB,b3Vector3 & linVelB,b3Vector3 & angVelB,float invMassB,const b3Matrix3x3 & invInertiaB,float maxRambdaDt[4],float minRambdaDt[4])262 static __inline void solveFriction(b3GpuConstraint4& cs,
263 const b3Vector3& posA, b3Vector3& linVelA, b3Vector3& angVelA, float invMassA, const b3Matrix3x3& invInertiaA,
264 const b3Vector3& posB, b3Vector3& linVelB, b3Vector3& angVelB, float invMassB, const b3Matrix3x3& invInertiaB,
265 float maxRambdaDt[4], float minRambdaDt[4])
266 {
267 if (cs.m_fJacCoeffInv[0] == 0 && cs.m_fJacCoeffInv[0] == 0) return;
268 const b3Vector3& center = (const b3Vector3&)cs.m_center;
269
270 b3Vector3 n = -(const b3Vector3&)cs.m_linear;
271
272 b3Vector3 tangent[2];
273 #if 1
274 b3PlaneSpace1(n, tangent[0], tangent[1]);
275 #else
276 b3Vector3 r = cs.m_worldPos[0] - center;
277 tangent[0] = cross3(n, r);
278 tangent[1] = cross3(tangent[0], n);
279 tangent[0] = normalize3(tangent[0]);
280 tangent[1] = normalize3(tangent[1]);
281 #endif
282
283 b3Vector3 angular0, angular1, linear;
284 b3Vector3 r0 = center - posA;
285 b3Vector3 r1 = center - posB;
286 for (int i = 0; i < 2; i++)
287 {
288 setLinearAndAngular(tangent[i], r0, r1, &linear, &angular0, &angular1);
289 float rambdaDt = calcRelVel(linear, -linear, angular0, angular1,
290 linVelA, angVelA, linVelB, angVelB);
291 rambdaDt *= cs.m_fJacCoeffInv[i];
292
293 {
294 float prevSum = cs.m_fAppliedRambdaDt[i];
295 float updated = prevSum;
296 updated += rambdaDt;
297 updated = b3Max(updated, minRambdaDt[i]);
298 updated = b3Min(updated, maxRambdaDt[i]);
299 rambdaDt = updated - prevSum;
300 cs.m_fAppliedRambdaDt[i] = updated;
301 }
302
303 b3Vector3 linImp0 = invMassA * linear * rambdaDt;
304 b3Vector3 linImp1 = invMassB * (-linear) * rambdaDt;
305 b3Vector3 angImp0 = (invInertiaA * angular0) * rambdaDt;
306 b3Vector3 angImp1 = (invInertiaB * angular1) * rambdaDt;
307 #ifdef _WIN32
308 b3Assert(_finite(linImp0.getX()));
309 b3Assert(_finite(linImp1.getX()));
310 #endif
311 linVelA += linImp0;
312 angVelA += angImp0;
313 linVelB += linImp1;
314 angVelB += angImp1;
315 }
316
317 { // angular damping for point constraint
318 b3Vector3 ab = (posB - posA).normalized();
319 b3Vector3 ac = (center - posA).normalized();
320 if (b3Dot(ab, ac) > 0.95f || (invMassA == 0.f || invMassB == 0.f))
321 {
322 float angNA = b3Dot(n, angVelA);
323 float angNB = b3Dot(n, angVelB);
324
325 angVelA -= (angNA * 0.1f) * n;
326 angVelB -= (angNB * 0.1f) * n;
327 }
328 }
329 }
330 /*
331 b3AlignedObjectArray<b3RigidBodyData>& m_bodies;
332 b3AlignedObjectArray<b3InertiaData>& m_shapes;
333 b3AlignedObjectArray<b3GpuConstraint4>& m_constraints;
334 b3AlignedObjectArray<int>* m_batchSizes;
335 int m_cellIndex;
336 int m_curWgidx;
337 int m_start;
338 int m_nConstraints;
339 bool m_solveFriction;
340 int m_maxNumBatches;
341 */
342
343 struct SolveTask // : public ThreadPool::Task
344 {
SolveTaskSolveTask345 SolveTask(b3AlignedObjectArray<b3RigidBodyData>& bodies, b3AlignedObjectArray<b3InertiaData>& shapes, b3AlignedObjectArray<b3GpuConstraint4>& constraints,
346 int start, int nConstraints, int maxNumBatches, b3AlignedObjectArray<int>* wgUsedBodies, int curWgidx, b3AlignedObjectArray<int>* batchSizes, int cellIndex)
347 : m_bodies(bodies), m_shapes(shapes), m_constraints(constraints), m_batchSizes(batchSizes), m_cellIndex(cellIndex), m_curWgidx(curWgidx), m_start(start), m_nConstraints(nConstraints), m_solveFriction(true), m_maxNumBatches(maxNumBatches)
348 {
349 }
350
getTypeSolveTask351 unsigned short int getType() { return 0; }
352
runSolveTask353 void run(int tIdx)
354 {
355 int offset = 0;
356 for (int ii = 0; ii < B3_MAX_NUM_BATCHES; ii++)
357 {
358 int numInBatch = m_batchSizes->at(m_cellIndex * B3_MAX_NUM_BATCHES + ii);
359 if (!numInBatch)
360 break;
361
362 for (int jj = 0; jj < numInBatch; jj++)
363 {
364 int i = m_start + offset + jj;
365 int batchId = m_constraints[i].m_batchIdx;
366 b3Assert(batchId == ii);
367 float frictionCoeff = m_constraints[i].getFrictionCoeff();
368 int aIdx = (int)m_constraints[i].m_bodyA;
369 int bIdx = (int)m_constraints[i].m_bodyB;
370 // int localBatch = m_constraints[i].m_batchIdx;
371 b3RigidBodyData& bodyA = m_bodies[aIdx];
372 b3RigidBodyData& bodyB = m_bodies[bIdx];
373
374 if (!m_solveFriction)
375 {
376 float maxRambdaDt[4] = {FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX};
377 float minRambdaDt[4] = {0.f, 0.f, 0.f, 0.f};
378
379 solveContact<false>(m_constraints[i], (b3Vector3&)bodyA.m_pos, (b3Vector3&)bodyA.m_linVel, (b3Vector3&)bodyA.m_angVel, bodyA.m_invMass, (const b3Matrix3x3&)m_shapes[aIdx].m_invInertiaWorld,
380 (b3Vector3&)bodyB.m_pos, (b3Vector3&)bodyB.m_linVel, (b3Vector3&)bodyB.m_angVel, bodyB.m_invMass, (const b3Matrix3x3&)m_shapes[bIdx].m_invInertiaWorld,
381 maxRambdaDt, minRambdaDt);
382 }
383 else
384 {
385 float maxRambdaDt[4] = {FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX};
386 float minRambdaDt[4] = {0.f, 0.f, 0.f, 0.f};
387 float sum = 0;
388 for (int j = 0; j < 4; j++)
389 {
390 sum += m_constraints[i].m_appliedRambdaDt[j];
391 }
392 frictionCoeff = 0.7f;
393 for (int j = 0; j < 4; j++)
394 {
395 maxRambdaDt[j] = frictionCoeff * sum;
396 minRambdaDt[j] = -maxRambdaDt[j];
397 }
398 solveFriction(m_constraints[i], (b3Vector3&)bodyA.m_pos, (b3Vector3&)bodyA.m_linVel, (b3Vector3&)bodyA.m_angVel, bodyA.m_invMass, (const b3Matrix3x3&)m_shapes[aIdx].m_invInertiaWorld,
399 (b3Vector3&)bodyB.m_pos, (b3Vector3&)bodyB.m_linVel, (b3Vector3&)bodyB.m_angVel, bodyB.m_invMass, (const b3Matrix3x3&)m_shapes[bIdx].m_invInertiaWorld,
400 maxRambdaDt, minRambdaDt);
401 }
402 }
403 offset += numInBatch;
404 }
405 /* for (int bb=0;bb<m_maxNumBatches;bb++)
406 {
407 //for(int ic=m_nConstraints-1; ic>=0; ic--)
408 for(int ic=0; ic<m_nConstraints; ic++)
409 {
410
411 int i = m_start + ic;
412 if (m_constraints[i].m_batchIdx != bb)
413 continue;
414
415 float frictionCoeff = m_constraints[i].getFrictionCoeff();
416 int aIdx = (int)m_constraints[i].m_bodyA;
417 int bIdx = (int)m_constraints[i].m_bodyB;
418 int localBatch = m_constraints[i].m_batchIdx;
419 b3RigidBodyData& bodyA = m_bodies[aIdx];
420 b3RigidBodyData& bodyB = m_bodies[bIdx];
421
422 if( !m_solveFriction )
423 {
424 float maxRambdaDt[4] = {FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX};
425 float minRambdaDt[4] = {0.f,0.f,0.f,0.f};
426
427 solveContact<false>( m_constraints[i], (b3Vector3&)bodyA.m_pos, (b3Vector3&)bodyA.m_linVel, (b3Vector3&)bodyA.m_angVel, bodyA.m_invMass, (const b3Matrix3x3 &)m_shapes[aIdx].m_invInertiaWorld,
428 (b3Vector3&)bodyB.m_pos, (b3Vector3&)bodyB.m_linVel, (b3Vector3&)bodyB.m_angVel, bodyB.m_invMass, (const b3Matrix3x3 &)m_shapes[bIdx].m_invInertiaWorld,
429 maxRambdaDt, minRambdaDt );
430 }
431 else
432 {
433 float maxRambdaDt[4] = {FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX};
434 float minRambdaDt[4] = {0.f,0.f,0.f,0.f};
435 float sum = 0;
436 for(int j=0; j<4; j++)
437 {
438 sum +=m_constraints[i].m_appliedRambdaDt[j];
439 }
440 frictionCoeff = 0.7f;
441 for(int j=0; j<4; j++)
442 {
443 maxRambdaDt[j] = frictionCoeff*sum;
444 minRambdaDt[j] = -maxRambdaDt[j];
445 }
446 solveFriction( m_constraints[i], (b3Vector3&)bodyA.m_pos, (b3Vector3&)bodyA.m_linVel, (b3Vector3&)bodyA.m_angVel, bodyA.m_invMass,(const b3Matrix3x3 &) m_shapes[aIdx].m_invInertiaWorld,
447 (b3Vector3&)bodyB.m_pos, (b3Vector3&)bodyB.m_linVel, (b3Vector3&)bodyB.m_angVel, bodyB.m_invMass,(const b3Matrix3x3 &) m_shapes[bIdx].m_invInertiaWorld,
448 maxRambdaDt, minRambdaDt );
449
450 }
451 }
452 }
453 */
454 }
455
456 b3AlignedObjectArray<b3RigidBodyData>& m_bodies;
457 b3AlignedObjectArray<b3InertiaData>& m_shapes;
458 b3AlignedObjectArray<b3GpuConstraint4>& m_constraints;
459 b3AlignedObjectArray<int>* m_batchSizes;
460 int m_cellIndex;
461 int m_curWgidx;
462 int m_start;
463 int m_nConstraints;
464 bool m_solveFriction;
465 int m_maxNumBatches;
466 };
467
solveContactConstraintHost(b3OpenCLArray<b3RigidBodyData> * bodyBuf,b3OpenCLArray<b3InertiaData> * shapeBuf,b3OpenCLArray<b3GpuConstraint4> * constraint,void * additionalData,int n,int maxNumBatches,b3AlignedObjectArray<int> * batchSizes)468 void b3Solver::solveContactConstraintHost(b3OpenCLArray<b3RigidBodyData>* bodyBuf, b3OpenCLArray<b3InertiaData>* shapeBuf,
469 b3OpenCLArray<b3GpuConstraint4>* constraint, void* additionalData, int n, int maxNumBatches, b3AlignedObjectArray<int>* batchSizes)
470 {
471 #if 0
472 {
473 int nSplitX = B3_SOLVER_N_SPLIT_X;
474 int nSplitY = B3_SOLVER_N_SPLIT_Y;
475 int numWorkgroups = B3_SOLVER_N_CELLS/B3_SOLVER_N_BATCHES;
476 for (int z=0;z<4;z++)
477 {
478 for (int y=0;y<4;y++)
479 {
480 for (int x=0;x<4;x++)
481 {
482 int newIndex = (x+y*nSplitX+z*nSplitX*nSplitY);
483 // printf("newIndex=%d\n",newIndex);
484
485 int zIdx = newIndex/(nSplitX*nSplitY);
486 int remain = newIndex%(nSplitX*nSplitY);
487 int yIdx = remain/nSplitX;
488 int xIdx = remain%nSplitX;
489 // printf("newIndex=%d\n",newIndex);
490 }
491 }
492 }
493
494 //for (int wgIdx=numWorkgroups-1;wgIdx>=0;wgIdx--)
495 for (int cellBatch=0;cellBatch<B3_SOLVER_N_BATCHES;cellBatch++)
496 {
497 for (int wgIdx=0;wgIdx<numWorkgroups;wgIdx++)
498 {
499 int zIdx = (wgIdx/((nSplitX*nSplitY)/4))*2+((cellBatch&4)>>2);
500 int remain= (wgIdx%((nSplitX*nSplitY)/4));
501 int yIdx = (remain/(nSplitX/2))*2 + ((cellBatch&2)>>1);
502 int xIdx = (remain%(nSplitX/2))*2 + (cellBatch&1);
503
504 /*int zIdx = newIndex/(nSplitX*nSplitY);
505 int remain = newIndex%(nSplitX*nSplitY);
506 int yIdx = remain/nSplitX;
507 int xIdx = remain%nSplitX;
508 */
509 int cellIdx = xIdx+yIdx*nSplitX+zIdx*(nSplitX*nSplitY);
510 // printf("wgIdx %d: xIdx=%d, yIdx=%d, zIdx=%d, cellIdx=%d, cell Batch %d\n",wgIdx,xIdx,yIdx,zIdx,cellIdx,cellBatch);
511 }
512 }
513 }
514 #endif
515
516 b3AlignedObjectArray<b3RigidBodyData> bodyNative;
517 bodyBuf->copyToHost(bodyNative);
518 b3AlignedObjectArray<b3InertiaData> shapeNative;
519 shapeBuf->copyToHost(shapeNative);
520 b3AlignedObjectArray<b3GpuConstraint4> constraintNative;
521 constraint->copyToHost(constraintNative);
522
523 b3AlignedObjectArray<unsigned int> numConstraintsHost;
524 m_numConstraints->copyToHost(numConstraintsHost);
525
526 //printf("------------------------\n");
527 b3AlignedObjectArray<unsigned int> offsetsHost;
528 m_offsets->copyToHost(offsetsHost);
529 static int frame = 0;
530 bool useBatches = true;
531 if (useBatches)
532 {
533 for (int iter = 0; iter < m_nIterations; iter++)
534 {
535 for (int cellBatch = 0; cellBatch < B3_SOLVER_N_BATCHES; cellBatch++)
536 {
537 int nSplitX = B3_SOLVER_N_SPLIT_X;
538 int nSplitY = B3_SOLVER_N_SPLIT_Y;
539 int numWorkgroups = B3_SOLVER_N_CELLS / B3_SOLVER_N_BATCHES;
540 //printf("cell Batch %d\n",cellBatch);
541 b3AlignedObjectArray<int> usedBodies[B3_SOLVER_N_CELLS];
542 for (int i = 0; i < B3_SOLVER_N_CELLS; i++)
543 {
544 usedBodies[i].resize(0);
545 }
546
547 //for (int wgIdx=numWorkgroups-1;wgIdx>=0;wgIdx--)
548 for (int wgIdx = 0; wgIdx < numWorkgroups; wgIdx++)
549 {
550 int zIdx = (wgIdx / ((nSplitX * nSplitY) / 4)) * 2 + ((cellBatch & 4) >> 2);
551 int remain = (wgIdx % ((nSplitX * nSplitY) / 4));
552 int yIdx = (remain / (nSplitX / 2)) * 2 + ((cellBatch & 2) >> 1);
553 int xIdx = (remain % (nSplitX / 2)) * 2 + (cellBatch & 1);
554 int cellIdx = xIdx + yIdx * nSplitX + zIdx * (nSplitX * nSplitY);
555
556 if (numConstraintsHost[cellIdx] == 0)
557 continue;
558
559 //printf("wgIdx %d: xIdx=%d, yIdx=%d, zIdx=%d, cellIdx=%d, cell Batch %d\n",wgIdx,xIdx,yIdx,zIdx,cellIdx,cellBatch);
560 //printf("cell %d has %d constraints\n", cellIdx,numConstraintsHost[cellIdx]);
561 if (zIdx)
562 {
563 //printf("?\n");
564 }
565
566 if (iter == 0)
567 {
568 //printf("frame=%d, Cell xIdx=%x, yIdx=%d ",frame, xIdx,yIdx);
569 //printf("cellBatch=%d, wgIdx=%d, #constraints in cell=%d\n",cellBatch,wgIdx,numConstraintsHost[cellIdx]);
570 }
571 const int start = offsetsHost[cellIdx];
572 int numConstraintsInCell = numConstraintsHost[cellIdx];
573 // const int end = start + numConstraintsInCell;
574
575 SolveTask task(bodyNative, shapeNative, constraintNative, start, numConstraintsInCell, maxNumBatches, usedBodies, wgIdx, batchSizes, cellIdx);
576 task.m_solveFriction = false;
577 task.run(0);
578 }
579 }
580 }
581
582 for (int iter = 0; iter < m_nIterations; iter++)
583 {
584 for (int cellBatch = 0; cellBatch < B3_SOLVER_N_BATCHES; cellBatch++)
585 {
586 int nSplitX = B3_SOLVER_N_SPLIT_X;
587 int nSplitY = B3_SOLVER_N_SPLIT_Y;
588
589 int numWorkgroups = B3_SOLVER_N_CELLS / B3_SOLVER_N_BATCHES;
590
591 for (int wgIdx = 0; wgIdx < numWorkgroups; wgIdx++)
592 {
593 int zIdx = (wgIdx / ((nSplitX * nSplitY) / 4)) * 2 + ((cellBatch & 4) >> 2);
594 int remain = (wgIdx % ((nSplitX * nSplitY) / 4));
595 int yIdx = (remain / (nSplitX / 2)) * 2 + ((cellBatch & 2) >> 1);
596 int xIdx = (remain % (nSplitX / 2)) * 2 + (cellBatch & 1);
597
598 int cellIdx = xIdx + yIdx * nSplitX + zIdx * (nSplitX * nSplitY);
599
600 if (numConstraintsHost[cellIdx] == 0)
601 continue;
602
603 //printf("yIdx=%d\n",yIdx);
604
605 const int start = offsetsHost[cellIdx];
606 int numConstraintsInCell = numConstraintsHost[cellIdx];
607 // const int end = start + numConstraintsInCell;
608
609 SolveTask task(bodyNative, shapeNative, constraintNative, start, numConstraintsInCell, maxNumBatches, 0, 0, batchSizes, cellIdx);
610 task.m_solveFriction = true;
611 task.run(0);
612 }
613 }
614 }
615 }
616 else
617 {
618 for (int iter = 0; iter < m_nIterations; iter++)
619 {
620 SolveTask task(bodyNative, shapeNative, constraintNative, 0, n, maxNumBatches, 0, 0, 0, 0);
621 task.m_solveFriction = false;
622 task.run(0);
623 }
624
625 for (int iter = 0; iter < m_nIterations; iter++)
626 {
627 SolveTask task(bodyNative, shapeNative, constraintNative, 0, n, maxNumBatches, 0, 0, 0, 0);
628 task.m_solveFriction = true;
629 task.run(0);
630 }
631 }
632
633 bodyBuf->copyFromHost(bodyNative);
634 shapeBuf->copyFromHost(shapeNative);
635 constraint->copyFromHost(constraintNative);
636 frame++;
637 }
638
checkConstraintBatch(const b3OpenCLArray<b3RigidBodyData> * bodyBuf,const b3OpenCLArray<b3InertiaData> * shapeBuf,b3OpenCLArray<b3GpuConstraint4> * constraint,b3OpenCLArray<unsigned int> * m_numConstraints,b3OpenCLArray<unsigned int> * m_offsets,int batchId)639 void checkConstraintBatch(const b3OpenCLArray<b3RigidBodyData>* bodyBuf,
640 const b3OpenCLArray<b3InertiaData>* shapeBuf,
641 b3OpenCLArray<b3GpuConstraint4>* constraint,
642 b3OpenCLArray<unsigned int>* m_numConstraints,
643 b3OpenCLArray<unsigned int>* m_offsets,
644 int batchId)
645 {
646 // b3BufferInfoCL( m_numConstraints->getBufferCL() ),
647 // b3BufferInfoCL( m_offsets->getBufferCL() )
648
649 int cellBatch = batchId;
650 const int nn = B3_SOLVER_N_CELLS;
651 // int numWorkItems = 64*nn/B3_SOLVER_N_BATCHES;
652
653 b3AlignedObjectArray<unsigned int> gN;
654 m_numConstraints->copyToHost(gN);
655 b3AlignedObjectArray<unsigned int> gOffsets;
656 m_offsets->copyToHost(gOffsets);
657 int nSplitX = B3_SOLVER_N_SPLIT_X;
658 int nSplitY = B3_SOLVER_N_SPLIT_Y;
659
660 // int bIdx = batchId;
661
662 b3AlignedObjectArray<b3GpuConstraint4> cpuConstraints;
663 constraint->copyToHost(cpuConstraints);
664
665 printf("batch = %d\n", batchId);
666
667 int numWorkgroups = nn / B3_SOLVER_N_BATCHES;
668 b3AlignedObjectArray<int> usedBodies;
669
670 for (int wgIdx = 0; wgIdx < numWorkgroups; wgIdx++)
671 {
672 printf("wgIdx = %d ", wgIdx);
673
674 int zIdx = (wgIdx / ((nSplitX * nSplitY)) / 2) * 2 + ((cellBatch & 4) >> 2);
675 int remain = wgIdx % ((nSplitX * nSplitY));
676 int yIdx = (remain % (nSplitX / 2)) * 2 + ((cellBatch & 2) >> 1);
677 int xIdx = (remain / (nSplitX / 2)) * 2 + (cellBatch & 1);
678
679 int cellIdx = xIdx + yIdx * nSplitX + zIdx * (nSplitX * nSplitY);
680 printf("cellIdx=%d\n", cellIdx);
681 if (gN[cellIdx] == 0)
682 continue;
683
684 const int start = gOffsets[cellIdx];
685 const int end = start + gN[cellIdx];
686
687 for (int c = start; c < end; c++)
688 {
689 b3GpuConstraint4& constraint = cpuConstraints[c];
690 //printf("constraint (%d,%d)\n", constraint.m_bodyA,constraint.m_bodyB);
691 if (usedBodies.findLinearSearch(constraint.m_bodyA) < usedBodies.size())
692 {
693 printf("error?\n");
694 }
695 if (usedBodies.findLinearSearch(constraint.m_bodyB) < usedBodies.size())
696 {
697 printf("error?\n");
698 }
699 }
700
701 for (int c = start; c < end; c++)
702 {
703 b3GpuConstraint4& constraint = cpuConstraints[c];
704 usedBodies.push_back(constraint.m_bodyA);
705 usedBodies.push_back(constraint.m_bodyB);
706 }
707 }
708 }
709
710 static bool verify = false;
711
solveContactConstraint(const b3OpenCLArray<b3RigidBodyData> * bodyBuf,const b3OpenCLArray<b3InertiaData> * shapeBuf,b3OpenCLArray<b3GpuConstraint4> * constraint,void * additionalData,int n,int maxNumBatches)712 void b3Solver::solveContactConstraint(const b3OpenCLArray<b3RigidBodyData>* bodyBuf, const b3OpenCLArray<b3InertiaData>* shapeBuf,
713 b3OpenCLArray<b3GpuConstraint4>* constraint, void* additionalData, int n, int maxNumBatches)
714 {
715 b3Int4 cdata = b3MakeInt4(n, 0, 0, 0);
716 {
717 const int nn = B3_SOLVER_N_CELLS;
718
719 cdata.x = 0;
720 cdata.y = maxNumBatches; //250;
721
722 int numWorkItems = 64 * nn / B3_SOLVER_N_BATCHES;
723 #ifdef DEBUG_ME
724 SolverDebugInfo* debugInfo = new SolverDebugInfo[numWorkItems];
725 adl::b3OpenCLArray<SolverDebugInfo> gpuDebugInfo(data->m_device, numWorkItems);
726 #endif
727
728 {
729 B3_PROFILE("m_batchSolveKernel iterations");
730 for (int iter = 0; iter < m_nIterations; iter++)
731 {
732 for (int ib = 0; ib < B3_SOLVER_N_BATCHES; ib++)
733 {
734 if (verify)
735 {
736 checkConstraintBatch(bodyBuf, shapeBuf, constraint, m_numConstraints, m_offsets, ib);
737 }
738
739 #ifdef DEBUG_ME
740 memset(debugInfo, 0, sizeof(SolverDebugInfo) * numWorkItems);
741 gpuDebugInfo.write(debugInfo, numWorkItems);
742 #endif
743
744 cdata.z = ib;
745
746 b3LauncherCL launcher(m_queue, m_solveContactKernel, "m_solveContactKernel");
747 #if 1
748
749 b3BufferInfoCL bInfo[] = {
750
751 b3BufferInfoCL(bodyBuf->getBufferCL()),
752 b3BufferInfoCL(shapeBuf->getBufferCL()),
753 b3BufferInfoCL(constraint->getBufferCL()),
754 b3BufferInfoCL(m_numConstraints->getBufferCL()),
755 b3BufferInfoCL(m_offsets->getBufferCL())
756 #ifdef DEBUG_ME
757 ,
758 b3BufferInfoCL(&gpuDebugInfo)
759 #endif
760 };
761
762 launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
763 //launcher.setConst( cdata.x );
764 launcher.setConst(cdata.y);
765 launcher.setConst(cdata.z);
766 b3Int4 nSplit;
767 nSplit.x = B3_SOLVER_N_SPLIT_X;
768 nSplit.y = B3_SOLVER_N_SPLIT_Y;
769 nSplit.z = B3_SOLVER_N_SPLIT_Z;
770
771 launcher.setConst(nSplit);
772 launcher.launch1D(numWorkItems, 64);
773
774 #else
775 const char* fileName = "m_batchSolveKernel.bin";
776 FILE* f = fopen(fileName, "rb");
777 if (f)
778 {
779 int sizeInBytes = 0;
780 if (fseek(f, 0, SEEK_END) || (sizeInBytes = ftell(f)) == EOF || fseek(f, 0, SEEK_SET))
781 {
782 printf("error, cannot get file size\n");
783 exit(0);
784 }
785
786 unsigned char* buf = (unsigned char*)malloc(sizeInBytes);
787 fread(buf, sizeInBytes, 1, f);
788 int serializedBytes = launcher.deserializeArgs(buf, sizeInBytes, m_context);
789 int num = *(int*)&buf[serializedBytes];
790
791 launcher.launch1D(num);
792
793 //this clFinish is for testing on errors
794 clFinish(m_queue);
795 }
796
797 #endif
798
799 #ifdef DEBUG_ME
800 clFinish(m_queue);
801 gpuDebugInfo.read(debugInfo, numWorkItems);
802 clFinish(m_queue);
803 for (int i = 0; i < numWorkItems; i++)
804 {
805 if (debugInfo[i].m_valInt2 > 0)
806 {
807 printf("debugInfo[i].m_valInt2 = %d\n", i, debugInfo[i].m_valInt2);
808 }
809
810 if (debugInfo[i].m_valInt3 > 0)
811 {
812 printf("debugInfo[i].m_valInt3 = %d\n", i, debugInfo[i].m_valInt3);
813 }
814 }
815 #endif //DEBUG_ME
816 }
817 }
818
819 clFinish(m_queue);
820 }
821
822 cdata.x = 1;
823 bool applyFriction = true;
824 if (applyFriction)
825 {
826 B3_PROFILE("m_batchSolveKernel iterations2");
827 for (int iter = 0; iter < m_nIterations; iter++)
828 {
829 for (int ib = 0; ib < B3_SOLVER_N_BATCHES; ib++)
830 {
831 cdata.z = ib;
832
833 b3BufferInfoCL bInfo[] = {
834 b3BufferInfoCL(bodyBuf->getBufferCL()),
835 b3BufferInfoCL(shapeBuf->getBufferCL()),
836 b3BufferInfoCL(constraint->getBufferCL()),
837 b3BufferInfoCL(m_numConstraints->getBufferCL()),
838 b3BufferInfoCL(m_offsets->getBufferCL())
839 #ifdef DEBUG_ME
840 ,
841 b3BufferInfoCL(&gpuDebugInfo)
842 #endif //DEBUG_ME
843 };
844 b3LauncherCL launcher(m_queue, m_solveFrictionKernel, "m_solveFrictionKernel");
845 launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
846 //launcher.setConst( cdata.x );
847 launcher.setConst(cdata.y);
848 launcher.setConst(cdata.z);
849 b3Int4 nSplit;
850 nSplit.x = B3_SOLVER_N_SPLIT_X;
851 nSplit.y = B3_SOLVER_N_SPLIT_Y;
852 nSplit.z = B3_SOLVER_N_SPLIT_Z;
853
854 launcher.setConst(nSplit);
855
856 launcher.launch1D(64 * nn / B3_SOLVER_N_BATCHES, 64);
857 }
858 }
859 clFinish(m_queue);
860 }
861 #ifdef DEBUG_ME
862 delete[] debugInfo;
863 #endif //DEBUG_ME
864 }
865 }
866
convertToConstraints(const b3OpenCLArray<b3RigidBodyData> * bodyBuf,const b3OpenCLArray<b3InertiaData> * shapeBuf,b3OpenCLArray<b3Contact4> * contactsIn,b3OpenCLArray<b3GpuConstraint4> * contactCOut,void * additionalData,int nContacts,const ConstraintCfg & cfg)867 void b3Solver::convertToConstraints(const b3OpenCLArray<b3RigidBodyData>* bodyBuf,
868 const b3OpenCLArray<b3InertiaData>* shapeBuf,
869 b3OpenCLArray<b3Contact4>* contactsIn, b3OpenCLArray<b3GpuConstraint4>* contactCOut, void* additionalData,
870 int nContacts, const ConstraintCfg& cfg)
871 {
872 // b3OpenCLArray<b3GpuConstraint4>* constraintNative =0;
873 contactCOut->resize(nContacts);
874 struct CB
875 {
876 int m_nContacts;
877 float m_dt;
878 float m_positionDrift;
879 float m_positionConstraintCoeff;
880 };
881
882 {
883 CB cdata;
884 cdata.m_nContacts = nContacts;
885 cdata.m_dt = cfg.m_dt;
886 cdata.m_positionDrift = cfg.m_positionDrift;
887 cdata.m_positionConstraintCoeff = cfg.m_positionConstraintCoeff;
888
889 if (gConvertConstraintOnCpu)
890 {
891 b3AlignedObjectArray<b3RigidBodyData> gBodies;
892 bodyBuf->copyToHost(gBodies);
893
894 b3AlignedObjectArray<b3Contact4> gContact;
895 contactsIn->copyToHost(gContact);
896
897 b3AlignedObjectArray<b3InertiaData> gShapes;
898 shapeBuf->copyToHost(gShapes);
899
900 b3AlignedObjectArray<b3GpuConstraint4> gConstraintOut;
901 gConstraintOut.resize(nContacts);
902
903 B3_PROFILE("cpu contactToConstraintKernel");
904 for (int gIdx = 0; gIdx < nContacts; gIdx++)
905 {
906 int aIdx = abs(gContact[gIdx].m_bodyAPtrAndSignBit);
907 int bIdx = abs(gContact[gIdx].m_bodyBPtrAndSignBit);
908
909 b3Float4 posA = gBodies[aIdx].m_pos;
910 b3Float4 linVelA = gBodies[aIdx].m_linVel;
911 b3Float4 angVelA = gBodies[aIdx].m_angVel;
912 float invMassA = gBodies[aIdx].m_invMass;
913 b3Mat3x3 invInertiaA = gShapes[aIdx].m_initInvInertia;
914
915 b3Float4 posB = gBodies[bIdx].m_pos;
916 b3Float4 linVelB = gBodies[bIdx].m_linVel;
917 b3Float4 angVelB = gBodies[bIdx].m_angVel;
918 float invMassB = gBodies[bIdx].m_invMass;
919 b3Mat3x3 invInertiaB = gShapes[bIdx].m_initInvInertia;
920
921 b3ContactConstraint4_t cs;
922
923 setConstraint4(posA, linVelA, angVelA, invMassA, invInertiaA, posB, linVelB, angVelB, invMassB, invInertiaB,
924 &gContact[gIdx], cdata.m_dt, cdata.m_positionDrift, cdata.m_positionConstraintCoeff,
925 &cs);
926
927 cs.m_batchIdx = gContact[gIdx].m_batchIdx;
928
929 gConstraintOut[gIdx] = (b3GpuConstraint4&)cs;
930 }
931
932 contactCOut->copyFromHost(gConstraintOut);
933 }
934 else
935 {
936 B3_PROFILE("gpu m_contactToConstraintKernel");
937
938 b3BufferInfoCL bInfo[] = {b3BufferInfoCL(contactsIn->getBufferCL()), b3BufferInfoCL(bodyBuf->getBufferCL()), b3BufferInfoCL(shapeBuf->getBufferCL()),
939 b3BufferInfoCL(contactCOut->getBufferCL())};
940 b3LauncherCL launcher(m_queue, m_contactToConstraintKernel, "m_contactToConstraintKernel");
941 launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
942 //launcher.setConst( cdata );
943
944 launcher.setConst(cdata.m_nContacts);
945 launcher.setConst(cdata.m_dt);
946 launcher.setConst(cdata.m_positionDrift);
947 launcher.setConst(cdata.m_positionConstraintCoeff);
948
949 launcher.launch1D(nContacts, 64);
950 clFinish(m_queue);
951 }
952 }
953 }
954
955 /*
956 void b3Solver::sortContacts( const b3OpenCLArray<b3RigidBodyData>* bodyBuf,
957 b3OpenCLArray<b3Contact4>* contactsIn, void* additionalData,
958 int nContacts, const b3Solver::ConstraintCfg& cfg )
959 {
960
961
962
963 const int sortAlignment = 512; // todo. get this out of sort
964 if( cfg.m_enableParallelSolve )
965 {
966
967
968 int sortSize = NEXTMULTIPLEOF( nContacts, sortAlignment );
969
970 b3OpenCLArray<unsigned int>* countsNative = m_numConstraints;//BufferUtils::map<TYPE_CL, false>( data->m_device, &countsHost );
971 b3OpenCLArray<unsigned int>* offsetsNative = m_offsets;//BufferUtils::map<TYPE_CL, false>( data->m_device, &offsetsHost );
972
973 { // 2. set cell idx
974 struct CB
975 {
976 int m_nContacts;
977 int m_staticIdx;
978 float m_scale;
979 int m_nSplit;
980 };
981
982 b3Assert( sortSize%64 == 0 );
983 CB cdata;
984 cdata.m_nContacts = nContacts;
985 cdata.m_staticIdx = cfg.m_staticIdx;
986 cdata.m_scale = 1.f/(N_OBJ_PER_SPLIT*cfg.m_averageExtent);
987 cdata.m_nSplit = B3_SOLVER_N_SPLIT;
988
989
990 b3BufferInfoCL bInfo[] = { b3BufferInfoCL( contactsIn->getBufferCL() ), b3BufferInfoCL( bodyBuf->getBufferCL() ), b3BufferInfoCL( m_sortDataBuffer->getBufferCL() ) };
991 b3LauncherCL launcher( m_queue, m_setSortDataKernel );
992 launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
993 launcher.setConst( cdata );
994 launcher.launch1D( sortSize, 64 );
995 }
996
997 { // 3. sort by cell idx
998 int n = B3_SOLVER_N_SPLIT*B3_SOLVER_N_SPLIT;
999 int sortBit = 32;
1000 //if( n <= 0xffff ) sortBit = 16;
1001 //if( n <= 0xff ) sortBit = 8;
1002 m_sort32->execute(*m_sortDataBuffer,sortSize);
1003 }
1004 { // 4. find entries
1005 m_search->execute( *m_sortDataBuffer, nContacts, *countsNative, B3_SOLVER_N_SPLIT*B3_SOLVER_N_SPLIT, b3BoundSearchCL::COUNT);
1006
1007 m_scan->execute( *countsNative, *offsetsNative, B3_SOLVER_N_SPLIT*B3_SOLVER_N_SPLIT );
1008 }
1009
1010 { // 5. sort constraints by cellIdx
1011 // todo. preallocate this
1012 // b3Assert( contactsIn->getType() == TYPE_HOST );
1013 // b3OpenCLArray<b3Contact4>* out = BufferUtils::map<TYPE_CL, false>( data->m_device, contactsIn ); // copying contacts to this buffer
1014
1015 {
1016
1017
1018 b3Int4 cdata; cdata.x = nContacts;
1019 b3BufferInfoCL bInfo[] = { b3BufferInfoCL( contactsIn->getBufferCL() ), b3BufferInfoCL( m_contactBuffer->getBufferCL() ), b3BufferInfoCL( m_sortDataBuffer->getBufferCL() ) };
1020 b3LauncherCL launcher( m_queue, m_reorderContactKernel );
1021 launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
1022 launcher.setConst( cdata );
1023 launcher.launch1D( nContacts, 64 );
1024 }
1025 // BufferUtils::unmap<true>( out, contactsIn, nContacts );
1026 }
1027 }
1028
1029
1030 }
1031
1032 */
batchContacts(b3OpenCLArray<b3Contact4> * contacts,int nContacts,b3OpenCLArray<unsigned int> * nNative,b3OpenCLArray<unsigned int> * offsetsNative,int staticIdx)1033 void b3Solver::batchContacts(b3OpenCLArray<b3Contact4>* contacts, int nContacts, b3OpenCLArray<unsigned int>* nNative, b3OpenCLArray<unsigned int>* offsetsNative, int staticIdx)
1034 {
1035 int numWorkItems = 64 * B3_SOLVER_N_CELLS;
1036 {
1037 B3_PROFILE("batch generation");
1038
1039 b3Int4 cdata;
1040 cdata.x = nContacts;
1041 cdata.y = 0;
1042 cdata.z = staticIdx;
1043
1044 #ifdef BATCH_DEBUG
1045 SolverDebugInfo* debugInfo = new SolverDebugInfo[numWorkItems];
1046 adl::b3OpenCLArray<SolverDebugInfo> gpuDebugInfo(data->m_device, numWorkItems);
1047 memset(debugInfo, 0, sizeof(SolverDebugInfo) * numWorkItems);
1048 gpuDebugInfo.write(debugInfo, numWorkItems);
1049 #endif
1050
1051 #if 0
1052 b3BufferInfoCL bInfo[] = {
1053 b3BufferInfoCL( contacts->getBufferCL() ),
1054 b3BufferInfoCL( m_contactBuffer2->getBufferCL()),
1055 b3BufferInfoCL( nNative->getBufferCL() ),
1056 b3BufferInfoCL( offsetsNative->getBufferCL() ),
1057 #ifdef BATCH_DEBUG
1058 , b3BufferInfoCL(&gpuDebugInfo)
1059 #endif
1060 };
1061 #endif
1062
1063 {
1064 m_batchSizes.resize(nNative->size());
1065 B3_PROFILE("batchingKernel");
1066 //b3LauncherCL launcher( m_queue, m_batchingKernel);
1067 cl_kernel k = useNewBatchingKernel ? m_batchingKernelNew : m_batchingKernel;
1068
1069 b3LauncherCL launcher(m_queue, k, "*batchingKernel");
1070 if (!useNewBatchingKernel)
1071 {
1072 launcher.setBuffer(contacts->getBufferCL());
1073 }
1074 launcher.setBuffer(m_contactBuffer2->getBufferCL());
1075 launcher.setBuffer(nNative->getBufferCL());
1076 launcher.setBuffer(offsetsNative->getBufferCL());
1077
1078 launcher.setBuffer(m_batchSizes.getBufferCL());
1079
1080 //launcher.setConst( cdata );
1081 launcher.setConst(staticIdx);
1082
1083 launcher.launch1D(numWorkItems, 64);
1084 //clFinish(m_queue);
1085 //b3AlignedObjectArray<int> batchSizesCPU;
1086 //m_batchSizes.copyToHost(batchSizesCPU);
1087 //printf(".\n");
1088 }
1089
1090 #ifdef BATCH_DEBUG
1091 aaaa
1092 b3Contact4* hostContacts = new b3Contact4[nContacts];
1093 m_contactBuffer->read(hostContacts, nContacts);
1094 clFinish(m_queue);
1095
1096 gpuDebugInfo.read(debugInfo, numWorkItems);
1097 clFinish(m_queue);
1098
1099 for (int i = 0; i < numWorkItems; i++)
1100 {
1101 if (debugInfo[i].m_valInt1 > 0)
1102 {
1103 printf("catch\n");
1104 }
1105 if (debugInfo[i].m_valInt2 > 0)
1106 {
1107 printf("catch22\n");
1108 }
1109
1110 if (debugInfo[i].m_valInt3 > 0)
1111 {
1112 printf("catch666\n");
1113 }
1114
1115 if (debugInfo[i].m_valInt4 > 0)
1116 {
1117 printf("catch777\n");
1118 }
1119 }
1120 delete[] debugInfo;
1121 #endif //BATCH_DEBUG
1122 }
1123
1124 // copy buffer to buffer
1125 //b3Assert(m_contactBuffer->size()==nContacts);
1126 //contacts->copyFromOpenCLArray( *m_contactBuffer);
1127 //clFinish(m_queue);//needed?
1128 }
1129