1 // =============================================================================
2 // PROJECT CHRONO - http://projectchrono.org
3 //
4 // Copyright (c) 2019 projectchrono.org
5 // All rights reserved.
6 //
7 // Use of this source code is governed by a BSD-style license that can be found
8 // in the LICENSE file at the top level of the distribution and at
9 // http://projectchrono.org/license-chrono.txt.
10 //
11 // =============================================================================
12 // Authors: Conlain Kelly, Nic Olsen, Dan Negrut, Ruochun Zhang
13 // =============================================================================
14 
15 #include "chrono_gpu/cuda/ChGpu_SMC_trimesh.cuh"
16 #include "chrono_gpu/cuda/ChGpu_SMC.cuh"
17 #include "chrono_gpu/physics/ChSystemGpuMesh_impl.h"
18 #include "chrono_gpu/utils/ChGpuUtilities.h"
19 #include <math_constants.h>
20 
21 namespace chrono {
22 namespace gpu {
23 
runTriangleBroadphase()24 __host__ void ChSystemGpuMesh_impl::runTriangleBroadphase() {
25     METRICS_PRINTF("Resetting broadphase info!\n");
26 
27     unsigned int numTriangles = meshSoup->nTrianglesInSoup;
28     unsigned int nblocks = (numTriangles + CUDA_THREADS_PER_BLOCK - 1) / CUDA_THREADS_PER_BLOCK;
29     determineCountOfSDsTouchedByEachTriangle<<<nblocks, CUDA_THREADS_PER_BLOCK>>>(
30         meshSoup, Triangle_NumSDsTouching.data(), gran_params, tri_params);
31 
32     gpuErrchk(cudaDeviceSynchronize());
33     gpuErrchk(cudaPeekAtLastError());
34 
35     // do prefix scan
36     size_t temp_storage_bytes = 0;
37     unsigned int* out_ptr = Triangle_SDsCompositeOffsets.data();
38     unsigned int* in_ptr = Triangle_NumSDsTouching.data();
39 
40     // copy data into the tmp array
41     gpuErrchk(cudaMemcpy(out_ptr, in_ptr, numTriangles * sizeof(unsigned int), cudaMemcpyDeviceToDevice));
42     cub::DeviceScan::ExclusiveSum(NULL, temp_storage_bytes, in_ptr, out_ptr, numTriangles);
43     gpuErrchk(cudaDeviceSynchronize());
44 
45     // get pointer to device memory; this memory block will be used internally by CUB, for scratch area
46     void* d_scratch_space = (void*)stateOfSolver_resources.pDeviceMemoryScratchSpace(temp_storage_bytes);
47     // Run exclusive prefix sum
48     cub::DeviceScan::ExclusiveSum(d_scratch_space, temp_storage_bytes, in_ptr, out_ptr, numTriangles);
49     gpuErrchk(cudaDeviceSynchronize());
50     unsigned int numOfTriangleTouchingSD_instances;  // total number of instances in which a triangle touches an SD
51     numOfTriangleTouchingSD_instances = out_ptr[numTriangles - 1] + in_ptr[numTriangles - 1];
52 
53     // resize, if need be, several dummy vectors that handle in managed memory
54     SDsTouchedByEachTriangle_composite_out.resize(numOfTriangleTouchingSD_instances, NULL_CHGPU_ID);
55     SDsTouchedByEachTriangle_composite.resize(numOfTriangleTouchingSD_instances, NULL_CHGPU_ID);
56     TriangleIDS_ByMultiplicity_out.resize(numOfTriangleTouchingSD_instances, NULL_CHGPU_ID);
57     TriangleIDS_ByMultiplicity.resize(numOfTriangleTouchingSD_instances, NULL_CHGPU_ID);
58 
59     // sort key-value where the key is SD id, value is triangle ID in composite array
60     storeSDsTouchedByEachTriangle<<<nblocks, CUDA_THREADS_PER_BLOCK>>>(
61         meshSoup, Triangle_NumSDsTouching.data(), Triangle_SDsCompositeOffsets.data(),
62         SDsTouchedByEachTriangle_composite.data(), TriangleIDS_ByMultiplicity.data(), gran_params, tri_params);
63     gpuErrchk(cudaDeviceSynchronize());
64 
65     unsigned int* d_keys_in = SDsTouchedByEachTriangle_composite.data();
66     unsigned int* d_keys_out = SDsTouchedByEachTriangle_composite_out.data();
67     unsigned int* d_values_in = TriangleIDS_ByMultiplicity.data();
68     unsigned int* d_values_out = TriangleIDS_ByMultiplicity_out.data();
69 
70     // Run CUB sorting operation, key-value type.
71     // Key: the ID of the SD.
72     // Value: the ID of the triangle that touches the "Key" SD.
73     // The outcome of the sort operation will look like this:
74     // SDs:       23 23 23 89 89  89  89  107 107 107 etc.
75     // Triangle:   5  9 17 43 67 108 221    6  12 298 etc.
76     // First, determine temporary device storage requirements; pass null, CUB tells us what it needs
77     cub::DeviceRadixSort::SortPairs(NULL, temp_storage_bytes, d_keys_in, d_keys_out, d_values_in, d_values_out,
78                                     numOfTriangleTouchingSD_instances);
79     gpuErrchk(cudaDeviceSynchronize());
80 
81     // get pointer to device memory; this memory block will be used internally by CUB
82     d_scratch_space = (void*)stateOfSolver_resources.pDeviceMemoryScratchSpace(temp_storage_bytes);
83     cub::DeviceRadixSort::SortPairs(d_scratch_space, temp_storage_bytes, d_keys_in, d_keys_out, d_values_in,
84                                     d_values_out, numOfTriangleTouchingSD_instances);
85     gpuErrchk(cudaDeviceSynchronize());
86 
87     // We started with SDs touching a triangle; we just flipped this through the key-value sort. That is, we now
88     // know the collection of triangles that touch each SD; SD by SD.
89     SD_trianglesInEachSD_composite.resize(TriangleIDS_ByMultiplicity_out.size());
90     gpuErrchk(cudaDeviceSynchronize());
91     gpuErrchk(cudaMemcpy(SD_trianglesInEachSD_composite.data(), TriangleIDS_ByMultiplicity_out.data(),
92                          numOfTriangleTouchingSD_instances * sizeof(unsigned int), cudaMemcpyDeviceToDevice));
93 
94     // The CUB encode operation below will tell us what SDs are actually touched by triangles, and how many triangles
95     // touch each SD.
96     //
97     // "d_in" is SDsTouchedByEachTriangle_composite_out; contains the IDs of the SDs that have triangles in them; if an
98     // SD is touched by "t" triangles, it'll show up "t" times in this array
99     unsigned int* d_in = d_keys_out;
100     // d_unique_out stores a list of *unique* SDs with the following property: each SD in this list has at least one
101     // triangle touching it. In terms of memory, this is pretty wasteful since it's unilkely that all SDs are touched by
102     // at least one triangle; perhaps revisit later.
103     unsigned int* d_unique_out =
104         (unsigned int*)stateOfSolver_resources.pDeviceMemoryScratchSpace(nSDs * sizeof(unsigned int));
105     // squatting on SD_TrianglesCompositeOffsets device vector; its size is nSDs. Works in tandem with d_unique_out.
106     // If d_unique_out[4]=72, d_counts_out[4] says how many triangles touch SD 72.
107     unsigned int* d_counts_out = SD_TrianglesCompositeOffsets.data();
108     // squatting on TriangleIDS_ByMultiplicity, which is not needed anymore. We're using only *one* entry in this array.
109     // Output value represents the number of SDs that have at last one triangle touching the SD
110     unsigned int* d_num_runs_out = Triangle_SDsCompositeOffsets.data();
111     // dry run, figure out the number of bytes that will be used in the actual run
112     cub::DeviceRunLengthEncode::Encode(NULL, temp_storage_bytes, d_in, d_unique_out, d_counts_out, d_num_runs_out,
113                                        numOfTriangleTouchingSD_instances);
114     gpuErrchk(cudaDeviceSynchronize());
115 
116     d_scratch_space = TriangleIDS_ByMultiplicity.data();
117     // Run the actual encoding operation
118     cub::DeviceRunLengthEncode::Encode(d_scratch_space, temp_storage_bytes, d_in, d_unique_out, d_counts_out,
119                                        d_num_runs_out, numOfTriangleTouchingSD_instances);
120     gpuErrchk(cudaDeviceSynchronize());
121 
122     // SD_numTrianglesTouching contains only zeros
123     // compute offsets in SD_trianglesInEachSD_composite and also counts for how many triangles touch each SD.
124     // Start by zeroing out, it's important since not all entries will be touched in
125     gpuErrchk(cudaMemset(SD_numTrianglesTouching.data(), 0, nSDs * sizeof(unsigned int)));
126     nblocks = ((*d_num_runs_out) + CUDA_THREADS_PER_BLOCK - 1) / CUDA_THREADS_PER_BLOCK;
127     if (nblocks > 0) {
128         finalizeSD_numTrianglesTouching<<<nblocks, CUDA_THREADS_PER_BLOCK>>>(d_unique_out, d_counts_out, d_num_runs_out,
129                                                                              SD_numTrianglesTouching.data());
130         gpuErrchk(cudaDeviceSynchronize());
131     }
132 
133     // Now assert that no SD has over max amount of triangles
134     // If there is one, exit graciously
135     in_ptr = SD_numTrianglesTouching.data();
136     // Just borrow the first element of SD_TrianglesCompositeOffsets to store the max value
137     unsigned int* maxTriCount = SD_TrianglesCompositeOffsets.data();
138     cub::DeviceReduce::Max(NULL, temp_storage_bytes, in_ptr, maxTriCount, nSDs);
139     gpuErrchk(cudaDeviceSynchronize());
140     d_scratch_space = (void*)stateOfSolver_resources.pDeviceMemoryScratchSpace(temp_storage_bytes);
141     cub::DeviceReduce::Max(d_scratch_space, temp_storage_bytes, in_ptr, maxTriCount, nSDs);
142     gpuErrchk(cudaDeviceSynchronize());
143     if (*maxTriCount > MAX_TRIANGLE_COUNT_PER_SD)
144         CHGPU_ERROR("ERROR! %u triangles are found in one of the SDs! The max allowance is %u.\n", *maxTriCount,
145                     MAX_TRIANGLE_COUNT_PER_SD);
146 
147     // Lastly, we need to do a CUB prefix scan to get the offsets in the big composite array
148     in_ptr = SD_numTrianglesTouching.data();
149     out_ptr = SD_TrianglesCompositeOffsets.data();
150     cub::DeviceScan::ExclusiveSum(NULL, temp_storage_bytes, in_ptr, out_ptr, nSDs);
151     gpuErrchk(cudaDeviceSynchronize());
152     d_scratch_space = (void*)stateOfSolver_resources.pDeviceMemoryScratchSpace(temp_storage_bytes);
153     // Run CUB exclusive prefix sum
154     cub::DeviceScan::ExclusiveSum(d_scratch_space, temp_storage_bytes, in_ptr, out_ptr, nSDs);
155     gpuErrchk(cudaDeviceSynchronize());
156 }
157 
interactionGranMat_TriangleSoup_matBased(ChSystemGpuMesh_impl::TriangleSoupPtr d_triangleSoup,ChSystemGpu_impl::GranSphereDataPtr sphere_data,const unsigned int * SD_trianglesInEachSD_composite,const unsigned int * SD_numTrianglesTouching,const unsigned int * SD_TrianglesCompositeOffsets,ChSystemGpu_impl::GranParamsPtr gran_params,ChSystemGpuMesh_impl::MeshParamsPtr mesh_params,unsigned int triangleFamilyHistmapOffset)158 __global__ void interactionGranMat_TriangleSoup_matBased(ChSystemGpuMesh_impl::TriangleSoupPtr d_triangleSoup,
159                                                          ChSystemGpu_impl::GranSphereDataPtr sphere_data,
160                                                          const unsigned int* SD_trianglesInEachSD_composite,
161                                                          const unsigned int* SD_numTrianglesTouching,
162                                                          const unsigned int* SD_TrianglesCompositeOffsets,
163                                                          ChSystemGpu_impl::GranParamsPtr gran_params,
164                                                          ChSystemGpuMesh_impl::MeshParamsPtr mesh_params,
165                                                          unsigned int triangleFamilyHistmapOffset) {
166     __shared__ unsigned int triangleIDs[MAX_TRIANGLE_COUNT_PER_SD];  //!< global ID of the triangles touching this SD
167 
168     __shared__ int3 sphere_pos_local[MAX_COUNT_OF_SPHERES_PER_SD];  //!< local coordinate of the sphere
169 
170     __shared__ float3 sphere_vel[MAX_COUNT_OF_SPHERES_PER_SD];
171 
172     // TODO figure out how we can do this better with no friction
173     __shared__ float3 omega[MAX_COUNT_OF_SPHERES_PER_SD];
174 
175     __shared__ double3 node1[MAX_TRIANGLE_COUNT_PER_SD];  //!< Coordinates of the 1st node of the triangle
176     __shared__ double3 node2[MAX_TRIANGLE_COUNT_PER_SD];  //!< Coordinates of the 2nd node of the triangle
177     __shared__ double3 node3[MAX_TRIANGLE_COUNT_PER_SD];  //!< Coordinates of the 3rd node of the triangle
178 
179     // define an alias first
180     unsigned int thisSD = blockIdx.x;
181 
182     if (SD_numTrianglesTouching[thisSD] == 0) {
183         return;  // no triangle touches this block's SD
184     }
185     unsigned int spheresTouchingThisSD = sphere_data->SD_NumSpheresTouching[thisSD];
186     if (spheresTouchingThisSD == 0) {
187         return;  // no sphere touches this block's SD
188     }
189 
190     // Getting here means that there are both triangles and DEs in this SD.
191     unsigned int numSDTriangles = SD_numTrianglesTouching[thisSD];
192     unsigned int sphereIDLocal = threadIdx.x;
193     unsigned int sphereIDGlobal = NULL_CHGPU_ID;
194 
195     // Bring in data from global into shmem. Only a subset of threads get to do this.
196     // Note that we're not using shared memory very heavily, so our bandwidth is pretty low
197     if (sphereIDLocal < spheresTouchingThisSD) {
198         size_t SD_composite_offset = sphere_data->SD_SphereCompositeOffsets[thisSD];
199 
200         // TODO standardize this
201         size_t offset_in_composite_Array = SD_composite_offset + sphereIDLocal;
202         sphereIDGlobal = sphere_data->spheres_in_SD_composite[offset_in_composite_Array];
203 
204         sphere_pos_local[sphereIDLocal] =
205             make_int3(sphere_data->sphere_local_pos_X[sphereIDGlobal], sphere_data->sphere_local_pos_Y[sphereIDGlobal],
206                       sphere_data->sphere_local_pos_Z[sphereIDGlobal]);
207 
208         unsigned int sphere_owner_SD = sphere_data->sphere_owner_SDs[sphereIDGlobal];
209         // if this SD doesn't own that sphere, add an offset to account
210         if (sphere_owner_SD != thisSD) {
211             sphere_pos_local[sphereIDLocal] =
212                 sphere_pos_local[sphereIDLocal] + getOffsetFromSDs(thisSD, sphere_owner_SD, gran_params);
213         }
214 
215         if (gran_params->friction_mode != chrono::gpu::CHGPU_FRICTION_MODE::FRICTIONLESS) {
216             omega[sphereIDLocal] =
217                 make_float3(sphere_data->sphere_Omega_X[sphereIDGlobal], sphere_data->sphere_Omega_Y[sphereIDGlobal],
218                             sphere_data->sphere_Omega_Z[sphereIDGlobal]);
219         }
220         sphere_vel[sphereIDLocal] =
221             make_float3(sphere_data->pos_X_dt[sphereIDGlobal], sphere_data->pos_Y_dt[sphereIDGlobal],
222                         sphere_data->pos_Z_dt[sphereIDGlobal]);
223     }
224     // Populate the shared memory with mesh triangle data
225     unsigned int tripsToCoverTriangles = (numSDTriangles + blockDim.x - 1) / blockDim.x;
226     unsigned int local_ID = threadIdx.x;
227     for (unsigned int triangTrip = 0; triangTrip < tripsToCoverTriangles; triangTrip++) {
228         if (local_ID < numSDTriangles) {
229             size_t SD_composite_offset = SD_TrianglesCompositeOffsets[thisSD];
230             if (SD_composite_offset == NULL_CHGPU_ID) {
231                 ABORTABORTABORT("Invalid composite offset %lu for SD %u, touching %u triangles\n", NULL_CHGPU_ID,
232                                 thisSD, numSDTriangles);
233             }
234             size_t offset_in_composite_Array = SD_composite_offset + local_ID;
235 
236             unsigned int globalID = SD_trianglesInEachSD_composite[offset_in_composite_Array];
237             triangleIDs[local_ID] = globalID;
238 
239             // Read node positions from global memory into shared memory
240             // NOTE implicit cast from float to double here
241             unsigned int fam = d_triangleSoup->triangleFamily_ID[globalID];
242             node1[local_ID] = apply_frame_transform<double, float3, double3>(
243                 d_triangleSoup->node1[globalID], mesh_params->fam_frame_narrow[fam].pos,
244                 mesh_params->fam_frame_narrow[fam].rot_mat);
245 
246             node2[local_ID] = apply_frame_transform<double, float3, double3>(
247                 d_triangleSoup->node2[globalID], mesh_params->fam_frame_narrow[fam].pos,
248                 mesh_params->fam_frame_narrow[fam].rot_mat);
249 
250             node3[local_ID] = apply_frame_transform<double, float3, double3>(
251                 d_triangleSoup->node3[globalID], mesh_params->fam_frame_narrow[fam].pos,
252                 mesh_params->fam_frame_narrow[fam].rot_mat);
253 
254             convert_pos_UU2SU<double3>(node1[local_ID], gran_params);
255             convert_pos_UU2SU<double3>(node2[local_ID], gran_params);
256             convert_pos_UU2SU<double3>(node3[local_ID], gran_params);
257         }
258         local_ID += blockDim.x;
259     }
260 
261     __syncthreads();  // this call ensures data is in its place in shared memory
262 
263     float3 sphere_force = {0.f, 0.f, 0.f};
264     float3 sphere_AngAcc = {0.f, 0.f, 0.f};
265 
266     if (sphereIDLocal < spheresTouchingThisSD) {
267         // loop over each triangle in the SD and compute the force this sphere (thread) exerts on it
268         for (unsigned int triangleLocalID = 0; triangleLocalID < numSDTriangles; triangleLocalID++) {
269             /// we have a valid sphere and a valid triganle; check if in contact
270             float3 normal;  // Unit normal from pt2 to pt1 (triangle contact point to sphere contact point)
271             float depth;    // Negative in overlap
272             float3 pt1_float;
273 
274             // Transform LRF to GRF
275             const unsigned int fam = d_triangleSoup->triangleFamily_ID[triangleIDs[triangleLocalID]];
276             bool valid_contact = false;
277 
278             // vector from center of mesh body to contact point, assume this can be held in a float
279             float3 fromCenter;
280 
281             {
282                 double3 pt1;  // Contact point on triangle
283                 // NOTE sphere_pos_local is relative to THIS SD, not its owner SD
284                 double3 sphCntr =
285                     int64_t3_to_double3(convertPosLocalToGlobal(thisSD, sphere_pos_local[sphereIDLocal], gran_params));
286                 valid_contact = face_sphere_cd(node1[triangleLocalID], node2[triangleLocalID], node3[triangleLocalID],
287                                                sphCntr, gran_params->sphereRadius_SU, normal, depth, pt1);
288 
289                 valid_contact = valid_contact &&
290                                 SDTripletID(pointSDTriplet(pt1.x, pt1.y, pt1.z, gran_params), gran_params) == thisSD;
291                 pt1_float = make_float3(pt1.x, pt1.y, pt1.z);
292 
293                 double3 meshCenter_double =
294                     make_double3(mesh_params->fam_frame_narrow[fam].pos[0], mesh_params->fam_frame_narrow[fam].pos[1],
295                                  mesh_params->fam_frame_narrow[fam].pos[2]);
296                 convert_pos_UU2SU<double3>(meshCenter_double, gran_params);
297 
298                 double3 fromCenter_double = pt1 - meshCenter_double;
299 
300                 fromCenter = make_float3(fromCenter_double.x, fromCenter_double.y, fromCenter_double.z);
301             }
302 
303             // If there is a collision, add an impulse to the sphere
304             if (valid_contact) {
305                 // TODO contact models
306                 // Use the CD information to compute the force on the grElement
307                 // normal points from triangle to sphere
308                 float3 delta = -depth * normal;
309 
310                 // effective radius is just sphere radius -- assume meshes are locally flat (a safe assumption?)
311                 // float hertz_force_factor = sqrt(abs(depth) / gran_params->sphereRadius_SU);
312 
313                 // helper variables
314                 float sqrt_Rd = sqrt(abs(depth) * gran_params->sphereRadius_SU);
315                 float Sn = 2. * mesh_params->E_eff_s2m_SU * sqrt_Rd;
316 
317                 float loge = (mesh_params->COR_s2m_SU < EPSILON) ? log(EPSILON) : log(mesh_params->COR_s2m_SU);
318                 float beta = loge / sqrt(loge * loge + CUDART_PI_F * CUDART_PI_F);
319 
320                 // effective mass = mass_mesh * mass_sphere / (m_mesh + mass_sphere)
321                 float fam_mass_SU = d_triangleSoup->familyMass_SU[fam];
322                 const float sphere_mass_SU = gran_params->sphere_mass_SU;
323                 float m_eff = sphere_mass_SU * fam_mass_SU / (sphere_mass_SU + fam_mass_SU);
324 
325                 // stiffness and damping coefficient
326                 float kn = (2.0 / 3.0) * Sn;
327                 float gn = 2 * sqrt(5.0 / 6.0) * beta * sqrt(Sn * m_eff);
328                 // relative velocity = v_sphere - v_mesh
329                 float3 v_rel = sphere_vel[sphereIDLocal] - d_triangleSoup->vel[fam];
330 
331                 // assumes pos is the center of mass of the mesh
332                 float3 meshCenter =
333                     make_float3(mesh_params->fam_frame_broad[fam].pos[0], mesh_params->fam_frame_broad[fam].pos[1],
334                                 mesh_params->fam_frame_broad[fam].pos[2]);
335                 convert_pos_UU2SU<float3>(meshCenter, gran_params);
336 
337                 // NOTE depth is negative and normal points from triangle to sphere center
338                 float3 r = pt1_float + normal * (depth / 2) - meshCenter;
339 
340                 // Add angular velocity contribution from mesh
341                 v_rel = v_rel - Cross(d_triangleSoup->omega[fam], r);
342 
343                 // add tangential components if they exist
344                 if (gran_params->friction_mode != chrono::gpu::CHGPU_FRICTION_MODE::FRICTIONLESS) {
345                     // Vector from the center of sphere to center of contact volume
346                     float3 r_A = -(gran_params->sphereRadius_SU + depth / 2.f) * normal;
347                     v_rel = v_rel + Cross(omega[sphereIDLocal], r_A);
348                 }
349 
350                 // normal component of relative velocity
351                 float projection = Dot(v_rel, normal);
352 
353                 // tangential component of relative velocity
354                 float3 vrel_t = v_rel - projection * normal;
355 
356                 // normal force magnitude
357                 float forceN_mag = -kn * depth + gn * projection;
358 
359                 float3 force_accum = forceN_mag * normal;
360 
361                 // Compute force updates for adhesion term, opposite the spring term
362                 // NOTE ratio is wrt the weight of a sphere of mass 1
363                 // NOTE the cancelation of two negatives
364                 force_accum = force_accum + gran_params->sphere_mass_SU * mesh_params->adhesionAcc_s2m * delta / depth;
365 
366                 // tangential component
367                 if (gran_params->friction_mode != chrono::gpu::CHGPU_FRICTION_MODE::FRICTIONLESS) {
368                     // radius pointing from the contact point to the center of particle
369                     float3 Rc = (gran_params->sphereRadius_SU + depth / 2.f) * normal;
370                     float3 roll_ang_acc = computeRollingAngAcc(
371                         sphere_data, gran_params, mesh_params->rolling_coeff_s2m_SU, mesh_params->spinning_coeff_s2m_SU,
372                         force_accum, omega[sphereIDLocal], d_triangleSoup->omega[fam], Rc);
373 
374                     sphere_AngAcc = sphere_AngAcc + roll_ang_acc;
375 
376                     unsigned int BC_histmap_label = triangleFamilyHistmapOffset + fam;
377 
378                     // compute tangent force
379                     float3 tangent_force = computeFrictionForces_matBased(
380                         gran_params, sphere_data, sphereIDGlobal, BC_histmap_label,
381                         mesh_params->static_friction_coeff_s2m, mesh_params->E_eff_s2m_SU, mesh_params->G_eff_s2m_SU,
382                         sqrt_Rd, beta, force_accum, vrel_t, normal, m_eff);
383 
384                     float force_unit = gran_params->MASS_UNIT * gran_params->LENGTH_UNIT /
385                                        (gran_params->TIME_UNIT * gran_params->TIME_UNIT);
386 
387                     float velocity_unit = gran_params->LENGTH_UNIT / gran_params->TIME_UNIT;
388 
389                     force_accum = force_accum + tangent_force;
390                     sphere_AngAcc =
391                         sphere_AngAcc + Cross(-1.f * normal, tangent_force) / gran_params->sphereInertia_by_r;
392                 }
393 
394                 // Use the CD information to compute the force and torque on the family of this triangle
395                 sphere_force = sphere_force + force_accum;
396 
397                 // Force on the mesh is opposite the force on the sphere
398                 float3 force_total = -1.f * force_accum;
399 
400                 float3 torque = Cross(fromCenter, force_total);
401                 // TODO we could be much smarter about reducing this atomic write
402                 unsigned int fam = d_triangleSoup->triangleFamily_ID[triangleIDs[triangleLocalID]];
403                 atomicAdd(d_triangleSoup->generalizedForcesPerFamily + fam * 6 + 0, force_total.x);
404                 atomicAdd(d_triangleSoup->generalizedForcesPerFamily + fam * 6 + 1, force_total.y);
405                 atomicAdd(d_triangleSoup->generalizedForcesPerFamily + fam * 6 + 2, force_total.z);
406 
407                 atomicAdd(d_triangleSoup->generalizedForcesPerFamily + fam * 6 + 3, torque.x);
408                 atomicAdd(d_triangleSoup->generalizedForcesPerFamily + fam * 6 + 4, torque.y);
409                 atomicAdd(d_triangleSoup->generalizedForcesPerFamily + fam * 6 + 5, torque.z);
410             }
411         }  // end of per-triangle loop
412         // write back sphere forces
413         atomicAdd(sphere_data->sphere_acc_X + sphereIDGlobal, sphere_force.x / gran_params->sphere_mass_SU);
414         atomicAdd(sphere_data->sphere_acc_Y + sphereIDGlobal, sphere_force.y / gran_params->sphere_mass_SU);
415         atomicAdd(sphere_data->sphere_acc_Z + sphereIDGlobal, sphere_force.z / gran_params->sphere_mass_SU);
416 
417         if (gran_params->friction_mode != chrono::gpu::CHGPU_FRICTION_MODE::FRICTIONLESS) {
418             // write back torques for later
419             atomicAdd(sphere_data->sphere_ang_acc_X + sphereIDGlobal, sphere_AngAcc.x);
420             atomicAdd(sphere_data->sphere_ang_acc_Y + sphereIDGlobal, sphere_AngAcc.y);
421             atomicAdd(sphere_data->sphere_ang_acc_Z + sphereIDGlobal, sphere_AngAcc.z);
422         }
423     }  // end sphere id check
424 }  // end kernel
425 
426 /// <summary>
427 /// Kernel accounts for the interaction between the granular material and the triangles making up the triangle soup
428 /// </summary>
429 /// <param name="d_triangleSoup">- information about triangle soup (in device mem.)</param>
430 /// <param name="sphere_data">- data structure containing pointers to granular-material related info</param>
431 /// <param name="SD_trianglesInEachSD_composite">- array saying which triangles touch an SD; has information for each
432 /// SD</param> <param name="SD_numTrianglesTouching">- number of triangles touching each SD</param> <param
433 /// name="SD_TrianglesCompositeOffsets">- offsets in the composite array for each SD; where each SD starts storing its
434 /// triangles</param> <param name="gran_params">- parameters associated with the granular material</param> <param
435 /// name="mesh_params">- parameters associated with the triangle soup</param> <param
436 /// name="triangleFamilyHistmapOffset">- offset in the array of friction history (?)</param> <returns></returns>
interactionGranMat_TriangleSoup(ChSystemGpuMesh_impl::TriangleSoupPtr d_triangleSoup,ChSystemGpu_impl::GranSphereDataPtr sphere_data,const unsigned int * SD_trianglesInEachSD_composite,const unsigned int * SD_numTrianglesTouching,const unsigned int * SD_TrianglesCompositeOffsets,ChSystemGpu_impl::GranParamsPtr gran_params,ChSystemGpuMesh_impl::MeshParamsPtr mesh_params,unsigned int triangleFamilyHistmapOffset)437 __global__ void interactionGranMat_TriangleSoup(ChSystemGpuMesh_impl::TriangleSoupPtr d_triangleSoup,
438                                                 ChSystemGpu_impl::GranSphereDataPtr sphere_data,
439                                                 const unsigned int* SD_trianglesInEachSD_composite,
440                                                 const unsigned int* SD_numTrianglesTouching,
441                                                 const unsigned int* SD_TrianglesCompositeOffsets,
442                                                 ChSystemGpu_impl::GranParamsPtr gran_params,
443                                                 ChSystemGpuMesh_impl::MeshParamsPtr mesh_params,
444                                                 unsigned int triangleFamilyHistmapOffset) {
445     __shared__ unsigned int triangleIDs[MAX_TRIANGLE_COUNT_PER_SD];  //!< global ID of the triangles touching this SD
446 
447     __shared__ int3 sphere_pos_local[MAX_COUNT_OF_SPHERES_PER_SD];  //!< local coordinate of the sphere
448 
449     __shared__ float3 sphere_vel[MAX_COUNT_OF_SPHERES_PER_SD];
450 
451     // TODO figure out how we can do this better with no friction
452     __shared__ float3 omega[MAX_COUNT_OF_SPHERES_PER_SD];
453 
454     __shared__ double3 node1[MAX_TRIANGLE_COUNT_PER_SD];  //!< Coordinates of the 1st node of the triangle
455     __shared__ double3 node2[MAX_TRIANGLE_COUNT_PER_SD];  //!< Coordinates of the 2nd node of the triangle
456     __shared__ double3 node3[MAX_TRIANGLE_COUNT_PER_SD];  //!< Coordinates of the 3rd node of the triangle
457 
458     // define an alias first
459     unsigned int thisSD = blockIdx.x;
460 
461     if (SD_numTrianglesTouching[thisSD] == 0) {
462         return;  // no triangle touches this block's SD
463     }
464     unsigned int spheresTouchingThisSD = sphere_data->SD_NumSpheresTouching[thisSD];
465     if (spheresTouchingThisSD == 0) {
466         return;  // no sphere touches this block's SD
467     }
468 
469     // Getting here means that there are both triangles and DEs in this SD.
470     unsigned int numSDTriangles = SD_numTrianglesTouching[thisSD];
471     unsigned int sphereIDLocal = threadIdx.x;
472     unsigned int sphereIDGlobal = NULL_CHGPU_ID;
473 
474     // Bring in data from global into shmem. Only a subset of threads get to do this.
475     // Note that we're not using shared memory very heavily, so our bandwidth is pretty low
476     if (sphereIDLocal < spheresTouchingThisSD) {
477         size_t SD_composite_offset = sphere_data->SD_SphereCompositeOffsets[thisSD];
478 
479         // TODO standardize this
480         size_t offset_in_composite_Array = SD_composite_offset + sphereIDLocal;
481         sphereIDGlobal = sphere_data->spheres_in_SD_composite[offset_in_composite_Array];
482 
483         sphere_pos_local[sphereIDLocal] =
484             make_int3(sphere_data->sphere_local_pos_X[sphereIDGlobal], sphere_data->sphere_local_pos_Y[sphereIDGlobal],
485                       sphere_data->sphere_local_pos_Z[sphereIDGlobal]);
486 
487         unsigned int sphere_owner_SD = sphere_data->sphere_owner_SDs[sphereIDGlobal];
488         // if this SD doesn't own that sphere, add an offset to account
489         if (sphere_owner_SD != thisSD) {
490             sphere_pos_local[sphereIDLocal] =
491                 sphere_pos_local[sphereIDLocal] + getOffsetFromSDs(thisSD, sphere_owner_SD, gran_params);
492         }
493 
494         if (gran_params->friction_mode != chrono::gpu::CHGPU_FRICTION_MODE::FRICTIONLESS) {
495             omega[sphereIDLocal] =
496                 make_float3(sphere_data->sphere_Omega_X[sphereIDGlobal], sphere_data->sphere_Omega_Y[sphereIDGlobal],
497                             sphere_data->sphere_Omega_Z[sphereIDGlobal]);
498         }
499         sphere_vel[sphereIDLocal] =
500             make_float3(sphere_data->pos_X_dt[sphereIDGlobal], sphere_data->pos_Y_dt[sphereIDGlobal],
501                         sphere_data->pos_Z_dt[sphereIDGlobal]);
502     }
503     // Populate the shared memory with mesh triangle data
504     unsigned int tripsToCoverTriangles = (numSDTriangles + blockDim.x - 1) / blockDim.x;
505     unsigned int local_ID = threadIdx.x;
506     for (unsigned int triangTrip = 0; triangTrip < tripsToCoverTriangles; triangTrip++) {
507         if (local_ID < numSDTriangles) {
508             size_t SD_composite_offset = SD_TrianglesCompositeOffsets[thisSD];
509             if (SD_composite_offset == NULL_CHGPU_ID) {
510                 ABORTABORTABORT("Invalid composite offset %lu for SD %u, touching %u triangles\n", NULL_CHGPU_ID,
511                                 thisSD, numSDTriangles);
512             }
513             size_t offset_in_composite_Array = SD_composite_offset + local_ID;
514 
515             unsigned int globalID = SD_trianglesInEachSD_composite[offset_in_composite_Array];
516             triangleIDs[local_ID] = globalID;
517 
518             // Read node positions from global memory into shared memory
519             // NOTE implicit cast from float to double here
520             unsigned int fam = d_triangleSoup->triangleFamily_ID[globalID];
521             node1[local_ID] = apply_frame_transform<double, float3, double3>(
522                 d_triangleSoup->node1[globalID], mesh_params->fam_frame_narrow[fam].pos,
523                 mesh_params->fam_frame_narrow[fam].rot_mat);
524 
525             node2[local_ID] = apply_frame_transform<double, float3, double3>(
526                 d_triangleSoup->node2[globalID], mesh_params->fam_frame_narrow[fam].pos,
527                 mesh_params->fam_frame_narrow[fam].rot_mat);
528 
529             node3[local_ID] = apply_frame_transform<double, float3, double3>(
530                 d_triangleSoup->node3[globalID], mesh_params->fam_frame_narrow[fam].pos,
531                 mesh_params->fam_frame_narrow[fam].rot_mat);
532 
533             convert_pos_UU2SU<double3>(node1[local_ID], gran_params);
534             convert_pos_UU2SU<double3>(node2[local_ID], gran_params);
535             convert_pos_UU2SU<double3>(node3[local_ID], gran_params);
536         }
537         local_ID += blockDim.x;
538     }
539 
540     __syncthreads();  // this call ensures data is in its place in shared memory
541 
542     float3 sphere_force = {0.f, 0.f, 0.f};
543     float3 sphere_AngAcc = {0.f, 0.f, 0.f};
544     if (sphereIDLocal < spheresTouchingThisSD) {
545         // loop over each triangle in the SD and compute the force this sphere (thread) exerts on it
546         for (unsigned int triangleLocalID = 0; triangleLocalID < numSDTriangles; triangleLocalID++) {
547             /// we have a valid sphere and a valid triganle; check if in contact
548             float3 normal;  // Unit normal from pt2 to pt1 (triangle contact point to sphere contact point)
549             float depth;    // Negative in overlap
550             float3 pt1_float;
551 
552             // Transform LRF to GRF
553             const unsigned int fam = d_triangleSoup->triangleFamily_ID[triangleIDs[triangleLocalID]];
554             bool valid_contact = false;
555 
556             // vector from center of mesh body to contact point, assume this can be held in a float
557             float3 fromCenter;
558 
559             {
560                 double3 pt1;  // Contact point on triangle
561                 // NOTE sphere_pos_local is relative to THIS SD, not its owner SD
562                 double3 sphCntr =
563                     int64_t3_to_double3(convertPosLocalToGlobal(thisSD, sphere_pos_local[sphereIDLocal], gran_params));
564                 valid_contact = face_sphere_cd(node1[triangleLocalID], node2[triangleLocalID], node3[triangleLocalID],
565                                                sphCntr, gran_params->sphereRadius_SU, normal, depth, pt1);
566 
567                 valid_contact = valid_contact &&
568                                 SDTripletID(pointSDTriplet(pt1.x, pt1.y, pt1.z, gran_params), gran_params) == thisSD;
569                 pt1_float = make_float3(pt1.x, pt1.y, pt1.z);
570 
571                 double3 meshCenter_double =
572                     make_double3(mesh_params->fam_frame_narrow[fam].pos[0], mesh_params->fam_frame_narrow[fam].pos[1],
573                                  mesh_params->fam_frame_narrow[fam].pos[2]);
574                 convert_pos_UU2SU<double3>(meshCenter_double, gran_params);
575 
576                 double3 fromCenter_double = pt1 - meshCenter_double;
577 
578                 fromCenter = make_float3(fromCenter_double.x, fromCenter_double.y, fromCenter_double.z);
579             }
580 
581             // If there is a collision, add an impulse to the sphere
582             if (valid_contact) {
583                 // TODO contact models
584                 // Use the CD information to compute the force on the grElement
585                 float3 delta = -depth * normal;
586 
587                 // effective radius is just sphere radius -- assume meshes are locally flat (a safe assumption?)
588                 float hertz_force_factor = sqrt(abs(depth) / gran_params->sphereRadius_SU);
589 
590                 float3 force_accum = hertz_force_factor * mesh_params->K_n_s2m_SU * delta;
591 
592                 // Compute force updates for adhesion term, opposite the spring term
593                 // NOTE ratio is wrt the weight of a sphere of mass 1
594                 // NOTE the cancelation of two negatives
595                 force_accum = force_accum + gran_params->sphere_mass_SU * mesh_params->adhesionAcc_s2m * delta / depth;
596 
597                 // Velocity difference, it's better to do a coalesced access here than a fragmented access
598                 // inside
599                 float3 v_rel = sphere_vel[sphereIDLocal] - d_triangleSoup->vel[fam];
600 
601                 // TODO assumes pos is the center of mass of the mesh
602                 // TODO can this be float?
603                 float3 meshCenter =
604                     make_float3(mesh_params->fam_frame_broad[fam].pos[0], mesh_params->fam_frame_broad[fam].pos[1],
605                                 mesh_params->fam_frame_broad[fam].pos[2]);
606                 convert_pos_UU2SU<float3>(meshCenter, gran_params);
607 
608                 // NOTE depth is negative and normal points from triangle to sphere center
609                 float3 r = pt1_float + normal * (depth / 2) - meshCenter;
610 
611                 // Add angular velocity contribution from mesh
612                 v_rel = v_rel - Cross(d_triangleSoup->omega[fam], r);
613 
614                 // add tangential components if they exist
615                 if (gran_params->friction_mode != chrono::gpu::CHGPU_FRICTION_MODE::FRICTIONLESS) {
616                     // Vector from the center of sphere to center of contact volume
617                     float3 r_A = -(gran_params->sphereRadius_SU + depth / 2.f) * normal;
618                     v_rel = v_rel + Cross(omega[sphereIDLocal], r_A);
619                 }
620 
621                 // Force accumulator on sphere for this sphere-triangle collision
622                 // Compute force updates for normal spring term
623 
624                 // Compute force updates for damping term
625                 // NOTE assumes sphere mass of 1
626                 float fam_mass_SU = d_triangleSoup->familyMass_SU[fam];
627                 const float sphere_mass_SU = gran_params->sphere_mass_SU;
628                 float m_eff = sphere_mass_SU * fam_mass_SU / (sphere_mass_SU + fam_mass_SU);
629                 float3 vrel_n = Dot(v_rel, normal) * normal;
630                 v_rel = v_rel - vrel_n;  // v_rel is now tangential relative velocity
631 
632                 // Add normal damping term
633                 force_accum = force_accum - hertz_force_factor * mesh_params->Gamma_n_s2m_SU * m_eff * vrel_n;
634 
635                 if (gran_params->friction_mode != chrono::gpu::CHGPU_FRICTION_MODE::FRICTIONLESS) {
636                     // radius pointing from the contact point to the center of particle
637                     float3 Rc = (gran_params->sphereRadius_SU + depth / 2.f) * normal;
638                     float3 roll_ang_acc = computeRollingAngAcc(
639                         sphere_data, gran_params, mesh_params->rolling_coeff_s2m_SU, mesh_params->spinning_coeff_s2m_SU,
640                         force_accum, omega[sphereIDLocal], d_triangleSoup->omega[fam], Rc);
641 
642                     sphere_AngAcc = sphere_AngAcc + roll_ang_acc;
643 
644                     unsigned int BC_histmap_label = triangleFamilyHistmapOffset + fam;
645 
646                     // compute tangent force
647                     float3 tangent_force = computeFrictionForces(
648                         gran_params, sphere_data, sphereIDGlobal, BC_histmap_label,
649                         mesh_params->static_friction_coeff_s2m, mesh_params->K_t_s2m_SU, mesh_params->Gamma_t_s2m_SU,
650                         hertz_force_factor, m_eff, force_accum, v_rel, normal);
651 
652                     float force_unit = gran_params->MASS_UNIT * gran_params->LENGTH_UNIT /
653                                        (gran_params->TIME_UNIT * gran_params->TIME_UNIT);
654 
655                     float velocity_unit = gran_params->LENGTH_UNIT / gran_params->TIME_UNIT;
656 
657                     force_accum = force_accum + tangent_force;
658                     sphere_AngAcc =
659                         sphere_AngAcc + Cross(-1.f * normal, tangent_force) / gran_params->sphereInertia_by_r;
660                 }
661 
662                 // Use the CD information to compute the force and torque on the family of this triangle
663                 sphere_force = sphere_force + force_accum;
664 
665                 // Force on the mesh is opposite the force on the sphere
666                 float3 force_total = -1.f * force_accum;
667 
668                 float3 torque = Cross(fromCenter, force_total);
669                 // TODO we could be much smarter about reducing this atomic write
670                 unsigned int fam = d_triangleSoup->triangleFamily_ID[triangleIDs[triangleLocalID]];
671                 atomicAdd(d_triangleSoup->generalizedForcesPerFamily + fam * 6 + 0, force_total.x);
672                 atomicAdd(d_triangleSoup->generalizedForcesPerFamily + fam * 6 + 1, force_total.y);
673                 atomicAdd(d_triangleSoup->generalizedForcesPerFamily + fam * 6 + 2, force_total.z);
674 
675                 atomicAdd(d_triangleSoup->generalizedForcesPerFamily + fam * 6 + 3, torque.x);
676                 atomicAdd(d_triangleSoup->generalizedForcesPerFamily + fam * 6 + 4, torque.y);
677                 atomicAdd(d_triangleSoup->generalizedForcesPerFamily + fam * 6 + 5, torque.z);
678             }
679         }  // end of per-triangle loop
680         // write back sphere forces
681         atomicAdd(sphere_data->sphere_acc_X + sphereIDGlobal, sphere_force.x / gran_params->sphere_mass_SU);
682         atomicAdd(sphere_data->sphere_acc_Y + sphereIDGlobal, sphere_force.y / gran_params->sphere_mass_SU);
683         atomicAdd(sphere_data->sphere_acc_Z + sphereIDGlobal, sphere_force.z / gran_params->sphere_mass_SU);
684 
685         if (gran_params->friction_mode != chrono::gpu::CHGPU_FRICTION_MODE::FRICTIONLESS) {
686             // write back torques for later
687             atomicAdd(sphere_data->sphere_ang_acc_X + sphereIDGlobal, sphere_AngAcc.x);
688             atomicAdd(sphere_data->sphere_ang_acc_Y + sphereIDGlobal, sphere_AngAcc.y);
689             atomicAdd(sphere_data->sphere_ang_acc_Z + sphereIDGlobal, sphere_AngAcc.z);
690         }
691     }  // end sphere id check
692 }  // end kernel
693 
AdvanceSimulation(float duration)694 __host__ double ChSystemGpuMesh_impl::AdvanceSimulation(float duration) {
695     // Figure our the number of blocks that need to be launched to cover the box
696     unsigned int nBlocks = (nSpheres + CUDA_THREADS_PER_BLOCK - 1) / CUDA_THREADS_PER_BLOCK;
697 
698     // Settling simulation loop.
699     float duration_SU = (float)(duration / TIME_SU2UU);
700     unsigned int nsteps = (unsigned int)std::round(duration_SU / stepSize_SU);
701 
702     packSphereDataPointers();
703     // cudaMemAdvise(gran_params, sizeof(*gran_params), cudaMemAdviseSetReadMostly, dev_ID);
704 
705     METRICS_PRINTF("advancing by %f at timestep %f, %u timesteps at approx user timestep %f\n", duration_SU,
706                    stepSize_SU, nsteps, duration / nsteps);
707 
708     METRICS_PRINTF("Starting Main Simulation loop!\n");
709 
710     float time_elapsed_SU = 0.f;  // time elapsed in this call (SU)
711     // Run the simulation, there are aggressive synchronizations because we want to have no race conditions
712     for (; time_elapsed_SU < stepSize_SU * nsteps; time_elapsed_SU += stepSize_SU) {
713         updateBCPositions();
714         runSphereBroadphase();
715 
716         resetSphereAccelerations();
717         resetBCForces();
718         if (meshSoup->nTrianglesInSoup != 0 && mesh_collision_enabled) {
719             gpuErrchk(
720                 cudaMemset(meshSoup->generalizedForcesPerFamily, 0, 6 * meshSoup->numTriangleFamilies * sizeof(float)));
721         }
722         gpuErrchk(cudaPeekAtLastError());
723         gpuErrchk(cudaDeviceSynchronize());
724 
725         if (meshSoup->nTrianglesInSoup != 0 && mesh_collision_enabled) {
726             runTriangleBroadphase();
727         }
728 
729         METRICS_PRINTF("Starting computeSphereForces!\n");
730 
731         if (gran_params->friction_mode == CHGPU_FRICTION_MODE::FRICTIONLESS) {
732             // Compute sphere-sphere forces
733             if (gran_params->use_mat_based == true) {
734                 METRICS_PRINTF("use material based model\n");
735                 computeSphereForces_frictionless_matBased<<<nSDs, MAX_COUNT_OF_SPHERES_PER_SD>>>(
736                     sphere_data, gran_params, BC_type_list.data(), BC_params_list_SU.data(),
737                     (unsigned int)BC_params_list_SU.size());
738 
739             } else {
740                 METRICS_PRINTF("use user defined model\n");
741                 computeSphereForces_frictionless<<<nSDs, MAX_COUNT_OF_SPHERES_PER_SD>>>(
742                     sphere_data, gran_params, BC_type_list.data(), BC_params_list_SU.data(),
743                     (unsigned int)BC_params_list_SU.size());
744             }
745             gpuErrchk(cudaPeekAtLastError());
746             gpuErrchk(cudaDeviceSynchronize());
747         }
748         // frictional contact
749         else if (gran_params->friction_mode == CHGPU_FRICTION_MODE::SINGLE_STEP ||
750                  gran_params->friction_mode == CHGPU_FRICTION_MODE::MULTI_STEP) {
751             // figure out who is contacting
752             determineContactPairs<<<nSDs, MAX_COUNT_OF_SPHERES_PER_SD>>>(sphere_data, gran_params);
753             gpuErrchk(cudaPeekAtLastError());
754             gpuErrchk(cudaDeviceSynchronize());
755             METRICS_PRINTF("Frictional case.\n");
756             if (gran_params->use_mat_based == true) {
757                 METRICS_PRINTF("compute sphere-sphere and sphere-bc mat based\n");
758                 computeSphereContactForces_matBased<<<nBlocks, CUDA_THREADS_PER_BLOCK>>>(
759                     sphere_data, gran_params, BC_type_list.data(), BC_params_list_SU.data(),
760                     (unsigned int)BC_params_list_SU.size(), nSpheres);
761             } else {
762                 METRICS_PRINTF("compute sphere-sphere and sphere-bc user defined\n");
763                 computeSphereContactForces<<<nBlocks, CUDA_THREADS_PER_BLOCK>>>(
764                     sphere_data, gran_params, BC_type_list.data(), BC_params_list_SU.data(),
765                     (unsigned int)BC_params_list_SU.size(), nSpheres);
766             }
767         }
768         gpuErrchk(cudaPeekAtLastError());
769         gpuErrchk(cudaDeviceSynchronize());
770 
771         if (meshSoup->numTriangleFamilies != 0 && mesh_collision_enabled) {
772             // TODO please do not use a template here
773             // triangle labels come after BC labels numerically
774             unsigned int triangleFamilyHistmapOffset =
775                 gran_params->nSpheres + 1 + (unsigned int)BC_params_list_SU.size() + 1;
776             // compute sphere-triangle forces
777             if (tri_params->use_mat_based == true) {
778                 interactionGranMat_TriangleSoup_matBased<<<nSDs, MAX_COUNT_OF_SPHERES_PER_SD>>>(
779                     meshSoup, sphere_data, SD_trianglesInEachSD_composite.data(), SD_numTrianglesTouching.data(),
780                     SD_TrianglesCompositeOffsets.data(), gran_params, tri_params, triangleFamilyHistmapOffset);
781             } else {
782                 //   //              printf("compute sphere-mesh user defined\n");
783 
784                 interactionGranMat_TriangleSoup<<<nSDs, MAX_COUNT_OF_SPHERES_PER_SD>>>(
785                     meshSoup, sphere_data, SD_trianglesInEachSD_composite.data(), SD_numTrianglesTouching.data(),
786                     SD_TrianglesCompositeOffsets.data(), gran_params, tri_params, triangleFamilyHistmapOffset);
787             }
788         }
789 
790         gpuErrchk(cudaPeekAtLastError());
791         gpuErrchk(cudaDeviceSynchronize());
792 
793         METRICS_PRINTF("Starting integrateSpheres!\n");
794         integrateSpheres<<<nBlocks, CUDA_THREADS_PER_BLOCK>>>(stepSize_SU, sphere_data, nSpheres, gran_params);
795         gpuErrchk(cudaPeekAtLastError());
796         gpuErrchk(cudaDeviceSynchronize());
797 
798         if (gran_params->friction_mode != CHGPU_FRICTION_MODE::FRICTIONLESS) {
799             const unsigned int nThreadsUpdateHist = 2 * CUDA_THREADS_PER_BLOCK;
800             unsigned int fricMapSize = nSpheres * MAX_SPHERES_TOUCHED_BY_SPHERE;
801             unsigned int nBlocksFricHistoryPostProcess = (fricMapSize + nThreadsUpdateHist - 1) / nThreadsUpdateHist;
802             updateFrictionData<<<nBlocksFricHistoryPostProcess, nThreadsUpdateHist>>>(fricMapSize, sphere_data,
803                                                                                       gran_params);
804             gpuErrchk(cudaPeekAtLastError());
805             gpuErrchk(cudaDeviceSynchronize());
806             updateAngVels<<<nBlocks, CUDA_THREADS_PER_BLOCK>>>(stepSize_SU, sphere_data, nSpheres, gran_params);
807             gpuErrchk(cudaPeekAtLastError());
808             gpuErrchk(cudaDeviceSynchronize());
809         }
810 
811         elapsedSimTime += (float)(stepSize_SU * TIME_SU2UU);  // Advance current time
812     }
813 
814     return time_elapsed_SU * TIME_SU2UU;  // return elapsed UU time
815 }
816 }  // namespace gpu
817 }  // namespace chrono
818