1 // =============================================================================
2 // PROJECT CHRONO - http://projectchrono.org
3 //
4 // Copyright (c) 2019 projectchrono.org
5 // All rights reserved.
6 //
7 // Use of this source code is governed by a BSD-style license that can be found
8 // in the LICENSE file at the top level of the distribution and at
9 // http://projectchrono.org/license-chrono.txt.
10 //
11 // =============================================================================
12 // Authors: Conlain Kelly, Nic Olsen, Dan Negrut, Ruochun Zhang
13 // =============================================================================
14 
15 #include <cmath>
16 #include <numeric>
17 #include <fstream>
18 
19 #include "chrono_gpu/cuda/ChGpu_SMC.cuh"
20 #include "chrono_gpu/utils/ChGpuUtilities.h"
21 
22 namespace chrono {
23 namespace gpu {
24 
computeArray3SquaredSum(std::vector<float,cudallocator<float>> & arrX,std::vector<float,cudallocator<float>> & arrY,std::vector<float,cudallocator<float>> & arrZ,size_t nSpheres)25 __host__ float ChSystemGpu_impl::computeArray3SquaredSum(std::vector<float, cudallocator<float>>& arrX,
26                                                          std::vector<float, cudallocator<float>>& arrY,
27                                                          std::vector<float, cudallocator<float>>& arrZ,
28                                                          size_t nSpheres) {
29     const unsigned int threadsPerBlock = 1024;
30     unsigned int nBlocks = (nSpheres + threadsPerBlock - 1) / threadsPerBlock;
31     elementalArray3Squared<float><<<nBlocks, threadsPerBlock>>>(sphere_data->sphere_stats_buffer, arrX.data(),
32                                                                 arrY.data(), arrZ.data(), nSpheres);
33     gpuErrchk(cudaDeviceSynchronize());
34 
35     // Use CUB to reduce. And put the reduced result at the last element of sphere_stats_buffer array.
36     size_t temp_storage_bytes = 0;
37     cub::DeviceReduce::Sum(NULL, temp_storage_bytes, sphere_data->sphere_stats_buffer,
38                            sphere_data->sphere_stats_buffer + nSpheres, nSpheres);
39     void* d_scratch_space = (void*)stateOfSolver_resources.pDeviceMemoryScratchSpace(temp_storage_bytes);
40     cub::DeviceReduce::Sum(d_scratch_space, temp_storage_bytes, sphere_data->sphere_stats_buffer,
41                            sphere_data->sphere_stats_buffer + nSpheres, nSpheres);
42     gpuErrchk(cudaDeviceSynchronize());
43     gpuErrchk(cudaPeekAtLastError());
44     return *(sphere_data->sphere_stats_buffer + nSpheres);
45 }
46 
GetMaxParticleZ(bool getMax)47 __host__ double ChSystemGpu_impl::GetMaxParticleZ(bool getMax) {
48     size_t nSpheres = sphere_local_pos_Z.size();
49     if (nSpheres == 0)
50         CHGPU_ERROR("ERROR! 0 particle in system! Please call this method after Initialize().\n");
51 
52     const unsigned int threadsPerBlock = 1024;
53     unsigned int nBlocks = (nSpheres + threadsPerBlock - 1) / threadsPerBlock;
54     elementalZLocalToGlobal<<<nBlocks, threadsPerBlock>>>(sphere_data->sphere_stats_buffer, sphere_data, nSpheres,
55                                                           gran_params);
56     gpuErrchk(cudaDeviceSynchronize());
57 
58     // Use CUB to find the max or min Z.
59     size_t temp_storage_bytes = 0;
60     if (getMax) {
61         cub::DeviceReduce::Max(NULL, temp_storage_bytes, sphere_data->sphere_stats_buffer,
62                                sphere_data->sphere_stats_buffer + nSpheres, nSpheres);
63         void* d_scratch_space = (void*)stateOfSolver_resources.pDeviceMemoryScratchSpace(temp_storage_bytes);
64         cub::DeviceReduce::Max(d_scratch_space, temp_storage_bytes, sphere_data->sphere_stats_buffer,
65                                sphere_data->sphere_stats_buffer + nSpheres, nSpheres);
66     } else {
67         cub::DeviceReduce::Min(NULL, temp_storage_bytes, sphere_data->sphere_stats_buffer,
68                                sphere_data->sphere_stats_buffer + nSpheres, nSpheres);
69         void* d_scratch_space = (void*)stateOfSolver_resources.pDeviceMemoryScratchSpace(temp_storage_bytes);
70         cub::DeviceReduce::Min(d_scratch_space, temp_storage_bytes, sphere_data->sphere_stats_buffer,
71                                sphere_data->sphere_stats_buffer + nSpheres, nSpheres);
72     }
73     gpuErrchk(cudaDeviceSynchronize());
74     gpuErrchk(cudaPeekAtLastError());
75     return *(sphere_data->sphere_stats_buffer + nSpheres);
76 }
77 
78 // Reset broadphase data structures
resetBroadphaseInformation()79 void ChSystemGpu_impl::resetBroadphaseInformation() {
80     // Set all the offsets to zero
81     gpuErrchk(cudaMemset(SD_NumSpheresTouching.data(), 0, SD_NumSpheresTouching.size() * sizeof(unsigned int)));
82     gpuErrchk(cudaMemset(SD_SphereCompositeOffsets.data(), 0, SD_SphereCompositeOffsets.size() * sizeof(unsigned int)));
83     // For each SD, all the spheres touching that SD should have their ID be NULL_CHGPU_ID
84     gpuErrchk(cudaMemset(spheres_in_SD_composite.data(), NULL_CHGPU_ID,
85                          spheres_in_SD_composite.size() * sizeof(unsigned int)));
86     gpuErrchk(cudaDeviceSynchronize());
87 }
88 
89 // Reset sphere acceleration data structures
resetSphereAccelerations()90 void ChSystemGpu_impl::resetSphereAccelerations() {
91     // cache past acceleration data
92     if (time_integrator == CHGPU_TIME_INTEGRATOR::CHUNG) {
93         gpuErrchk(cudaMemcpy(sphere_acc_X_old.data(), sphere_acc_X.data(), nSpheres * sizeof(float),
94                              cudaMemcpyDeviceToDevice));
95         gpuErrchk(cudaMemcpy(sphere_acc_Y_old.data(), sphere_acc_Y.data(), nSpheres * sizeof(float),
96                              cudaMemcpyDeviceToDevice));
97         gpuErrchk(cudaMemcpy(sphere_acc_Z_old.data(), sphere_acc_Z.data(), nSpheres * sizeof(float),
98                              cudaMemcpyDeviceToDevice));
99         // if we have multistep AND friction, cache old alphas
100         if (gran_params->friction_mode != CHGPU_FRICTION_MODE::FRICTIONLESS) {
101             gpuErrchk(cudaMemcpy(sphere_ang_acc_X_old.data(), sphere_ang_acc_X.data(), nSpheres * sizeof(float),
102                                  cudaMemcpyDeviceToDevice));
103             gpuErrchk(cudaMemcpy(sphere_ang_acc_Y_old.data(), sphere_ang_acc_Y.data(), nSpheres * sizeof(float),
104                                  cudaMemcpyDeviceToDevice));
105             gpuErrchk(cudaMemcpy(sphere_ang_acc_Z_old.data(), sphere_ang_acc_Z.data(), nSpheres * sizeof(float),
106                                  cudaMemcpyDeviceToDevice));
107         }
108         gpuErrchk(cudaDeviceSynchronize());
109     }
110 
111     // reset current accelerations to zero to zero
112     gpuErrchk(cudaMemset(sphere_acc_X.data(), 0, nSpheres * sizeof(float)));
113     gpuErrchk(cudaMemset(sphere_acc_Y.data(), 0, nSpheres * sizeof(float)));
114     gpuErrchk(cudaMemset(sphere_acc_Z.data(), 0, nSpheres * sizeof(float)));
115 
116     // reset torques to zero, if applicable
117     if (gran_params->friction_mode != CHGPU_FRICTION_MODE::FRICTIONLESS) {
118         gpuErrchk(cudaMemset(sphere_ang_acc_X.data(), 0, nSpheres * sizeof(float)));
119         gpuErrchk(cudaMemset(sphere_ang_acc_Y.data(), 0, nSpheres * sizeof(float)));
120         gpuErrchk(cudaMemset(sphere_ang_acc_Z.data(), 0, nSpheres * sizeof(float)));
121     }
122 }
123 
compute_absv(const unsigned int nSpheres,const float * velX,const float * velY,const float * velZ,float * d_absv)124 __global__ void compute_absv(const unsigned int nSpheres,
125                              const float* velX,
126                              const float* velY,
127                              const float* velZ,
128                              float* d_absv) {
129     unsigned int my_sphere = blockIdx.x * blockDim.x + threadIdx.x;
130     if (my_sphere < nSpheres) {
131         float v[3] = {velX[my_sphere], velY[my_sphere], velZ[my_sphere]};
132         d_absv[my_sphere] = sqrt(v[0] * v[0] + v[1] * v[1] + v[2] * v[2]);
133     }
134 }
135 
get_max_vel() const136 __host__ float ChSystemGpu_impl::get_max_vel() const {
137     float* d_absv;
138     float* d_max_vel;
139     float h_max_vel;
140     gpuErrchk(cudaMalloc(&d_absv, nSpheres * sizeof(float)));
141     gpuErrchk(cudaMalloc(&d_max_vel, sizeof(float)));
142 
143     compute_absv<<<(nSpheres + 255) / 256, 256>>>(nSpheres, pos_X_dt.data(), pos_Y_dt.data(), pos_Z_dt.data(), d_absv);
144 
145     void* d_temp_storage = NULL;
146     size_t temp_storage_bytes = 0;
147     cub::DeviceReduce::Max(d_temp_storage, temp_storage_bytes, d_absv, d_max_vel, nSpheres);
148     gpuErrchk(cudaMalloc(&d_temp_storage, temp_storage_bytes));
149     cub::DeviceReduce::Max(d_temp_storage, temp_storage_bytes, d_absv, d_max_vel, nSpheres);
150     gpuErrchk(cudaMemcpy(&h_max_vel, d_max_vel, sizeof(float), cudaMemcpyDeviceToHost));
151 
152     gpuErrchk(cudaFree(d_absv));
153     gpuErrchk(cudaFree(d_max_vel));
154 
155     return h_max_vel;
156 }
157 
getSDTripletFromID(unsigned int SD_ID) const158 __host__ int3 ChSystemGpu_impl::getSDTripletFromID(unsigned int SD_ID) const {
159     return SDIDTriplet(SD_ID, gran_params);
160 }
161 /// Sort sphere positions by subdomain id
162 /// Occurs entirely on host, not intended to be efficient
163 /// ONLY DO AT BEGINNING OF SIMULATION
defragment_initial_positions()164 __host__ void ChSystemGpu_impl::defragment_initial_positions() {
165     // key and value pointers
166     std::vector<unsigned int, cudallocator<unsigned int>> sphere_ids;
167 
168     // load sphere indices
169     sphere_ids.resize(nSpheres);
170     std::iota(sphere_ids.begin(), sphere_ids.end(), 0);
171 
172     // sort sphere ids by owner SD
173     std::sort(sphere_ids.begin(), sphere_ids.end(),
174               [&](std::size_t i, std::size_t j) { return sphere_owner_SDs.at(i) < sphere_owner_SDs.at(j); });
175 
176     std::vector<int, cudallocator<int>> sphere_pos_x_tmp;
177     std::vector<int, cudallocator<int>> sphere_pos_y_tmp;
178     std::vector<int, cudallocator<int>> sphere_pos_z_tmp;
179 
180     std::vector<float, cudallocator<float>> sphere_vel_x_tmp;
181     std::vector<float, cudallocator<float>> sphere_vel_y_tmp;
182     std::vector<float, cudallocator<float>> sphere_vel_z_tmp;
183 
184     std::vector<float, cudallocator<float>> sphere_angv_x_tmp;
185     std::vector<float, cudallocator<float>> sphere_angv_y_tmp;
186     std::vector<float, cudallocator<float>> sphere_angv_z_tmp;
187 
188     std::vector<not_stupid_bool, cudallocator<not_stupid_bool>> sphere_fixed_tmp;
189     std::vector<unsigned int, cudallocator<unsigned int>> sphere_owner_SDs_tmp;
190 
191     sphere_pos_x_tmp.resize(nSpheres);
192     sphere_pos_y_tmp.resize(nSpheres);
193     sphere_pos_z_tmp.resize(nSpheres);
194 
195     sphere_vel_x_tmp.resize(nSpheres);
196     sphere_vel_y_tmp.resize(nSpheres);
197     sphere_vel_z_tmp.resize(nSpheres);
198 
199     if (gran_params->friction_mode != CHGPU_FRICTION_MODE::FRICTIONLESS) {
200         sphere_angv_x_tmp.resize(nSpheres);
201         sphere_angv_y_tmp.resize(nSpheres);
202         sphere_angv_z_tmp.resize(nSpheres);
203     }
204 
205     sphere_fixed_tmp.resize(nSpheres);
206     sphere_owner_SDs_tmp.resize(nSpheres);
207 
208     // reorder values into new sorted
209     for (unsigned int i = 0; i < nSpheres; i++) {
210         sphere_pos_x_tmp.at(i) = sphere_local_pos_X.at(sphere_ids.at(i));
211         sphere_pos_y_tmp.at(i) = sphere_local_pos_Y.at(sphere_ids.at(i));
212         sphere_pos_z_tmp.at(i) = sphere_local_pos_Z.at(sphere_ids.at(i));
213 
214         sphere_vel_x_tmp.at(i) = (float)pos_X_dt.at(sphere_ids.at(i));
215         sphere_vel_y_tmp.at(i) = (float)pos_Y_dt.at(sphere_ids.at(i));
216         sphere_vel_z_tmp.at(i) = (float)pos_Z_dt.at(sphere_ids.at(i));
217 
218         if (gran_params->friction_mode != CHGPU_FRICTION_MODE::FRICTIONLESS) {
219             sphere_angv_x_tmp.at(i) = (float)sphere_Omega_X.at(sphere_ids.at(i));
220             sphere_angv_y_tmp.at(i) = (float)sphere_Omega_Y.at(sphere_ids.at(i));
221             sphere_angv_z_tmp.at(i) = (float)sphere_Omega_Z.at(sphere_ids.at(i));
222         }
223 
224         sphere_fixed_tmp.at(i) = sphere_fixed.at(sphere_ids.at(i));
225         sphere_owner_SDs_tmp.at(i) = sphere_owner_SDs.at(sphere_ids.at(i));
226     }
227 
228     // swap into the correct data structures
229     sphere_local_pos_X.swap(sphere_pos_x_tmp);
230     sphere_local_pos_Y.swap(sphere_pos_y_tmp);
231     sphere_local_pos_Z.swap(sphere_pos_z_tmp);
232 
233     pos_X_dt.swap(sphere_vel_x_tmp);
234     pos_Y_dt.swap(sphere_vel_y_tmp);
235     pos_Z_dt.swap(sphere_vel_z_tmp);
236 
237     if (gran_params->friction_mode != CHGPU_FRICTION_MODE::FRICTIONLESS) {
238         sphere_Omega_X.swap(sphere_angv_x_tmp);
239         sphere_Omega_Y.swap(sphere_angv_y_tmp);
240         sphere_Omega_Z.swap(sphere_angv_z_tmp);
241     }
242 
243     sphere_fixed.swap(sphere_fixed_tmp);
244     sphere_owner_SDs.swap(sphere_owner_SDs_tmp);
245 }
246 
247 /// Same defragment function, but this time for the contact friction history arrays.
248 /// It is stand-alone because it should rarely be needed, so let us save some time by
249 /// not calling it in most of our simulations.
defragment_friction_history(unsigned int history_offset)250 __host__ void ChSystemGpu_impl::defragment_friction_history(unsigned int history_offset) {
251     // key and value pointers
252     std::vector<unsigned int, cudallocator<unsigned int>> sphere_ids;
253 
254     // load sphere indices
255     sphere_ids.resize(nSpheres);
256     std::iota(sphere_ids.begin(), sphere_ids.end(), 0);
257 
258     // sort sphere ids by owner SD
259     std::sort(sphere_ids.begin(), sphere_ids.end(),
260               [&](std::size_t i, std::size_t j) { return sphere_owner_SDs.at(i) < sphere_owner_SDs.at(j); });
261 
262     std::vector<float3, cudallocator<float3>> history_tmp;
263     std::vector<unsigned int, cudallocator<unsigned int>> partners_tmp;
264 
265     history_tmp.resize(history_offset * nSpheres);
266     partners_tmp.resize(history_offset * nSpheres);
267 
268     // reorder values into new sorted
269     for (unsigned int i = 0; i < nSpheres; i++) {
270         for (unsigned int j = 0; j < history_offset; j++) {
271             history_tmp.at(history_offset * i + j) = contact_history_map.at(history_offset * sphere_ids.at(i) + j);
272             partners_tmp.at(history_offset * i + j) = contact_partners_map.at(history_offset * sphere_ids.at(i) + j);
273         }
274     }
275 
276     contact_history_map.swap(history_tmp);
277     contact_partners_map.swap(partners_tmp);
278 }
279 
setupSphereDataStructures()280 __host__ void ChSystemGpu_impl::setupSphereDataStructures() {
281     // Each fills user_sphere_positions with positions to be copied
282     if (user_sphere_positions.size() == 0) {
283         CHGPU_ERROR("ERROR! no sphere positions given!\n");
284     }
285 
286     nSpheres = (unsigned int)user_sphere_positions.size();
287     INFO_PRINTF("%u balls added!\n", nSpheres);
288     gran_params->nSpheres = nSpheres;
289 
290     TRACK_VECTOR_RESIZE(sphere_owner_SDs, nSpheres, "sphere_owner_SDs", NULL_CHGPU_ID);
291 
292     // Allocate space for new bodies
293     TRACK_VECTOR_RESIZE(sphere_local_pos_X, nSpheres, "sphere_local_pos_X", 0);
294     TRACK_VECTOR_RESIZE(sphere_local_pos_Y, nSpheres, "sphere_local_pos_Y", 0);
295     TRACK_VECTOR_RESIZE(sphere_local_pos_Z, nSpheres, "sphere_local_pos_Z", 0);
296 
297     TRACK_VECTOR_RESIZE(sphere_fixed, nSpheres, "sphere_fixed", 0);
298 
299     TRACK_VECTOR_RESIZE(pos_X_dt, nSpheres, "pos_X_dt", 0);
300     TRACK_VECTOR_RESIZE(pos_Y_dt, nSpheres, "pos_Y_dt", 0);
301     TRACK_VECTOR_RESIZE(pos_Z_dt, nSpheres, "pos_Z_dt", 0);
302 
303     // temporarily store global positions as 64-bit, discard as soon as local positions are loaded
304     {
305         bool user_provided_fixed = user_sphere_fixed.size() != 0;
306         bool user_provided_vel = user_sphere_vel.size() != 0;
307         if (user_provided_fixed && user_sphere_fixed.size() != nSpheres)
308             CHGPU_ERROR("Provided fixity array has length %zu, but there are %u spheres!\n", user_sphere_fixed.size(),
309                         nSpheres);
310         if (user_provided_vel && user_sphere_vel.size() != nSpheres)
311             CHGPU_ERROR("Provided velocity array has length %zu, but there are %u spheres!\n", user_sphere_vel.size(),
312                         nSpheres);
313 
314         std::vector<int64_t, cudallocator<int64_t>> sphere_global_pos_X;
315         std::vector<int64_t, cudallocator<int64_t>> sphere_global_pos_Y;
316         std::vector<int64_t, cudallocator<int64_t>> sphere_global_pos_Z;
317 
318         sphere_global_pos_X.resize(nSpheres);
319         sphere_global_pos_Y.resize(nSpheres);
320         sphere_global_pos_Z.resize(nSpheres);
321 
322         // Copy from array of structs to 3 arrays
323         for (unsigned int i = 0; i < nSpheres; i++) {
324             float3 vec = user_sphere_positions.at(i);
325             // cast to double, convert to SU, then cast to int64_t
326             sphere_global_pos_X.at(i) = (int64_t)((double)vec.x / LENGTH_SU2UU);
327             sphere_global_pos_Y.at(i) = (int64_t)((double)vec.y / LENGTH_SU2UU);
328             sphere_global_pos_Z.at(i) = (int64_t)((double)vec.z / LENGTH_SU2UU);
329 
330             // Convert to not_stupid_bool
331             sphere_fixed.at(i) = (not_stupid_bool)((user_provided_fixed) ? user_sphere_fixed[i] : false);
332             if (user_provided_vel) {
333                 auto vel = user_sphere_vel.at(i);
334                 pos_X_dt.at(i) = (float)(vel.x / VEL_SU2UU);
335                 pos_Y_dt.at(i) = (float)(vel.y / VEL_SU2UU);
336                 pos_Z_dt.at(i) = (float)(vel.z / VEL_SU2UU);
337             }
338         }
339 
340         packSphereDataPointers();
341         // Figure our the number of blocks that need to be launched to cover the box
342         unsigned int nBlocks = (nSpheres + CUDA_THREADS_PER_BLOCK - 1) / CUDA_THREADS_PER_BLOCK;
343         initializeLocalPositions<<<nBlocks, CUDA_THREADS_PER_BLOCK>>>(
344             sphere_data, sphere_global_pos_X.data(), sphere_global_pos_Y.data(), sphere_global_pos_Z.data(), nSpheres,
345             gran_params);
346 
347         gpuErrchk(cudaDeviceSynchronize());
348         gpuErrchk(cudaPeekAtLastError());
349     }
350 
351     TRACK_VECTOR_RESIZE(sphere_acc_X, nSpheres, "sphere_acc_X", 0);
352     TRACK_VECTOR_RESIZE(sphere_acc_Y, nSpheres, "sphere_acc_Y", 0);
353     TRACK_VECTOR_RESIZE(sphere_acc_Z, nSpheres, "sphere_acc_Z", 0);
354 
355     // The buffer array that stores any quantity that the user wish to quarry. We resize it here once instead of
356     // resizing on-the-call, to save time, in case that quarry function is called with a high frequency. The last
357     // element in this array is to store the reduced value.
358     TRACK_VECTOR_RESIZE(sphere_stats_buffer, nSpheres + 1, "sphere_stats_buffer", 0);
359 
360     // NOTE that this will get resized again later, this is just the first estimate
361     TRACK_VECTOR_RESIZE(spheres_in_SD_composite, 2 * nSpheres, "spheres_in_SD_composite", NULL_CHGPU_ID);
362 
363     if (gran_params->friction_mode != CHGPU_FRICTION_MODE::FRICTIONLESS) {
364         // add rotational DOFs
365         TRACK_VECTOR_RESIZE(sphere_Omega_X, nSpheres, "sphere_Omega_X", 0);
366         TRACK_VECTOR_RESIZE(sphere_Omega_Y, nSpheres, "sphere_Omega_Y", 0);
367         TRACK_VECTOR_RESIZE(sphere_Omega_Z, nSpheres, "sphere_Omega_Z", 0);
368 
369         // add torques
370         TRACK_VECTOR_RESIZE(sphere_ang_acc_X, nSpheres, "sphere_ang_acc_X", 0);
371         TRACK_VECTOR_RESIZE(sphere_ang_acc_Y, nSpheres, "sphere_ang_acc_Y", 0);
372         TRACK_VECTOR_RESIZE(sphere_ang_acc_Z, nSpheres, "sphere_ang_acc_Z", 0);
373 
374         {
375             bool user_provided_ang_vel = user_sphere_ang_vel.size() != 0;
376             if (user_provided_ang_vel && user_sphere_ang_vel.size() != nSpheres)
377                 CHGPU_ERROR("Provided angular velocity array has length %zu, but there are %u spheres!\n",
378                             user_sphere_ang_vel.size(), nSpheres);
379             if (user_provided_ang_vel) {
380                 for (unsigned int i = 0; i < nSpheres; i++) {
381                     auto ang_vel = user_sphere_ang_vel.at(i);
382                     sphere_Omega_X.at(i) = (float)(ang_vel.x * TIME_SU2UU);
383                     sphere_Omega_Y.at(i) = (float)(ang_vel.y * TIME_SU2UU);
384                     sphere_Omega_Z.at(i) = (float)(ang_vel.z * TIME_SU2UU);
385                 }
386             }
387         }
388     }
389 
390     if (time_integrator == CHGPU_TIME_INTEGRATOR::CHUNG) {
391         TRACK_VECTOR_RESIZE(sphere_acc_X_old, nSpheres, "sphere_acc_X_old", 0);
392         TRACK_VECTOR_RESIZE(sphere_acc_Y_old, nSpheres, "sphere_acc_Y_old", 0);
393         TRACK_VECTOR_RESIZE(sphere_acc_Z_old, nSpheres, "sphere_acc_Z_old", 0);
394 
395         // friction and multistep means keep old ang acc
396         if (gran_params->friction_mode != CHGPU_FRICTION_MODE::FRICTIONLESS) {
397             TRACK_VECTOR_RESIZE(sphere_ang_acc_X_old, nSpheres, "sphere_ang_acc_X_old", 0);
398             TRACK_VECTOR_RESIZE(sphere_ang_acc_Y_old, nSpheres, "sphere_ang_acc_Y_old", 0);
399             TRACK_VECTOR_RESIZE(sphere_ang_acc_Z_old, nSpheres, "sphere_ang_acc_Z_old", 0);
400         }
401     }
402 
403     // If this is a new-boot, we usually want to do this defragment.
404     // But if this is a restart, then probably no. We do not want every time the simulation restarts,
405     // we have the order of particles completely changed: it may be bad for visualization or debugging
406     if (defragment_on_start) {
407         defragment_initial_positions();
408     }
409 
410     bool user_provided_internal_data = false;
411     if (gran_params->friction_mode == CHGPU_FRICTION_MODE::MULTI_STEP ||
412         gran_params->friction_mode == CHGPU_FRICTION_MODE::SINGLE_STEP) {
413         TRACK_VECTOR_RESIZE(contact_partners_map, MAX_SPHERES_TOUCHED_BY_SPHERE * nSpheres, "contact_partners_map",
414                             NULL_CHGPU_ID);
415         TRACK_VECTOR_RESIZE(contact_active_map, MAX_SPHERES_TOUCHED_BY_SPHERE * nSpheres, "contact_active_map", false);
416 
417         // If the user provides a checkpointed history array, we load it here
418         bool user_provided_partner_map = user_partner_map.size() != 0;
419         if (user_provided_partner_map && user_partner_map.size() != MAX_SPHERES_TOUCHED_BY_SPHERE * nSpheres)
420             CHGPU_ERROR("ERROR! The user provided contact partner map has size %zu. It needs to be %u * %u!\n",
421                         user_partner_map.size(), MAX_SPHERES_TOUCHED_BY_SPHERE, nSpheres);
422 
423         // Hope that using .at (instead of []) gives better err msg when things go wrong,
424         // at the cost of some speed which is not important in I/O
425         if (user_provided_partner_map) {
426             for (unsigned int i = 0; i < nSpheres; i++) {
427                 for (unsigned int j = 0; j < MAX_SPHERES_TOUCHED_BY_SPHERE; j++) {
428                     contact_partners_map.at(MAX_SPHERES_TOUCHED_BY_SPHERE * i + j) =
429                         user_partner_map.at(MAX_SPHERES_TOUCHED_BY_SPHERE * i + j);
430                 }
431             }
432         }
433 
434         user_provided_internal_data = user_provided_internal_data || user_provided_partner_map;
435     }
436 
437     if (gran_params->friction_mode == CHGPU_FRICTION_MODE::MULTI_STEP) {
438         float3 null_history = {0., 0., 0.};
439         TRACK_VECTOR_RESIZE(contact_history_map, MAX_SPHERES_TOUCHED_BY_SPHERE * nSpheres, "contact_history_map",
440                             null_history);
441         TRACK_VECTOR_RESIZE(contact_duration, MAX_SPHERES_TOUCHED_BY_SPHERE * nSpheres, "contact_duration", 0);
442 
443         // If the user provides a checkpointed history array, we load it here
444         bool user_provided_friction_history = user_friction_history.size() != 0;
445         if (user_provided_friction_history && user_friction_history.size() != MAX_SPHERES_TOUCHED_BY_SPHERE * nSpheres)
446             CHGPU_ERROR("ERROR! The user provided contact friction history has size %zu. It needs to be %u * %u!\n",
447                         user_friction_history.size(), MAX_SPHERES_TOUCHED_BY_SPHERE, nSpheres);
448 
449         if (user_provided_friction_history) {
450             for (unsigned int i = 0; i < nSpheres; i++) {
451                 for (unsigned int j = 0; j < MAX_SPHERES_TOUCHED_BY_SPHERE; j++) {
452                     float3 history_UU = user_friction_history[MAX_SPHERES_TOUCHED_BY_SPHERE * i + j];
453                     float3 history_SU = make_float3(history_UU.x / LENGTH_SU2UU, history_UU.y / LENGTH_SU2UU,
454                                                     history_UU.z / LENGTH_SU2UU);
455                     contact_history_map.at(MAX_SPHERES_TOUCHED_BY_SPHERE * i + j) = history_SU;
456                 }
457             }
458         }
459 
460         user_provided_internal_data = user_provided_internal_data || user_provided_friction_history;
461     }
462 
463     // This if content should be executed rarely, if at all.
464     // If user gives Chrono::Gpu internal data from a file then it's a restart,
465     // then defragment_on_start should be set to false. But I implemented it anyway.
466     if (user_provided_internal_data && defragment_on_start) {
467         defragment_friction_history(MAX_SPHERES_TOUCHED_BY_SPHERE);
468     }
469 
470     // record normal contact force
471     if (gran_params->recording_contactInfo == true) {
472         float3 null_force = {0.0f, 0.0f, 0.0f};
473         TRACK_VECTOR_RESIZE(normal_contact_force, MAX_SPHERES_TOUCHED_BY_SPHERE * nSpheres, "normal contact force",
474                             null_force);
475     }
476 
477     // record friction force
478     if (gran_params->recording_contactInfo == true && gran_params->friction_mode != CHGPU_FRICTION_MODE::FRICTIONLESS) {
479         float3 null_force = {0.0f, 0.0f, 0.0f};
480         TRACK_VECTOR_RESIZE(tangential_friction_force, MAX_SPHERES_TOUCHED_BY_SPHERE * nSpheres,
481                             "tangential contact force", null_force);
482     }
483 
484     // record rolling friction torque
485     if (gran_params->recording_contactInfo == true && gran_params->rolling_mode != CHGPU_ROLLING_MODE::NO_RESISTANCE) {
486         float3 null_force = {0.0f, 0.0f, 0.0f};
487         TRACK_VECTOR_RESIZE(rolling_friction_torque, MAX_SPHERES_TOUCHED_BY_SPHERE * nSpheres,
488                             "rolling friction torque", null_force);
489         TRACK_VECTOR_RESIZE(char_collision_time, MAX_SPHERES_TOUCHED_BY_SPHERE * nSpheres,
490                             "characterisitc collision time", 0);
491         TRACK_VECTOR_RESIZE(v_rot_array, MAX_SPHERES_TOUCHED_BY_SPHERE * nSpheres, "v rot", null_force);
492     }
493 
494     // make sure the right pointers are packed
495     packSphereDataPointers();
496 }
497 
498 /// <summary>
499 /// runSphereBroadphase goes through three stages. First, a kernel figures out for each SD, how many spheres touch it.
500 /// Then, there is a prefix scan done (which requires two CUB function calls) to figure out offsets into the big fat
501 /// array that contains, for SD after SD, which spheres touch the SD. This last thing is accomplished by a kernel call.
502 ///
503 /// CAVEAT: in this approach, the outcome of the prefix scan operation will be canibalized during the kernel call that
504 /// updates the big fat composite array. As such, there is a "scratch-pad" version that is used along the way
505 /// </summary>
506 /// <returns></returns>
runSphereBroadphase()507 __host__ void ChSystemGpu_impl::runSphereBroadphase() {
508     METRICS_PRINTF("Resetting broadphase info!\n");
509 
510     // reset the number of spheres per SD, the offsets in the big composite array, and the big fat composite array
511     resetBroadphaseInformation();
512 
513     // Frist stage of the computation in this function: Figure out the how many spheres touch each SD.
514     unsigned int nBlocks = (nSpheres + CUDA_THREADS_PER_BLOCK - 1) / CUDA_THREADS_PER_BLOCK;
515     getNumberOfSpheresTouchingEachSD<CUDA_THREADS_PER_BLOCK>
516         <<<nBlocks, CUDA_THREADS_PER_BLOCK>>>(sphere_data, nSpheres, gran_params);
517     gpuErrchk(cudaDeviceSynchronize());
518     gpuErrchk(cudaPeekAtLastError());
519 
520     // Starting the second stage of this function call - the prefix scan operation
521     unsigned int* out_ptr = SD_SphereCompositeOffsets.data();
522     unsigned int* in_ptr = SD_NumSpheresTouching.data();
523     gpuErrchk(cudaMemcpy(out_ptr, in_ptr, nSDs * sizeof(unsigned int), cudaMemcpyDeviceToDevice));
524 
525     // cold run; CUB determines the amount of storage it needs (since first argument is NULL pointer)
526     size_t temp_storage_bytes = 0;
527     cub::DeviceScan::ExclusiveSum(NULL, temp_storage_bytes, in_ptr, out_ptr, nSDs);
528     gpuErrchk(cudaDeviceSynchronize());
529     gpuErrchk(cudaPeekAtLastError());
530 
531     // give CUB needed temporary storage on the device
532     void* d_scratch_space = (void*)stateOfSolver_resources.pDeviceMemoryScratchSpace(temp_storage_bytes);
533     // Run the actual exclusive prefix sum
534     cub::DeviceScan::ExclusiveSum(d_scratch_space, temp_storage_bytes, in_ptr, out_ptr, nSDs);
535     gpuErrchk(cudaDeviceSynchronize());
536     gpuErrchk(cudaPeekAtLastError());
537 
538     // Beginning of the last stage of computation in this function: assembling the big composite array.
539     // num_entries: total number of sphere entries to record in the big fat composite array
540     unsigned int num_entries = out_ptr[nSDs - 1] + in_ptr[nSDs - 1];
541     spheres_in_SD_composite.resize(num_entries, NULL_CHGPU_ID);
542     sphere_data->spheres_in_SD_composite = spheres_in_SD_composite.data();
543 
544     // Copy the offesets in the scratch pad; the subsequent kernel call would step on the outcome of the prefix scan
545     gpuErrchk(cudaMemcpy(SD_SphereCompositeOffsets_ScratchPad.data(), SD_SphereCompositeOffsets.data(),
546                          nSDs * sizeof(unsigned int), cudaMemcpyDeviceToDevice));
547     // Populate the composite array; in the process, the content of the scratch pad will be modified
548     // nBlocks = (MAX_SDs_TOUCHED_BY_SPHERE * nSpheres + 2*CUDA_THREADS_PER_BLOCK - 1) / (2*CUDA_THREADS_PER_BLOCK);
549     // populateSpheresInEachSD<<<nBlocks, 2*CUDA_THREADS_PER_BLOCK>>>(sphere_data, nSpheres, gran_params);
550     nBlocks = (nSpheres + CUDA_THREADS_PER_BLOCK - 1) / (CUDA_THREADS_PER_BLOCK);
551     populateSpheresInEachSD<<<nBlocks, CUDA_THREADS_PER_BLOCK>>>(sphere_data, nSpheres, gran_params);
552     gpuErrchk(cudaDeviceSynchronize());
553     gpuErrchk(cudaPeekAtLastError());
554 }
555 
updateBCPositions()556 __host__ void ChSystemGpu_impl::updateBCPositions() {
557     for (unsigned int i = 0; i < BC_params_list_UU.size(); i++) {
558         auto bc_type = BC_type_list.at(i);
559         const BC_params_t<float, float3>& params_UU = BC_params_list_UU.at(i);
560         BC_params_t<int64_t, int64_t3>& params_SU = BC_params_list_SU.at(i);
561         auto offset_function = BC_offset_function_list.at(i);
562         setBCOffset(bc_type, params_UU, params_SU, offset_function(elapsedSimTime));
563     }
564 
565     if (!BD_is_fixed) {
566         double3 new_BD_offset = BDOffsetFunction(elapsedSimTime);
567 
568         int64_t3 bd_offset_SU = {0, 0, 0};
569         bd_offset_SU.x = (int64_t)(new_BD_offset.x / LENGTH_SU2UU);
570         bd_offset_SU.y = (int64_t)(new_BD_offset.y / LENGTH_SU2UU);
571         bd_offset_SU.z = (int64_t)(new_BD_offset.z / LENGTH_SU2UU);
572 
573         int64_t old_frame_X = gran_params->BD_frame_X;
574         int64_t old_frame_Y = gran_params->BD_frame_Y;
575         int64_t old_frame_Z = gran_params->BD_frame_Z;
576 
577         gran_params->BD_frame_X = bd_offset_SU.x + BD_rest_frame_SU.x;
578         gran_params->BD_frame_Y = bd_offset_SU.y + BD_rest_frame_SU.y;
579         gran_params->BD_frame_Z = bd_offset_SU.z + BD_rest_frame_SU.z;
580 
581         unsigned int nBlocks = (nSpheres + CUDA_THREADS_PER_BLOCK - 1) / CUDA_THREADS_PER_BLOCK;
582 
583         int64_t3 offset_delta = {0, 0, 0};
584 
585         // if the frame X increases, the local X should decrease
586         offset_delta.x = old_frame_X - gran_params->BD_frame_X;
587         offset_delta.y = old_frame_Y - gran_params->BD_frame_Y;
588         offset_delta.z = old_frame_Z - gran_params->BD_frame_Z;
589 
590         // printf("offset is %lld, %lld, %lld\n", offset_delta.x, offset_delta.y, offset_delta.z);
591 
592         packSphereDataPointers();
593 
594         applyBDFrameChange<<<nBlocks, CUDA_THREADS_PER_BLOCK>>>(offset_delta, sphere_data, nSpheres, gran_params);
595 
596         gpuErrchk(cudaPeekAtLastError());
597         gpuErrchk(cudaDeviceSynchronize());
598     }
599 }
600 
AdvanceSimulation(float duration)601 __host__ double ChSystemGpu_impl::AdvanceSimulation(float duration) {
602     // Figure our the number of blocks that need to be launched to cover the box
603     unsigned int nBlocks = (nSpheres + CUDA_THREADS_PER_BLOCK - 1) / CUDA_THREADS_PER_BLOCK;
604     // Settling simulation loop.
605     float duration_SU = (float)(duration / TIME_SU2UU);
606     unsigned int nsteps = (unsigned int)std::round(duration_SU / stepSize_SU);
607     METRICS_PRINTF("advancing by %f at timestep %f, %u timesteps at approx user timestep %f\n", duration_SU,
608                    stepSize_SU, nsteps, duration / nsteps);
609     float time_elapsed_SU = 0;  // time elapsed in this advance call
610 
611     packSphereDataPointers();
612 
613     // Run the simulation, there are aggressive synchronizations because we want to have no race conditions
614     for (unsigned int n = 0; n < nsteps; n++) {
615         updateBCPositions();
616         runSphereBroadphase();
617         resetSphereAccelerations();
618         resetBCForces();
619 
620         METRICS_PRINTF("Starting computeSphereForces!\n");
621 
622         if (gran_params->friction_mode == CHGPU_FRICTION_MODE::FRICTIONLESS) {
623             // Compute sphere-sphere forces
624             computeSphereForces_frictionless_matBased<<<nSDs, MAX_COUNT_OF_SPHERES_PER_SD>>>(
625                 sphere_data, gran_params, BC_type_list.data(), BC_params_list_SU.data(),
626                 (unsigned int)BC_params_list_SU.size());
627             gpuErrchk(cudaPeekAtLastError());
628             gpuErrchk(cudaDeviceSynchronize());
629         } else if (gran_params->friction_mode == CHGPU_FRICTION_MODE::SINGLE_STEP ||
630                    gran_params->friction_mode == CHGPU_FRICTION_MODE::MULTI_STEP) {
631             // figure out who is contacting
632             determineContactPairs<<<nSDs, MAX_COUNT_OF_SPHERES_PER_SD>>>(sphere_data, gran_params);
633             gpuErrchk(cudaPeekAtLastError());
634             gpuErrchk(cudaDeviceSynchronize());
635 
636             if (gran_params->use_mat_based == true) {
637                 computeSphereContactForces_matBased<<<nBlocks, CUDA_THREADS_PER_BLOCK>>>(
638                     sphere_data, gran_params, BC_type_list.data(), BC_params_list_SU.data(),
639                     (unsigned int)BC_params_list_SU.size(), nSpheres);
640 
641             } else {
642                 computeSphereContactForces<<<nBlocks, CUDA_THREADS_PER_BLOCK>>>(
643                     sphere_data, gran_params, BC_type_list.data(), BC_params_list_SU.data(),
644                     (unsigned int)BC_params_list_SU.size(), nSpheres);
645             }
646 
647             gpuErrchk(cudaPeekAtLastError());
648             gpuErrchk(cudaDeviceSynchronize());
649         }
650 
651         METRICS_PRINTF("Starting integrateSpheres!\n");
652         integrateSpheres<<<nBlocks, CUDA_THREADS_PER_BLOCK>>>(stepSize_SU, sphere_data, nSpheres, gran_params);
653         gpuErrchk(cudaPeekAtLastError());
654         gpuErrchk(cudaDeviceSynchronize());
655 
656         if (gran_params->friction_mode != CHGPU_FRICTION_MODE::FRICTIONLESS) {
657             const unsigned int nThreadsUpdateHist = 2 * CUDA_THREADS_PER_BLOCK;
658             unsigned int fricMapSize = nSpheres * MAX_SPHERES_TOUCHED_BY_SPHERE;
659             unsigned int nBlocksFricHistoryPostProcess = (fricMapSize + nThreadsUpdateHist - 1) / nThreadsUpdateHist;
660 
661             METRICS_PRINTF("Update Friction Data!\n");
662 
663             updateFrictionData<<<nBlocksFricHistoryPostProcess, nThreadsUpdateHist>>>(fricMapSize, sphere_data,
664                                                                                       gran_params);
665 
666             gpuErrchk(cudaPeekAtLastError());
667             gpuErrchk(cudaDeviceSynchronize());
668             METRICS_PRINTF("Update angular velocity.\n");
669             updateAngVels<<<nBlocks, CUDA_THREADS_PER_BLOCK>>>(stepSize_SU, sphere_data, nSpheres, gran_params);
670             gpuErrchk(cudaPeekAtLastError());
671             gpuErrchk(cudaDeviceSynchronize());
672         }
673 
674         elapsedSimTime += (float)(stepSize_SU * TIME_SU2UU);  // Advance current time
675         time_elapsed_SU += stepSize_SU;
676     }
677 
678     return time_elapsed_SU * TIME_SU2UU;  // return elapsed UU time
679 }
680 }  // namespace gpu
681 }  // namespace chrono
682