1 // =============================================================================
2 // PROJECT CHRONO - http://projectchrono.org
3 //
4 // Copyright (c) 2019 projectchrono.org
5 // All rights reserved.
6 //
7 // Use of this source code is governed by a BSD-style license that can be found
8 // in the LICENSE file at the top level of the distribution and at
9 // http://projectchrono.org/license-chrono.txt.
10 //
11 // =============================================================================
12 // Authors: Conlain Kelly, Nic Olsen, Dan Negrut, Ruochun Zhang
13 // =============================================================================
14
15 #include <cmath>
16 #include <numeric>
17 #include <fstream>
18
19 #include "chrono_gpu/cuda/ChGpu_SMC.cuh"
20 #include "chrono_gpu/utils/ChGpuUtilities.h"
21
22 namespace chrono {
23 namespace gpu {
24
computeArray3SquaredSum(std::vector<float,cudallocator<float>> & arrX,std::vector<float,cudallocator<float>> & arrY,std::vector<float,cudallocator<float>> & arrZ,size_t nSpheres)25 __host__ float ChSystemGpu_impl::computeArray3SquaredSum(std::vector<float, cudallocator<float>>& arrX,
26 std::vector<float, cudallocator<float>>& arrY,
27 std::vector<float, cudallocator<float>>& arrZ,
28 size_t nSpheres) {
29 const unsigned int threadsPerBlock = 1024;
30 unsigned int nBlocks = (nSpheres + threadsPerBlock - 1) / threadsPerBlock;
31 elementalArray3Squared<float><<<nBlocks, threadsPerBlock>>>(sphere_data->sphere_stats_buffer, arrX.data(),
32 arrY.data(), arrZ.data(), nSpheres);
33 gpuErrchk(cudaDeviceSynchronize());
34
35 // Use CUB to reduce. And put the reduced result at the last element of sphere_stats_buffer array.
36 size_t temp_storage_bytes = 0;
37 cub::DeviceReduce::Sum(NULL, temp_storage_bytes, sphere_data->sphere_stats_buffer,
38 sphere_data->sphere_stats_buffer + nSpheres, nSpheres);
39 void* d_scratch_space = (void*)stateOfSolver_resources.pDeviceMemoryScratchSpace(temp_storage_bytes);
40 cub::DeviceReduce::Sum(d_scratch_space, temp_storage_bytes, sphere_data->sphere_stats_buffer,
41 sphere_data->sphere_stats_buffer + nSpheres, nSpheres);
42 gpuErrchk(cudaDeviceSynchronize());
43 gpuErrchk(cudaPeekAtLastError());
44 return *(sphere_data->sphere_stats_buffer + nSpheres);
45 }
46
GetMaxParticleZ(bool getMax)47 __host__ double ChSystemGpu_impl::GetMaxParticleZ(bool getMax) {
48 size_t nSpheres = sphere_local_pos_Z.size();
49 if (nSpheres == 0)
50 CHGPU_ERROR("ERROR! 0 particle in system! Please call this method after Initialize().\n");
51
52 const unsigned int threadsPerBlock = 1024;
53 unsigned int nBlocks = (nSpheres + threadsPerBlock - 1) / threadsPerBlock;
54 elementalZLocalToGlobal<<<nBlocks, threadsPerBlock>>>(sphere_data->sphere_stats_buffer, sphere_data, nSpheres,
55 gran_params);
56 gpuErrchk(cudaDeviceSynchronize());
57
58 // Use CUB to find the max or min Z.
59 size_t temp_storage_bytes = 0;
60 if (getMax) {
61 cub::DeviceReduce::Max(NULL, temp_storage_bytes, sphere_data->sphere_stats_buffer,
62 sphere_data->sphere_stats_buffer + nSpheres, nSpheres);
63 void* d_scratch_space = (void*)stateOfSolver_resources.pDeviceMemoryScratchSpace(temp_storage_bytes);
64 cub::DeviceReduce::Max(d_scratch_space, temp_storage_bytes, sphere_data->sphere_stats_buffer,
65 sphere_data->sphere_stats_buffer + nSpheres, nSpheres);
66 } else {
67 cub::DeviceReduce::Min(NULL, temp_storage_bytes, sphere_data->sphere_stats_buffer,
68 sphere_data->sphere_stats_buffer + nSpheres, nSpheres);
69 void* d_scratch_space = (void*)stateOfSolver_resources.pDeviceMemoryScratchSpace(temp_storage_bytes);
70 cub::DeviceReduce::Min(d_scratch_space, temp_storage_bytes, sphere_data->sphere_stats_buffer,
71 sphere_data->sphere_stats_buffer + nSpheres, nSpheres);
72 }
73 gpuErrchk(cudaDeviceSynchronize());
74 gpuErrchk(cudaPeekAtLastError());
75 return *(sphere_data->sphere_stats_buffer + nSpheres);
76 }
77
78 // Reset broadphase data structures
resetBroadphaseInformation()79 void ChSystemGpu_impl::resetBroadphaseInformation() {
80 // Set all the offsets to zero
81 gpuErrchk(cudaMemset(SD_NumSpheresTouching.data(), 0, SD_NumSpheresTouching.size() * sizeof(unsigned int)));
82 gpuErrchk(cudaMemset(SD_SphereCompositeOffsets.data(), 0, SD_SphereCompositeOffsets.size() * sizeof(unsigned int)));
83 // For each SD, all the spheres touching that SD should have their ID be NULL_CHGPU_ID
84 gpuErrchk(cudaMemset(spheres_in_SD_composite.data(), NULL_CHGPU_ID,
85 spheres_in_SD_composite.size() * sizeof(unsigned int)));
86 gpuErrchk(cudaDeviceSynchronize());
87 }
88
89 // Reset sphere acceleration data structures
resetSphereAccelerations()90 void ChSystemGpu_impl::resetSphereAccelerations() {
91 // cache past acceleration data
92 if (time_integrator == CHGPU_TIME_INTEGRATOR::CHUNG) {
93 gpuErrchk(cudaMemcpy(sphere_acc_X_old.data(), sphere_acc_X.data(), nSpheres * sizeof(float),
94 cudaMemcpyDeviceToDevice));
95 gpuErrchk(cudaMemcpy(sphere_acc_Y_old.data(), sphere_acc_Y.data(), nSpheres * sizeof(float),
96 cudaMemcpyDeviceToDevice));
97 gpuErrchk(cudaMemcpy(sphere_acc_Z_old.data(), sphere_acc_Z.data(), nSpheres * sizeof(float),
98 cudaMemcpyDeviceToDevice));
99 // if we have multistep AND friction, cache old alphas
100 if (gran_params->friction_mode != CHGPU_FRICTION_MODE::FRICTIONLESS) {
101 gpuErrchk(cudaMemcpy(sphere_ang_acc_X_old.data(), sphere_ang_acc_X.data(), nSpheres * sizeof(float),
102 cudaMemcpyDeviceToDevice));
103 gpuErrchk(cudaMemcpy(sphere_ang_acc_Y_old.data(), sphere_ang_acc_Y.data(), nSpheres * sizeof(float),
104 cudaMemcpyDeviceToDevice));
105 gpuErrchk(cudaMemcpy(sphere_ang_acc_Z_old.data(), sphere_ang_acc_Z.data(), nSpheres * sizeof(float),
106 cudaMemcpyDeviceToDevice));
107 }
108 gpuErrchk(cudaDeviceSynchronize());
109 }
110
111 // reset current accelerations to zero to zero
112 gpuErrchk(cudaMemset(sphere_acc_X.data(), 0, nSpheres * sizeof(float)));
113 gpuErrchk(cudaMemset(sphere_acc_Y.data(), 0, nSpheres * sizeof(float)));
114 gpuErrchk(cudaMemset(sphere_acc_Z.data(), 0, nSpheres * sizeof(float)));
115
116 // reset torques to zero, if applicable
117 if (gran_params->friction_mode != CHGPU_FRICTION_MODE::FRICTIONLESS) {
118 gpuErrchk(cudaMemset(sphere_ang_acc_X.data(), 0, nSpheres * sizeof(float)));
119 gpuErrchk(cudaMemset(sphere_ang_acc_Y.data(), 0, nSpheres * sizeof(float)));
120 gpuErrchk(cudaMemset(sphere_ang_acc_Z.data(), 0, nSpheres * sizeof(float)));
121 }
122 }
123
compute_absv(const unsigned int nSpheres,const float * velX,const float * velY,const float * velZ,float * d_absv)124 __global__ void compute_absv(const unsigned int nSpheres,
125 const float* velX,
126 const float* velY,
127 const float* velZ,
128 float* d_absv) {
129 unsigned int my_sphere = blockIdx.x * blockDim.x + threadIdx.x;
130 if (my_sphere < nSpheres) {
131 float v[3] = {velX[my_sphere], velY[my_sphere], velZ[my_sphere]};
132 d_absv[my_sphere] = sqrt(v[0] * v[0] + v[1] * v[1] + v[2] * v[2]);
133 }
134 }
135
get_max_vel() const136 __host__ float ChSystemGpu_impl::get_max_vel() const {
137 float* d_absv;
138 float* d_max_vel;
139 float h_max_vel;
140 gpuErrchk(cudaMalloc(&d_absv, nSpheres * sizeof(float)));
141 gpuErrchk(cudaMalloc(&d_max_vel, sizeof(float)));
142
143 compute_absv<<<(nSpheres + 255) / 256, 256>>>(nSpheres, pos_X_dt.data(), pos_Y_dt.data(), pos_Z_dt.data(), d_absv);
144
145 void* d_temp_storage = NULL;
146 size_t temp_storage_bytes = 0;
147 cub::DeviceReduce::Max(d_temp_storage, temp_storage_bytes, d_absv, d_max_vel, nSpheres);
148 gpuErrchk(cudaMalloc(&d_temp_storage, temp_storage_bytes));
149 cub::DeviceReduce::Max(d_temp_storage, temp_storage_bytes, d_absv, d_max_vel, nSpheres);
150 gpuErrchk(cudaMemcpy(&h_max_vel, d_max_vel, sizeof(float), cudaMemcpyDeviceToHost));
151
152 gpuErrchk(cudaFree(d_absv));
153 gpuErrchk(cudaFree(d_max_vel));
154
155 return h_max_vel;
156 }
157
getSDTripletFromID(unsigned int SD_ID) const158 __host__ int3 ChSystemGpu_impl::getSDTripletFromID(unsigned int SD_ID) const {
159 return SDIDTriplet(SD_ID, gran_params);
160 }
161 /// Sort sphere positions by subdomain id
162 /// Occurs entirely on host, not intended to be efficient
163 /// ONLY DO AT BEGINNING OF SIMULATION
defragment_initial_positions()164 __host__ void ChSystemGpu_impl::defragment_initial_positions() {
165 // key and value pointers
166 std::vector<unsigned int, cudallocator<unsigned int>> sphere_ids;
167
168 // load sphere indices
169 sphere_ids.resize(nSpheres);
170 std::iota(sphere_ids.begin(), sphere_ids.end(), 0);
171
172 // sort sphere ids by owner SD
173 std::sort(sphere_ids.begin(), sphere_ids.end(),
174 [&](std::size_t i, std::size_t j) { return sphere_owner_SDs.at(i) < sphere_owner_SDs.at(j); });
175
176 std::vector<int, cudallocator<int>> sphere_pos_x_tmp;
177 std::vector<int, cudallocator<int>> sphere_pos_y_tmp;
178 std::vector<int, cudallocator<int>> sphere_pos_z_tmp;
179
180 std::vector<float, cudallocator<float>> sphere_vel_x_tmp;
181 std::vector<float, cudallocator<float>> sphere_vel_y_tmp;
182 std::vector<float, cudallocator<float>> sphere_vel_z_tmp;
183
184 std::vector<float, cudallocator<float>> sphere_angv_x_tmp;
185 std::vector<float, cudallocator<float>> sphere_angv_y_tmp;
186 std::vector<float, cudallocator<float>> sphere_angv_z_tmp;
187
188 std::vector<not_stupid_bool, cudallocator<not_stupid_bool>> sphere_fixed_tmp;
189 std::vector<unsigned int, cudallocator<unsigned int>> sphere_owner_SDs_tmp;
190
191 sphere_pos_x_tmp.resize(nSpheres);
192 sphere_pos_y_tmp.resize(nSpheres);
193 sphere_pos_z_tmp.resize(nSpheres);
194
195 sphere_vel_x_tmp.resize(nSpheres);
196 sphere_vel_y_tmp.resize(nSpheres);
197 sphere_vel_z_tmp.resize(nSpheres);
198
199 if (gran_params->friction_mode != CHGPU_FRICTION_MODE::FRICTIONLESS) {
200 sphere_angv_x_tmp.resize(nSpheres);
201 sphere_angv_y_tmp.resize(nSpheres);
202 sphere_angv_z_tmp.resize(nSpheres);
203 }
204
205 sphere_fixed_tmp.resize(nSpheres);
206 sphere_owner_SDs_tmp.resize(nSpheres);
207
208 // reorder values into new sorted
209 for (unsigned int i = 0; i < nSpheres; i++) {
210 sphere_pos_x_tmp.at(i) = sphere_local_pos_X.at(sphere_ids.at(i));
211 sphere_pos_y_tmp.at(i) = sphere_local_pos_Y.at(sphere_ids.at(i));
212 sphere_pos_z_tmp.at(i) = sphere_local_pos_Z.at(sphere_ids.at(i));
213
214 sphere_vel_x_tmp.at(i) = (float)pos_X_dt.at(sphere_ids.at(i));
215 sphere_vel_y_tmp.at(i) = (float)pos_Y_dt.at(sphere_ids.at(i));
216 sphere_vel_z_tmp.at(i) = (float)pos_Z_dt.at(sphere_ids.at(i));
217
218 if (gran_params->friction_mode != CHGPU_FRICTION_MODE::FRICTIONLESS) {
219 sphere_angv_x_tmp.at(i) = (float)sphere_Omega_X.at(sphere_ids.at(i));
220 sphere_angv_y_tmp.at(i) = (float)sphere_Omega_Y.at(sphere_ids.at(i));
221 sphere_angv_z_tmp.at(i) = (float)sphere_Omega_Z.at(sphere_ids.at(i));
222 }
223
224 sphere_fixed_tmp.at(i) = sphere_fixed.at(sphere_ids.at(i));
225 sphere_owner_SDs_tmp.at(i) = sphere_owner_SDs.at(sphere_ids.at(i));
226 }
227
228 // swap into the correct data structures
229 sphere_local_pos_X.swap(sphere_pos_x_tmp);
230 sphere_local_pos_Y.swap(sphere_pos_y_tmp);
231 sphere_local_pos_Z.swap(sphere_pos_z_tmp);
232
233 pos_X_dt.swap(sphere_vel_x_tmp);
234 pos_Y_dt.swap(sphere_vel_y_tmp);
235 pos_Z_dt.swap(sphere_vel_z_tmp);
236
237 if (gran_params->friction_mode != CHGPU_FRICTION_MODE::FRICTIONLESS) {
238 sphere_Omega_X.swap(sphere_angv_x_tmp);
239 sphere_Omega_Y.swap(sphere_angv_y_tmp);
240 sphere_Omega_Z.swap(sphere_angv_z_tmp);
241 }
242
243 sphere_fixed.swap(sphere_fixed_tmp);
244 sphere_owner_SDs.swap(sphere_owner_SDs_tmp);
245 }
246
247 /// Same defragment function, but this time for the contact friction history arrays.
248 /// It is stand-alone because it should rarely be needed, so let us save some time by
249 /// not calling it in most of our simulations.
defragment_friction_history(unsigned int history_offset)250 __host__ void ChSystemGpu_impl::defragment_friction_history(unsigned int history_offset) {
251 // key and value pointers
252 std::vector<unsigned int, cudallocator<unsigned int>> sphere_ids;
253
254 // load sphere indices
255 sphere_ids.resize(nSpheres);
256 std::iota(sphere_ids.begin(), sphere_ids.end(), 0);
257
258 // sort sphere ids by owner SD
259 std::sort(sphere_ids.begin(), sphere_ids.end(),
260 [&](std::size_t i, std::size_t j) { return sphere_owner_SDs.at(i) < sphere_owner_SDs.at(j); });
261
262 std::vector<float3, cudallocator<float3>> history_tmp;
263 std::vector<unsigned int, cudallocator<unsigned int>> partners_tmp;
264
265 history_tmp.resize(history_offset * nSpheres);
266 partners_tmp.resize(history_offset * nSpheres);
267
268 // reorder values into new sorted
269 for (unsigned int i = 0; i < nSpheres; i++) {
270 for (unsigned int j = 0; j < history_offset; j++) {
271 history_tmp.at(history_offset * i + j) = contact_history_map.at(history_offset * sphere_ids.at(i) + j);
272 partners_tmp.at(history_offset * i + j) = contact_partners_map.at(history_offset * sphere_ids.at(i) + j);
273 }
274 }
275
276 contact_history_map.swap(history_tmp);
277 contact_partners_map.swap(partners_tmp);
278 }
279
setupSphereDataStructures()280 __host__ void ChSystemGpu_impl::setupSphereDataStructures() {
281 // Each fills user_sphere_positions with positions to be copied
282 if (user_sphere_positions.size() == 0) {
283 CHGPU_ERROR("ERROR! no sphere positions given!\n");
284 }
285
286 nSpheres = (unsigned int)user_sphere_positions.size();
287 INFO_PRINTF("%u balls added!\n", nSpheres);
288 gran_params->nSpheres = nSpheres;
289
290 TRACK_VECTOR_RESIZE(sphere_owner_SDs, nSpheres, "sphere_owner_SDs", NULL_CHGPU_ID);
291
292 // Allocate space for new bodies
293 TRACK_VECTOR_RESIZE(sphere_local_pos_X, nSpheres, "sphere_local_pos_X", 0);
294 TRACK_VECTOR_RESIZE(sphere_local_pos_Y, nSpheres, "sphere_local_pos_Y", 0);
295 TRACK_VECTOR_RESIZE(sphere_local_pos_Z, nSpheres, "sphere_local_pos_Z", 0);
296
297 TRACK_VECTOR_RESIZE(sphere_fixed, nSpheres, "sphere_fixed", 0);
298
299 TRACK_VECTOR_RESIZE(pos_X_dt, nSpheres, "pos_X_dt", 0);
300 TRACK_VECTOR_RESIZE(pos_Y_dt, nSpheres, "pos_Y_dt", 0);
301 TRACK_VECTOR_RESIZE(pos_Z_dt, nSpheres, "pos_Z_dt", 0);
302
303 // temporarily store global positions as 64-bit, discard as soon as local positions are loaded
304 {
305 bool user_provided_fixed = user_sphere_fixed.size() != 0;
306 bool user_provided_vel = user_sphere_vel.size() != 0;
307 if (user_provided_fixed && user_sphere_fixed.size() != nSpheres)
308 CHGPU_ERROR("Provided fixity array has length %zu, but there are %u spheres!\n", user_sphere_fixed.size(),
309 nSpheres);
310 if (user_provided_vel && user_sphere_vel.size() != nSpheres)
311 CHGPU_ERROR("Provided velocity array has length %zu, but there are %u spheres!\n", user_sphere_vel.size(),
312 nSpheres);
313
314 std::vector<int64_t, cudallocator<int64_t>> sphere_global_pos_X;
315 std::vector<int64_t, cudallocator<int64_t>> sphere_global_pos_Y;
316 std::vector<int64_t, cudallocator<int64_t>> sphere_global_pos_Z;
317
318 sphere_global_pos_X.resize(nSpheres);
319 sphere_global_pos_Y.resize(nSpheres);
320 sphere_global_pos_Z.resize(nSpheres);
321
322 // Copy from array of structs to 3 arrays
323 for (unsigned int i = 0; i < nSpheres; i++) {
324 float3 vec = user_sphere_positions.at(i);
325 // cast to double, convert to SU, then cast to int64_t
326 sphere_global_pos_X.at(i) = (int64_t)((double)vec.x / LENGTH_SU2UU);
327 sphere_global_pos_Y.at(i) = (int64_t)((double)vec.y / LENGTH_SU2UU);
328 sphere_global_pos_Z.at(i) = (int64_t)((double)vec.z / LENGTH_SU2UU);
329
330 // Convert to not_stupid_bool
331 sphere_fixed.at(i) = (not_stupid_bool)((user_provided_fixed) ? user_sphere_fixed[i] : false);
332 if (user_provided_vel) {
333 auto vel = user_sphere_vel.at(i);
334 pos_X_dt.at(i) = (float)(vel.x / VEL_SU2UU);
335 pos_Y_dt.at(i) = (float)(vel.y / VEL_SU2UU);
336 pos_Z_dt.at(i) = (float)(vel.z / VEL_SU2UU);
337 }
338 }
339
340 packSphereDataPointers();
341 // Figure our the number of blocks that need to be launched to cover the box
342 unsigned int nBlocks = (nSpheres + CUDA_THREADS_PER_BLOCK - 1) / CUDA_THREADS_PER_BLOCK;
343 initializeLocalPositions<<<nBlocks, CUDA_THREADS_PER_BLOCK>>>(
344 sphere_data, sphere_global_pos_X.data(), sphere_global_pos_Y.data(), sphere_global_pos_Z.data(), nSpheres,
345 gran_params);
346
347 gpuErrchk(cudaDeviceSynchronize());
348 gpuErrchk(cudaPeekAtLastError());
349 }
350
351 TRACK_VECTOR_RESIZE(sphere_acc_X, nSpheres, "sphere_acc_X", 0);
352 TRACK_VECTOR_RESIZE(sphere_acc_Y, nSpheres, "sphere_acc_Y", 0);
353 TRACK_VECTOR_RESIZE(sphere_acc_Z, nSpheres, "sphere_acc_Z", 0);
354
355 // The buffer array that stores any quantity that the user wish to quarry. We resize it here once instead of
356 // resizing on-the-call, to save time, in case that quarry function is called with a high frequency. The last
357 // element in this array is to store the reduced value.
358 TRACK_VECTOR_RESIZE(sphere_stats_buffer, nSpheres + 1, "sphere_stats_buffer", 0);
359
360 // NOTE that this will get resized again later, this is just the first estimate
361 TRACK_VECTOR_RESIZE(spheres_in_SD_composite, 2 * nSpheres, "spheres_in_SD_composite", NULL_CHGPU_ID);
362
363 if (gran_params->friction_mode != CHGPU_FRICTION_MODE::FRICTIONLESS) {
364 // add rotational DOFs
365 TRACK_VECTOR_RESIZE(sphere_Omega_X, nSpheres, "sphere_Omega_X", 0);
366 TRACK_VECTOR_RESIZE(sphere_Omega_Y, nSpheres, "sphere_Omega_Y", 0);
367 TRACK_VECTOR_RESIZE(sphere_Omega_Z, nSpheres, "sphere_Omega_Z", 0);
368
369 // add torques
370 TRACK_VECTOR_RESIZE(sphere_ang_acc_X, nSpheres, "sphere_ang_acc_X", 0);
371 TRACK_VECTOR_RESIZE(sphere_ang_acc_Y, nSpheres, "sphere_ang_acc_Y", 0);
372 TRACK_VECTOR_RESIZE(sphere_ang_acc_Z, nSpheres, "sphere_ang_acc_Z", 0);
373
374 {
375 bool user_provided_ang_vel = user_sphere_ang_vel.size() != 0;
376 if (user_provided_ang_vel && user_sphere_ang_vel.size() != nSpheres)
377 CHGPU_ERROR("Provided angular velocity array has length %zu, but there are %u spheres!\n",
378 user_sphere_ang_vel.size(), nSpheres);
379 if (user_provided_ang_vel) {
380 for (unsigned int i = 0; i < nSpheres; i++) {
381 auto ang_vel = user_sphere_ang_vel.at(i);
382 sphere_Omega_X.at(i) = (float)(ang_vel.x * TIME_SU2UU);
383 sphere_Omega_Y.at(i) = (float)(ang_vel.y * TIME_SU2UU);
384 sphere_Omega_Z.at(i) = (float)(ang_vel.z * TIME_SU2UU);
385 }
386 }
387 }
388 }
389
390 if (time_integrator == CHGPU_TIME_INTEGRATOR::CHUNG) {
391 TRACK_VECTOR_RESIZE(sphere_acc_X_old, nSpheres, "sphere_acc_X_old", 0);
392 TRACK_VECTOR_RESIZE(sphere_acc_Y_old, nSpheres, "sphere_acc_Y_old", 0);
393 TRACK_VECTOR_RESIZE(sphere_acc_Z_old, nSpheres, "sphere_acc_Z_old", 0);
394
395 // friction and multistep means keep old ang acc
396 if (gran_params->friction_mode != CHGPU_FRICTION_MODE::FRICTIONLESS) {
397 TRACK_VECTOR_RESIZE(sphere_ang_acc_X_old, nSpheres, "sphere_ang_acc_X_old", 0);
398 TRACK_VECTOR_RESIZE(sphere_ang_acc_Y_old, nSpheres, "sphere_ang_acc_Y_old", 0);
399 TRACK_VECTOR_RESIZE(sphere_ang_acc_Z_old, nSpheres, "sphere_ang_acc_Z_old", 0);
400 }
401 }
402
403 // If this is a new-boot, we usually want to do this defragment.
404 // But if this is a restart, then probably no. We do not want every time the simulation restarts,
405 // we have the order of particles completely changed: it may be bad for visualization or debugging
406 if (defragment_on_start) {
407 defragment_initial_positions();
408 }
409
410 bool user_provided_internal_data = false;
411 if (gran_params->friction_mode == CHGPU_FRICTION_MODE::MULTI_STEP ||
412 gran_params->friction_mode == CHGPU_FRICTION_MODE::SINGLE_STEP) {
413 TRACK_VECTOR_RESIZE(contact_partners_map, MAX_SPHERES_TOUCHED_BY_SPHERE * nSpheres, "contact_partners_map",
414 NULL_CHGPU_ID);
415 TRACK_VECTOR_RESIZE(contact_active_map, MAX_SPHERES_TOUCHED_BY_SPHERE * nSpheres, "contact_active_map", false);
416
417 // If the user provides a checkpointed history array, we load it here
418 bool user_provided_partner_map = user_partner_map.size() != 0;
419 if (user_provided_partner_map && user_partner_map.size() != MAX_SPHERES_TOUCHED_BY_SPHERE * nSpheres)
420 CHGPU_ERROR("ERROR! The user provided contact partner map has size %zu. It needs to be %u * %u!\n",
421 user_partner_map.size(), MAX_SPHERES_TOUCHED_BY_SPHERE, nSpheres);
422
423 // Hope that using .at (instead of []) gives better err msg when things go wrong,
424 // at the cost of some speed which is not important in I/O
425 if (user_provided_partner_map) {
426 for (unsigned int i = 0; i < nSpheres; i++) {
427 for (unsigned int j = 0; j < MAX_SPHERES_TOUCHED_BY_SPHERE; j++) {
428 contact_partners_map.at(MAX_SPHERES_TOUCHED_BY_SPHERE * i + j) =
429 user_partner_map.at(MAX_SPHERES_TOUCHED_BY_SPHERE * i + j);
430 }
431 }
432 }
433
434 user_provided_internal_data = user_provided_internal_data || user_provided_partner_map;
435 }
436
437 if (gran_params->friction_mode == CHGPU_FRICTION_MODE::MULTI_STEP) {
438 float3 null_history = {0., 0., 0.};
439 TRACK_VECTOR_RESIZE(contact_history_map, MAX_SPHERES_TOUCHED_BY_SPHERE * nSpheres, "contact_history_map",
440 null_history);
441 TRACK_VECTOR_RESIZE(contact_duration, MAX_SPHERES_TOUCHED_BY_SPHERE * nSpheres, "contact_duration", 0);
442
443 // If the user provides a checkpointed history array, we load it here
444 bool user_provided_friction_history = user_friction_history.size() != 0;
445 if (user_provided_friction_history && user_friction_history.size() != MAX_SPHERES_TOUCHED_BY_SPHERE * nSpheres)
446 CHGPU_ERROR("ERROR! The user provided contact friction history has size %zu. It needs to be %u * %u!\n",
447 user_friction_history.size(), MAX_SPHERES_TOUCHED_BY_SPHERE, nSpheres);
448
449 if (user_provided_friction_history) {
450 for (unsigned int i = 0; i < nSpheres; i++) {
451 for (unsigned int j = 0; j < MAX_SPHERES_TOUCHED_BY_SPHERE; j++) {
452 float3 history_UU = user_friction_history[MAX_SPHERES_TOUCHED_BY_SPHERE * i + j];
453 float3 history_SU = make_float3(history_UU.x / LENGTH_SU2UU, history_UU.y / LENGTH_SU2UU,
454 history_UU.z / LENGTH_SU2UU);
455 contact_history_map.at(MAX_SPHERES_TOUCHED_BY_SPHERE * i + j) = history_SU;
456 }
457 }
458 }
459
460 user_provided_internal_data = user_provided_internal_data || user_provided_friction_history;
461 }
462
463 // This if content should be executed rarely, if at all.
464 // If user gives Chrono::Gpu internal data from a file then it's a restart,
465 // then defragment_on_start should be set to false. But I implemented it anyway.
466 if (user_provided_internal_data && defragment_on_start) {
467 defragment_friction_history(MAX_SPHERES_TOUCHED_BY_SPHERE);
468 }
469
470 // record normal contact force
471 if (gran_params->recording_contactInfo == true) {
472 float3 null_force = {0.0f, 0.0f, 0.0f};
473 TRACK_VECTOR_RESIZE(normal_contact_force, MAX_SPHERES_TOUCHED_BY_SPHERE * nSpheres, "normal contact force",
474 null_force);
475 }
476
477 // record friction force
478 if (gran_params->recording_contactInfo == true && gran_params->friction_mode != CHGPU_FRICTION_MODE::FRICTIONLESS) {
479 float3 null_force = {0.0f, 0.0f, 0.0f};
480 TRACK_VECTOR_RESIZE(tangential_friction_force, MAX_SPHERES_TOUCHED_BY_SPHERE * nSpheres,
481 "tangential contact force", null_force);
482 }
483
484 // record rolling friction torque
485 if (gran_params->recording_contactInfo == true && gran_params->rolling_mode != CHGPU_ROLLING_MODE::NO_RESISTANCE) {
486 float3 null_force = {0.0f, 0.0f, 0.0f};
487 TRACK_VECTOR_RESIZE(rolling_friction_torque, MAX_SPHERES_TOUCHED_BY_SPHERE * nSpheres,
488 "rolling friction torque", null_force);
489 TRACK_VECTOR_RESIZE(char_collision_time, MAX_SPHERES_TOUCHED_BY_SPHERE * nSpheres,
490 "characterisitc collision time", 0);
491 TRACK_VECTOR_RESIZE(v_rot_array, MAX_SPHERES_TOUCHED_BY_SPHERE * nSpheres, "v rot", null_force);
492 }
493
494 // make sure the right pointers are packed
495 packSphereDataPointers();
496 }
497
498 /// <summary>
499 /// runSphereBroadphase goes through three stages. First, a kernel figures out for each SD, how many spheres touch it.
500 /// Then, there is a prefix scan done (which requires two CUB function calls) to figure out offsets into the big fat
501 /// array that contains, for SD after SD, which spheres touch the SD. This last thing is accomplished by a kernel call.
502 ///
503 /// CAVEAT: in this approach, the outcome of the prefix scan operation will be canibalized during the kernel call that
504 /// updates the big fat composite array. As such, there is a "scratch-pad" version that is used along the way
505 /// </summary>
506 /// <returns></returns>
runSphereBroadphase()507 __host__ void ChSystemGpu_impl::runSphereBroadphase() {
508 METRICS_PRINTF("Resetting broadphase info!\n");
509
510 // reset the number of spheres per SD, the offsets in the big composite array, and the big fat composite array
511 resetBroadphaseInformation();
512
513 // Frist stage of the computation in this function: Figure out the how many spheres touch each SD.
514 unsigned int nBlocks = (nSpheres + CUDA_THREADS_PER_BLOCK - 1) / CUDA_THREADS_PER_BLOCK;
515 getNumberOfSpheresTouchingEachSD<CUDA_THREADS_PER_BLOCK>
516 <<<nBlocks, CUDA_THREADS_PER_BLOCK>>>(sphere_data, nSpheres, gran_params);
517 gpuErrchk(cudaDeviceSynchronize());
518 gpuErrchk(cudaPeekAtLastError());
519
520 // Starting the second stage of this function call - the prefix scan operation
521 unsigned int* out_ptr = SD_SphereCompositeOffsets.data();
522 unsigned int* in_ptr = SD_NumSpheresTouching.data();
523 gpuErrchk(cudaMemcpy(out_ptr, in_ptr, nSDs * sizeof(unsigned int), cudaMemcpyDeviceToDevice));
524
525 // cold run; CUB determines the amount of storage it needs (since first argument is NULL pointer)
526 size_t temp_storage_bytes = 0;
527 cub::DeviceScan::ExclusiveSum(NULL, temp_storage_bytes, in_ptr, out_ptr, nSDs);
528 gpuErrchk(cudaDeviceSynchronize());
529 gpuErrchk(cudaPeekAtLastError());
530
531 // give CUB needed temporary storage on the device
532 void* d_scratch_space = (void*)stateOfSolver_resources.pDeviceMemoryScratchSpace(temp_storage_bytes);
533 // Run the actual exclusive prefix sum
534 cub::DeviceScan::ExclusiveSum(d_scratch_space, temp_storage_bytes, in_ptr, out_ptr, nSDs);
535 gpuErrchk(cudaDeviceSynchronize());
536 gpuErrchk(cudaPeekAtLastError());
537
538 // Beginning of the last stage of computation in this function: assembling the big composite array.
539 // num_entries: total number of sphere entries to record in the big fat composite array
540 unsigned int num_entries = out_ptr[nSDs - 1] + in_ptr[nSDs - 1];
541 spheres_in_SD_composite.resize(num_entries, NULL_CHGPU_ID);
542 sphere_data->spheres_in_SD_composite = spheres_in_SD_composite.data();
543
544 // Copy the offesets in the scratch pad; the subsequent kernel call would step on the outcome of the prefix scan
545 gpuErrchk(cudaMemcpy(SD_SphereCompositeOffsets_ScratchPad.data(), SD_SphereCompositeOffsets.data(),
546 nSDs * sizeof(unsigned int), cudaMemcpyDeviceToDevice));
547 // Populate the composite array; in the process, the content of the scratch pad will be modified
548 // nBlocks = (MAX_SDs_TOUCHED_BY_SPHERE * nSpheres + 2*CUDA_THREADS_PER_BLOCK - 1) / (2*CUDA_THREADS_PER_BLOCK);
549 // populateSpheresInEachSD<<<nBlocks, 2*CUDA_THREADS_PER_BLOCK>>>(sphere_data, nSpheres, gran_params);
550 nBlocks = (nSpheres + CUDA_THREADS_PER_BLOCK - 1) / (CUDA_THREADS_PER_BLOCK);
551 populateSpheresInEachSD<<<nBlocks, CUDA_THREADS_PER_BLOCK>>>(sphere_data, nSpheres, gran_params);
552 gpuErrchk(cudaDeviceSynchronize());
553 gpuErrchk(cudaPeekAtLastError());
554 }
555
updateBCPositions()556 __host__ void ChSystemGpu_impl::updateBCPositions() {
557 for (unsigned int i = 0; i < BC_params_list_UU.size(); i++) {
558 auto bc_type = BC_type_list.at(i);
559 const BC_params_t<float, float3>& params_UU = BC_params_list_UU.at(i);
560 BC_params_t<int64_t, int64_t3>& params_SU = BC_params_list_SU.at(i);
561 auto offset_function = BC_offset_function_list.at(i);
562 setBCOffset(bc_type, params_UU, params_SU, offset_function(elapsedSimTime));
563 }
564
565 if (!BD_is_fixed) {
566 double3 new_BD_offset = BDOffsetFunction(elapsedSimTime);
567
568 int64_t3 bd_offset_SU = {0, 0, 0};
569 bd_offset_SU.x = (int64_t)(new_BD_offset.x / LENGTH_SU2UU);
570 bd_offset_SU.y = (int64_t)(new_BD_offset.y / LENGTH_SU2UU);
571 bd_offset_SU.z = (int64_t)(new_BD_offset.z / LENGTH_SU2UU);
572
573 int64_t old_frame_X = gran_params->BD_frame_X;
574 int64_t old_frame_Y = gran_params->BD_frame_Y;
575 int64_t old_frame_Z = gran_params->BD_frame_Z;
576
577 gran_params->BD_frame_X = bd_offset_SU.x + BD_rest_frame_SU.x;
578 gran_params->BD_frame_Y = bd_offset_SU.y + BD_rest_frame_SU.y;
579 gran_params->BD_frame_Z = bd_offset_SU.z + BD_rest_frame_SU.z;
580
581 unsigned int nBlocks = (nSpheres + CUDA_THREADS_PER_BLOCK - 1) / CUDA_THREADS_PER_BLOCK;
582
583 int64_t3 offset_delta = {0, 0, 0};
584
585 // if the frame X increases, the local X should decrease
586 offset_delta.x = old_frame_X - gran_params->BD_frame_X;
587 offset_delta.y = old_frame_Y - gran_params->BD_frame_Y;
588 offset_delta.z = old_frame_Z - gran_params->BD_frame_Z;
589
590 // printf("offset is %lld, %lld, %lld\n", offset_delta.x, offset_delta.y, offset_delta.z);
591
592 packSphereDataPointers();
593
594 applyBDFrameChange<<<nBlocks, CUDA_THREADS_PER_BLOCK>>>(offset_delta, sphere_data, nSpheres, gran_params);
595
596 gpuErrchk(cudaPeekAtLastError());
597 gpuErrchk(cudaDeviceSynchronize());
598 }
599 }
600
AdvanceSimulation(float duration)601 __host__ double ChSystemGpu_impl::AdvanceSimulation(float duration) {
602 // Figure our the number of blocks that need to be launched to cover the box
603 unsigned int nBlocks = (nSpheres + CUDA_THREADS_PER_BLOCK - 1) / CUDA_THREADS_PER_BLOCK;
604 // Settling simulation loop.
605 float duration_SU = (float)(duration / TIME_SU2UU);
606 unsigned int nsteps = (unsigned int)std::round(duration_SU / stepSize_SU);
607 METRICS_PRINTF("advancing by %f at timestep %f, %u timesteps at approx user timestep %f\n", duration_SU,
608 stepSize_SU, nsteps, duration / nsteps);
609 float time_elapsed_SU = 0; // time elapsed in this advance call
610
611 packSphereDataPointers();
612
613 // Run the simulation, there are aggressive synchronizations because we want to have no race conditions
614 for (unsigned int n = 0; n < nsteps; n++) {
615 updateBCPositions();
616 runSphereBroadphase();
617 resetSphereAccelerations();
618 resetBCForces();
619
620 METRICS_PRINTF("Starting computeSphereForces!\n");
621
622 if (gran_params->friction_mode == CHGPU_FRICTION_MODE::FRICTIONLESS) {
623 // Compute sphere-sphere forces
624 computeSphereForces_frictionless_matBased<<<nSDs, MAX_COUNT_OF_SPHERES_PER_SD>>>(
625 sphere_data, gran_params, BC_type_list.data(), BC_params_list_SU.data(),
626 (unsigned int)BC_params_list_SU.size());
627 gpuErrchk(cudaPeekAtLastError());
628 gpuErrchk(cudaDeviceSynchronize());
629 } else if (gran_params->friction_mode == CHGPU_FRICTION_MODE::SINGLE_STEP ||
630 gran_params->friction_mode == CHGPU_FRICTION_MODE::MULTI_STEP) {
631 // figure out who is contacting
632 determineContactPairs<<<nSDs, MAX_COUNT_OF_SPHERES_PER_SD>>>(sphere_data, gran_params);
633 gpuErrchk(cudaPeekAtLastError());
634 gpuErrchk(cudaDeviceSynchronize());
635
636 if (gran_params->use_mat_based == true) {
637 computeSphereContactForces_matBased<<<nBlocks, CUDA_THREADS_PER_BLOCK>>>(
638 sphere_data, gran_params, BC_type_list.data(), BC_params_list_SU.data(),
639 (unsigned int)BC_params_list_SU.size(), nSpheres);
640
641 } else {
642 computeSphereContactForces<<<nBlocks, CUDA_THREADS_PER_BLOCK>>>(
643 sphere_data, gran_params, BC_type_list.data(), BC_params_list_SU.data(),
644 (unsigned int)BC_params_list_SU.size(), nSpheres);
645 }
646
647 gpuErrchk(cudaPeekAtLastError());
648 gpuErrchk(cudaDeviceSynchronize());
649 }
650
651 METRICS_PRINTF("Starting integrateSpheres!\n");
652 integrateSpheres<<<nBlocks, CUDA_THREADS_PER_BLOCK>>>(stepSize_SU, sphere_data, nSpheres, gran_params);
653 gpuErrchk(cudaPeekAtLastError());
654 gpuErrchk(cudaDeviceSynchronize());
655
656 if (gran_params->friction_mode != CHGPU_FRICTION_MODE::FRICTIONLESS) {
657 const unsigned int nThreadsUpdateHist = 2 * CUDA_THREADS_PER_BLOCK;
658 unsigned int fricMapSize = nSpheres * MAX_SPHERES_TOUCHED_BY_SPHERE;
659 unsigned int nBlocksFricHistoryPostProcess = (fricMapSize + nThreadsUpdateHist - 1) / nThreadsUpdateHist;
660
661 METRICS_PRINTF("Update Friction Data!\n");
662
663 updateFrictionData<<<nBlocksFricHistoryPostProcess, nThreadsUpdateHist>>>(fricMapSize, sphere_data,
664 gran_params);
665
666 gpuErrchk(cudaPeekAtLastError());
667 gpuErrchk(cudaDeviceSynchronize());
668 METRICS_PRINTF("Update angular velocity.\n");
669 updateAngVels<<<nBlocks, CUDA_THREADS_PER_BLOCK>>>(stepSize_SU, sphere_data, nSpheres, gran_params);
670 gpuErrchk(cudaPeekAtLastError());
671 gpuErrchk(cudaDeviceSynchronize());
672 }
673
674 elapsedSimTime += (float)(stepSize_SU * TIME_SU2UU); // Advance current time
675 time_elapsed_SU += stepSize_SU;
676 }
677
678 return time_elapsed_SU * TIME_SU2UU; // return elapsed UU time
679 }
680 } // namespace gpu
681 } // namespace chrono
682