1 // =============================================================================
2 // PROJECT CHRONO - http://projectchrono.org
3 //
4 // Copyright (c) 2014 projectchrono.org
5 // All rights reserved.
6 //
7 // Use of this source code is governed by a BSD-style license that can be found
8 // in the LICENSE file at the top level of the distribution and at
9 // http://projectchrono.org/license-chrono.txt.
10 //
11 // =============================================================================
12 // Author: Milad Rakhsha
13 // =============================================================================
14 #include <thrust/extrema.h>
15 #include <thrust/sort.h>
16 #include "chrono_fsi/physics/ChFsiForceIISPH.cuh"
17 #include "chrono_fsi/physics/ChSphGeneral.cuh"
18 #define RESOLUTION_LENGTH_MULT_IISPH 2.0
19 
20 //==========================================================================================================================================
21 namespace chrono {
22 namespace fsi {
23 // double precision atomic add function
datomicAdd(double * address,double val)24 __device__ inline double datomicAdd(double* address, double val) {
25     unsigned long long int* address_as_ull = (unsigned long long int*)address;
26 
27     unsigned long long int old = *address_as_ull, assumed;
28 
29     do {
30         assumed = old;
31         old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val + __longlong_as_double(assumed)));
32     } while (assumed != old);
33 
34     return __longlong_as_double(old);
35 }
36 
ChFsiForceIISPH(std::shared_ptr<ChBce> otherBceWorker,std::shared_ptr<SphMarkerDataD> otherSortedSphMarkersD,std::shared_ptr<ProximityDataD> otherMarkersProximityD,std::shared_ptr<FsiGeneralData> otherFsiGeneralData,std::shared_ptr<SimParams> otherParamsH,std::shared_ptr<NumberOfObjects> otherNumObjects)37 ChFsiForceIISPH::ChFsiForceIISPH(std::shared_ptr<ChBce> otherBceWorker,
38                                  std::shared_ptr<SphMarkerDataD> otherSortedSphMarkersD,
39                                  std::shared_ptr<ProximityDataD> otherMarkersProximityD,
40                                  std::shared_ptr<FsiGeneralData> otherFsiGeneralData,
41                                  std::shared_ptr<SimParams> otherParamsH,
42                                  std::shared_ptr<NumberOfObjects> otherNumObjects)
43     : ChFsiForce(otherBceWorker,
44                  otherSortedSphMarkersD,
45                  otherMarkersProximityD,
46                  otherFsiGeneralData,
47                  otherParamsH,
48                  otherNumObjects) {}
49 //--------------------------------------------------------------------------------------------------------------------------------
~ChFsiForceIISPH()50 ChFsiForceIISPH::~ChFsiForceIISPH() {}
51 //--------------------------------------------------------------------------------------------------------------------------------
Finalize()52 void ChFsiForceIISPH::Finalize() {
53     ChFsiForce::Finalize();
54     cudaMemcpyToSymbolAsync(paramsD, paramsH.get(), sizeof(SimParams));
55     cudaMemcpyToSymbolAsync(numObjectsD, numObjectsH.get(), sizeof(NumberOfObjects));
56     cudaMemcpyFromSymbol(paramsH.get(), paramsD, sizeof(SimParams));
57     cudaDeviceSynchronize();
58 }
59 //--------------------------------------------------------------------------------------------------------------------------------
V_i_np__AND__d_ii_kernel(Real4 * sortedPosRad,Real3 * sortedVelMas,Real4 * sortedRhoPreMu,Real3 * d_ii,Real3 * V_i_np,Real * sumWij_inv,Real * G_tensor,uint * cellStart,uint * cellEnd,Real delta_t,const size_t numAllMarkers,volatile bool * isErrorD)60 __global__ void V_i_np__AND__d_ii_kernel(Real4* sortedPosRad,  // input: sorted positions
61                                          Real3* sortedVelMas,
62                                          Real4* sortedRhoPreMu,
63                                          Real3* d_ii,
64                                          Real3* V_i_np,
65                                          Real* sumWij_inv,
66                                          Real* G_tensor,
67                                          uint* cellStart,
68                                          uint* cellEnd,
69                                          Real delta_t,
70                                          const size_t numAllMarkers,
71                                          volatile bool* isErrorD) {
72     uint i_idx = blockIdx.x * blockDim.x + threadIdx.x;
73     if (i_idx >= numAllMarkers || sortedRhoPreMu[i_idx].w <= -2) {
74         return;
75     }
76     //    sortedRhoPreMu[i_idx].x = sortedRhoPreMu[i_idx].x / sumWij_inv[i_idx];
77     Real h_i = sortedPosRad[i_idx].w;
78     Real m_i = h_i * h_i * h_i * paramsD.rho0;
79 
80     Real mu_0 = paramsD.mu0;
81     Real epsilon = paramsD.epsMinMarkersDis;
82     Real dT = delta_t;
83     Real3 source_term = paramsD.gravity + paramsD.bodyForce3;
84     Real RHO_0 = paramsD.rho0;
85     if (sortedRhoPreMu[i_idx].x < EPSILON) {
86         printf("density is %f,ref density= %f\n", sortedRhoPreMu[i_idx].x, RHO_0);
87     }
88 
89     Real3 posi = mR3(sortedPosRad[i_idx]);
90     Real3 Veli = sortedVelMas[i_idx];
91     Real Rhoi = sortedRhoPreMu[i_idx].x;
92     Real3 My_d_ii = mR3(0);
93     Real3 My_F_i_np = mR3(0);
94 
95     // get address in grid
96     int3 gridPos = calcGridPos(posi);
97 
98     for (int z = -1; z <= 1; z++) {
99         for (int y = -1; y <= 1; y++) {
100             for (int x = -1; x <= 1; x++) {
101                 int3 neighbourPos = gridPos + mI3(x, y, z);
102                 uint gridHash = calcGridHash(neighbourPos);
103                 // get start of bucket for this cell
104                 uint startIndex = cellStart[gridHash];
105                 if (startIndex != 0xffffffff) {  // cell is not empty
106                     uint endIndex = cellEnd[gridHash];
107                     for (uint j = startIndex; j < endIndex; j++) {
108                         Real3 posj = mR3(sortedPosRad[j]);
109                         Real3 rij = Distance(posi, posj);
110                         Real d = length(rij);
111 
112                         if (d > RESOLUTION_LENGTH_MULT * h_i || sortedRhoPreMu[j].w <= -2 || i_idx == j)
113                             continue;
114                         Real3 eij = rij / d;
115 
116                         Real3 Velj = sortedVelMas[j];
117                         Real Rhoj = sortedRhoPreMu[j].x;
118                         Real h_j = sortedPosRad[j].w;
119 
120                         if (Rhoj == 0) {
121                             printf("Bug F_i_np__AND__d_ii_kernel i=%d j=%d, hi=%f, hj=%f\n", i_idx, j, h_i, h_j);
122                         }
123 
124                         Real m_j = h_j * h_j * h_j * paramsD.rho0;
125                         Real h_ij = 0.5 * (h_j + h_i);
126                         Real3 grad_ij = GradWh(rij, h_ij);
127                         My_d_ii += m_j * (-(dT * dT) / (Rhoi * Rhoi)) * grad_ij;
128 
129                         Real Rho_bar = (Rhoj + Rhoi) * 0.5;
130                         Real3 V_ij = (Veli - Velj);
131                         //                        Real nu = mu_0 * paramsD.HSML * 320 / Rho_bar;
132                         //                        Real3 muNumerator = nu * fmin(0.0, dot(rij, V_ij)) * grad_ij;
133                         Real3 muNumerator = 2 * mu_0 * dot(rij, grad_ij) * V_ij;
134                         Real muDenominator = (Rho_bar * Rho_bar) * (d * d + h_ij * h_ij * epsilon);
135                         //                        if ((sortedRhoPreMu[i_idx].w < 0 && sortedRhoPreMu[j].w < 0))
136                         //                        if (sortedRhoPreMu[i_idx].w < 0 || (sortedRhoPreMu[i_idx].w >= 0 &&
137                         //                        sortedRhoPreMu[j].w < 0))
138                         My_F_i_np += m_j * muNumerator / muDenominator;
139 
140                         Real Wd = W3h(d, h_ij);
141                         My_F_i_np -= paramsD.kappa / m_i * m_j * Wd * rij;
142                     }
143                 }
144             }
145         }
146     }
147 
148     //    if (!paramsD.Conservative_Form)
149     //        My_F_i_np = mu_0 * LaplacainVi;
150 
151     My_F_i_np *= m_i;
152 
153     My_F_i_np += m_i * source_term;
154     d_ii[i_idx] = My_d_ii;
155     V_i_np[i_idx] = (My_F_i_np * dT + Veli);  // This does not contain m_0?
156 }
157 //--------------------------------------------------------------------------------------------------------------------------------
158 //--------------------------------------------------------------------------------------------------------------------------------
Rho_np_AND_a_ii_AND_sum_m_GradW(Real4 * sortedPosRad,Real4 * sortedRhoPreMu,Real * rho_np,Real * a_ii,Real * p_old,Real3 * V_np,Real3 * d_ii,Real3 * sum_m_GradW,uint * cellStart,uint * cellEnd,Real delta_t,const size_t numAllMarkers,volatile bool * isErrorD)159 __global__ void Rho_np_AND_a_ii_AND_sum_m_GradW(Real4* sortedPosRad,
160                                                 Real4* sortedRhoPreMu,
161                                                 Real* rho_np,  // Write
162                                                 Real* a_ii,    // Write
163                                                 Real* p_old,   // Write
164                                                 Real3* V_np,   // Read
165                                                 Real3* d_ii,   // Read
166                                                 Real3* sum_m_GradW,
167                                                 uint* cellStart,
168                                                 uint* cellEnd,
169                                                 Real delta_t,
170                                                 const size_t numAllMarkers,
171                                                 volatile bool* isErrorD) {
172     uint i_idx = blockIdx.x * blockDim.x + threadIdx.x;
173     if (i_idx >= numAllMarkers || sortedRhoPreMu[i_idx].w <= -2) {
174         return;
175     }
176 
177     Real h_i = sortedPosRad[i_idx].w;
178     Real m_i = h_i * h_i * h_i * paramsD.rho0;
179 
180     Real3 posi = mR3(sortedPosRad[i_idx]);
181     Real3 Veli_np = V_np[i_idx];
182     Real Rho_i = sortedRhoPreMu[i_idx].x;
183     Real3 my_d_ii = d_ii[i_idx];
184     Real rho_temp = 0;
185     Real my_a_ii = 0;
186     Real3 My_sum_m_gradW = mR3(0);
187     Real dT = delta_t;
188     // get address in gridj
189     int3 gridPos = calcGridPos(posi);
190 
191     //
192     // examine neighbouring cells
193     for (int z = -1; z <= 1; z++) {
194         for (int y = -1; y <= 1; y++) {
195             for (int x = -1; x <= 1; x++) {
196                 int3 neighbourPos = gridPos + mI3(x, y, z);
197                 uint gridHash = calcGridHash(neighbourPos);
198                 // get start of bucket for this cell
199                 uint startIndex = cellStart[gridHash];
200                 if (startIndex != 0xffffffff) {  // cell is not empty
201                     // iterate over particles in this cell
202                     uint endIndex = cellEnd[gridHash];
203 
204                     for (uint j = startIndex; j < endIndex; j++) {
205                         Real3 posj = mR3(sortedPosRad[j]);
206                         Real3 dist3 = Distance(posi, posj);
207                         Real d = length(dist3);
208                         if (d > RESOLUTION_LENGTH_MULT * h_i || sortedRhoPreMu[j].w <= -2 || i_idx == j)
209                             continue;
210                         Real h_j = sortedPosRad[j].w;
211                         Real m_j = h_j * h_j * h_j * paramsD.rho0;
212                         Real h_ij = 0.5 * (h_j + h_i);
213                         Real3 Velj_np = V_np[j];
214                         Real3 grad_i_wij = GradWh(dist3, h_ij);
215                         rho_temp += m_j * dot((Veli_np - Velj_np), grad_i_wij);
216                         Real3 d_ji = m_i * (-(dT * dT) / (Rho_i * Rho_i)) * (-grad_i_wij);
217                         my_a_ii += m_j * dot((my_d_ii - d_ji), grad_i_wij);
218                         My_sum_m_gradW += m_j * grad_i_wij;
219                     }
220                 }
221             }
222         }
223     }
224     rho_np[i_idx] = dT * rho_temp + sortedRhoPreMu[i_idx].x;
225 
226     // Note: a_ii can become zero and when this can cause divide by 0 issues for free particles
227     a_ii[i_idx] = abs(my_a_ii) > EPSILON ? my_a_ii : 1.0;
228     sum_m_GradW[i_idx] = My_sum_m_gradW;
229 
230     p_old[i_idx] = sortedRhoPreMu[i_idx].y;  // = 1000;  // Note that this is outside of the for loop
231 }
232 //--------------------------------------------------------------------------------------------------------------------------------
233 
Calc_dij_pj(Real3 * dij_pj,Real3 * F_p,Real3 * d_ii,Real4 * sortedPosRad,Real3 * sortedVelMas,Real4 * sortedRhoPreMu,Real * p_old,uint * cellStart,uint * cellEnd,Real delta_t,const size_t numAllMarkers,volatile bool * isErrorD)234 __global__ void Calc_dij_pj(Real3* dij_pj,  // write
235                             Real3* F_p,     // Write
236                             Real3* d_ii,    // Read
237                             Real4* sortedPosRad,
238                             Real3* sortedVelMas,
239                             Real4* sortedRhoPreMu,
240                             Real* p_old,
241                             uint* cellStart,
242                             uint* cellEnd,
243                             Real delta_t,
244                             const size_t numAllMarkers,
245                             volatile bool* isErrorD) {
246     uint i_idx = blockIdx.x * blockDim.x + threadIdx.x;
247     if (i_idx >= numAllMarkers || sortedRhoPreMu[i_idx].w <= -2) {
248         return;
249     }
250     Real h_i = sortedPosRad[i_idx].w;
251     Real m_i = h_i * h_i * h_i * paramsD.rho0;
252 
253     Real3 my_F_p = mR3(0);
254     Real p_i_old = p_old[i_idx];
255     Real3 pos_i = mR3(sortedPosRad[i_idx]);
256     Real Rho_i = sortedRhoPreMu[i_idx].x;
257     if (sortedRhoPreMu[i_idx].x < EPSILON) {
258         printf("(Calc_dij_pj) My density is %f in Calc_dij_pj\n", sortedRhoPreMu[i_idx].x);
259     }
260     Real dT = delta_t;
261 
262     Real3 My_dij_pj = mR3(0);
263     int3 gridPos = calcGridPos(pos_i);
264     for (int z = -1; z <= 1; z++) {
265         for (int y = -1; y <= 1; y++) {
266             for (int x = -1; x <= 1; x++) {
267                 int3 neighbourPos = gridPos + mI3(x, y, z);
268                 uint gridHash = calcGridHash(neighbourPos);
269                 // get start of bucket for this cell
270                 uint startIndex = cellStart[gridHash];
271                 if (startIndex != 0xffffffff) {  // cell is not empty
272                     // iterate over particles in this cell
273                     uint endIndex = cellEnd[gridHash];
274 
275                     for (uint j = startIndex; j < endIndex; j++) {
276                         Real3 pos_j = mR3(sortedPosRad[j]);
277                         Real3 dist3 = Distance(pos_i, pos_j);
278                         Real d = length(dist3);
279                         ////CHECK THIS CONDITION!!!
280                         if (d > RESOLUTION_LENGTH_MULT * h_i || sortedRhoPreMu[j].w <= -2 || i_idx == j)
281                             continue;
282                         Real h_j = sortedPosRad[j].w;
283                         Real m_j = h_j * h_j * h_j * paramsD.rho0;
284                         Real h_ij = 0.5 * (h_j + h_i);
285                         Real3 grad_i_wij = GradWh(dist3, h_ij);
286                         Real Rho_j = sortedRhoPreMu[j].x;
287                         Real p_j_old = p_old[j];
288                         My_dij_pj += m_j * (-(dT * dT) / (Rho_j * Rho_j)) * grad_i_wij * p_j_old;
289                         my_F_p += m_j * ((p_i_old / (Rho_i * Rho_i)) + (p_j_old / (Rho_j * Rho_j))) * grad_i_wij;
290                     }
291                 }
292             }
293         }
294     }
295     dij_pj[i_idx] = My_dij_pj;
296     F_p[i_idx] = -m_i * my_F_p;
297 }
298 
299 //--------------------------------------------------------------------------------------------------------------------------------
CalcNumber_Contacts(uint * numContacts,Real4 * sortedPosRad,Real4 * sortedRhoPreMu,uint * cellStart,uint * cellEnd,const size_t numAllMarkers,volatile bool * isErrorD)300 __global__ void CalcNumber_Contacts(uint* numContacts,
301                                     Real4* sortedPosRad,
302                                     Real4* sortedRhoPreMu,
303                                     uint* cellStart,
304                                     uint* cellEnd,
305                                     const size_t numAllMarkers,
306                                     volatile bool* isErrorD) {
307     uint i_idx = blockIdx.x * blockDim.x + threadIdx.x;
308     if (i_idx >= numAllMarkers) {
309         return;
310     }
311     if (sortedRhoPreMu[i_idx].w <= -2) {
312         numContacts[i_idx] = 1;
313         return;
314     }
315 
316     Real h_i = sortedPosRad[i_idx].w;
317     //    Real m_i = h_i * h_i * h_i * paramsD.rho0;
318 
319     int myType = sortedRhoPreMu[i_idx].w;
320     Real3 pos_i = mR3(sortedPosRad[i_idx]);
321 
322     uint numCol[400];
323     int counter = 1;
324     numCol[0] = i_idx;  // The first one is always the idx of the marker itself
325 
326     int3 gridPos = calcGridPos(pos_i);
327     for (int z = -1; z <= 1; z++) {
328         for (int y = -1; y <= 1; y++) {
329             for (int x = -1; x <= 1; x++) {
330                 int3 neighbourPos = gridPos + mI3(x, y, z);
331                 uint gridHash = calcGridHash(neighbourPos);
332                 // get start of bucket for this cell
333                 uint startIndex = cellStart[gridHash];
334                 if (startIndex != 0xffffffff) {
335                     // iterate over particles in this cell
336                     uint endIndex = cellEnd[gridHash];
337                     for (uint j = startIndex; j < endIndex; j++) {
338                         Real3 pos_j = mR3(sortedPosRad[j]);
339                         Real3 dist3 = Distance(pos_i, pos_j);
340                         Real d = length(dist3);
341                         if (d > RESOLUTION_LENGTH_MULT * h_i || sortedRhoPreMu[j].w <= -2 || i_idx == j)
342                             continue;
343                         bool AlreadyHave = false;
344                         for (uint findCol = 1; findCol <= counter; findCol++) {
345                             if (numCol[findCol] == j) {
346                                 AlreadyHave = true;
347                                 continue;
348                             }
349                         }
350 
351                         // Room for improvment ...
352                         if (!AlreadyHave) {
353                             numCol[counter] = j;
354                             counter++;
355                             // Do not count BCE-BCE interactions...
356                             if (myType >= 0 && sortedRhoPreMu[j].w >= 0 && paramsD.bceType == BceVersion::ADAMI)
357                                 counter--;
358                         }
359 
360                         if (myType != -1)  // For BCE no need to go deeper than this...
361                             continue;
362 
363                         Real h_j = sortedPosRad[j].w;
364                         int3 gridPosJ = calcGridPos(pos_j);
365 
366                         for (int zz = -1; zz <= 1; zz++) {
367                             for (int yy = -1; yy <= 1; yy++) {
368                                 for (int xx = -1; xx <= 1; xx++) {
369                                     int3 neighbourPosJ = gridPosJ + mI3(xx, yy, zz);
370                                     uint gridHashJ = calcGridHash(neighbourPosJ);
371                                     uint startIndexJ = cellStart[gridHashJ];
372                                     if (startIndexJ != 0xffffffff) {  // cell is not empty
373                                         uint endIndexJ = cellEnd[gridHashJ];
374                                         for (uint k = startIndexJ; k < endIndexJ; k++) {
375                                             Real3 pos_k = mR3(sortedPosRad[k]);
376                                             Real3 dist3jk = Distance(pos_j, pos_k);
377                                             Real djk = length(dist3jk);
378                                             if (djk > RESOLUTION_LENGTH_MULT * h_j || k == j || k == i_idx ||
379                                                 sortedRhoPreMu[k].w <= -2)
380                                                 continue;
381                                             bool AlreadyHave2 = false;
382                                             for (uint findCol = 1; findCol <= counter; findCol++) {
383                                                 if (numCol[findCol] == k) {
384                                                     AlreadyHave2 = true;
385                                                     continue;
386                                                 }
387                                             }
388                                             if (!AlreadyHave2) {
389                                                 numCol[counter] = k;
390                                                 counter++;
391                                             }
392                                         }
393                                     }
394                                 }
395                             }
396                         }
397 
398                     }
399                 }
400             }
401         }
402     }
403 
404     numContacts[i_idx] = counter + 10;
405 }
406 
407 //--------------------------------------------------------------------------------------------------------------------------------
Calc_summGradW(Real3 * summGradW,Real4 * sortedPosRad,Real4 * sortedRhoPreMu,uint * cellStart,uint * cellEnd,const size_t numAllMarkers,volatile bool * isErrorD)408 __global__ void Calc_summGradW(Real3* summGradW,  // write
409                                Real4* sortedPosRad,
410                                Real4* sortedRhoPreMu,
411                                uint* cellStart,
412                                uint* cellEnd,
413                                const size_t numAllMarkers,
414                                volatile bool* isErrorD) {
415     uint i_idx = blockIdx.x * blockDim.x + threadIdx.x;
416     if (i_idx >= numAllMarkers) {
417         return;
418     }
419     if (sortedRhoPreMu[i_idx].w <= -2) {
420         return;
421     }
422     Real h_i = sortedPosRad[i_idx].w;
423     //    Real m_i = h_i * h_i * h_i * paramsD.rho0;
424 
425     Real3 pos_i = mR3(sortedPosRad[i_idx]);
426     Real3 My_summgradW = mR3(0);
427     //    Real dT = paramsD.dT;
428     int3 gridPos = calcGridPos(pos_i);
429     for (int z = -1; z <= 1; z++) {
430         for (int y = -1; y <= 1; y++) {
431             for (int x = -1; x <= 1; x++) {
432                 int3 neighbourPos = gridPos + mI3(x, y, z);
433                 uint gridHash = calcGridHash(neighbourPos);
434                 // get start of bucket for this cell
435                 uint startIndex = cellStart[gridHash];
436                 if (startIndex != 0xffffffff) {  // cell is not empty
437                     // iterate over particles in this cell
438                     uint endIndex = cellEnd[gridHash];
439 
440                     for (uint j = startIndex; j < endIndex; j++) {
441                         Real3 pos_j = mR3(sortedPosRad[j]);
442                         Real3 dist3 = Distance(pos_i, pos_j);
443                         Real d = length(dist3);
444                         if (d > RESOLUTION_LENGTH_MULT * h_i || sortedRhoPreMu[j].w <= -2 || i_idx == j)
445                             continue;
446                         Real h_j = sortedPosRad[j].w;
447                         Real m_j = h_j * h_j * h_j * paramsD.rho0;
448                         Real h_ij = 0.5 * (h_j + h_i);
449                         Real3 grad_i_wij = GradWh(dist3, h_ij);
450                         My_summgradW += m_j * grad_i_wij;
451                     }
452                 }
453             }
454         }
455     }
456     summGradW[i_idx] = My_summgradW;
457 }
458 
459 //--------------------------------------------------------------------------------------------------------------------------------
Calc_BC_aij_Bi(const uint i_idx,Real * csrValA,uint * csrColIndA,unsigned long int * GlobalcsrColIndA,uint * numContacts,Real * a_ii,Real * B_i,Real4 * sortedPosRad,Real3 * sortedVelMas,const Real4 * sortedRhoPreMu,Real3 * V_new,Real * p_old,Real3 * Normals,Real * G_i,Real * sumWij_inv,Real4 * qD,Real3 * rigidSPH_MeshPos_LRF_D,Real3 * posRigid_fsiBodies_D,Real4 * velMassRigid_fsiBodies_D,Real3 * omegaVelLRF_fsiBodies_D,Real3 * accRigid_fsiBodies_D,Real3 * omegaAccLRF_fsiBodies_D,uint * rigidIdentifierD,Real3 * pos_fsi_fea_D,Real3 * vel_fsi_fea_D,Real3 * acc_fsi_fea_D,uint * FlexIdentifierD,const int numFlex1D,uint2 * CableElementsNodes,uint4 * ShellelementsNodes,int4 updatePortion,uint * gridMarkerIndexD,const uint * cellStart,const uint * cellEnd,const size_t numAllMarkers,bool IsSPARSE)460 __device__ void Calc_BC_aij_Bi(const uint i_idx,
461                                Real* csrValA,
462                                uint* csrColIndA,
463                                unsigned long int* GlobalcsrColIndA,
464                                uint* numContacts,
465                                // The above 4 vectors are used for CSR form.
466                                Real* a_ii,  // write
467                                Real* B_i,
468                                Real4* sortedPosRad,
469                                Real3* sortedVelMas,
470                                const Real4* sortedRhoPreMu,
471                                Real3* V_new,
472                                Real* p_old,
473                                Real3* Normals,
474                                Real* G_i,
475                                Real* sumWij_inv,
476 
477                                Real4* qD,
478                                Real3* rigidSPH_MeshPos_LRF_D,
479                                Real3* posRigid_fsiBodies_D,
480                                Real4* velMassRigid_fsiBodies_D,
481                                Real3* omegaVelLRF_fsiBodies_D,
482                                Real3* accRigid_fsiBodies_D,
483                                Real3* omegaAccLRF_fsiBodies_D,
484                                uint* rigidIdentifierD,
485 
486                                Real3* pos_fsi_fea_D,
487                                Real3* vel_fsi_fea_D,
488                                Real3* acc_fsi_fea_D,
489                                uint* FlexIdentifierD,
490                                const int numFlex1D,
491                                uint2* CableElementsNodes,
492                                uint4* ShellelementsNodes,
493 
494                                int4 updatePortion,
495                                uint* gridMarkerIndexD,
496                                const uint* cellStart,
497                                const uint* cellEnd,
498                                const size_t numAllMarkers,
499                                bool IsSPARSE) {
500     uint csrStartIdx = numContacts[i_idx] + 1;
501     uint csrEndIdx = numContacts[i_idx + 1];
502 
503     Real h_i = sortedPosRad[i_idx].w;
504     //    Real m_i = h_i * h_i * h_i * paramsD.rho0;
505     Real3 my_normal = Normals[i_idx];
506 
507     Real3 source_term = paramsD.gravity + paramsD.bodyForce3;
508     //  if (bceIndex >= numObjectsD.numRigid_SphMarkers) {
509     //    return;
510     //  }
511 
512     //  int Original_idx = gridMarkerIndexD[i_idx];
513     Real3 myAcc = mR3(0.0);
514     Real3 V_prescribed = mR3(0.0);
515 
516     //    if (!(sortedRhoPreMu[i_idx].w >= 0 && sortedRhoPreMu[i_idx].w <= 3))
517     //        printf("type of marker is %f\n", sortedRhoPreMu[i_idx].w);
518 
519     BCE_Vel_Acc(i_idx, myAcc, V_prescribed, sortedPosRad, updatePortion, gridMarkerIndexD, qD, rigidSPH_MeshPos_LRF_D,
520                 posRigid_fsiBodies_D, velMassRigid_fsiBodies_D, omegaVelLRF_fsiBodies_D, accRigid_fsiBodies_D,
521                 omegaAccLRF_fsiBodies_D, rigidIdentifierD, pos_fsi_fea_D, vel_fsi_fea_D, acc_fsi_fea_D, FlexIdentifierD,
522                 numFlex1D, CableElementsNodes, ShellelementsNodes);
523 
524     for (int c = csrStartIdx; c < csrEndIdx; c++) {
525         csrValA[c] = 0;
526         csrColIndA[c] = i_idx;
527         GlobalcsrColIndA[c] = i_idx + numAllMarkers * i_idx;
528     }
529 
530     // if ((csrEndIdx - csrStartIdx) != uint(0)) {
531     Real3 numeratorv = mR3(0);
532     Real denumenator = 0;
533     Real pRHS = 0;
534     //  Real Rho_i = sortedRhoPreMu[i_idx].x;
535     Real3 pos_i = mR3(sortedPosRad[i_idx]);
536     // get address in grid
537     int3 gridPos = calcGridPos(pos_i);
538 
539     uint counter = 0;
540     for (int z = -1; z <= 1; z++) {
541         for (int y = -1; y <= 1; y++) {
542             for (int x = -1; x <= 1; x++) {
543                 int3 neighbourPos = gridPos + mI3(x, y, z);
544                 uint gridHash = calcGridHash(neighbourPos);
545                 // get start of bucket for this cell
546                 uint startIndex = cellStart[gridHash];
547                 if (startIndex != 0xffffffff) {  // cell is not empty
548                     uint endIndex = cellEnd[gridHash];
549                     for (uint j = startIndex; j < endIndex; j++) {
550                         Real3 pos_j = mR3(sortedPosRad[j]);
551                         Real3 dist3 = Distance(pos_i, pos_j);
552                         Real d = length(dist3);
553 
554                         if (d > RESOLUTION_LENGTH_MULT * h_i || j == i_idx)
555                             continue;
556 
557                         Real h_j = sortedPosRad[j].w;
558                         // Real m_j = h_j * h_j * h_j * paramsD.rho0;
559                         // Real rhoj = sortedRhoPreMu[j].x;
560                         Real h_ij = 0.5 * (h_j + h_i);
561                         Real Wd = W3h(d, h_ij);
562                         Real3 Vel_j = sortedVelMas[j];
563 
564                         if (paramsD.bceType != BceVersion::ADAMI) {
565                             if (sortedRhoPreMu[j].w == -1.0 || dot(my_normal, mR3(pos_i - pos_j)) > 0) {
566                                 Real3 grad_i_wij = GradWh(dist3, h_ij);
567                                 csrValA[csrStartIdx - 1] += dot(grad_i_wij, my_normal);
568                                 csrValA[counter + csrStartIdx] = -dot(grad_i_wij, my_normal);
569                                 csrColIndA[counter + csrStartIdx] = j;
570                                 GlobalcsrColIndA[counter + csrStartIdx] = j + numAllMarkers * i_idx;
571                                 counter++;
572                                 if (sortedRhoPreMu[j].w != -1)
573                                     continue;
574                                 numeratorv += Vel_j * Wd;
575                                 denumenator += Wd;
576                             }
577                         } else {
578                             if (sortedRhoPreMu[j].w != -1 || sortedRhoPreMu[j].w <= -2)
579                                 continue;
580                             numeratorv += Vel_j * Wd;
581                             denumenator += Wd;
582                             pRHS += dot(source_term - myAcc, dist3) * sortedRhoPreMu[j].x * Wd;
583                             csrValA[counter + csrStartIdx] = -Wd;
584                             csrColIndA[counter + csrStartIdx] = j;
585                             GlobalcsrColIndA[counter + csrStartIdx] = j + numAllMarkers * i_idx;
586                             counter++;
587                         }
588                     }
589                 }
590             }
591         }
592     }
593 
594     if (abs(denumenator) < EPSILON) {
595         V_new[i_idx] = 2 * V_prescribed;
596         B_i[i_idx] = 0;
597         if (paramsD.bceType == BceVersion::ADAMI) {
598             csrValA[csrStartIdx - 1] = a_ii[i_idx];
599             csrColIndA[csrStartIdx - 1] = i_idx;
600             GlobalcsrColIndA[csrStartIdx - 1] = i_idx + numAllMarkers * i_idx;
601         }
602     } else {
603         Real Scaling = a_ii[i_idx] / denumenator;
604         V_new[i_idx] = 2 * V_prescribed - numeratorv / denumenator;
605 
606         if (paramsD.bceType == BceVersion::ADAMI) {
607             B_i[i_idx] = pRHS;
608             csrValA[csrStartIdx - 1] = denumenator;
609             csrColIndA[csrStartIdx - 1] = i_idx;
610             GlobalcsrColIndA[csrStartIdx - 1] = i_idx + numAllMarkers * i_idx;
611 
612             for (int i = csrStartIdx - 1; i < csrEndIdx; i++)
613                 csrValA[i] *= Scaling;
614             B_i[i_idx] *= Scaling;
615         }
616     }
617 
618     if (paramsD.bceType != BceVersion::ADAMI) {
619         Real Scaling = a_ii[i_idx];
620         if (abs(csrValA[csrStartIdx - 1]) > EPSILON) {
621             Scaling = a_ii[i_idx];  // csrValA[csrStartIdx - 1];
622             for (int count = csrStartIdx - 1; count < csrEndIdx; count++)
623                 csrValA[count] *= Scaling;
624         } else {
625             clearRow(i_idx, csrStartIdx - 1, csrEndIdx, csrValA, B_i);
626             for (int count = csrStartIdx - 1; count < csrEndIdx; count++) {
627                 int j = csrColIndA[counter];
628                 Real3 pos_j = mR3(sortedPosRad[j]);
629                 Real3 dist3 = Distance(pos_i, pos_j);
630                 Real d = length(dist3);
631                 if (d > RESOLUTION_LENGTH_MULT * h_i || j == i_idx) {
632                     csrValA[count] = 0.0;
633                     continue;
634                 }
635                 Real h_j = sortedPosRad[j].w;
636                 Real h_ij = 0.5 * (h_j + h_i);
637                 Real Wd = W3h(d, h_ij);
638                 csrValA[count] = sumWij_inv[j] * Wd * Scaling;
639             }
640             csrValA[csrStartIdx - 1] -= 1.0 * Scaling;
641         }
642 
643         B_i[i_idx] = 0.0 * Scaling;
644     }
645 
646     sortedVelMas[i_idx] = V_new[i_idx];
647 }  // namespace fsi
648 //--------------------------------------------------------------------------------------------------------------------------------
649 
650 //--------------------------------------------------------------------------------------------------------------------------------
Calc_fluid_aij_Bi(const uint i_idx,Real * csrValA,uint * csrColIndA,unsigned long int * GlobalcsrColIndA,uint * numContacts,Real * B_i,Real3 * d_ii,Real * a_ii,Real * rho_np,Real3 * summGradW,Real4 * sortedPosRad,Real4 * sortedRhoPreMu,uint * cellStart,uint * cellEnd,Real delta_t,const int numAllMarkers,bool IsSPARSE)651 __device__ void Calc_fluid_aij_Bi(const uint i_idx,
652                                   Real* csrValA,
653                                   uint* csrColIndA,
654                                   unsigned long int* GlobalcsrColIndA,
655                                   uint* numContacts,
656                                   // The above 4 vectors are used for CSR form.
657                                   Real* B_i,
658                                   Real3* d_ii,   // Read
659                                   Real* a_ii,    // Read
660                                   Real* rho_np,  // Read
661                                   Real3* summGradW,
662                                   Real4* sortedPosRad,
663                                   Real4* sortedRhoPreMu,
664                                   uint* cellStart,
665                                   uint* cellEnd,
666                                   Real delta_t,
667                                   const int numAllMarkers,
668                                   bool IsSPARSE) {
669     Real3 pos_i = mR3(sortedPosRad[i_idx]);
670     Real dT = delta_t;
671 
672     int counter = 0;  // There is always one non-zero at each row- The marker itself
673     B_i[i_idx] = paramsD.rho0 - rho_np[i_idx];
674 
675     uint csrStartIdx = numContacts[i_idx] + 1;  // Reserve the starting index for the A_ii
676     uint csrEndIdx = numContacts[i_idx + 1];
677 
678     Real h_i = sortedPosRad[i_idx].w;
679     //    Real m_i = h_i * h_i * h_i * paramsD.rho0;
680 
681     //  for (int c = csrStartIdx; c < csrEndIdx; c++) {
682     //    csrValA[c] = a_ii[i_idx];
683     //    csrColIndA[c] = i_idx;
684     //    GlobalcsrColIndA[c] = i_idx + numAllMarkers * i_idx;
685     //  }
686 
687     int3 gridPos = calcGridPos(pos_i);
688     for (int z = -1; z <= 1; z++) {
689         for (int y = -1; y <= 1; y++) {
690             for (int x = -1; x <= 1; x++) {
691                 int3 neighbourPos = gridPos + mI3(x, y, z);
692                 uint gridHash = calcGridHash(neighbourPos);
693                 // get start of bucket for this cell
694                 uint startIndex = cellStart[gridHash];
695                 if (startIndex != 0xffffffff) {  // cell is not empty
696                     // iterate over particles in this cell
697                     uint endIndex = cellEnd[gridHash];
698                     //          Real Rho_i = sortedRhoPreMu[i_idx].x;
699 
700                     for (uint j = startIndex; j < endIndex; j++) {
701                         Real3 pos_j = mR3(sortedPosRad[j]);
702                         Real3 dist3 = Distance(pos_i, pos_j);
703                         Real d = length(dist3);
704                         if (d > RESOLUTION_LENGTH_MULT * h_i || sortedRhoPreMu[j].w <= -2 || i_idx == j)
705                             continue;
706                         Real h_j = sortedPosRad[j].w;
707                         Real h_ij = 0.5 * (h_j + h_i);
708                         Real3 grad_i_wij = GradWh(dist3, h_ij);
709                         Real Rho_j = sortedRhoPreMu[j].x;
710                         Real m_j = h_j * h_j * h_j * paramsD.rho0;
711                         Real3 d_it = m_j * (-(dT * dT) / (Rho_j * Rho_j)) * grad_i_wij;
712                         Real My_a_ij_1 = m_j * dot(d_it, summGradW[i_idx]);
713                         Real My_a_ij_2 = m_j * dot(d_ii[j], grad_i_wij);
714                         Real My_a_ij_12 = My_a_ij_1 - My_a_ij_2;
715                         bool DONE1 = false;
716 
717                         for (uint findCol = csrStartIdx; findCol < csrEndIdx; findCol++) {
718                             if (csrColIndA[findCol] == j) {
719                                 csrValA[findCol] += My_a_ij_12;
720                                 csrColIndA[findCol] = j;
721                                 GlobalcsrColIndA[findCol] = j + numAllMarkers * i_idx;
722                                 DONE1 = true;
723                                 continue;
724                             }
725                         }
726                         if (!DONE1) {
727                             csrValA[counter + csrStartIdx] += My_a_ij_12;
728                             csrColIndA[counter + csrStartIdx] = j;
729                             GlobalcsrColIndA[counter + csrStartIdx] = j + numAllMarkers * i_idx;
730                             counter++;
731                         }
732                         int3 gridPosJ = calcGridPos(pos_j);
733                         for (int zz = -1; zz <= 1; zz++) {
734                             for (int yy = -1; yy <= 1; yy++) {
735                                 for (int xx = -1; xx <= 1; xx++) {
736                                     int3 neighbourPosJ = gridPosJ + mI3(xx, yy, zz);
737                                     uint gridHashJ = calcGridHash(neighbourPosJ);
738                                     uint startIndexJ = cellStart[gridHashJ];
739                                     if (startIndexJ != 0xffffffff) {  // cell is not empty
740                                         uint endIndexJ = cellEnd[gridHashJ];
741                                         for (uint k = startIndexJ; k < endIndexJ; k++) {
742                                             Real3 pos_k = mR3(sortedPosRad[k]);
743                                             Real3 dist3jk = Distance(pos_j, pos_k);
744                                             Real djk = length(dist3jk);
745                                             if (djk > RESOLUTION_LENGTH_MULT_IISPH * h_j || k == j || k == i_idx ||
746                                                 sortedRhoPreMu[k].w <= -2)
747                                                 continue;
748                                             Real h_k = sortedPosRad[j].w;
749                                             Real h_jk = 0.5 * (h_j + h_k);
750                                             Real3 grad_j_wjk = GradWh(dist3jk, h_jk);
751                                             Real m_k = cube(sortedPosRad[k].w) * paramsD.rho0;
752                                             Real Rho_k = sortedRhoPreMu[k].x;
753                                             Real3 d_jk = m_k * (-(dT * dT) / (Rho_k * Rho_k)) * grad_j_wjk;
754                                             Real My_a_ij_3 = m_j * dot(d_jk, grad_i_wij);
755                                             bool DONE2 = false;
756 
757                                             for (uint findCol = csrStartIdx; findCol < csrEndIdx; findCol++) {
758                                                 if (csrColIndA[findCol] == k) {
759                                                     csrValA[findCol] -= My_a_ij_3;
760                                                     csrColIndA[findCol] = k;
761                                                     GlobalcsrColIndA[findCol] = k + numAllMarkers * i_idx;
762                                                     DONE2 = true;
763                                                     continue;
764                                                 }
765                                             }
766                                             if (!DONE2) {
767                                                 csrValA[counter + csrStartIdx] -= My_a_ij_3;
768                                                 csrColIndA[counter + csrStartIdx] = k;
769                                                 GlobalcsrColIndA[counter + csrStartIdx] = k + numAllMarkers * i_idx;
770                                                 counter++;
771                                             }
772                                         }
773                                     }
774                                 }
775                             }
776                         }
777 
778                     }
779                 }
780             }
781         }
782     }
783 
784     for (int myIdx = csrStartIdx; myIdx < csrEndIdx; myIdx++) {
785         if (csrColIndA[myIdx] == i_idx)
786             csrValA[myIdx] = a_ii[i_idx];
787     }
788 
789     csrValA[csrStartIdx - 1] = a_ii[i_idx];
790     csrColIndA[csrStartIdx - 1] = i_idx;
791     GlobalcsrColIndA[csrStartIdx - 1] = i_idx + numAllMarkers * i_idx;
792 
793     if (sortedRhoPreMu[i_idx].x < 0.999 * paramsD.rho0) {
794         csrValA[csrStartIdx - 1] = a_ii[i_idx];
795         for (int myIdx = csrStartIdx; myIdx < csrEndIdx; myIdx++) {
796             csrValA[myIdx] = 0.0;
797             B_i[i_idx] = 0.0;
798         }
799     }
800 
801     Real RHS = B_i[i_idx];
802     B_i[i_idx] = RHS;  // fminf(0.0, RHS);
803 }
804 
805 //--------------------------------------------------------------------------------------------------------------------------------
FormAXB(Real * csrValA,uint * csrColIndA,unsigned long int * GlobalcsrColIndA,uint * numContacts,Real * a_ij,Real * B_i,Real3 * d_ii,Real * a_ii,Real3 * summGradW,Real4 * sortedPosRad,Real3 * sortedVelMas,Real4 * sortedRhoPreMu,Real3 * V_new,Real * p_old,Real3 * Normals,Real * G_i,Real * sumWij_inv,Real * rho_np,Real4 * qD,Real3 * rigidSPH_MeshPos_LRF_D,Real3 * posRigid_fsiBodies_D,Real4 * velMassRigid_fsiBodies_D,Real3 * omegaVelLRF_fsiBodies_D,Real3 * accRigid_fsiBodies_D,Real3 * omegaAccLRF_fsiBodies_D,uint * rigidIdentifierD,Real3 * pos_fsi_fea_D,Real3 * vel_fsi_fea_D,Real3 * acc_fsi_fea_D,uint * FlexIdentifierD,const int numFlex1D,uint2 * CableElementsNodes,uint4 * ShellelementsNodes,int4 updatePortion,uint * gridMarkerIndexD,uint * cellStart,uint * cellEnd,Real delta_t,const size_t numAllMarkers,bool IsSPARSE,volatile bool * isError)806 __global__ void FormAXB(Real* csrValA,
807                         uint* csrColIndA,
808                         unsigned long int* GlobalcsrColIndA,
809 
810                         uint* numContacts,
811                         // The above 4 vectors are used for CSR form.
812                         Real* a_ij,   // write
813                         Real* B_i,    // write
814                         Real3* d_ii,  // Read
815                         Real* a_ii,   // Read
816                         Real3* summGradW,
817                         Real4* sortedPosRad,
818                         Real3* sortedVelMas,
819                         Real4* sortedRhoPreMu,
820                         Real3* V_new,
821                         Real* p_old,
822                         Real3* Normals,
823                         Real* G_i,
824                         Real* sumWij_inv,
825                         Real* rho_np,
826 
827                         Real4* qD,
828                         Real3* rigidSPH_MeshPos_LRF_D,
829                         Real3* posRigid_fsiBodies_D,
830                         Real4* velMassRigid_fsiBodies_D,
831                         Real3* omegaVelLRF_fsiBodies_D,
832                         Real3* accRigid_fsiBodies_D,
833                         Real3* omegaAccLRF_fsiBodies_D,
834                         uint* rigidIdentifierD,
835 
836                         Real3* pos_fsi_fea_D,
837                         Real3* vel_fsi_fea_D,
838                         Real3* acc_fsi_fea_D,
839                         uint* FlexIdentifierD,
840                         const int numFlex1D,
841                         uint2* CableElementsNodes,
842                         uint4* ShellelementsNodes,
843 
844                         int4 updatePortion,
845                         uint* gridMarkerIndexD,
846                         uint* cellStart,
847                         uint* cellEnd,
848                         Real delta_t,
849                         const size_t numAllMarkers,
850                         bool IsSPARSE,
851                         volatile bool* isError) {
852     uint i_idx = blockIdx.x * blockDim.x + threadIdx.x;
853     if (i_idx >= numAllMarkers) {
854         return;
855     }
856 
857     //    Real m_0 = paramsD.markerMass;
858     //    Real RHO_0 = paramsD.rho0;
859     //    Real dT = paramsD.dT;
860     //    Real3 gravity = paramsD.gravity;
861 
862     int TYPE_OF_NARKER = sortedRhoPreMu[i_idx].w;
863 
864     if (TYPE_OF_NARKER <= -2) {
865         B_i[i_idx] = 0;
866         uint csrStartIdx = numContacts[i_idx];
867         // This needs to be check to see if it messes up the condition number of the matrix
868         csrValA[csrStartIdx] = 1.0;
869         csrColIndA[csrStartIdx] = i_idx;
870         GlobalcsrColIndA[csrStartIdx] = i_idx + numAllMarkers * i_idx;
871     } else if (TYPE_OF_NARKER == -1) {
872         Calc_fluid_aij_Bi(i_idx, csrValA, csrColIndA, GlobalcsrColIndA, numContacts, B_i, d_ii, a_ii, rho_np, summGradW,
873                           sortedPosRad, sortedRhoPreMu, cellStart, cellEnd, delta_t, numAllMarkers, true);
874 
875     } else if (TYPE_OF_NARKER > -1)
876         Calc_BC_aij_Bi(i_idx, csrValA, csrColIndA, GlobalcsrColIndA, numContacts, a_ii, B_i, sortedPosRad, sortedVelMas,
877 
878                        sortedRhoPreMu, V_new, p_old, Normals, G_i, sumWij_inv,
879 
880                        qD, rigidSPH_MeshPos_LRF_D, posRigid_fsiBodies_D, velMassRigid_fsiBodies_D,
881                        omegaVelLRF_fsiBodies_D, accRigid_fsiBodies_D, omegaAccLRF_fsiBodies_D, rigidIdentifierD,
882 
883                        pos_fsi_fea_D, vel_fsi_fea_D, acc_fsi_fea_D, FlexIdentifierD, numFlex1D, CableElementsNodes,
884                        ShellelementsNodes,
885 
886                        updatePortion, gridMarkerIndexD, cellStart, cellEnd, numAllMarkers, true);
887 }
888 
889 //--------------------------------------------------------------------------------------------------------------------------------
890 //--------------------------------------------------------------------------------------------------------------------------------
Calc_Pressure_AXB_USING_CSR(Real * csrValA,Real * a_ii,uint * csrColIndA,uint * numContacts,Real4 * sortedRhoPreMu,Real * sumWij_inv,Real3 * sortedVelMas,Real3 * V_new,Real * p_old,Real * B_i,Real * Residuals,const size_t numAllMarkers,volatile bool * isErrorD)891 __global__ void Calc_Pressure_AXB_USING_CSR(Real* csrValA,
892                                             Real* a_ii,
893                                             uint* csrColIndA,
894                                             uint* numContacts,
895                                             Real4* sortedRhoPreMu,
896                                             Real* sumWij_inv,
897                                             Real3* sortedVelMas,
898                                             Real3* V_new,
899                                             Real* p_old,
900                                             Real* B_i,  // Read
901                                             Real* Residuals,
902                                             const size_t numAllMarkers,
903                                             volatile bool* isErrorD) {
904     uint i_idx = blockIdx.x * blockDim.x + threadIdx.x;
905     if (i_idx >= numAllMarkers) {
906         return;
907     }
908     if (sortedRhoPreMu[i_idx].w <= -2) {
909         return;
910     }
911 
912     //    Real RHO_0 = paramsD.rho0;
913     //    bool ClampPressure = paramsD.ClampPressure;
914     //    Real Max_Pressure = paramsD.Max_Pressure;
915     uint startIdx = numContacts[i_idx] + 1;  // numContacts[i_idx] is the diagonal itself
916     uint endIdx = numContacts[i_idx + 1];
917 
918     Real aij_pj = 0;
919     //  Real error = aij_pj + sortedRhoPreMu[i_idx].y * csrValA[startIdx - 1] - B_i[i_idx];
920 
921     for (int myIdx = startIdx; myIdx < endIdx; myIdx++) {
922         if (csrColIndA[myIdx] != i_idx)
923             aij_pj += csrValA[myIdx] * p_old[csrColIndA[myIdx]];
924     }
925     Real RHS = B_i[i_idx];
926     Residuals[i_idx] = abs(RHS - aij_pj - p_old[i_idx] * csrValA[startIdx - 1]);
927     sortedRhoPreMu[i_idx].y = (RHS - aij_pj) / csrValA[startIdx - 1];
928 
929     //    if (paramsD.ClampPressure && sortedRhoPreMu[i_idx].y < 0)
930     //        sortedRhoPreMu[i_idx].y = 0;
931     if (!isfinite(aij_pj)) {
932         printf("a_ij *p_j became Nan in Calc_Pressure_AXB_USING_CSR ");
933     }
934 }
935 
936 //--------------------------------------------------------------------------------------------------------------------------------
Calc_Pressure(Real * a_ii,Real3 * d_ii,Real3 * dij_pj,Real * rho_np,Real * rho_p,Real * Residuals,Real3 * F_p,Real4 * sortedPosRad,Real3 * sortedVelMas,Real4 * sortedRhoPreMu,Real4 * qD,Real3 * rigidSPH_MeshPos_LRF_D,Real3 * posRigid_fsiBodies_D,Real4 * velMassRigid_fsiBodies_D,Real3 * omegaVelLRF_fsiBodies_D,Real3 * accRigid_fsiBodies_D,Real3 * omegaAccLRF_fsiBodies_D,uint * rigidIdentifierD,Real3 * pos_fsi_fea_D,Real3 * vel_fsi_fea_D,Real3 * acc_fsi_fea_D,uint * FlexIdentifierD,const int numFlex1D,uint2 * CableElementsNodes,uint4 * ShellelementsNodes,int4 updatePortion,uint * gridMarkerIndexD,Real * p_old,Real3 * V_new,uint * cellStart,uint * cellEnd,Real delta_t,const size_t numAllMarkers,volatile bool * isErrorD)937 __global__ void Calc_Pressure(Real* a_ii,     // Read
938                               Real3* d_ii,    // Read
939                               Real3* dij_pj,  // Read
940                               Real* rho_np,   // Read
941                               Real* rho_p,    // Write
942                               Real* Residuals,
943                               Real3* F_p,
944                               Real4* sortedPosRad,
945                               Real3* sortedVelMas,
946                               Real4* sortedRhoPreMu,
947 
948                               Real4* qD,
949                               Real3* rigidSPH_MeshPos_LRF_D,
950                               Real3* posRigid_fsiBodies_D,
951                               Real4* velMassRigid_fsiBodies_D,
952                               Real3* omegaVelLRF_fsiBodies_D,
953                               Real3* accRigid_fsiBodies_D,
954                               Real3* omegaAccLRF_fsiBodies_D,
955                               uint* rigidIdentifierD,
956 
957                               Real3* pos_fsi_fea_D,
958                               Real3* vel_fsi_fea_D,
959                               Real3* acc_fsi_fea_D,
960                               uint* FlexIdentifierD,
961                               const int numFlex1D,
962                               uint2* CableElementsNodes,
963                               uint4* ShellelementsNodes,
964 
965                               int4 updatePortion,
966                               uint* gridMarkerIndexD,
967 
968                               Real* p_old,
969                               Real3* V_new,
970                               uint* cellStart,
971                               uint* cellEnd,
972                               Real delta_t,
973 
974                               const size_t numAllMarkers,
975                               volatile bool* isErrorD) {
976     uint i_idx = blockIdx.x * blockDim.x + threadIdx.x;
977     if (i_idx >= numAllMarkers) {
978         return;
979     }
980     if (sortedRhoPreMu[i_idx].w <= -2) {
981         return;
982     }
983 
984     Real h_i = sortedPosRad[i_idx].w;
985     Real m_i = h_i * h_i * h_i * paramsD.rho0;
986 
987     Real RHO_0 = paramsD.rho0;
988     Real dT = delta_t;
989     Real3 source_term = paramsD.gravity + paramsD.bodyForce3;
990 
991     if (sortedRhoPreMu[i_idx].x < EPSILON) {
992         printf("(Calc_Pressure)My density is %f in Calc_Pressure\n", sortedRhoPreMu[i_idx].x);
993     }
994     int myType = sortedRhoPreMu[i_idx].w;
995     Real Rho_i = sortedRhoPreMu[i_idx].x;
996     Real p_i = p_old[i_idx];
997     Real3 pos_i = mR3(sortedPosRad[i_idx]);
998     Real p_new = 0;
999     Real my_rho_p = 0;
1000     Real3 F_i_p = F_p[i_idx];
1001 
1002     if (myType == -1) {
1003         if (Rho_i < 0.999 * RHO_0) {
1004             p_new = 0;
1005             Residuals[i_idx] = 0;
1006         } else {
1007             Real3 my_dij_pj = dij_pj[i_idx];
1008             Real sum_dij_pj = 0;  // This is the first summation  term in the expression for the pressure.
1009             Real sum_djj_pj = 0;  // This is the second summation term in the expression for the pressure.
1010             Real sum_djk_pk = 0;  // This is the last summation term in the expression for the pressure.
1011             int3 gridPosI = calcGridPos(pos_i);
1012             for (int z = -1; z <= 1; z++) {
1013                 for (int y = -1; y <= 1; y++) {
1014                     for (int x = -1; x <= 1; x++) {
1015                         int3 neighbourPosI = gridPosI + mI3(x, y, z);
1016                         uint gridHashI = calcGridHash(neighbourPosI);
1017                         // get start of bucket for this cell
1018                         uint startIndexI = cellStart[gridHashI];
1019                         if (startIndexI != 0xffffffff) {
1020                             uint endIndexI = cellEnd[gridHashI];
1021                             for (uint j = startIndexI; j < endIndexI; j++) {
1022                                 Real3 pos_j = mR3(sortedPosRad[j]);
1023                                 Real3 dist3ij = Distance(pos_i, pos_j);
1024                                 Real dij = length(dist3ij);
1025                                 if (dij > RESOLUTION_LENGTH_MULT * paramsD.HSML || i_idx == j ||
1026                                     sortedRhoPreMu[j].w <= -2)
1027                                     continue;
1028                                 //                Real Rho_j = sortedRhoPreMu[j].x;
1029                                 Real p_j_old = p_old[j];
1030                                 Real h_j = sortedPosRad[j].w;
1031                                 Real m_j = h_j * h_j * h_j * paramsD.rho0;
1032 
1033                                 Real3 djj = d_ii[j];
1034                                 Real3 F_j_p = F_p[j];
1035 
1036                                 Real h_ij = 0.5 * (h_j + h_i);
1037                                 Real3 grad_i_wij = GradWh(dist3ij, h_ij);
1038                                 Real3 d_ji = m_i * (-(dT * dT) / (Rho_i * Rho_i)) * (-grad_i_wij);
1039                                 Real3 djk_pk = dij_pj[j] - d_ji * p_i;
1040                                 sum_dij_pj += m_j * dot(my_dij_pj, grad_i_wij);
1041                                 sum_djj_pj += m_j * dot(djj, grad_i_wij) * p_j_old;
1042                                 sum_djk_pk += m_j * dot(djk_pk, grad_i_wij);
1043                                 my_rho_p += (dT * dT) * m_j * dot((F_i_p / m_i - F_j_p / m_j), grad_i_wij);
1044                             }
1045                         }
1046                     }
1047                 }
1048             }
1049 
1050             // Real RHS = fminf(0.0, RHO_0 - rho_np[i_idx]);
1051             Real RHS = RHO_0 - rho_np[i_idx];
1052             Real aij_pj = +sum_dij_pj - sum_djj_pj - sum_djk_pk;
1053             p_new = (RHS - aij_pj) / a_ii[i_idx];
1054             Residuals[i_idx] = abs(RHS - aij_pj - p_old[i_idx] * a_ii[i_idx]);
1055             //      sortedRhoPreMu[i_idx].x = aij_pj + p_new * a_ii[i_idx] + RHO_0 - RHS;
1056         }
1057     } else {  // Do Adami BC
1058 
1059         Real3 myAcc = mR3(0);
1060         Real3 V_prescribed = mR3(0);
1061         BCE_Vel_Acc(i_idx, myAcc, V_prescribed, sortedPosRad, updatePortion, gridMarkerIndexD, qD,
1062                     rigidSPH_MeshPos_LRF_D, posRigid_fsiBodies_D, velMassRigid_fsiBodies_D, omegaVelLRF_fsiBodies_D,
1063                     accRigid_fsiBodies_D, omegaAccLRF_fsiBodies_D, rigidIdentifierD, pos_fsi_fea_D, vel_fsi_fea_D,
1064                     acc_fsi_fea_D, FlexIdentifierD, numFlex1D, CableElementsNodes, ShellelementsNodes);
1065 
1066         Real3 numeratorv = mR3(0);
1067         Real denumenator = 0;
1068         Real numeratorp = 0;
1069         Real3 Vel_i;
1070 
1071         // get address in grid
1072         int3 gridPos = calcGridPos(pos_i);
1073         for (int z = -1; z <= 1; z++) {
1074             for (int y = -1; y <= 1; y++) {
1075                 for (int x = -1; x <= 1; x++) {
1076                     int3 neighbourPos = gridPos + mI3(x, y, z);
1077                     uint gridHash = calcGridHash(neighbourPos);
1078                     // get start of bucket for this cell
1079                     uint startIndex = cellStart[gridHash];
1080                     if (startIndex != 0xffffffff) {  // cell is not empty
1081                         uint endIndex = cellEnd[gridHash];
1082                         for (uint j = startIndex; j < endIndex; j++) {
1083                             Real3 pos_j = mR3(sortedPosRad[j]);
1084                             Real3 dist3 = Distance(pos_i, pos_j);
1085                             Real d = length(dist3);
1086                             if (d > RESOLUTION_LENGTH_MULT * paramsD.HSML || sortedRhoPreMu[j].w != -1)
1087                                 continue;
1088                             // OLD VELOCITY IS SHOULD BE OBDATED NOT THE NEW ONE!!!!!
1089                             Real3 Vel_j = sortedVelMas[j];
1090                             Real p_j = p_old[j];
1091                             Real3 F_j_p = F_p[j];
1092                             Real h_j = sortedPosRad[j].w;
1093                             Real m_j = h_j * h_j * h_j * paramsD.rho0;
1094                             // Real rhoj = sortedRhoPreMu[j].x;
1095 
1096                             Real h_ij = 0.5 * (h_j + h_i);
1097                             Real Wd = W3h(d, h_ij);
1098                             numeratorv += Vel_j * Wd;
1099                             numeratorp += p_j * Wd + dot(source_term - myAcc, dist3) * sortedRhoPreMu[j].x * Wd;
1100                             denumenator += Wd;
1101                             Real3 TobeUsed = (F_i_p / m_i - F_j_p / m_j);
1102                             my_rho_p += (dT * dT) * m_j * dot(TobeUsed, GradWh(dist3, h_ij));
1103 
1104                             if (isnan(numeratorp))
1105                                 printf("Something is wrong here..., %f\n", numeratorp);
1106                         }
1107                     }
1108                 }
1109             }
1110         }
1111         if (abs(denumenator) < EPSILON) {
1112             p_new = 0;
1113             Vel_i = 2 * V_prescribed;
1114         } else {
1115             Vel_i = 2 * V_prescribed - numeratorv / denumenator;
1116             p_new = numeratorp / denumenator;
1117         }
1118 
1119         Residuals[i_idx] = abs(numeratorp - denumenator * p_old[i_idx]) * a_ii[i_idx];
1120         V_new[i_idx] = Vel_i;
1121     }
1122     // if (paramsD.ClampPressure && p_new < 0.0)
1123     //    p_new = 0.0;
1124     rho_p[i_idx] = my_rho_p;
1125     sortedRhoPreMu[i_idx].y = p_new;
1126 }
1127 
1128 //--------------------------------------------------------------------------------------------------------------------------------
Update_AND_Calc_Res(Real3 * sortedVelMas,Real4 * sortedRhoPreMu,Real * p_old,Real3 * V_new,Real * rho_p,Real * rho_np,Real * Residuals,const size_t numAllMarkers,const int Iteration,Real params_relaxation,bool IsSPARSE,volatile bool * isErrorD)1129 __global__ void Update_AND_Calc_Res(Real3* sortedVelMas,
1130                                     Real4* sortedRhoPreMu,
1131                                     Real* p_old,
1132                                     Real3* V_new,
1133                                     Real* rho_p,
1134                                     Real* rho_np,
1135                                     Real* Residuals,
1136                                     const size_t numAllMarkers,
1137                                     const int Iteration,
1138                                     Real params_relaxation,
1139                                     bool IsSPARSE,
1140                                     volatile bool* isErrorD) {
1141     uint i_idx = blockIdx.x * blockDim.x + threadIdx.x;
1142 
1143     if (i_idx >= numAllMarkers) {
1144         return;
1145     }
1146     if (sortedRhoPreMu[i_idx].w <= -2) {
1147         return;
1148     }
1149 
1150     //  p_i = (1 - relax) * p_old_i + relax * p_i;
1151 
1152     sortedRhoPreMu[i_idx].y = (1 - params_relaxation) * p_old[i_idx] + params_relaxation * sortedRhoPreMu[i_idx].y;
1153     // if(!paramsD.USE_LinearSolver)
1154     //    p_old[i_idx] = sortedRhoPreMu[i_idx].y;
1155     // if (paramsD.ClampPressure && sortedRhoPreMu[i_idx].y < 0)
1156     //    sortedRhoPreMu[i_idx].y = 0;
1157     //  Real AbsRes = abs(sortedRhoPreMu[i_idx].y - p_old[i_idx]);
1158 
1159     //  Real Updated_rho = rho_np[i_idx] + rho_p[i_idx];
1160     //  Real rho_res = abs(1000 - sortedRhoPreMu[i_idx].x);  // Hard-coded for now
1161     Real p_res = 0;
1162     //  p_res = abs(sortedRhoPreMu[i_idx].y - p_old[i_idx]) / (abs(p_old[i_idx]) + 0.00001);
1163     p_res = abs(sortedRhoPreMu[i_idx].y - p_old[i_idx]);
1164     p_old[i_idx] = sortedRhoPreMu[i_idx].y;
1165 
1166     Residuals[i_idx] = p_res;
1167 }
1168 
1169 //--------------------------------------------------------------------------------------------------------------------------------
CalcForces(Real3 * new_vel,Real4 * derivVelRhoD,Real4 * sortedPosRad,Real3 * sortedVelMas,Real4 * sortedRhoPreMu,Real * sumWij_inv,Real * p_old,Real3 * r_shift,uint * cellStart,uint * cellEnd,Real delta_t,size_t numAllMarkers,volatile bool * isErrorD)1170 __global__ void CalcForces(Real3* new_vel,  // Write
1171                            Real4* derivVelRhoD,
1172                            Real4* sortedPosRad,  // Read
1173                            Real3* sortedVelMas,  // Read
1174                            Real4* sortedRhoPreMu,
1175                            Real* sumWij_inv,
1176                            Real* p_old,
1177                            Real3* r_shift,
1178                            uint* cellStart,
1179                            uint* cellEnd,
1180                            Real delta_t,
1181 
1182                            size_t numAllMarkers,
1183                            volatile bool* isErrorD) {
1184     uint i_idx = blockIdx.x * blockDim.x + threadIdx.x;
1185     if (i_idx >= numAllMarkers) {
1186         return;
1187     }
1188     if (sortedRhoPreMu[i_idx].w <= -2) {
1189         sortedRhoPreMu[i_idx].x = 0;
1190         sortedRhoPreMu[i_idx].y = 0;
1191         sortedRhoPreMu[i_idx].z = 0;
1192         return;
1193     }
1194 
1195     //    if (sortedRhoPreMu[i_idx].w > -1) {
1196     //        return;
1197     //    }
1198 
1199     Real mu_0 = paramsD.mu0;
1200     Real h_i = sortedPosRad[i_idx].w;
1201     Real m_i = h_i * h_i * h_i * paramsD.rho0;
1202 
1203     Real dT = delta_t;
1204     Real3 source_term = paramsD.gravity + paramsD.bodyForce3;
1205     Real epsilon = paramsD.epsMinMarkersDis;
1206     Real3 posi = mR3(sortedPosRad[i_idx]);
1207     Real3 Veli = sortedVelMas[i_idx];
1208 
1209     Real p_i;
1210     //    if (sortedRhoPreMu[i_idx].w == -1)
1211     p_i = sortedRhoPreMu[i_idx].y;
1212     //    else
1213     //        p_i = p_old[i_idx];
1214 
1215     Real rho_i = sortedRhoPreMu[i_idx].x;
1216     Real3 F_i_mu = mR3(0);
1217     Real3 F_i_surface_tension = mR3(0);
1218     Real3 F_i_p = mR3(0);
1219     if ((sortedRhoPreMu[i_idx].x > 3 * paramsD.rho0 || sortedRhoPreMu[i_idx].x < 0) && sortedRhoPreMu[i_idx].w < 0)
1220         printf("too large/small density marker %d, type=%f\n", i_idx, sortedRhoPreMu[i_idx].w);
1221 
1222     Real r0 = 0;
1223     int Ni = 0;
1224     Real mi_bar = 0;
1225     Real3 inner_sum = mR3(0);
1226 
1227     int3 gridPos = calcGridPos(posi);
1228 
1229     // get address in grid
1230     for (int z = -1; z <= 1; z++) {
1231         for (int y = -1; y <= 1; y++) {
1232             for (int x = -1; x <= 1; x++) {
1233                 int3 neighbourPos = gridPos + mI3(x, y, z);
1234                 uint gridHash = calcGridHash(neighbourPos);
1235                 // get start of bucket for this cell
1236                 uint startIndex = cellStart[gridHash];
1237                 uint endIndex = cellEnd[gridHash];
1238                 for (uint j = startIndex; j < endIndex; j++) {
1239                     Real3 posj = mR3(sortedPosRad[j]);
1240                     Real3 rij = Distance(posi, posj);
1241                     Real d = length(rij);
1242                     if (d > RESOLUTION_LENGTH_MULT * h_i || sortedRhoPreMu[j].w <= -2 || i_idx == j)
1243                         continue;
1244 
1245                     Real3 eij = rij / d;
1246                     Real h_j = sortedPosRad[j].w;
1247                     Real m_j = h_j * h_j * h_j * paramsD.rho0;
1248 
1249                     mi_bar += m_j;
1250                     Ni++;
1251                     r0 += d;
1252                     inner_sum += m_j * rij / (d * d * d);
1253 
1254                     Real h_ij = 0.5 * (h_j + h_i);
1255                     Real Wd = m_j * W3h(d, h_ij);
1256                     Real3 grad_ij = GradWh(rij, h_ij);
1257 
1258                     Real3 Velj = sortedVelMas[j];
1259                     Real p_j = sortedRhoPreMu[j].y;
1260                     Real rho_j = sortedRhoPreMu[j].x;
1261 
1262                     Real3 V_ij = (Veli - Velj);
1263                     // Only Consider (fluid-fluid + fluid-solid) or Solid-Fluid Interaction
1264                     if (sortedRhoPreMu[i_idx].w < 0 || (sortedRhoPreMu[i_idx].w >= 0 && sortedRhoPreMu[j].w < 0))
1265                         F_i_p += -m_j * ((p_i / (rho_i * rho_i)) + (p_j / (rho_j * rho_j))) * grad_ij;
1266 
1267                     Real Rho_bar = (rho_j + rho_i) * 0.5;
1268                     //                    Real nu = mu_0 * paramsD.HSML * 320 / Rho_bar;
1269                     //                    Real3 muNumerator = nu * fminf(0.0, dot(rij, V_ij)) * grad_ij;
1270                     Real3 muNumerator = 2 * mu_0 * dot(rij, grad_ij) * V_ij;
1271                     Real muDenominator = (Rho_bar * Rho_bar) * (d * d + paramsD.HSML * paramsD.HSML * epsilon);
1272                     // Only Consider (fluid-fluid + fluid-solid) or Solid-Fluid Interaction
1273                     if (sortedRhoPreMu[i_idx].w < 0 || (sortedRhoPreMu[i_idx].w >= 0 && sortedRhoPreMu[j].w < 0))
1274                         //                    if ((sortedRhoPreMu[i_idx].w < 0 && sortedRhoPreMu[j].w < 0))
1275                         F_i_mu += m_j * muNumerator / muDenominator;
1276                     if (!isfinite(length(F_i_mu))) {
1277                         printf("F_i_np in CalcForces returns Nan or Inf");
1278                     }
1279                 }
1280             }
1281         }
1282 
1283         if (Ni != 0) {
1284             r0 /= Ni;
1285             mi_bar /= Ni;
1286         }
1287         if (mi_bar > EPSILON)
1288             r_shift[i_idx] = paramsD.beta_shifting * r0 * r0 * paramsD.v_Max * dT / mi_bar * inner_sum;
1289 
1290         // Forces are per unit mass at this point.
1291         derivVelRhoD[i_idx] = mR4((F_i_p + F_i_mu) * m_i);
1292 
1293         // Add the source_term  only to the fluid markers
1294         if (sortedRhoPreMu[i_idx].w == -1) {
1295             derivVelRhoD[i_idx] = derivVelRhoD[i_idx] + mR4(source_term) * m_i;
1296         }
1297 
1298         new_vel[i_idx] = Veli + dT * mR3(derivVelRhoD[i_idx]) / m_i + r_shift[i_idx] / dT;
1299 
1300         if (!isfinite(length(new_vel[i_idx])) || !isfinite(length(derivVelRhoD[i_idx])) ||
1301             !isfinite(length(r_shift[i_idx])))
1302             printf("%d= new_vel=%.2f,derivVelRhoD=%.2f,r_shift=%.2f, F_i_p=%f, F_i_mu=%f\n", i_idx,
1303                    length(new_vel[i_idx]), length(derivVelRhoD[i_idx]), length(r_shift[i_idx]), length(F_i_p),
1304                    length(F_i_mu));
1305     }
1306 }
1307 
1308 //--------------------------------------------------------------------------------------------------------------------------------
FinalizePressure(Real4 * sortedPosRad,Real4 * sortedRhoPreMu,Real * p_old,Real3 * F_p,uint * cellStart,uint * cellEnd,size_t numAllMarkers,Real p_shift,volatile bool * isErrorD)1309 __global__ void FinalizePressure(Real4* sortedPosRad,  // Read
1310                                  Real4* sortedRhoPreMu,
1311                                  Real* p_old,
1312                                  Real3* F_p,  // Write
1313                                  uint* cellStart,
1314                                  uint* cellEnd,
1315                                  size_t numAllMarkers,
1316                                  Real p_shift,
1317                                  volatile bool* isErrorD) {
1318     uint i_idx = blockIdx.x * blockDim.x + threadIdx.x;
1319     if (i_idx >= numAllMarkers) {
1320         return;
1321     }
1322     if (sortedRhoPreMu[i_idx].w <= -2) {
1323         return;
1324     }
1325     if (!(isfinite(sortedRhoPreMu[i_idx].x) && isfinite(sortedRhoPreMu[i_idx].y) && isfinite(sortedRhoPreMu[i_idx].z) &&
1326           isfinite(sortedRhoPreMu[i_idx].w))) {
1327         printf("rhoPreMu is NAN: thrown from FinalizePressure ! %f,%f,%f\\n", sortedRhoPreMu[i_idx].x,
1328                sortedRhoPreMu[i_idx].y, sortedRhoPreMu[i_idx].z);
1329         sortedRhoPreMu[i_idx].y = 0.0;
1330     }
1331     //    if (p_shift < 0)
1332     sortedRhoPreMu[i_idx].y = p_old[i_idx] + ((paramsD.ClampPressure) ? paramsD.BASEPRES : 0.0);  //- p_shift;
1333 
1334     if (paramsD.ClampPressure && sortedRhoPreMu[i_idx].y < 0)
1335         sortedRhoPreMu[i_idx].y = 0;
1336 
1337     // if (sortedRhoPreMu[i_idx].y < 0)
1338     //     sortedRhoPreMu[i_idx].y = (p_old[i_idx] > 0) ? p_old[i_idx] : 0.0;
1339 
1340     if (sortedRhoPreMu[i_idx].y > paramsD.Max_Pressure)
1341         sortedRhoPreMu[i_idx].y = paramsD.Max_Pressure;
1342 }
1343 
1344 //--------------------------------------------------------------------------------------------------------------------------------
calcPressureIISPH(std::shared_ptr<FsiBodiesDataD> otherFsiBodiesD,thrust::device_vector<Real3> pos_fsi_fea_D,thrust::device_vector<Real3> vel_fsi_fea_D,thrust::device_vector<Real3> acc_fsi_fea_D,thrust::device_vector<Real> sumWij_inv,thrust::device_vector<Real> & p_old,thrust::device_vector<Real3> Normals,thrust::device_vector<Real> G_i,thrust::device_vector<Real> & Color)1345 void ChFsiForceIISPH::calcPressureIISPH(std::shared_ptr<FsiBodiesDataD> otherFsiBodiesD,
1346                                         thrust::device_vector<Real3> pos_fsi_fea_D,
1347                                         thrust::device_vector<Real3> vel_fsi_fea_D,
1348                                         thrust::device_vector<Real3> acc_fsi_fea_D,
1349                                         thrust::device_vector<Real> sumWij_inv,
1350                                         thrust::device_vector<Real>& p_old,
1351                                         thrust::device_vector<Real3> Normals,
1352                                         thrust::device_vector<Real> G_i,
1353                                         thrust::device_vector<Real>& Color) {
1354     //    Real RES = paramsH->PPE_res;
1355 
1356     PPE_SolutionType mySolutionType = paramsH->PPE_Solution_type;
1357     std::cout << "time step in calcPressureIISPH " << paramsH->dT << std::endl;
1358 
1359     double total_step_timeClock = clock();
1360     bool *isErrorH, *isErrorD;
1361     isErrorH = (bool*)malloc(sizeof(bool));
1362     cudaMalloc((void**)&isErrorD, sizeof(bool));
1363     *isErrorH = false;
1364     cudaMemcpy(isErrorD, isErrorH, sizeof(bool), cudaMemcpyHostToDevice);
1365     //------------------------------------------------------------------------
1366     // thread per particle
1367     uint numThreads, numBlocks;
1368     size_t numAllMarkers = (int)numObjectsH->numAllMarkers;
1369     computeGridSize((uint)numAllMarkers, 256, numBlocks, numThreads);
1370 
1371     *isErrorH = false;
1372     cudaMemcpy(isErrorD, isErrorH, sizeof(bool), cudaMemcpyHostToDevice);
1373 
1374     thrust::device_vector<Real3> d_ii(numAllMarkers);
1375     thrust::device_vector<Real3> V_np(numAllMarkers);
1376     thrust::fill(d_ii.begin(), d_ii.end(), mR3(0.0));
1377     thrust::fill(V_np.begin(), V_np.end(), mR3(0.0));
1378     *isErrorH = false;
1379     cudaMemcpy(isErrorD, isErrorH, sizeof(bool), cudaMemcpyHostToDevice);
1380     V_i_np__AND__d_ii_kernel<<<numBlocks, numThreads>>>(
1381         mR4CAST(sortedSphMarkersD->posRadD), mR3CAST(sortedSphMarkersD->velMasD),
1382         mR4CAST(sortedSphMarkersD->rhoPresMuD), mR3CAST(d_ii), mR3CAST(V_np), R1CAST(sumWij_inv), R1CAST(G_i),
1383         U1CAST(markersProximityD->cellStartD), U1CAST(markersProximityD->cellEndD), paramsH->dT, numAllMarkers,
1384         isErrorD);
1385 
1386     cudaDeviceSynchronize();
1387     cudaCheckError();
1388     cudaMemcpy(isErrorH, isErrorD, sizeof(bool), cudaMemcpyDeviceToHost);
1389     if (*isErrorH == true) {
1390         throw std::runtime_error("Error! program crashed after F_i_np__AND__d_ii_kernel!\n");
1391     }
1392 
1393     thrust::device_vector<Real> a_ii(numAllMarkers);
1394     thrust::device_vector<Real> rho_np(numAllMarkers);
1395     thrust::fill(a_ii.begin(), a_ii.end(), 0.0);
1396     thrust::fill(rho_np.begin(), rho_np.end(), 0.0);
1397     thrust::fill(p_old.begin(), p_old.end(), 0.0);
1398     thrust::device_vector<Real3> summGradW(numAllMarkers);
1399 
1400     *isErrorH = false;
1401     cudaMemcpy(isErrorD, isErrorH, sizeof(bool), cudaMemcpyHostToDevice);
1402     Rho_np_AND_a_ii_AND_sum_m_GradW<<<numBlocks, numThreads>>>(
1403         mR4CAST(sortedSphMarkersD->posRadD), mR4CAST(sortedSphMarkersD->rhoPresMuD), R1CAST(rho_np), R1CAST(a_ii),
1404         R1CAST(p_old), mR3CAST(V_np), mR3CAST(d_ii), mR3CAST(summGradW), U1CAST(markersProximityD->cellStartD),
1405         U1CAST(markersProximityD->cellEndD), paramsH->dT, numAllMarkers, isErrorD);
1406 
1407     cudaDeviceSynchronize();
1408     cudaCheckError();
1409     cudaMemcpy(isErrorH, isErrorD, sizeof(bool), cudaMemcpyDeviceToHost);
1410     if (*isErrorH == true) {
1411         throw std::runtime_error("Error! program crashed after F_i_np__AND__d_ii_kernel!\n");
1412     }
1413 
1414     thrust::device_vector<Real3> V_new(numAllMarkers);
1415     thrust::fill(V_new.begin(), V_new.end(), mR3(0.0));
1416     thrust::device_vector<Real> a_ij;
1417     thrust::device_vector<Real> B_i(numAllMarkers);
1418     thrust::device_vector<uint> csrColIndA;
1419     thrust::device_vector<uint> numContacts(numAllMarkers);
1420     thrust::device_vector<unsigned long int> GlobalcsrColIndA;
1421     thrust::device_vector<Real> csrValA;
1422 
1423     double durationFormAXB;
1424 
1425     int numFlexbodies = (int)numObjectsH->numFlexBodies1D + (int)numObjectsH->numFlexBodies2D;
1426 
1427     int haveGhost = (numObjectsH->numGhostMarkers > 0) ? 1 : 0;
1428     int haveHelper = (numObjectsH->numHelperMarkers > 0) ? 1 : 0;
1429 
1430     int4 updatePortion =
1431         mI4(fsiGeneralData->referenceArray[haveGhost + haveHelper + 0].y,  // end of fluid
1432             fsiGeneralData->referenceArray[haveGhost + haveHelper + 1].y,  // end of boundary
1433             fsiGeneralData->referenceArray[haveGhost + haveHelper + 1 + numObjectsH->numRigidBodies].y,
1434             fsiGeneralData->referenceArray[haveGhost + haveHelper + 1 + numObjectsH->numRigidBodies + numFlexbodies].y);
1435 
1436     uint NNZ;
1437     if (mySolutionType == PPE_SolutionType::FORM_SPARSE_MATRIX) {
1438         thrust::fill(a_ij.begin(), a_ij.end(), 0.0);
1439         thrust::fill(B_i.begin(), B_i.end(), 0.0);
1440         //        thrust::fill(summGradW.begin(), summGradW.end(), mR3(0.0));
1441         thrust::fill(numContacts.begin(), numContacts.end(), 0.0);
1442         //------------------------------------------------------------------------
1443         //------------- MatrixJacobi
1444         //------------------------------------------------------------------------
1445 
1446         bool SPARSE_FLAG = true;
1447         double FormAXBClock = clock();
1448         thrust::device_vector<Real> Residuals(numAllMarkers);
1449         thrust::fill(Residuals.begin(), Residuals.end(), 1.0);
1450         thrust::device_vector<Real> rho_p(numAllMarkers);
1451         thrust::fill(rho_p.begin(), rho_p.end(), 0.0);
1452 
1453         *isErrorH = false;
1454         cudaMemcpy(isErrorD, isErrorH, sizeof(bool), cudaMemcpyHostToDevice);
1455         CalcNumber_Contacts<<<numBlocks, numThreads>>>(
1456             U1CAST(numContacts), mR4CAST(sortedSphMarkersD->posRadD), mR4CAST(sortedSphMarkersD->rhoPresMuD),
1457             U1CAST(markersProximityD->cellStartD), U1CAST(markersProximityD->cellEndD), numAllMarkers, isErrorD);
1458 
1459         cudaDeviceSynchronize();
1460         cudaCheckError();
1461         cudaMemcpy(isErrorH, isErrorD, sizeof(bool), cudaMemcpyDeviceToHost);
1462         if (*isErrorH == true) {
1463             throw std::runtime_error("Error! program crashed after CalcNumber_Contacts!\n");
1464         }
1465         uint MAX_CONTACT = thrust::reduce(numContacts.begin(), numContacts.end(), 0, thrust::maximum<Real>());
1466         std::cout << "Max contact between SPH particles: " << MAX_CONTACT << std::endl;
1467 
1468         uint LastVal = numContacts[numAllMarkers - 1];
1469         thrust::exclusive_scan(numContacts.begin(), numContacts.end(), numContacts.begin());
1470         numContacts.push_back(LastVal + numContacts[numAllMarkers - 1]);
1471         NNZ = numContacts[numAllMarkers];
1472 
1473         csrValA.resize(NNZ);
1474         csrColIndA.resize(NNZ);
1475         GlobalcsrColIndA.resize(NNZ);
1476 
1477         thrust::fill(csrValA.begin(), csrValA.end(), 0.0);
1478         thrust::fill(GlobalcsrColIndA.begin(), GlobalcsrColIndA.end(), 0.0);
1479         thrust::fill(csrColIndA.begin(), csrColIndA.end(), 0.0);
1480 
1481         cudaMemcpy(isErrorD, isErrorH, sizeof(bool), cudaMemcpyHostToDevice);
1482 
1483         std::cout << "updatePortion of  BC: " << updatePortion.x << " " << updatePortion.y << " " << updatePortion.z
1484                   << " " << updatePortion.w << "\n ";
1485 
1486         FormAXB<<<numBlocks, numThreads>>>(
1487             R1CAST(csrValA), U1CAST(csrColIndA), LU1CAST(GlobalcsrColIndA), U1CAST(numContacts), R1CAST(a_ij),
1488             R1CAST(B_i), mR3CAST(d_ii), R1CAST(a_ii), mR3CAST(summGradW), mR4CAST(sortedSphMarkersD->posRadD),
1489             mR3CAST(sortedSphMarkersD->velMasD), mR4CAST(sortedSphMarkersD->rhoPresMuD), mR3CAST(V_new), R1CAST(p_old),
1490             mR3CAST(Normals), R1CAST(G_i), R1CAST(sumWij_inv), R1CAST(rho_np),
1491 
1492             mR4CAST(otherFsiBodiesD->q_fsiBodies_D), mR3CAST(fsiGeneralData->rigidSPH_MeshPos_LRF_D),
1493             mR3CAST(otherFsiBodiesD->posRigid_fsiBodies_D), mR4CAST(otherFsiBodiesD->velMassRigid_fsiBodies_D),
1494             mR3CAST(otherFsiBodiesD->omegaVelLRF_fsiBodies_D), mR3CAST(otherFsiBodiesD->accRigid_fsiBodies_D),
1495             mR3CAST(otherFsiBodiesD->omegaAccLRF_fsiBodies_D), U1CAST(fsiGeneralData->rigidIdentifierD),
1496 
1497             mR3CAST(pos_fsi_fea_D), mR3CAST(vel_fsi_fea_D), mR3CAST(acc_fsi_fea_D),
1498             U1CAST(fsiGeneralData->FlexIdentifierD), (int)numObjectsH->numFlexBodies1D,
1499             U2CAST(fsiGeneralData->CableElementsNodes), U4CAST(fsiGeneralData->ShellElementsNodes),
1500 
1501             updatePortion, U1CAST(markersProximityD->gridMarkerIndexD), U1CAST(markersProximityD->cellStartD),
1502             U1CAST(markersProximityD->cellEndD), paramsH->dT, numAllMarkers, SPARSE_FLAG, isErrorD);
1503 
1504         cudaDeviceSynchronize();
1505         cudaCheckError();
1506         cudaMemcpy(isErrorH, isErrorD, sizeof(bool), cudaMemcpyDeviceToHost);
1507         if (*isErrorH == true) {
1508             throw std::runtime_error("Error! program crashed after F_i_np__AND__d_ii_kernel!\n");
1509         }
1510 
1511         durationFormAXB = (clock() - FormAXBClock) / (double)CLOCKS_PER_SEC;
1512     }
1513     //------------------------------------------------------------------------
1514     //------------- Iterative loop
1515     //------------------------------------------------------------------------
1516     int Iteration = 0;
1517     Real MaxRes = 100;
1518     thrust::device_vector<Real> Residuals(numAllMarkers);
1519     thrust::fill(Residuals.begin(), Residuals.end(), 1.0);
1520     thrust::device_vector<Real3> dij_pj(numAllMarkers);
1521     thrust::fill(dij_pj.begin(), dij_pj.end(), mR3(0.0));
1522     thrust::device_vector<Real3> F_p(numAllMarkers);
1523     thrust::fill(F_p.begin(), F_p.end(), mR3(0.0));
1524     thrust::device_vector<Real> rho_p(numAllMarkers);
1525     thrust::fill(rho_p.begin(), rho_p.end(), 0.0);
1526 
1527     double LinearSystemClock = clock();
1528 
1529     myLinearSolver->SetVerbose(paramsH->Verbose_monitoring);
1530     myLinearSolver->SetAbsRes(paramsH->LinearSolver_Abs_Tol);
1531     myLinearSolver->SetRelRes(paramsH->LinearSolver_Rel_Tol);
1532     myLinearSolver->SetIterationLimit(paramsH->LinearSolver_Max_Iter);
1533 
1534     if (paramsH->USE_LinearSolver) {
1535         if (paramsH->PPE_Solution_type != PPE_SolutionType::FORM_SPARSE_MATRIX) {
1536             printf(
1537                 "You should paramsH->PPE_Solution_type == FORM_SPARSE_MATRIX in order to use the "
1538                 "chrono_fsi linear "
1539                 "solvers\n");
1540             exit(0);
1541         }
1542 
1543         myLinearSolver->Solve((int)numAllMarkers, NNZ, R1CAST(csrValA), U1CAST(numContacts), U1CAST(csrColIndA),
1544                               R1CAST(p_old), R1CAST(B_i));
1545         cudaCheckError();
1546     } else {
1547         while ((MaxRes > paramsH->LinearSolver_Abs_Tol || Iteration < 3) &&
1548                Iteration < paramsH->LinearSolver_Max_Iter) {
1549             *isErrorH = false;
1550             cudaMemcpy(isErrorD, isErrorH, sizeof(bool), cudaMemcpyHostToDevice);
1551             Initialize_Variables<<<numBlocks, numThreads>>>(mR4CAST(sortedSphMarkersD->rhoPresMuD), R1CAST(p_old),
1552                                                             mR3CAST(sortedSphMarkersD->velMasD), mR3CAST(V_new),
1553                                                             numAllMarkers, isErrorD);
1554             cudaDeviceSynchronize();
1555             cudaCheckError();
1556             cudaMemcpy(isErrorH, isErrorD, sizeof(bool), cudaMemcpyDeviceToHost);
1557             if (*isErrorH == true) {
1558                 throw std::runtime_error("Error! program crashed after Initialize_Variables!\n");
1559             }
1560 
1561             if (mySolutionType == PPE_SolutionType::MATRIX_FREE) {
1562                 *isErrorH = false;
1563                 cudaMemcpy(isErrorD, isErrorH, sizeof(bool), cudaMemcpyHostToDevice);
1564                 Calc_dij_pj<<<numBlocks, numThreads>>>(
1565                     mR3CAST(dij_pj), mR3CAST(F_p), mR3CAST(d_ii), mR4CAST(sortedSphMarkersD->posRadD),
1566                     mR3CAST(sortedSphMarkersD->velMasD), mR4CAST(sortedSphMarkersD->rhoPresMuD), R1CAST(p_old),
1567                     U1CAST(markersProximityD->cellStartD), U1CAST(markersProximityD->cellEndD), paramsH->dT,
1568                     numAllMarkers, isErrorD);
1569                 cudaDeviceSynchronize();
1570                 cudaCheckError();
1571                 cudaMemcpy(isErrorH, isErrorD, sizeof(bool), cudaMemcpyDeviceToHost);
1572                 if (*isErrorH == true) {
1573                     throw std::runtime_error("Error! program crashed after Calc_dij_pj!\n");
1574                 }
1575 
1576                 *isErrorH = false;
1577                 cudaMemcpy(isErrorD, isErrorH, sizeof(bool), cudaMemcpyHostToDevice);
1578                 Calc_Pressure<<<numBlocks, numThreads>>>(
1579                     R1CAST(a_ii), mR3CAST(d_ii), mR3CAST(dij_pj), R1CAST(rho_np), R1CAST(rho_p), R1CAST(Residuals),
1580                     mR3CAST(F_p), mR4CAST(sortedSphMarkersD->posRadD), mR3CAST(sortedSphMarkersD->velMasD),
1581                     mR4CAST(sortedSphMarkersD->rhoPresMuD),
1582 
1583                     mR4CAST(otherFsiBodiesD->q_fsiBodies_D), mR3CAST(fsiGeneralData->rigidSPH_MeshPos_LRF_D),
1584                     mR3CAST(otherFsiBodiesD->posRigid_fsiBodies_D), mR4CAST(otherFsiBodiesD->velMassRigid_fsiBodies_D),
1585                     mR3CAST(otherFsiBodiesD->omegaVelLRF_fsiBodies_D), mR3CAST(otherFsiBodiesD->accRigid_fsiBodies_D),
1586                     mR3CAST(otherFsiBodiesD->omegaAccLRF_fsiBodies_D), U1CAST(fsiGeneralData->rigidIdentifierD),
1587 
1588                     mR3CAST(pos_fsi_fea_D), mR3CAST(vel_fsi_fea_D), mR3CAST(acc_fsi_fea_D),
1589                     U1CAST(fsiGeneralData->FlexIdentifierD), (int)numObjectsH->numFlexBodies1D,
1590                     U2CAST(fsiGeneralData->CableElementsNodes), U4CAST(fsiGeneralData->ShellElementsNodes),
1591                     updatePortion, U1CAST(markersProximityD->gridMarkerIndexD), R1CAST(p_old), mR3CAST(V_new),
1592                     U1CAST(markersProximityD->cellStartD), U1CAST(markersProximityD->cellEndD), paramsH->dT,
1593                     numAllMarkers, isErrorD);
1594 
1595                 cudaDeviceSynchronize();
1596                 cudaCheckError();
1597                 cudaMemcpy(isErrorH, isErrorD, sizeof(bool), cudaMemcpyDeviceToHost);
1598                 if (*isErrorH == true) {
1599                     throw std::runtime_error("Error! program crashed after Calc_Pressure!\n");
1600                 }
1601             }
1602 
1603             if (mySolutionType == PPE_SolutionType::FORM_SPARSE_MATRIX) {
1604                 *isErrorH = false;
1605                 cudaMemcpy(isErrorD, isErrorH, sizeof(bool), cudaMemcpyHostToDevice);
1606                 Calc_Pressure_AXB_USING_CSR<<<numBlocks, numThreads>>>(
1607                     R1CAST(csrValA), R1CAST(a_ii), U1CAST(csrColIndA), U1CAST(numContacts),
1608                     mR4CAST(sortedSphMarkersD->rhoPresMuD), R1CAST(sumWij_inv), mR3CAST(sortedSphMarkersD->velMasD),
1609                     mR3CAST(V_new), R1CAST(p_old), R1CAST(B_i), R1CAST(Residuals), numAllMarkers, isErrorD);
1610                 cudaDeviceSynchronize();
1611                 cudaCheckError();
1612                 cudaMemcpy(isErrorH, isErrorD, sizeof(bool), cudaMemcpyDeviceToHost);
1613                 if (*isErrorH == true) {
1614                     throw std::runtime_error("Error! program crashed after Iterative_pressure_update!\n");
1615                 }
1616             }
1617             *isErrorH = false;
1618             cudaMemcpy(isErrorD, isErrorH, sizeof(bool), cudaMemcpyHostToDevice);
1619 
1620             Update_AND_Calc_Res<<<numBlocks, numThreads>>>(
1621                 mR3CAST(sortedSphMarkersD->velMasD), mR4CAST(sortedSphMarkersD->rhoPresMuD), R1CAST(p_old),
1622                 mR3CAST(V_new), R1CAST(rho_p), R1CAST(rho_np), R1CAST(Residuals), numAllMarkers, Iteration,
1623                 paramsH->PPE_relaxation, false, isErrorD);
1624             cudaDeviceSynchronize();
1625             cudaCheckError();
1626             cudaMemcpy(isErrorH, isErrorD, sizeof(bool), cudaMemcpyDeviceToHost);
1627             if (*isErrorH == true) {
1628                 throw std::runtime_error("Error! program crashed after Iterative_pressure_update!\n");
1629             }
1630 
1631             Iteration++;
1632 
1633             thrust::device_vector<Real>::iterator iter = thrust::max_element(Residuals.begin(), Residuals.end());
1634             auto position = iter - Residuals.begin();
1635             MaxRes = *iter;
1636 
1637             //        MaxRes =
1638             //            thrust::reduce(Residuals.begin(), Residuals.end(), 0.0, thrust::plus<Real>()) /
1639             //            numObjectsH->numAllMarkers;
1640 
1641             //    Real PMAX = thrust::reduce(p_old.begin(), p_old.end(), 0.0, thrust::maximum<Real>());
1642             //    MaxRes = thrust::reduce(Residuals.begin(), Residuals.end(), 0.0, thrust::plus<Real>()) /
1643             //    numObjectsH->numAllMarkers;
1644             //    MaxRes = thrust::reduce(Residuals.begin(), Residuals.end(), 0.0, thrust::maximum<Real>());
1645             //      Real R_np = thrust::reduce(rho_np.begin(), rho_np.end(), 0.0, thrust::plus<Real>()) /
1646             //      rho_np.size();
1647             //      Real R_p = thrust::reduce(rho_p.begin(), rho_p.end(), 0.0, thrust::plus<Real>()) /
1648             //      rho_p.size();
1649             //
1650             if (paramsH->Verbose_monitoring)
1651                 printf("Iter= %d, Res= %f\n", Iteration, MaxRes);
1652         }
1653     }
1654 
1655     thrust::device_vector<Real>::iterator iter = thrust::min_element(p_old.begin(), p_old.end());
1656     auto position = iter - p_old.begin();
1657     Real shift_p = *iter;
1658     //    Real shift_p = 0;
1659 
1660     // This must be run if linear solver is used
1661     if (paramsH->USE_LinearSolver || paramsH->ClampPressure) {
1662         printf("Shifting pressure values by %f\n", -shift_p);
1663         *isErrorH = false;
1664         cudaMemcpy(isErrorD, isErrorH, sizeof(bool), cudaMemcpyHostToDevice);
1665         FinalizePressure<<<numBlocks, numThreads>>>(
1666             mR4CAST(sortedSphMarkersD->posRadD), mR4CAST(sortedSphMarkersD->rhoPresMuD), R1CAST(p_old), mR3CAST(F_p),
1667             U1CAST(markersProximityD->cellStartD), U1CAST(markersProximityD->cellEndD), numAllMarkers, shift_p,
1668             isErrorD);
1669         cudaDeviceSynchronize();
1670         cudaCheckError();
1671         cudaMemcpy(isErrorH, isErrorD, sizeof(bool), cudaMemcpyDeviceToHost);
1672         if (*isErrorH == true) {
1673             throw std::runtime_error("Error! program crashed after FinalizePressure!\n");
1674         }
1675     }
1676     double durationLinearSystem = (clock() - LinearSystemClock) / (double)CLOCKS_PER_SEC;
1677     double durationtotal_step_time = (clock() - total_step_timeClock) / (double)CLOCKS_PER_SEC;
1678 
1679     printf("---------------IISPH CLOCK-------------------\n");
1680     printf(" Total: %f \n FormAXB: %f\n Linear System: %f \n", durationtotal_step_time, durationFormAXB,
1681            durationLinearSystem);
1682     if (!paramsH->USE_LinearSolver)
1683         printf(" Iter (Jacobi+SOR)# = %d, to Res= %.3e \n", Iteration, MaxRes);
1684     if (paramsH->USE_LinearSolver)
1685         if (myLinearSolver->GetSolverStatus()) {
1686             std::cout << " Solver converged to " << myLinearSolver->GetResidual() << " tolerance";
1687             std::cout << " after " << myLinearSolver->GetNumIterations() << " iterations" << std::endl;
1688         } else {
1689             std::cout << "Failed to converge after " << myLinearSolver->GetIterationLimit() << " iterations";
1690             std::cout << " (" << myLinearSolver->GetResidual() << " final residual)" << std::endl;
1691         }
1692 
1693     //------------------------------------------------------------------------
1694     //------------------------------------------------------------------------
1695     cudaFree(isErrorD);
1696     free(isErrorH);
1697 }
1698 
ForceSPH(std::shared_ptr<SphMarkerDataD> otherSphMarkersD,std::shared_ptr<FsiBodiesDataD> otherFsiBodiesD,std::shared_ptr<FsiMeshDataD> otherFsiMeshD)1699 void ChFsiForceIISPH::ForceSPH(std::shared_ptr<SphMarkerDataD> otherSphMarkersD,
1700                                std::shared_ptr<FsiBodiesDataD> otherFsiBodiesD,
1701                                std::shared_ptr<FsiMeshDataD> otherFsiMeshD) {
1702     sphMarkersD = otherSphMarkersD;
1703     int numAllMarkers = (int)numObjectsH->numAllMarkers;
1704     int numHelperMarkers = (int)numObjectsH->numHelperMarkers;
1705     fsiCollisionSystem->ArrangeData(sphMarkersD);
1706 
1707     thrust::device_vector<Real3>::iterator iter =
1708         thrust::max_element(sortedSphMarkersD->velMasD.begin(), sortedSphMarkersD->velMasD.end(), compare_Real3_mag());
1709     Real MaxVel = length(*iter);
1710 
1711     if (paramsH->Adaptive_time_stepping) {
1712         Real dt_CFL = paramsH->Co_number * paramsH->HSML / MaxVel;
1713         Real dt_nu = 0.25 * paramsH->HSML * paramsH->HSML / (paramsH->mu0 / paramsH->rho0);
1714         Real dt_body = 0.25 * std::sqrt(paramsH->HSML / length(paramsH->bodyForce3 + paramsH->gravity));
1715         Real dt = std::fmin(dt_body, std::fmin(dt_CFL, dt_nu));
1716         if (dt / paramsH->dT_Max > 0.7 && dt / paramsH->dT_Max < 1)
1717             paramsH->dT = paramsH->dT_Max * 0.5;
1718         else
1719             paramsH->dT = std::fmin(dt, paramsH->dT_Max);
1720 
1721         CopyParams_NumberOfObjects(paramsH, numObjectsH);
1722 
1723         printf(" time step=%.3e, dt_Max=%.3e, dt_CFL=%.3e (CFL=%.2g), dt_nu=%.3e, dt_body=%.3e\n", paramsH->dT,
1724                paramsH->dT_Max, dt_CFL, paramsH->Co_number, dt_nu, dt_body);
1725     }
1726 
1727     bool *isErrorH, *isErrorD, *isErrorD2;
1728 
1729     isErrorH = (bool*)malloc(sizeof(bool));
1730     cudaMalloc((void**)&isErrorD, sizeof(bool));
1731     cudaMalloc((void**)&isErrorD2, sizeof(bool));
1732     *isErrorH = false;
1733     cudaMemcpy(isErrorD, isErrorH, sizeof(bool), cudaMemcpyHostToDevice);
1734     cudaMemcpy(isErrorD2, isErrorH, sizeof(bool), cudaMemcpyHostToDevice);
1735 
1736     uint numThreads, numBlocks;
1737     computeGridSize(numAllMarkers, 256, numBlocks, numThreads);
1738     printf("numBlocks: %d, numThreads: %d, numAllMarker:%d \n", numBlocks, numThreads, numAllMarkers);
1739 
1740     thrust::device_vector<Real> Color(numAllMarkers);
1741     thrust::fill(Color.begin(), Color.end(), 1.0e10);
1742     thrust::device_vector<Real> _sumWij_inv(numAllMarkers);
1743     thrust::fill(_sumWij_inv.begin(), _sumWij_inv.end(), 0.0);
1744     thrust::device_vector<Real> G_i(numAllMarkers * 9);
1745     thrust::fill(G_i.begin(), G_i.end(), 0);
1746     *isErrorH = false;
1747 
1748     cudaMemcpy(isErrorD, isErrorH, sizeof(bool), cudaMemcpyHostToDevice);
1749 
1750     thrust::device_vector<uint> Contact_i(numAllMarkers);
1751     thrust::fill(Contact_i.begin(), Contact_i.end(), 0);
1752     calcRho_kernel<<<numBlocks, numThreads>>>(
1753         mR4CAST(sortedSphMarkersD->posRadD), mR4CAST(sortedSphMarkersD->rhoPresMuD), R1CAST(_sumWij_inv),
1754         U1CAST(markersProximityD->cellStartD), U1CAST(markersProximityD->cellEndD), U1CAST(Contact_i), numAllMarkers,
1755         isErrorD);
1756     cudaDeviceSynchronize();
1757     cudaCheckError();
1758     cudaMemcpy(isErrorH, isErrorD, sizeof(bool), cudaMemcpyDeviceToHost);
1759     if (*isErrorH == true) {
1760         throw std::runtime_error("Error! program crashed after calcRho_kernel!\n");
1761     }
1762 
1763     thrust::device_vector<Real3> Normals(numAllMarkers);
1764 
1765     calcNormalizedRho_kernel<<<numBlocks, numThreads>>>(
1766         mR4CAST(sortedSphMarkersD->posRadD), mR3CAST(sortedSphMarkersD->velMasD),
1767         mR4CAST(sortedSphMarkersD->rhoPresMuD), R1CAST(_sumWij_inv), R1CAST(G_i), mR3CAST(Normals), R1CAST(Color),
1768         U1CAST(markersProximityD->cellStartD), U1CAST(markersProximityD->cellEndD), numAllMarkers, isErrorD);
1769     cudaDeviceSynchronize();
1770     cudaCheckError();
1771     cudaMemcpy(isErrorH, isErrorD, sizeof(bool), cudaMemcpyDeviceToHost);
1772     if (*isErrorH == true) {
1773         throw std::runtime_error("Error! program crashed after calcNormalizedRho_kernel!\n");
1774     }
1775 
1776     thrust::device_vector<Real> p_old(numAllMarkers, 0.0);
1777     calcPressureIISPH(otherFsiBodiesD, otherFsiMeshD->pos_fsi_fea_D, otherFsiMeshD->vel_fsi_fea_D,
1778                       otherFsiMeshD->acc_fsi_fea_D, _sumWij_inv, p_old, Normals, G_i, Color);
1779 
1780     //------------------------------------------------------------------------
1781     // thread per particle
1782     //  std::cout << "dT in ForceSPH after calcPressure: " << paramsH->dT << "\n";
1783     double CalcForcesClock = clock();
1784 
1785     thrust::fill(vel_vis_Sorted_D.begin(), vel_vis_Sorted_D.end(), mR3(0.0));
1786     thrust::fill(derivVelRhoD_Sorted_D.begin(), derivVelRhoD_Sorted_D.end(), mR4(0.0));
1787     thrust::fill(vel_XSPH_Sorted_D.begin(), vel_XSPH_Sorted_D.end(), mR3(0.0));
1788     thrust::device_vector<Real3> dr_shift(numAllMarkers);
1789     thrust::fill(dr_shift.begin(), dr_shift.end(), mR3(0.0));
1790 
1791     thrust::device_vector<Real3> NEW_Vel(numAllMarkers, mR3(0.0));
1792 
1793     CalcForces<<<numBlocks, numThreads>>>(mR3CAST(NEW_Vel), mR4CAST(derivVelRhoD_Sorted_D),
1794                                           mR4CAST(sortedSphMarkersD->posRadD), mR3CAST(sortedSphMarkersD->velMasD),
1795                                           mR4CAST(sortedSphMarkersD->rhoPresMuD), R1CAST(_sumWij_inv), R1CAST(p_old),
1796                                           mR3CAST(dr_shift), U1CAST(markersProximityD->cellStartD),
1797                                           U1CAST(markersProximityD->cellEndD), paramsH->dT, numAllMarkers, isErrorD);
1798     cudaDeviceSynchronize();
1799     cudaCheckError();
1800 
1801     cudaMemcpy(isErrorH, isErrorD, sizeof(bool), cudaMemcpyDeviceToHost);
1802     if (*isErrorH == true) {
1803         throw std::runtime_error("Error! program crashed in CalcForces!\n");
1804     }
1805     double calcforce = (clock() - CalcForcesClock) / (double)CLOCKS_PER_SEC;
1806     printf(" Force Computation: %f \n", calcforce);
1807     double UpdateClock = clock();
1808 
1809     sortedSphMarkersD->velMasD = NEW_Vel;
1810     UpdateDensity<<<numBlocks, numThreads>>>(
1811         mR3CAST(vel_vis_Sorted_D), mR3CAST(vel_XSPH_Sorted_D), mR3CAST(sortedSphMarkersD->velMasD),
1812         mR4CAST(sortedSphMarkersD->posRadD), mR4CAST(sortedSphMarkersD->rhoPresMuD), R1CAST(_sumWij_inv),
1813         U1CAST(markersProximityD->cellStartD), U1CAST(markersProximityD->cellEndD), numAllMarkers, isErrorD);
1814 
1815     cudaDeviceSynchronize();
1816     cudaCheckError();
1817 
1818     cudaMemcpy(isErrorH, isErrorD, sizeof(bool), cudaMemcpyDeviceToHost);
1819     if (*isErrorH == true) {
1820         throw std::runtime_error("Error! program crashed in CalcForces!\n");
1821     }
1822     CopySortedToOriginal_NonInvasive_R3(fsiGeneralData->vel_XSPH_D, vel_XSPH_Sorted_D,
1823                                         markersProximityD->gridMarkerIndexD);
1824     CopySortedToOriginal_NonInvasive_R3(fsiGeneralData->vis_vel_SPH_D, vel_vis_Sorted_D,
1825                                         markersProximityD->gridMarkerIndexD);
1826     CopySortedToOriginal_NonInvasive_R3(sphMarkersD->velMasD, sortedSphMarkersD->velMasD,
1827                                         markersProximityD->gridMarkerIndexD);
1828     CopySortedToOriginal_NonInvasive_R4(sphMarkersD->posRadD, sortedSphMarkersD->posRadD,
1829                                         markersProximityD->gridMarkerIndexD);
1830 
1831     CopySortedToOriginal_NonInvasive_R4(sphMarkersD->rhoPresMuD, sortedSphMarkersD->rhoPresMuD,
1832                                         markersProximityD->gridMarkerIndexD);
1833     CopySortedToOriginal_NonInvasive_R4(fsiGeneralData->derivVelRhoD, derivVelRhoD_Sorted_D,
1834                                         markersProximityD->gridMarkerIndexD);
1835     printf(" Update information: %f \n", (clock() - UpdateClock) / (double)CLOCKS_PER_SEC);
1836     printf("----------------------------------------------\n");
1837 }
1838 
1839 }  // namespace fsi
1840 }  // namespace chrono
1841