1 /*
2   This is a Version 2.0 MPI + OpenMP implementation of LULESH
3 
4                  Copyright (c) 2010-2013.
5       Lawrence Livermore National Security, LLC.
6 Produced at the Lawrence Livermore National Laboratory.
7                   LLNL-CODE-461231
8                 All rights reserved.
9 
10 This file is part of LULESH, Version 2.0.
11 Please also read this link -- http://www.opensource.org/licenses/index.php
12 
13 //////////////
14 DIFFERENCES BETWEEN THIS VERSION (2.x) AND EARLIER VERSIONS:
15 * Addition of regions to make work more representative of multi-material codes
16 * Default size of each domain is 30^3 (27000 elem) instead of 45^3. This is
17   more representative of our actual working set sizes
18 * Single source distribution supports pure serial, pure OpenMP, MPI-only,
19   and MPI+OpenMP
20 * Addition of ability to visualize the mesh using VisIt
21   https://wci.llnl.gov/codes/visit/download.html
22 * Various command line options (see ./lulesh2.0 -h)
23  -q              : quiet mode - suppress stdout
24  -i <iterations> : number of cycles to run
25  -s <size>       : length of cube mesh along side
26  -r <numregions> : Number of distinct regions (def: 11)
27  -b <balance>    : Load balance between regions of a domain (def: 1)
28  -c <cost>       : Extra cost of more expensive regions (def: 1)
29  -f <filepieces> : Number of file parts for viz output (def: np/9)
30  -p              : Print out progress
31  -v              : Output viz file (requires compiling with -DVIZ_MESH
32  -h              : This message
33 
34  printf("Usage: %s [opts]\n", execname);
35       printf(" where [opts] is one or more of:\n");
36       printf(" -q              : quiet mode - suppress all stdout\n");
37       printf(" -i <iterations> : number of cycles to run\n");
38       printf(" -s <size>       : length of cube mesh along side\n");
39       printf(" -r <numregions> : Number of distinct regions (def: 11)\n");
40       printf(" -b <balance>    : Load balance between regions of a domain (def: 1)\n");
41       printf(" -c <cost>       : Extra cost of more expensive regions (def: 1)\n");
42       printf(" -f <numfiles>   : Number of files to split viz dump into (def: (np+10)/9)\n");
43       printf(" -p              : Print out progress\n");
44       printf(" -v              : Output viz file (requires compiling with -DVIZ_MESH\n");
45       printf(" -h              : This message\n");
46       printf("\n\n");
47 
48 *Notable changes in LULESH 2.0
49 
50 * Split functionality into different files
51 lulesh.cc - where most (all?) of the timed functionality lies
52 lulesh-comm.cc - MPI functionality
53 lulesh-init.cc - Setup code
54 lulesh-viz.cc  - Support for visualization option
55 lulesh-util.cc - Non-timed functions
56 *
57 * The concept of "regions" was added, although every region is the same ideal
58 *    gas material, and the same sedov blast wave problem is still the only
59 *    problem its hardcoded to solve.
60 * Regions allow two things important to making this proxy app more representative:
61 *   Four of the LULESH routines are now performed on a region-by-region basis,
62 *     making the memory access patterns non-unit stride
63 *   Artificial load imbalances can be easily introduced that could impact
64 *     parallelization strategies.
65 * The load balance flag changes region assignment.  Region number is raised to
66 *   the power entered for assignment probability.  Most likely regions changes
67 *   with MPI process id.
68 * The cost flag raises the cost of ~45% of the regions to evaluate EOS by the
69 *   entered multiple. The cost of 5% is 10x the entered multiple.
70 * MPI and OpenMP were added, and coalesced into a single version of the source
71 *   that can support serial builds, MPI-only, OpenMP-only, and MPI+OpenMP
72 * Added support to write plot files using "poor mans parallel I/O" when linked
73 *   with the silo library, which in turn can be read by VisIt.
74 * Enabled variable timestep calculation by default (courant condition), which
75 *   results in an additional reduction.
76 * Default domain (mesh) size reduced from 45^3 to 30^3
77 * Command line options to allow numerous test cases without needing to recompile
78 * Performance optimizations and code cleanup beyond LULESH 1.0
79 * Added a "Figure of Merit" calculation (elements solved per microsecond) and
80 *   output in support of using LULESH 2.0 for the 2017 CORAL procurement
81 *
82 * Possible Differences in Final Release (other changes possible)
83 *
84 * High Level mesh structure to allow data structure transformations
85 * Different default parameters
86 * Minor code performance changes and cleanup
87 
88 TODO in future versions
89 * Add reader for (truly) unstructured meshes, probably serial only
90 * CMake based build system
91 
92 //////////////
93 
94 Redistribution and use in source and binary forms, with or without
95 modification, are permitted provided that the following conditions
96 are met:
97 
98    * Redistributions of source code must retain the above copyright
99      notice, this list of conditions and the disclaimer below.
100 
101    * Redistributions in binary form must reproduce the above copyright
102      notice, this list of conditions and the disclaimer (as noted below)
103      in the documentation and/or other materials provided with the
104      distribution.
105 
106    * Neither the name of the LLNS/LLNL nor the names of its contributors
107      may be used to endorse or promote products derived from this software
108      without specific prior written permission.
109 
110 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
111 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
112 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
113 ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY, LLC,
114 THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
115 INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
116 BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
117 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
118 OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
119 NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
120 EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
121 
122 
123 Additional BSD Notice
124 
125 1. This notice is required to be provided under our contract with the U.S.
126    Department of Energy (DOE). This work was produced at Lawrence Livermore
127    National Laboratory under Contract No. DE-AC52-07NA27344 with the DOE.
128 
129 2. Neither the United States Government nor Lawrence Livermore National
130    Security, LLC nor any of their employees, makes any warranty, express
131    or implied, or assumes any liability or responsibility for the accuracy,
132    completeness, or usefulness of any information, apparatus, product, or
133    process disclosed, or represents that its use would not infringe
134    privately-owned rights.
135 
136 3. Also, reference herein to any specific commercial products, process, or
137    services by trade name, trademark, manufacturer or otherwise does not
138    necessarily constitute or imply its endorsement, recommendation, or
139    favoring by the United States Government or Lawrence Livermore National
140    Security, LLC. The views and opinions of authors expressed herein do not
141    necessarily state or reflect those of the United States Government or
142    Lawrence Livermore National Security, LLC, and shall not be used for
143    advertising or product endorsement purposes.
144 
145 */
146 
147 #include <climits>
148 #include <vector>
149 #include <math.h>
150 #include <stdio.h>
151 #include <stdlib.h>
152 #include <string.h>
153 #include <ctype.h>
154 #include <time.h>
155 #include <sys/time.h>
156 #include <iostream>
157 #include <unistd.h>
158 
159 #if LULESH_USE_OPENMP
160 # include <omp.h>
161 #endif
162 
163 #include "lulesh.h"
164 
165 #include "conduit.hpp"
166 #include "ascent.hpp"
167 
168 using namespace conduit;
169 using namespace ascent;
170 
171 /*********************************/
172 /* Data structure implementation */
173 /*********************************/
174 
175 /* might want to add access methods so that memory can be */
176 /* better managed, as in luleshFT */
177 
178 template <typename T>
Allocate(size_t size)179 T *Allocate(size_t size)
180 {
181    return static_cast<T *>(malloc(sizeof(T)*size)) ;
182 }
183 
184 template <typename T>
Release(T ** ptr)185 void Release(T **ptr)
186 {
187    if (*ptr != NULL) {
188       free(*ptr) ;
189       *ptr = NULL ;
190    }
191 }
192 
193 
194 
195 /******************************************/
196 
197 /* Work Routines */
198 
199 static inline
TimeIncrement(Domain & domain)200 void TimeIncrement(Domain& domain)
201 {
202    Real_t targetdt = domain.stoptime() - domain.time() ;
203 
204    if ((domain.dtfixed() <= Real_t(0.0)) && (domain.cycle() != Int_t(0))) {
205       Real_t ratio ;
206       Real_t olddt = domain.deltatime() ;
207 
208       /* This will require a reduction in parallel */
209       Real_t gnewdt = Real_t(1.0e+20) ;
210       Real_t newdt ;
211       if (domain.dtcourant() < gnewdt) {
212          gnewdt = domain.dtcourant() / Real_t(2.0) ;
213       }
214       if (domain.dthydro() < gnewdt) {
215          gnewdt = domain.dthydro() * Real_t(2.0) / Real_t(3.0) ;
216       }
217 
218 #if USE_MPI
219       MPI_Allreduce(&gnewdt, &newdt, 1,
220                     ((sizeof(Real_t) == 4) ? MPI_FLOAT : MPI_DOUBLE),
221                     MPI_MIN, MPI_COMM_WORLD) ;
222 #else
223       newdt = gnewdt;
224 #endif
225 
226       ratio = newdt / olddt ;
227       if (ratio >= Real_t(1.0)) {
228          if (ratio < domain.deltatimemultlb()) {
229             newdt = olddt ;
230          }
231          else if (ratio > domain.deltatimemultub()) {
232             newdt = olddt*domain.deltatimemultub() ;
233          }
234       }
235 
236       if (newdt > domain.dtmax()) {
237          newdt = domain.dtmax() ;
238       }
239       domain.deltatime() = newdt ;
240    }
241 
242    /* TRY TO PREVENT VERY SMALL SCALING ON THE NEXT CYCLE */
243    if ((targetdt > domain.deltatime()) &&
244        (targetdt < (Real_t(4.0) * domain.deltatime() / Real_t(3.0))) ) {
245       targetdt = Real_t(2.0) * domain.deltatime() / Real_t(3.0) ;
246    }
247 
248    if (targetdt < domain.deltatime()) {
249       domain.deltatime() = targetdt ;
250    }
251 
252    domain.time() += domain.deltatime() ;
253 
254    ++domain.cycle() ;
255 }
256 
257 /******************************************/
258 
259 static inline
CollectDomainNodesToElemNodes(Domain & domain,const Index_t * elemToNode,Real_t elemX[8],Real_t elemY[8],Real_t elemZ[8])260 void CollectDomainNodesToElemNodes(Domain &domain,
261                                    const Index_t* elemToNode,
262                                    Real_t elemX[8],
263                                    Real_t elemY[8],
264                                    Real_t elemZ[8])
265 {
266    Index_t nd0i = elemToNode[0] ;
267    Index_t nd1i = elemToNode[1] ;
268    Index_t nd2i = elemToNode[2] ;
269    Index_t nd3i = elemToNode[3] ;
270    Index_t nd4i = elemToNode[4] ;
271    Index_t nd5i = elemToNode[5] ;
272    Index_t nd6i = elemToNode[6] ;
273    Index_t nd7i = elemToNode[7] ;
274 
275    elemX[0] = domain.x(nd0i);
276    elemX[1] = domain.x(nd1i);
277    elemX[2] = domain.x(nd2i);
278    elemX[3] = domain.x(nd3i);
279    elemX[4] = domain.x(nd4i);
280    elemX[5] = domain.x(nd5i);
281    elemX[6] = domain.x(nd6i);
282    elemX[7] = domain.x(nd7i);
283 
284    elemY[0] = domain.y(nd0i);
285    elemY[1] = domain.y(nd1i);
286    elemY[2] = domain.y(nd2i);
287    elemY[3] = domain.y(nd3i);
288    elemY[4] = domain.y(nd4i);
289    elemY[5] = domain.y(nd5i);
290    elemY[6] = domain.y(nd6i);
291    elemY[7] = domain.y(nd7i);
292 
293    elemZ[0] = domain.z(nd0i);
294    elemZ[1] = domain.z(nd1i);
295    elemZ[2] = domain.z(nd2i);
296    elemZ[3] = domain.z(nd3i);
297    elemZ[4] = domain.z(nd4i);
298    elemZ[5] = domain.z(nd5i);
299    elemZ[6] = domain.z(nd6i);
300    elemZ[7] = domain.z(nd7i);
301 
302 }
303 
304 /******************************************/
305 
306 static inline
InitStressTermsForElems(Domain & domain,Real_t * sigxx,Real_t * sigyy,Real_t * sigzz,Index_t numElem)307 void InitStressTermsForElems(Domain &domain,
308                              Real_t *sigxx, Real_t *sigyy, Real_t *sigzz,
309                              Index_t numElem)
310 {
311    //
312    // pull in the stresses appropriate to the hydro integration
313    //
314 #ifdef LULESH_USE_OPENMP
315 #pragma omp parallel for firstprivate(numElem)
316 #endif
317    for (Index_t i = 0 ; i < numElem ; ++i){
318       sigxx[i] = sigyy[i] = sigzz[i] =  - domain.p(i) - domain.q(i) ;
319    }
320 }
321 
322 /******************************************/
323 
324 static inline
CalcElemShapeFunctionDerivatives(Real_t const x[],Real_t const y[],Real_t const z[],Real_t b[][8],Real_t * const volume)325 void CalcElemShapeFunctionDerivatives( Real_t const x[],
326                                        Real_t const y[],
327                                        Real_t const z[],
328                                        Real_t b[][8],
329                                        Real_t* const volume )
330 {
331   const Real_t x0 = x[0] ;   const Real_t x1 = x[1] ;
332   const Real_t x2 = x[2] ;   const Real_t x3 = x[3] ;
333   const Real_t x4 = x[4] ;   const Real_t x5 = x[5] ;
334   const Real_t x6 = x[6] ;   const Real_t x7 = x[7] ;
335 
336   const Real_t y0 = y[0] ;   const Real_t y1 = y[1] ;
337   const Real_t y2 = y[2] ;   const Real_t y3 = y[3] ;
338   const Real_t y4 = y[4] ;   const Real_t y5 = y[5] ;
339   const Real_t y6 = y[6] ;   const Real_t y7 = y[7] ;
340 
341   const Real_t z0 = z[0] ;   const Real_t z1 = z[1] ;
342   const Real_t z2 = z[2] ;   const Real_t z3 = z[3] ;
343   const Real_t z4 = z[4] ;   const Real_t z5 = z[5] ;
344   const Real_t z6 = z[6] ;   const Real_t z7 = z[7] ;
345 
346   Real_t fjxxi, fjxet, fjxze;
347   Real_t fjyxi, fjyet, fjyze;
348   Real_t fjzxi, fjzet, fjzze;
349   Real_t cjxxi, cjxet, cjxze;
350   Real_t cjyxi, cjyet, cjyze;
351   Real_t cjzxi, cjzet, cjzze;
352 
353   fjxxi = Real_t(.125) * ( (x6-x0) + (x5-x3) - (x7-x1) - (x4-x2) );
354   fjxet = Real_t(.125) * ( (x6-x0) - (x5-x3) + (x7-x1) - (x4-x2) );
355   fjxze = Real_t(.125) * ( (x6-x0) + (x5-x3) + (x7-x1) + (x4-x2) );
356 
357   fjyxi = Real_t(.125) * ( (y6-y0) + (y5-y3) - (y7-y1) - (y4-y2) );
358   fjyet = Real_t(.125) * ( (y6-y0) - (y5-y3) + (y7-y1) - (y4-y2) );
359   fjyze = Real_t(.125) * ( (y6-y0) + (y5-y3) + (y7-y1) + (y4-y2) );
360 
361   fjzxi = Real_t(.125) * ( (z6-z0) + (z5-z3) - (z7-z1) - (z4-z2) );
362   fjzet = Real_t(.125) * ( (z6-z0) - (z5-z3) + (z7-z1) - (z4-z2) );
363   fjzze = Real_t(.125) * ( (z6-z0) + (z5-z3) + (z7-z1) + (z4-z2) );
364 
365   /* compute cofactors */
366   cjxxi =    (fjyet * fjzze) - (fjzet * fjyze);
367   cjxet =  - (fjyxi * fjzze) + (fjzxi * fjyze);
368   cjxze =    (fjyxi * fjzet) - (fjzxi * fjyet);
369 
370   cjyxi =  - (fjxet * fjzze) + (fjzet * fjxze);
371   cjyet =    (fjxxi * fjzze) - (fjzxi * fjxze);
372   cjyze =  - (fjxxi * fjzet) + (fjzxi * fjxet);
373 
374   cjzxi =    (fjxet * fjyze) - (fjyet * fjxze);
375   cjzet =  - (fjxxi * fjyze) + (fjyxi * fjxze);
376   cjzze =    (fjxxi * fjyet) - (fjyxi * fjxet);
377 
378   /* calculate partials :
379      this need only be done for l = 0,1,2,3   since , by symmetry ,
380      (6,7,4,5) = - (0,1,2,3) .
381   */
382   b[0][0] =   -  cjxxi  -  cjxet  -  cjxze;
383   b[0][1] =      cjxxi  -  cjxet  -  cjxze;
384   b[0][2] =      cjxxi  +  cjxet  -  cjxze;
385   b[0][3] =   -  cjxxi  +  cjxet  -  cjxze;
386   b[0][4] = -b[0][2];
387   b[0][5] = -b[0][3];
388   b[0][6] = -b[0][0];
389   b[0][7] = -b[0][1];
390 
391   b[1][0] =   -  cjyxi  -  cjyet  -  cjyze;
392   b[1][1] =      cjyxi  -  cjyet  -  cjyze;
393   b[1][2] =      cjyxi  +  cjyet  -  cjyze;
394   b[1][3] =   -  cjyxi  +  cjyet  -  cjyze;
395   b[1][4] = -b[1][2];
396   b[1][5] = -b[1][3];
397   b[1][6] = -b[1][0];
398   b[1][7] = -b[1][1];
399 
400   b[2][0] =   -  cjzxi  -  cjzet  -  cjzze;
401   b[2][1] =      cjzxi  -  cjzet  -  cjzze;
402   b[2][2] =      cjzxi  +  cjzet  -  cjzze;
403   b[2][3] =   -  cjzxi  +  cjzet  -  cjzze;
404   b[2][4] = -b[2][2];
405   b[2][5] = -b[2][3];
406   b[2][6] = -b[2][0];
407   b[2][7] = -b[2][1];
408 
409   /* calculate jacobian determinant (volume) */
410   *volume = Real_t(8.) * ( fjxet * cjxet + fjyet * cjyet + fjzet * cjzet);
411 }
412 
413 /******************************************/
414 
415 static inline
SumElemFaceNormal(Real_t * normalX0,Real_t * normalY0,Real_t * normalZ0,Real_t * normalX1,Real_t * normalY1,Real_t * normalZ1,Real_t * normalX2,Real_t * normalY2,Real_t * normalZ2,Real_t * normalX3,Real_t * normalY3,Real_t * normalZ3,const Real_t x0,const Real_t y0,const Real_t z0,const Real_t x1,const Real_t y1,const Real_t z1,const Real_t x2,const Real_t y2,const Real_t z2,const Real_t x3,const Real_t y3,const Real_t z3)416 void SumElemFaceNormal(Real_t *normalX0, Real_t *normalY0, Real_t *normalZ0,
417                        Real_t *normalX1, Real_t *normalY1, Real_t *normalZ1,
418                        Real_t *normalX2, Real_t *normalY2, Real_t *normalZ2,
419                        Real_t *normalX3, Real_t *normalY3, Real_t *normalZ3,
420                        const Real_t x0, const Real_t y0, const Real_t z0,
421                        const Real_t x1, const Real_t y1, const Real_t z1,
422                        const Real_t x2, const Real_t y2, const Real_t z2,
423                        const Real_t x3, const Real_t y3, const Real_t z3)
424 {
425    Real_t bisectX0 = Real_t(0.5) * (x3 + x2 - x1 - x0);
426    Real_t bisectY0 = Real_t(0.5) * (y3 + y2 - y1 - y0);
427    Real_t bisectZ0 = Real_t(0.5) * (z3 + z2 - z1 - z0);
428    Real_t bisectX1 = Real_t(0.5) * (x2 + x1 - x3 - x0);
429    Real_t bisectY1 = Real_t(0.5) * (y2 + y1 - y3 - y0);
430    Real_t bisectZ1 = Real_t(0.5) * (z2 + z1 - z3 - z0);
431    Real_t areaX = Real_t(0.25) * (bisectY0 * bisectZ1 - bisectZ0 * bisectY1);
432    Real_t areaY = Real_t(0.25) * (bisectZ0 * bisectX1 - bisectX0 * bisectZ1);
433    Real_t areaZ = Real_t(0.25) * (bisectX0 * bisectY1 - bisectY0 * bisectX1);
434 
435    *normalX0 += areaX;
436    *normalX1 += areaX;
437    *normalX2 += areaX;
438    *normalX3 += areaX;
439 
440    *normalY0 += areaY;
441    *normalY1 += areaY;
442    *normalY2 += areaY;
443    *normalY3 += areaY;
444 
445    *normalZ0 += areaZ;
446    *normalZ1 += areaZ;
447    *normalZ2 += areaZ;
448    *normalZ3 += areaZ;
449 }
450 
451 /******************************************/
452 
453 static inline
CalcElemNodeNormals(Real_t pfx[8],Real_t pfy[8],Real_t pfz[8],const Real_t x[8],const Real_t y[8],const Real_t z[8])454 void CalcElemNodeNormals(Real_t pfx[8],
455                          Real_t pfy[8],
456                          Real_t pfz[8],
457                          const Real_t x[8],
458                          const Real_t y[8],
459                          const Real_t z[8])
460 {
461    for (Index_t i = 0 ; i < 8 ; ++i) {
462       pfx[i] = Real_t(0.0);
463       pfy[i] = Real_t(0.0);
464       pfz[i] = Real_t(0.0);
465    }
466    /* evaluate face one: nodes 0, 1, 2, 3 */
467    SumElemFaceNormal(&pfx[0], &pfy[0], &pfz[0],
468                   &pfx[1], &pfy[1], &pfz[1],
469                   &pfx[2], &pfy[2], &pfz[2],
470                   &pfx[3], &pfy[3], &pfz[3],
471                   x[0], y[0], z[0], x[1], y[1], z[1],
472                   x[2], y[2], z[2], x[3], y[3], z[3]);
473    /* evaluate face two: nodes 0, 4, 5, 1 */
474    SumElemFaceNormal(&pfx[0], &pfy[0], &pfz[0],
475                   &pfx[4], &pfy[4], &pfz[4],
476                   &pfx[5], &pfy[5], &pfz[5],
477                   &pfx[1], &pfy[1], &pfz[1],
478                   x[0], y[0], z[0], x[4], y[4], z[4],
479                   x[5], y[5], z[5], x[1], y[1], z[1]);
480    /* evaluate face three: nodes 1, 5, 6, 2 */
481    SumElemFaceNormal(&pfx[1], &pfy[1], &pfz[1],
482                   &pfx[5], &pfy[5], &pfz[5],
483                   &pfx[6], &pfy[6], &pfz[6],
484                   &pfx[2], &pfy[2], &pfz[2],
485                   x[1], y[1], z[1], x[5], y[5], z[5],
486                   x[6], y[6], z[6], x[2], y[2], z[2]);
487    /* evaluate face four: nodes 2, 6, 7, 3 */
488    SumElemFaceNormal(&pfx[2], &pfy[2], &pfz[2],
489                   &pfx[6], &pfy[6], &pfz[6],
490                   &pfx[7], &pfy[7], &pfz[7],
491                   &pfx[3], &pfy[3], &pfz[3],
492                   x[2], y[2], z[2], x[6], y[6], z[6],
493                   x[7], y[7], z[7], x[3], y[3], z[3]);
494    /* evaluate face five: nodes 3, 7, 4, 0 */
495    SumElemFaceNormal(&pfx[3], &pfy[3], &pfz[3],
496                   &pfx[7], &pfy[7], &pfz[7],
497                   &pfx[4], &pfy[4], &pfz[4],
498                   &pfx[0], &pfy[0], &pfz[0],
499                   x[3], y[3], z[3], x[7], y[7], z[7],
500                   x[4], y[4], z[4], x[0], y[0], z[0]);
501    /* evaluate face six: nodes 4, 7, 6, 5 */
502    SumElemFaceNormal(&pfx[4], &pfy[4], &pfz[4],
503                   &pfx[7], &pfy[7], &pfz[7],
504                   &pfx[6], &pfy[6], &pfz[6],
505                   &pfx[5], &pfy[5], &pfz[5],
506                   x[4], y[4], z[4], x[7], y[7], z[7],
507                   x[6], y[6], z[6], x[5], y[5], z[5]);
508 }
509 
510 /******************************************/
511 
512 static inline
SumElemStressesToNodeForces(const Real_t B[][8],const Real_t stress_xx,const Real_t stress_yy,const Real_t stress_zz,Real_t fx[],Real_t fy[],Real_t fz[])513 void SumElemStressesToNodeForces( const Real_t B[][8],
514                                   const Real_t stress_xx,
515                                   const Real_t stress_yy,
516                                   const Real_t stress_zz,
517                                   Real_t fx[], Real_t fy[], Real_t fz[] )
518 {
519    for(Index_t i = 0; i < 8; i++) {
520       fx[i] = -( stress_xx * B[0][i] );
521       fy[i] = -( stress_yy * B[1][i]  );
522       fz[i] = -( stress_zz * B[2][i] );
523    }
524 }
525 
526 /******************************************/
527 
528 static inline
IntegrateStressForElems(Domain & domain,Real_t * sigxx,Real_t * sigyy,Real_t * sigzz,Real_t * determ,Index_t numElem,Index_t numNode)529 void IntegrateStressForElems( Domain &domain,
530                               Real_t *sigxx, Real_t *sigyy, Real_t *sigzz,
531                               Real_t *determ, Index_t numElem, Index_t numNode)
532 {
533 #if LULESH_USE_OPENMP
534    Index_t numthreads = omp_get_max_threads();
535 #else
536    Index_t numthreads = 1;
537 #endif
538 
539    Index_t numElem8 = numElem * 8 ;
540    Real_t *fx_elem;
541    Real_t *fy_elem;
542    Real_t *fz_elem;
543    Real_t fx_local[8] ;
544    Real_t fy_local[8] ;
545    Real_t fz_local[8] ;
546 
547 
548   if (numthreads > 1) {
549      fx_elem = Allocate<Real_t>(numElem8) ;
550      fy_elem = Allocate<Real_t>(numElem8) ;
551      fz_elem = Allocate<Real_t>(numElem8) ;
552   }
553   // loop over all elements
554 
555 #ifdef LULESH_USE_OPENMP
556 #pragma omp parallel for firstprivate(numElem)
557 #endif
558   for( Index_t k=0 ; k<numElem ; ++k )
559   {
560     const Index_t* const elemToNode = domain.nodelist(k);
561     Real_t B[3][8] ;// shape function derivatives
562     Real_t x_local[8] ;
563     Real_t y_local[8] ;
564     Real_t z_local[8] ;
565 
566     // get nodal coordinates from global arrays and copy into local arrays.
567     CollectDomainNodesToElemNodes(domain, elemToNode, x_local, y_local, z_local);
568 
569     // Volume calculation involves extra work for numerical consistency
570     CalcElemShapeFunctionDerivatives(x_local, y_local, z_local,
571                                          B, &determ[k]);
572 
573     CalcElemNodeNormals( B[0] , B[1], B[2],
574                           x_local, y_local, z_local );
575 
576     if (numthreads > 1) {
577        // Eliminate thread writing conflicts at the nodes by giving
578        // each element its own copy to write to
579        SumElemStressesToNodeForces( B, sigxx[k], sigyy[k], sigzz[k],
580                                     &fx_elem[k*8],
581                                     &fy_elem[k*8],
582                                     &fz_elem[k*8] ) ;
583     }
584     else {
585        SumElemStressesToNodeForces( B, sigxx[k], sigyy[k], sigzz[k],
586                                     fx_local, fy_local, fz_local ) ;
587 
588        // copy nodal force contributions to global force arrray.
589        for( Index_t lnode=0 ; lnode<8 ; ++lnode ) {
590           Index_t gnode = elemToNode[lnode];
591           domain.fx(gnode) += fx_local[lnode];
592           domain.fy(gnode) += fy_local[lnode];
593           domain.fz(gnode) += fz_local[lnode];
594        }
595     }
596   }
597 
598   if (numthreads > 1) {
599      // If threaded, then we need to copy the data out of the temporary
600      // arrays used above into the final forces field
601 #ifdef LULESH_USE_OPENMP
602 #pragma omp parallel for firstprivate(numNode)
603 #endif
604      for( Index_t gnode=0 ; gnode<numNode ; ++gnode )
605      {
606         Index_t count = domain.nodeElemCount(gnode) ;
607         Index_t *cornerList = domain.nodeElemCornerList(gnode) ;
608         Real_t fx_tmp = Real_t(0.0) ;
609         Real_t fy_tmp = Real_t(0.0) ;
610         Real_t fz_tmp = Real_t(0.0) ;
611         for (Index_t i=0 ; i < count ; ++i) {
612            Index_t elem = cornerList[i] ;
613            fx_tmp += fx_elem[elem] ;
614            fy_tmp += fy_elem[elem] ;
615            fz_tmp += fz_elem[elem] ;
616         }
617         domain.fx(gnode) = fx_tmp ;
618         domain.fy(gnode) = fy_tmp ;
619         domain.fz(gnode) = fz_tmp ;
620      }
621      Release(&fz_elem) ;
622      Release(&fy_elem) ;
623      Release(&fx_elem) ;
624   }
625 }
626 
627 /******************************************/
628 
629 static inline
VoluDer(const Real_t x0,const Real_t x1,const Real_t x2,const Real_t x3,const Real_t x4,const Real_t x5,const Real_t y0,const Real_t y1,const Real_t y2,const Real_t y3,const Real_t y4,const Real_t y5,const Real_t z0,const Real_t z1,const Real_t z2,const Real_t z3,const Real_t z4,const Real_t z5,Real_t * dvdx,Real_t * dvdy,Real_t * dvdz)630 void VoluDer(const Real_t x0, const Real_t x1, const Real_t x2,
631              const Real_t x3, const Real_t x4, const Real_t x5,
632              const Real_t y0, const Real_t y1, const Real_t y2,
633              const Real_t y3, const Real_t y4, const Real_t y5,
634              const Real_t z0, const Real_t z1, const Real_t z2,
635              const Real_t z3, const Real_t z4, const Real_t z5,
636              Real_t* dvdx, Real_t* dvdy, Real_t* dvdz)
637 {
638    const Real_t twelfth = Real_t(1.0) / Real_t(12.0) ;
639 
640    *dvdx =
641       (y1 + y2) * (z0 + z1) - (y0 + y1) * (z1 + z2) +
642       (y0 + y4) * (z3 + z4) - (y3 + y4) * (z0 + z4) -
643       (y2 + y5) * (z3 + z5) + (y3 + y5) * (z2 + z5);
644    *dvdy =
645       - (x1 + x2) * (z0 + z1) + (x0 + x1) * (z1 + z2) -
646       (x0 + x4) * (z3 + z4) + (x3 + x4) * (z0 + z4) +
647       (x2 + x5) * (z3 + z5) - (x3 + x5) * (z2 + z5);
648 
649    *dvdz =
650       - (y1 + y2) * (x0 + x1) + (y0 + y1) * (x1 + x2) -
651       (y0 + y4) * (x3 + x4) + (y3 + y4) * (x0 + x4) +
652       (y2 + y5) * (x3 + x5) - (y3 + y5) * (x2 + x5);
653 
654    *dvdx *= twelfth;
655    *dvdy *= twelfth;
656    *dvdz *= twelfth;
657 }
658 
659 /******************************************/
660 
661 static inline
CalcElemVolumeDerivative(Real_t dvdx[8],Real_t dvdy[8],Real_t dvdz[8],const Real_t x[8],const Real_t y[8],const Real_t z[8])662 void CalcElemVolumeDerivative(Real_t dvdx[8],
663                               Real_t dvdy[8],
664                               Real_t dvdz[8],
665                               const Real_t x[8],
666                               const Real_t y[8],
667                               const Real_t z[8])
668 {
669    VoluDer(x[1], x[2], x[3], x[4], x[5], x[7],
670            y[1], y[2], y[3], y[4], y[5], y[7],
671            z[1], z[2], z[3], z[4], z[5], z[7],
672            &dvdx[0], &dvdy[0], &dvdz[0]);
673    VoluDer(x[0], x[1], x[2], x[7], x[4], x[6],
674            y[0], y[1], y[2], y[7], y[4], y[6],
675            z[0], z[1], z[2], z[7], z[4], z[6],
676            &dvdx[3], &dvdy[3], &dvdz[3]);
677    VoluDer(x[3], x[0], x[1], x[6], x[7], x[5],
678            y[3], y[0], y[1], y[6], y[7], y[5],
679            z[3], z[0], z[1], z[6], z[7], z[5],
680            &dvdx[2], &dvdy[2], &dvdz[2]);
681    VoluDer(x[2], x[3], x[0], x[5], x[6], x[4],
682            y[2], y[3], y[0], y[5], y[6], y[4],
683            z[2], z[3], z[0], z[5], z[6], z[4],
684            &dvdx[1], &dvdy[1], &dvdz[1]);
685    VoluDer(x[7], x[6], x[5], x[0], x[3], x[1],
686            y[7], y[6], y[5], y[0], y[3], y[1],
687            z[7], z[6], z[5], z[0], z[3], z[1],
688            &dvdx[4], &dvdy[4], &dvdz[4]);
689    VoluDer(x[4], x[7], x[6], x[1], x[0], x[2],
690            y[4], y[7], y[6], y[1], y[0], y[2],
691            z[4], z[7], z[6], z[1], z[0], z[2],
692            &dvdx[5], &dvdy[5], &dvdz[5]);
693    VoluDer(x[5], x[4], x[7], x[2], x[1], x[3],
694            y[5], y[4], y[7], y[2], y[1], y[3],
695            z[5], z[4], z[7], z[2], z[1], z[3],
696            &dvdx[6], &dvdy[6], &dvdz[6]);
697    VoluDer(x[6], x[5], x[4], x[3], x[2], x[0],
698            y[6], y[5], y[4], y[3], y[2], y[0],
699            z[6], z[5], z[4], z[3], z[2], z[0],
700            &dvdx[7], &dvdy[7], &dvdz[7]);
701 }
702 
703 /******************************************/
704 
705 static inline
CalcElemFBHourglassForce(Real_t * xd,Real_t * yd,Real_t * zd,Real_t hourgam[][4],Real_t coefficient,Real_t * hgfx,Real_t * hgfy,Real_t * hgfz)706 void CalcElemFBHourglassForce(Real_t *xd, Real_t *yd, Real_t *zd,  Real_t hourgam[][4],
707                               Real_t coefficient,
708                               Real_t *hgfx, Real_t *hgfy, Real_t *hgfz )
709 {
710    Real_t hxx[4];
711    for(Index_t i = 0; i < 4; i++) {
712       hxx[i] = hourgam[0][i] * xd[0] + hourgam[1][i] * xd[1] +
713                hourgam[2][i] * xd[2] + hourgam[3][i] * xd[3] +
714                hourgam[4][i] * xd[4] + hourgam[5][i] * xd[5] +
715                hourgam[6][i] * xd[6] + hourgam[7][i] * xd[7];
716    }
717    for(Index_t i = 0; i < 8; i++) {
718       hgfx[i] = coefficient *
719                 (hourgam[i][0] * hxx[0] + hourgam[i][1] * hxx[1] +
720                  hourgam[i][2] * hxx[2] + hourgam[i][3] * hxx[3]);
721    }
722    for(Index_t i = 0; i < 4; i++) {
723       hxx[i] = hourgam[0][i] * yd[0] + hourgam[1][i] * yd[1] +
724                hourgam[2][i] * yd[2] + hourgam[3][i] * yd[3] +
725                hourgam[4][i] * yd[4] + hourgam[5][i] * yd[5] +
726                hourgam[6][i] * yd[6] + hourgam[7][i] * yd[7];
727    }
728    for(Index_t i = 0; i < 8; i++) {
729       hgfy[i] = coefficient *
730                 (hourgam[i][0] * hxx[0] + hourgam[i][1] * hxx[1] +
731                  hourgam[i][2] * hxx[2] + hourgam[i][3] * hxx[3]);
732    }
733    for(Index_t i = 0; i < 4; i++) {
734       hxx[i] = hourgam[0][i] * zd[0] + hourgam[1][i] * zd[1] +
735                hourgam[2][i] * zd[2] + hourgam[3][i] * zd[3] +
736                hourgam[4][i] * zd[4] + hourgam[5][i] * zd[5] +
737                hourgam[6][i] * zd[6] + hourgam[7][i] * zd[7];
738    }
739    for(Index_t i = 0; i < 8; i++) {
740       hgfz[i] = coefficient *
741                 (hourgam[i][0] * hxx[0] + hourgam[i][1] * hxx[1] +
742                  hourgam[i][2] * hxx[2] + hourgam[i][3] * hxx[3]);
743    }
744 }
745 
746 /******************************************/
747 
748 static inline
CalcFBHourglassForceForElems(Domain & domain,Real_t * determ,Real_t * x8n,Real_t * y8n,Real_t * z8n,Real_t * dvdx,Real_t * dvdy,Real_t * dvdz,Real_t hourg,Index_t numElem,Index_t numNode)749 void CalcFBHourglassForceForElems( Domain &domain,
750                                    Real_t *determ,
751                                    Real_t *x8n, Real_t *y8n, Real_t *z8n,
752                                    Real_t *dvdx, Real_t *dvdy, Real_t *dvdz,
753                                    Real_t hourg, Index_t numElem,
754                                    Index_t numNode)
755 {
756 
757 #if LULESH_USE_OPENMP
758    Index_t numthreads = omp_get_max_threads();
759 #else
760    Index_t numthreads = 1;
761 #endif
762    /*************************************************
763     *
764     *     FUNCTION: Calculates the Flanagan-Belytschko anti-hourglass
765     *               force.
766     *
767     *************************************************/
768 
769    Index_t numElem8 = numElem * 8 ;
770 
771    Real_t *fx_elem;
772    Real_t *fy_elem;
773    Real_t *fz_elem;
774 
775    if(numthreads > 1) {
776       fx_elem = Allocate<Real_t>(numElem8) ;
777       fy_elem = Allocate<Real_t>(numElem8) ;
778       fz_elem = Allocate<Real_t>(numElem8) ;
779    }
780 
781    Real_t  gamma[4][8];
782 
783    gamma[0][0] = Real_t( 1.);
784    gamma[0][1] = Real_t( 1.);
785    gamma[0][2] = Real_t(-1.);
786    gamma[0][3] = Real_t(-1.);
787    gamma[0][4] = Real_t(-1.);
788    gamma[0][5] = Real_t(-1.);
789    gamma[0][6] = Real_t( 1.);
790    gamma[0][7] = Real_t( 1.);
791    gamma[1][0] = Real_t( 1.);
792    gamma[1][1] = Real_t(-1.);
793    gamma[1][2] = Real_t(-1.);
794    gamma[1][3] = Real_t( 1.);
795    gamma[1][4] = Real_t(-1.);
796    gamma[1][5] = Real_t( 1.);
797    gamma[1][6] = Real_t( 1.);
798    gamma[1][7] = Real_t(-1.);
799    gamma[2][0] = Real_t( 1.);
800    gamma[2][1] = Real_t(-1.);
801    gamma[2][2] = Real_t( 1.);
802    gamma[2][3] = Real_t(-1.);
803    gamma[2][4] = Real_t( 1.);
804    gamma[2][5] = Real_t(-1.);
805    gamma[2][6] = Real_t( 1.);
806    gamma[2][7] = Real_t(-1.);
807    gamma[3][0] = Real_t(-1.);
808    gamma[3][1] = Real_t( 1.);
809    gamma[3][2] = Real_t(-1.);
810    gamma[3][3] = Real_t( 1.);
811    gamma[3][4] = Real_t( 1.);
812    gamma[3][5] = Real_t(-1.);
813    gamma[3][6] = Real_t( 1.);
814    gamma[3][7] = Real_t(-1.);
815 
816 /*************************************************/
817 /*    compute the hourglass modes */
818 
819 
820 #ifdef LULESH_USE_OPENMP
821 #pragma omp parallel for firstprivate(numElem, hourg)
822 #endif
823    for(Index_t i2=0;i2<numElem;++i2){
824       Real_t *fx_local, *fy_local, *fz_local ;
825       Real_t hgfx[8], hgfy[8], hgfz[8] ;
826 
827       Real_t coefficient;
828 
829       Real_t hourgam[8][4];
830       Real_t xd1[8], yd1[8], zd1[8] ;
831 
832       const Index_t *elemToNode = domain.nodelist(i2);
833       Index_t i3=8*i2;
834       Real_t volinv=Real_t(1.0)/determ[i2];
835       Real_t ss1, mass1, volume13 ;
836       for(Index_t i1=0;i1<4;++i1){
837 
838          Real_t hourmodx =
839             x8n[i3] * gamma[i1][0] + x8n[i3+1] * gamma[i1][1] +
840             x8n[i3+2] * gamma[i1][2] + x8n[i3+3] * gamma[i1][3] +
841             x8n[i3+4] * gamma[i1][4] + x8n[i3+5] * gamma[i1][5] +
842             x8n[i3+6] * gamma[i1][6] + x8n[i3+7] * gamma[i1][7];
843 
844          Real_t hourmody =
845             y8n[i3] * gamma[i1][0] + y8n[i3+1] * gamma[i1][1] +
846             y8n[i3+2] * gamma[i1][2] + y8n[i3+3] * gamma[i1][3] +
847             y8n[i3+4] * gamma[i1][4] + y8n[i3+5] * gamma[i1][5] +
848             y8n[i3+6] * gamma[i1][6] + y8n[i3+7] * gamma[i1][7];
849 
850          Real_t hourmodz =
851             z8n[i3] * gamma[i1][0] + z8n[i3+1] * gamma[i1][1] +
852             z8n[i3+2] * gamma[i1][2] + z8n[i3+3] * gamma[i1][3] +
853             z8n[i3+4] * gamma[i1][4] + z8n[i3+5] * gamma[i1][5] +
854             z8n[i3+6] * gamma[i1][6] + z8n[i3+7] * gamma[i1][7];
855 
856          hourgam[0][i1] = gamma[i1][0] -  volinv*(dvdx[i3  ] * hourmodx +
857                                                   dvdy[i3  ] * hourmody +
858                                                   dvdz[i3  ] * hourmodz );
859 
860          hourgam[1][i1] = gamma[i1][1] -  volinv*(dvdx[i3+1] * hourmodx +
861                                                   dvdy[i3+1] * hourmody +
862                                                   dvdz[i3+1] * hourmodz );
863 
864          hourgam[2][i1] = gamma[i1][2] -  volinv*(dvdx[i3+2] * hourmodx +
865                                                   dvdy[i3+2] * hourmody +
866                                                   dvdz[i3+2] * hourmodz );
867 
868          hourgam[3][i1] = gamma[i1][3] -  volinv*(dvdx[i3+3] * hourmodx +
869                                                   dvdy[i3+3] * hourmody +
870                                                   dvdz[i3+3] * hourmodz );
871 
872          hourgam[4][i1] = gamma[i1][4] -  volinv*(dvdx[i3+4] * hourmodx +
873                                                   dvdy[i3+4] * hourmody +
874                                                   dvdz[i3+4] * hourmodz );
875 
876          hourgam[5][i1] = gamma[i1][5] -  volinv*(dvdx[i3+5] * hourmodx +
877                                                   dvdy[i3+5] * hourmody +
878                                                   dvdz[i3+5] * hourmodz );
879 
880          hourgam[6][i1] = gamma[i1][6] -  volinv*(dvdx[i3+6] * hourmodx +
881                                                   dvdy[i3+6] * hourmody +
882                                                   dvdz[i3+6] * hourmodz );
883 
884          hourgam[7][i1] = gamma[i1][7] -  volinv*(dvdx[i3+7] * hourmodx +
885                                                   dvdy[i3+7] * hourmody +
886                                                   dvdz[i3+7] * hourmodz );
887 
888       }
889 
890       /* compute forces */
891       /* store forces into h arrays (force arrays) */
892 
893       ss1=domain.ss(i2);
894       mass1=domain.elemMass(i2);
895       volume13=CBRT(determ[i2]);
896 
897       Index_t n0si2 = elemToNode[0];
898       Index_t n1si2 = elemToNode[1];
899       Index_t n2si2 = elemToNode[2];
900       Index_t n3si2 = elemToNode[3];
901       Index_t n4si2 = elemToNode[4];
902       Index_t n5si2 = elemToNode[5];
903       Index_t n6si2 = elemToNode[6];
904       Index_t n7si2 = elemToNode[7];
905 
906       xd1[0] = domain.xd(n0si2);
907       xd1[1] = domain.xd(n1si2);
908       xd1[2] = domain.xd(n2si2);
909       xd1[3] = domain.xd(n3si2);
910       xd1[4] = domain.xd(n4si2);
911       xd1[5] = domain.xd(n5si2);
912       xd1[6] = domain.xd(n6si2);
913       xd1[7] = domain.xd(n7si2);
914 
915       yd1[0] = domain.yd(n0si2);
916       yd1[1] = domain.yd(n1si2);
917       yd1[2] = domain.yd(n2si2);
918       yd1[3] = domain.yd(n3si2);
919       yd1[4] = domain.yd(n4si2);
920       yd1[5] = domain.yd(n5si2);
921       yd1[6] = domain.yd(n6si2);
922       yd1[7] = domain.yd(n7si2);
923 
924       zd1[0] = domain.zd(n0si2);
925       zd1[1] = domain.zd(n1si2);
926       zd1[2] = domain.zd(n2si2);
927       zd1[3] = domain.zd(n3si2);
928       zd1[4] = domain.zd(n4si2);
929       zd1[5] = domain.zd(n5si2);
930       zd1[6] = domain.zd(n6si2);
931       zd1[7] = domain.zd(n7si2);
932 
933       coefficient = - hourg * Real_t(0.01) * ss1 * mass1 / volume13;
934 
935       CalcElemFBHourglassForce(xd1,yd1,zd1,
936                       hourgam,
937                       coefficient, hgfx, hgfy, hgfz);
938 
939       // With the threaded version, we write into local arrays per elem
940       // so we don't have to worry about race conditions
941       if (numthreads > 1) {
942          fx_local = &fx_elem[i3] ;
943          fx_local[0] = hgfx[0];
944          fx_local[1] = hgfx[1];
945          fx_local[2] = hgfx[2];
946          fx_local[3] = hgfx[3];
947          fx_local[4] = hgfx[4];
948          fx_local[5] = hgfx[5];
949          fx_local[6] = hgfx[6];
950          fx_local[7] = hgfx[7];
951 
952          fy_local = &fy_elem[i3] ;
953          fy_local[0] = hgfy[0];
954          fy_local[1] = hgfy[1];
955          fy_local[2] = hgfy[2];
956          fy_local[3] = hgfy[3];
957          fy_local[4] = hgfy[4];
958          fy_local[5] = hgfy[5];
959          fy_local[6] = hgfy[6];
960          fy_local[7] = hgfy[7];
961 
962          fz_local = &fz_elem[i3] ;
963          fz_local[0] = hgfz[0];
964          fz_local[1] = hgfz[1];
965          fz_local[2] = hgfz[2];
966          fz_local[3] = hgfz[3];
967          fz_local[4] = hgfz[4];
968          fz_local[5] = hgfz[5];
969          fz_local[6] = hgfz[6];
970          fz_local[7] = hgfz[7];
971       }
972       else {
973          domain.fx(n0si2) += hgfx[0];
974          domain.fy(n0si2) += hgfy[0];
975          domain.fz(n0si2) += hgfz[0];
976 
977          domain.fx(n1si2) += hgfx[1];
978          domain.fy(n1si2) += hgfy[1];
979          domain.fz(n1si2) += hgfz[1];
980 
981          domain.fx(n2si2) += hgfx[2];
982          domain.fy(n2si2) += hgfy[2];
983          domain.fz(n2si2) += hgfz[2];
984 
985          domain.fx(n3si2) += hgfx[3];
986          domain.fy(n3si2) += hgfy[3];
987          domain.fz(n3si2) += hgfz[3];
988 
989          domain.fx(n4si2) += hgfx[4];
990          domain.fy(n4si2) += hgfy[4];
991          domain.fz(n4si2) += hgfz[4];
992 
993          domain.fx(n5si2) += hgfx[5];
994          domain.fy(n5si2) += hgfy[5];
995          domain.fz(n5si2) += hgfz[5];
996 
997          domain.fx(n6si2) += hgfx[6];
998          domain.fy(n6si2) += hgfy[6];
999          domain.fz(n6si2) += hgfz[6];
1000 
1001          domain.fx(n7si2) += hgfx[7];
1002          domain.fy(n7si2) += hgfy[7];
1003          domain.fz(n7si2) += hgfz[7];
1004       }
1005    }
1006 
1007    if (numthreads > 1) {
1008      // Collect the data from the local arrays into the final force arrays
1009 #ifdef LULESH_USE_OPENMP
1010 #pragma omp parallel for firstprivate(numNode)
1011 #endif
1012       for( Index_t gnode=0 ; gnode<numNode ; ++gnode )
1013       {
1014          Index_t count = domain.nodeElemCount(gnode) ;
1015          Index_t *cornerList = domain.nodeElemCornerList(gnode) ;
1016          Real_t fx_tmp = Real_t(0.0) ;
1017          Real_t fy_tmp = Real_t(0.0) ;
1018          Real_t fz_tmp = Real_t(0.0) ;
1019          for (Index_t i=0 ; i < count ; ++i) {
1020             Index_t elem = cornerList[i] ;
1021             fx_tmp += fx_elem[elem] ;
1022             fy_tmp += fy_elem[elem] ;
1023             fz_tmp += fz_elem[elem] ;
1024          }
1025          domain.fx(gnode) += fx_tmp ;
1026          domain.fy(gnode) += fy_tmp ;
1027          domain.fz(gnode) += fz_tmp ;
1028       }
1029       Release(&fz_elem) ;
1030       Release(&fy_elem) ;
1031       Release(&fx_elem) ;
1032    }
1033 }
1034 
1035 /******************************************/
1036 
1037 static inline
CalcHourglassControlForElems(Domain & domain,Real_t determ[],Real_t hgcoef)1038 void CalcHourglassControlForElems(Domain& domain,
1039                                   Real_t determ[], Real_t hgcoef)
1040 {
1041    Index_t numElem = domain.numElem() ;
1042    Index_t numElem8 = numElem * 8 ;
1043    Real_t *dvdx = Allocate<Real_t>(numElem8) ;
1044    Real_t *dvdy = Allocate<Real_t>(numElem8) ;
1045    Real_t *dvdz = Allocate<Real_t>(numElem8) ;
1046    Real_t *x8n  = Allocate<Real_t>(numElem8) ;
1047    Real_t *y8n  = Allocate<Real_t>(numElem8) ;
1048    Real_t *z8n  = Allocate<Real_t>(numElem8) ;
1049 
1050    /* start loop over elements */
1051 #ifdef LULESH_USE_OPENMP
1052 #pragma omp parallel for firstprivate(numElem)
1053 #endif
1054    for (Index_t i=0 ; i<numElem ; ++i){
1055       Real_t  x1[8],  y1[8],  z1[8] ;
1056       Real_t pfx[8], pfy[8], pfz[8] ;
1057 
1058       Index_t* elemToNode = domain.nodelist(i);
1059       CollectDomainNodesToElemNodes(domain, elemToNode, x1, y1, z1);
1060 
1061       CalcElemVolumeDerivative(pfx, pfy, pfz, x1, y1, z1);
1062 
1063       /* load into temporary storage for FB Hour Glass control */
1064       for(Index_t ii=0;ii<8;++ii){
1065          Index_t jj=8*i+ii;
1066 
1067          dvdx[jj] = pfx[ii];
1068          dvdy[jj] = pfy[ii];
1069          dvdz[jj] = pfz[ii];
1070          x8n[jj]  = x1[ii];
1071          y8n[jj]  = y1[ii];
1072          z8n[jj]  = z1[ii];
1073       }
1074 
1075       determ[i] = domain.volo(i) * domain.v(i);
1076 
1077       /* Do a check for negative volumes */
1078       if ( domain.v(i) <= Real_t(0.0) ) {
1079 #if USE_MPI
1080          MPI_Abort(MPI_COMM_WORLD, VolumeError) ;
1081 #else
1082          exit(VolumeError);
1083 #endif
1084       }
1085    }
1086 
1087    if ( hgcoef > Real_t(0.) ) {
1088       CalcFBHourglassForceForElems( domain,
1089                                     determ, x8n, y8n, z8n, dvdx, dvdy, dvdz,
1090                                     hgcoef, numElem, domain.numNode()) ;
1091    }
1092 
1093    Release(&z8n) ;
1094    Release(&y8n) ;
1095    Release(&x8n) ;
1096    Release(&dvdz) ;
1097    Release(&dvdy) ;
1098    Release(&dvdx) ;
1099 
1100    return ;
1101 }
1102 
1103 /******************************************/
1104 
1105 static inline
CalcVolumeForceForElems(Domain & domain)1106 void CalcVolumeForceForElems(Domain& domain)
1107 {
1108    Index_t numElem = domain.numElem() ;
1109    if (numElem != 0) {
1110       Real_t  hgcoef = domain.hgcoef() ;
1111       Real_t *sigxx  = Allocate<Real_t>(numElem) ;
1112       Real_t *sigyy  = Allocate<Real_t>(numElem) ;
1113       Real_t *sigzz  = Allocate<Real_t>(numElem) ;
1114       Real_t *determ = Allocate<Real_t>(numElem) ;
1115 
1116       /* Sum contributions to total stress tensor */
1117       InitStressTermsForElems(domain, sigxx, sigyy, sigzz, numElem);
1118 
1119       // call elemlib stress integration loop to produce nodal forces from
1120       // material stresses.
1121       IntegrateStressForElems( domain,
1122                                sigxx, sigyy, sigzz, determ, numElem,
1123                                domain.numNode()) ;
1124 
1125       // check for negative element volume
1126 #ifdef LULESH_USE_OPENMP
1127 #pragma omp parallel for firstprivate(numElem)
1128 #endif
1129       for ( Index_t k=0 ; k<numElem ; ++k ) {
1130          if (determ[k] <= Real_t(0.0)) {
1131 #if USE_MPI
1132             MPI_Abort(MPI_COMM_WORLD, VolumeError) ;
1133 #else
1134             exit(VolumeError);
1135 #endif
1136          }
1137       }
1138 
1139       CalcHourglassControlForElems(domain, determ, hgcoef) ;
1140 
1141       Release(&determ) ;
1142       Release(&sigzz) ;
1143       Release(&sigyy) ;
1144       Release(&sigxx) ;
1145    }
1146 }
1147 
1148 /******************************************/
1149 
CalcForceForNodes(Domain & domain)1150 static inline void CalcForceForNodes(Domain& domain)
1151 {
1152   Index_t numNode = domain.numNode() ;
1153 
1154 #if USE_MPI
1155   CommRecv(domain, MSG_COMM_SBN, 3,
1156            domain.sizeX() + 1, domain.sizeY() + 1, domain.sizeZ() + 1,
1157            true, false) ;
1158 #endif
1159 
1160 #ifdef LULESH_USE_OPENMP
1161 #pragma omp parallel for firstprivate(numNode)
1162 #endif
1163   for (Index_t i=0; i<numNode; ++i) {
1164      domain.fx(i) = Real_t(0.0) ;
1165      domain.fy(i) = Real_t(0.0) ;
1166      domain.fz(i) = Real_t(0.0) ;
1167   }
1168 
1169   /* Calcforce calls partial, force, hourq */
1170   CalcVolumeForceForElems(domain) ;
1171 
1172 #if USE_MPI
1173   Domain_member fieldData[3] ;
1174   fieldData[0] = &Domain::fx ;
1175   fieldData[1] = &Domain::fy ;
1176   fieldData[2] = &Domain::fz ;
1177 
1178   CommSend(domain, MSG_COMM_SBN, 3, fieldData,
1179            domain.sizeX() + 1, domain.sizeY() + 1, domain.sizeZ() +  1,
1180            true, false) ;
1181   CommSBN(domain, 3, fieldData) ;
1182 #endif
1183 }
1184 
1185 /******************************************/
1186 
1187 static inline
CalcAccelerationForNodes(Domain & domain,Index_t numNode)1188 void CalcAccelerationForNodes(Domain &domain, Index_t numNode)
1189 {
1190 #ifdef LULESH_USE_OPENMP
1191 #pragma omp parallel for firstprivate(numNode)
1192 #endif
1193    for (Index_t i = 0; i < numNode; ++i) {
1194       domain.xdd(i) = domain.fx(i) / domain.nodalMass(i);
1195       domain.ydd(i) = domain.fy(i) / domain.nodalMass(i);
1196       domain.zdd(i) = domain.fz(i) / domain.nodalMass(i);
1197    }
1198 }
1199 
1200 /******************************************/
1201 
1202 static inline
ApplyAccelerationBoundaryConditionsForNodes(Domain & domain)1203 void ApplyAccelerationBoundaryConditionsForNodes(Domain& domain)
1204 {
1205    Index_t size = domain.sizeX();
1206    Index_t numNodeBC = (size+1)*(size+1) ;
1207 
1208 #ifdef LULESH_USE_OPENMP
1209 #pragma omp parallel
1210 #endif
1211    {
1212       if (!domain.symmXempty() != 0) {
1213 #ifdef LULESH_USE_OPENMP
1214 #pragma omp for nowait firstprivate(numNodeBC)
1215 #endif
1216          for(Index_t i=0 ; i<numNodeBC ; ++i)
1217             domain.xdd(domain.symmX(i)) = Real_t(0.0) ;
1218       }
1219 
1220       if (!domain.symmYempty() != 0) {
1221 #ifdef LULESH_USE_OPENMP
1222 #pragma omp for nowait firstprivate(numNodeBC)
1223 #endif
1224          for(Index_t i=0 ; i<numNodeBC ; ++i)
1225             domain.ydd(domain.symmY(i)) = Real_t(0.0) ;
1226       }
1227 
1228       if (!domain.symmZempty() != 0) {
1229 #ifdef LULESH_USE_OPENMP
1230 #pragma omp for nowait firstprivate(numNodeBC)
1231 #endif
1232          for(Index_t i=0 ; i<numNodeBC ; ++i)
1233             domain.zdd(domain.symmZ(i)) = Real_t(0.0) ;
1234       }
1235    }
1236 }
1237 
1238 /******************************************/
1239 
1240 static inline
CalcVelocityForNodes(Domain & domain,const Real_t dt,const Real_t u_cut,Index_t numNode)1241 void CalcVelocityForNodes(Domain &domain, const Real_t dt, const Real_t u_cut,
1242                           Index_t numNode)
1243 {
1244 
1245 #ifdef LULESH_USE_OPENMP
1246 #pragma omp parallel for firstprivate(numNode)
1247 #endif
1248    for ( Index_t i = 0 ; i < numNode ; ++i )
1249    {
1250      Real_t xdtmp, ydtmp, zdtmp ;
1251 
1252      xdtmp = domain.xd(i) + domain.xdd(i) * dt ;
1253      if( FABS(xdtmp) < u_cut ) xdtmp = Real_t(0.0);
1254      domain.xd(i) = xdtmp ;
1255 
1256      ydtmp = domain.yd(i) + domain.ydd(i) * dt ;
1257      if( FABS(ydtmp) < u_cut ) ydtmp = Real_t(0.0);
1258      domain.yd(i) = ydtmp ;
1259 
1260      zdtmp = domain.zd(i) + domain.zdd(i) * dt ;
1261      if( FABS(zdtmp) < u_cut ) zdtmp = Real_t(0.0);
1262      domain.zd(i) = zdtmp ;
1263    }
1264 }
1265 
1266 /******************************************/
1267 
1268 static inline
CalcPositionForNodes(Domain & domain,const Real_t dt,Index_t numNode)1269 void CalcPositionForNodes(Domain &domain, const Real_t dt, Index_t numNode)
1270 {
1271 #ifdef LULESH_USE_OPENMP
1272 #pragma omp parallel for firstprivate(numNode)
1273 #endif
1274    for ( Index_t i = 0 ; i < numNode ; ++i )
1275    {
1276      domain.x(i) += domain.xd(i) * dt ;
1277      domain.y(i) += domain.yd(i) * dt ;
1278      domain.z(i) += domain.zd(i) * dt ;
1279    }
1280 }
1281 
1282 /******************************************/
1283 
1284 static inline
LagrangeNodal(Domain & domain)1285 void LagrangeNodal(Domain& domain)
1286 {
1287 #ifdef SEDOV_SYNC_POS_VEL_EARLY
1288    Domain_member fieldData[6] ;
1289 #endif
1290 
1291    const Real_t delt = domain.deltatime() ;
1292    Real_t u_cut = domain.u_cut() ;
1293 
1294   /* time of boundary condition evaluation is beginning of step for force and
1295    * acceleration boundary conditions. */
1296   CalcForceForNodes(domain);
1297 
1298 #if USE_MPI
1299 #ifdef SEDOV_SYNC_POS_VEL_EARLY
1300    CommRecv(domain, MSG_SYNC_POS_VEL, 6,
1301             domain.sizeX() + 1, domain.sizeY() + 1, domain.sizeZ() + 1,
1302             false, false) ;
1303 #endif
1304 #endif
1305 
1306    CalcAccelerationForNodes(domain, domain.numNode());
1307 
1308    ApplyAccelerationBoundaryConditionsForNodes(domain);
1309 
1310    CalcVelocityForNodes( domain, delt, u_cut, domain.numNode()) ;
1311 
1312    CalcPositionForNodes( domain, delt, domain.numNode() );
1313 #if USE_MPI
1314 #ifdef SEDOV_SYNC_POS_VEL_EARLY
1315   fieldData[0] = &Domain::x ;
1316   fieldData[1] = &Domain::y ;
1317   fieldData[2] = &Domain::z ;
1318   fieldData[3] = &Domain::xd ;
1319   fieldData[4] = &Domain::yd ;
1320   fieldData[5] = &Domain::zd ;
1321 
1322    CommSend(domain, MSG_SYNC_POS_VEL, 6, fieldData,
1323             domain.sizeX() + 1, domain.sizeY() + 1, domain.sizeZ() + 1,
1324             false, false) ;
1325    CommSyncPosVel(domain) ;
1326 #endif
1327 #endif
1328 
1329   return;
1330 }
1331 
1332 /******************************************/
1333 
1334 static inline
CalcElemVolume(const Real_t x0,const Real_t x1,const Real_t x2,const Real_t x3,const Real_t x4,const Real_t x5,const Real_t x6,const Real_t x7,const Real_t y0,const Real_t y1,const Real_t y2,const Real_t y3,const Real_t y4,const Real_t y5,const Real_t y6,const Real_t y7,const Real_t z0,const Real_t z1,const Real_t z2,const Real_t z3,const Real_t z4,const Real_t z5,const Real_t z6,const Real_t z7)1335 Real_t CalcElemVolume( const Real_t x0, const Real_t x1,
1336                const Real_t x2, const Real_t x3,
1337                const Real_t x4, const Real_t x5,
1338                const Real_t x6, const Real_t x7,
1339                const Real_t y0, const Real_t y1,
1340                const Real_t y2, const Real_t y3,
1341                const Real_t y4, const Real_t y5,
1342                const Real_t y6, const Real_t y7,
1343                const Real_t z0, const Real_t z1,
1344                const Real_t z2, const Real_t z3,
1345                const Real_t z4, const Real_t z5,
1346                const Real_t z6, const Real_t z7 )
1347 {
1348   Real_t twelveth = Real_t(1.0)/Real_t(12.0);
1349 
1350   Real_t dx61 = x6 - x1;
1351   Real_t dy61 = y6 - y1;
1352   Real_t dz61 = z6 - z1;
1353 
1354   Real_t dx70 = x7 - x0;
1355   Real_t dy70 = y7 - y0;
1356   Real_t dz70 = z7 - z0;
1357 
1358   Real_t dx63 = x6 - x3;
1359   Real_t dy63 = y6 - y3;
1360   Real_t dz63 = z6 - z3;
1361 
1362   Real_t dx20 = x2 - x0;
1363   Real_t dy20 = y2 - y0;
1364   Real_t dz20 = z2 - z0;
1365 
1366   Real_t dx50 = x5 - x0;
1367   Real_t dy50 = y5 - y0;
1368   Real_t dz50 = z5 - z0;
1369 
1370   Real_t dx64 = x6 - x4;
1371   Real_t dy64 = y6 - y4;
1372   Real_t dz64 = z6 - z4;
1373 
1374   Real_t dx31 = x3 - x1;
1375   Real_t dy31 = y3 - y1;
1376   Real_t dz31 = z3 - z1;
1377 
1378   Real_t dx72 = x7 - x2;
1379   Real_t dy72 = y7 - y2;
1380   Real_t dz72 = z7 - z2;
1381 
1382   Real_t dx43 = x4 - x3;
1383   Real_t dy43 = y4 - y3;
1384   Real_t dz43 = z4 - z3;
1385 
1386   Real_t dx57 = x5 - x7;
1387   Real_t dy57 = y5 - y7;
1388   Real_t dz57 = z5 - z7;
1389 
1390   Real_t dx14 = x1 - x4;
1391   Real_t dy14 = y1 - y4;
1392   Real_t dz14 = z1 - z4;
1393 
1394   Real_t dx25 = x2 - x5;
1395   Real_t dy25 = y2 - y5;
1396   Real_t dz25 = z2 - z5;
1397 
1398 #define TRIPLE_PRODUCT(x1, y1, z1, x2, y2, z2, x3, y3, z3) \
1399    ((x1)*((y2)*(z3) - (z2)*(y3)) + (x2)*((z1)*(y3) - (y1)*(z3)) + (x3)*((y1)*(z2) - (z1)*(y2)))
1400 
1401   Real_t volume =
1402     TRIPLE_PRODUCT(dx31 + dx72, dx63, dx20,
1403        dy31 + dy72, dy63, dy20,
1404        dz31 + dz72, dz63, dz20) +
1405     TRIPLE_PRODUCT(dx43 + dx57, dx64, dx70,
1406        dy43 + dy57, dy64, dy70,
1407        dz43 + dz57, dz64, dz70) +
1408     TRIPLE_PRODUCT(dx14 + dx25, dx61, dx50,
1409        dy14 + dy25, dy61, dy50,
1410        dz14 + dz25, dz61, dz50);
1411 
1412 #undef TRIPLE_PRODUCT
1413 
1414   volume *= twelveth;
1415 
1416   return volume ;
1417 }
1418 
1419 /******************************************/
1420 
1421 //inline
CalcElemVolume(const Real_t x[8],const Real_t y[8],const Real_t z[8])1422 Real_t CalcElemVolume( const Real_t x[8], const Real_t y[8], const Real_t z[8] )
1423 {
1424 return CalcElemVolume( x[0], x[1], x[2], x[3], x[4], x[5], x[6], x[7],
1425                        y[0], y[1], y[2], y[3], y[4], y[5], y[6], y[7],
1426                        z[0], z[1], z[2], z[3], z[4], z[5], z[6], z[7]);
1427 }
1428 
1429 /******************************************/
1430 
1431 static inline
AreaFace(const Real_t x0,const Real_t x1,const Real_t x2,const Real_t x3,const Real_t y0,const Real_t y1,const Real_t y2,const Real_t y3,const Real_t z0,const Real_t z1,const Real_t z2,const Real_t z3)1432 Real_t AreaFace( const Real_t x0, const Real_t x1,
1433                  const Real_t x2, const Real_t x3,
1434                  const Real_t y0, const Real_t y1,
1435                  const Real_t y2, const Real_t y3,
1436                  const Real_t z0, const Real_t z1,
1437                  const Real_t z2, const Real_t z3)
1438 {
1439    Real_t fx = (x2 - x0) - (x3 - x1);
1440    Real_t fy = (y2 - y0) - (y3 - y1);
1441    Real_t fz = (z2 - z0) - (z3 - z1);
1442    Real_t gx = (x2 - x0) + (x3 - x1);
1443    Real_t gy = (y2 - y0) + (y3 - y1);
1444    Real_t gz = (z2 - z0) + (z3 - z1);
1445    Real_t area =
1446       (fx * fx + fy * fy + fz * fz) *
1447       (gx * gx + gy * gy + gz * gz) -
1448       (fx * gx + fy * gy + fz * gz) *
1449       (fx * gx + fy * gy + fz * gz);
1450    return area ;
1451 }
1452 
1453 /******************************************/
1454 
1455 static inline
CalcElemCharacteristicLength(const Real_t x[8],const Real_t y[8],const Real_t z[8],const Real_t volume)1456 Real_t CalcElemCharacteristicLength( const Real_t x[8],
1457                                      const Real_t y[8],
1458                                      const Real_t z[8],
1459                                      const Real_t volume)
1460 {
1461    Real_t a, charLength = Real_t(0.0);
1462 
1463    a = AreaFace(x[0],x[1],x[2],x[3],
1464                 y[0],y[1],y[2],y[3],
1465                 z[0],z[1],z[2],z[3]) ;
1466    charLength = std::max(a,charLength) ;
1467 
1468    a = AreaFace(x[4],x[5],x[6],x[7],
1469                 y[4],y[5],y[6],y[7],
1470                 z[4],z[5],z[6],z[7]) ;
1471    charLength = std::max(a,charLength) ;
1472 
1473    a = AreaFace(x[0],x[1],x[5],x[4],
1474                 y[0],y[1],y[5],y[4],
1475                 z[0],z[1],z[5],z[4]) ;
1476    charLength = std::max(a,charLength) ;
1477 
1478    a = AreaFace(x[1],x[2],x[6],x[5],
1479                 y[1],y[2],y[6],y[5],
1480                 z[1],z[2],z[6],z[5]) ;
1481    charLength = std::max(a,charLength) ;
1482 
1483    a = AreaFace(x[2],x[3],x[7],x[6],
1484                 y[2],y[3],y[7],y[6],
1485                 z[2],z[3],z[7],z[6]) ;
1486    charLength = std::max(a,charLength) ;
1487 
1488    a = AreaFace(x[3],x[0],x[4],x[7],
1489                 y[3],y[0],y[4],y[7],
1490                 z[3],z[0],z[4],z[7]) ;
1491    charLength = std::max(a,charLength) ;
1492 
1493    charLength = Real_t(4.0) * volume / SQRT(charLength);
1494 
1495    return charLength;
1496 }
1497 
1498 /******************************************/
1499 
1500 static inline
CalcElemVelocityGradient(const Real_t * const xvel,const Real_t * const yvel,const Real_t * const zvel,const Real_t b[][8],const Real_t detJ,Real_t * const d)1501 void CalcElemVelocityGradient( const Real_t* const xvel,
1502                                 const Real_t* const yvel,
1503                                 const Real_t* const zvel,
1504                                 const Real_t b[][8],
1505                                 const Real_t detJ,
1506                                 Real_t* const d )
1507 {
1508   const Real_t inv_detJ = Real_t(1.0) / detJ ;
1509   Real_t dyddx, dxddy, dzddx, dxddz, dzddy, dyddz;
1510   const Real_t* const pfx = b[0];
1511   const Real_t* const pfy = b[1];
1512   const Real_t* const pfz = b[2];
1513 
1514   d[0] = inv_detJ * ( pfx[0] * (xvel[0]-xvel[6])
1515                      + pfx[1] * (xvel[1]-xvel[7])
1516                      + pfx[2] * (xvel[2]-xvel[4])
1517                      + pfx[3] * (xvel[3]-xvel[5]) );
1518 
1519   d[1] = inv_detJ * ( pfy[0] * (yvel[0]-yvel[6])
1520                      + pfy[1] * (yvel[1]-yvel[7])
1521                      + pfy[2] * (yvel[2]-yvel[4])
1522                      + pfy[3] * (yvel[3]-yvel[5]) );
1523 
1524   d[2] = inv_detJ * ( pfz[0] * (zvel[0]-zvel[6])
1525                      + pfz[1] * (zvel[1]-zvel[7])
1526                      + pfz[2] * (zvel[2]-zvel[4])
1527                      + pfz[3] * (zvel[3]-zvel[5]) );
1528 
1529   dyddx  = inv_detJ * ( pfx[0] * (yvel[0]-yvel[6])
1530                       + pfx[1] * (yvel[1]-yvel[7])
1531                       + pfx[2] * (yvel[2]-yvel[4])
1532                       + pfx[3] * (yvel[3]-yvel[5]) );
1533 
1534   dxddy  = inv_detJ * ( pfy[0] * (xvel[0]-xvel[6])
1535                       + pfy[1] * (xvel[1]-xvel[7])
1536                       + pfy[2] * (xvel[2]-xvel[4])
1537                       + pfy[3] * (xvel[3]-xvel[5]) );
1538 
1539   dzddx  = inv_detJ * ( pfx[0] * (zvel[0]-zvel[6])
1540                       + pfx[1] * (zvel[1]-zvel[7])
1541                       + pfx[2] * (zvel[2]-zvel[4])
1542                       + pfx[3] * (zvel[3]-zvel[5]) );
1543 
1544   dxddz  = inv_detJ * ( pfz[0] * (xvel[0]-xvel[6])
1545                       + pfz[1] * (xvel[1]-xvel[7])
1546                       + pfz[2] * (xvel[2]-xvel[4])
1547                       + pfz[3] * (xvel[3]-xvel[5]) );
1548 
1549   dzddy  = inv_detJ * ( pfy[0] * (zvel[0]-zvel[6])
1550                       + pfy[1] * (zvel[1]-zvel[7])
1551                       + pfy[2] * (zvel[2]-zvel[4])
1552                       + pfy[3] * (zvel[3]-zvel[5]) );
1553 
1554   dyddz  = inv_detJ * ( pfz[0] * (yvel[0]-yvel[6])
1555                       + pfz[1] * (yvel[1]-yvel[7])
1556                       + pfz[2] * (yvel[2]-yvel[4])
1557                       + pfz[3] * (yvel[3]-yvel[5]) );
1558   d[5]  = Real_t( .5) * ( dxddy + dyddx );
1559   d[4]  = Real_t( .5) * ( dxddz + dzddx );
1560   d[3]  = Real_t( .5) * ( dzddy + dyddz );
1561 }
1562 
1563 /******************************************/
1564 
1565 //static inline
CalcKinematicsForElems(Domain & domain,Real_t * vnew,Real_t deltaTime,Index_t numElem)1566 void CalcKinematicsForElems( Domain &domain, Real_t *vnew,
1567                              Real_t deltaTime, Index_t numElem )
1568 {
1569 
1570   // loop over all elements
1571 #ifdef LULESH_USE_OPENMP
1572 #pragma omp parallel for firstprivate(numElem, deltaTime)
1573 #endif
1574   for( Index_t k=0 ; k<numElem ; ++k )
1575   {
1576     Real_t B[3][8] ; /** shape function derivatives */
1577     Real_t D[6] ;
1578     Real_t x_local[8] ;
1579     Real_t y_local[8] ;
1580     Real_t z_local[8] ;
1581     Real_t xd_local[8] ;
1582     Real_t yd_local[8] ;
1583     Real_t zd_local[8] ;
1584     Real_t detJ = Real_t(0.0) ;
1585 
1586     Real_t volume ;
1587     Real_t relativeVolume ;
1588     const Index_t* const elemToNode = domain.nodelist(k) ;
1589 
1590     // get nodal coordinates from global arrays and copy into local arrays.
1591     CollectDomainNodesToElemNodes(domain, elemToNode, x_local, y_local, z_local);
1592 
1593     // volume calculations
1594     volume = CalcElemVolume(x_local, y_local, z_local );
1595     relativeVolume = volume / domain.volo(k) ;
1596     vnew[k] = relativeVolume ;
1597     domain.delv(k) = relativeVolume - domain.v(k) ;
1598 
1599     // set characteristic length
1600     domain.arealg(k) = CalcElemCharacteristicLength(x_local, y_local, z_local,
1601                                              volume);
1602 
1603     // get nodal velocities from global array and copy into local arrays.
1604     for( Index_t lnode=0 ; lnode<8 ; ++lnode )
1605     {
1606       Index_t gnode = elemToNode[lnode];
1607       xd_local[lnode] = domain.xd(gnode);
1608       yd_local[lnode] = domain.yd(gnode);
1609       zd_local[lnode] = domain.zd(gnode);
1610     }
1611 
1612     Real_t dt2 = Real_t(0.5) * deltaTime;
1613     for ( Index_t j=0 ; j<8 ; ++j )
1614     {
1615        x_local[j] -= dt2 * xd_local[j];
1616        y_local[j] -= dt2 * yd_local[j];
1617        z_local[j] -= dt2 * zd_local[j];
1618     }
1619 
1620     CalcElemShapeFunctionDerivatives( x_local, y_local, z_local,
1621                                       B, &detJ );
1622 
1623     CalcElemVelocityGradient( xd_local, yd_local, zd_local,
1624                                B, detJ, D );
1625 
1626     // put velocity gradient quantities into their global arrays.
1627     domain.dxx(k) = D[0];
1628     domain.dyy(k) = D[1];
1629     domain.dzz(k) = D[2];
1630   }
1631 }
1632 
1633 /******************************************/
1634 
1635 static inline
CalcLagrangeElements(Domain & domain,Real_t * vnew)1636 void CalcLagrangeElements(Domain& domain, Real_t* vnew)
1637 {
1638    Index_t numElem = domain.numElem() ;
1639    if (numElem > 0) {
1640       const Real_t deltatime = domain.deltatime() ;
1641 
1642       domain.AllocateStrains(numElem);
1643 
1644       CalcKinematicsForElems(domain, vnew, deltatime, numElem) ;
1645 
1646       // element loop to do some stuff not included in the elemlib function.
1647 #ifdef LULESH_USE_OPENMP
1648 #pragma omp parallel for firstprivate(numElem)
1649 #endif
1650       for ( Index_t k=0 ; k<numElem ; ++k )
1651       {
1652          // calc strain rate and apply as constraint (only done in FB element)
1653          Real_t vdov = domain.dxx(k) + domain.dyy(k) + domain.dzz(k) ;
1654          Real_t vdovthird = vdov/Real_t(3.0) ;
1655 
1656          // make the rate of deformation tensor deviatoric
1657          domain.vdov(k) = vdov ;
1658          domain.dxx(k) -= vdovthird ;
1659          domain.dyy(k) -= vdovthird ;
1660          domain.dzz(k) -= vdovthird ;
1661 
1662         // See if any volumes are negative, and take appropriate action.
1663          if (vnew[k] <= Real_t(0.0))
1664         {
1665 #if USE_MPI
1666            MPI_Abort(MPI_COMM_WORLD, VolumeError) ;
1667 #else
1668            exit(VolumeError);
1669 #endif
1670         }
1671       }
1672       domain.DeallocateStrains();
1673    }
1674 }
1675 
1676 /******************************************/
1677 
1678 static inline
CalcMonotonicQGradientsForElems(Domain & domain,Real_t vnew[])1679 void CalcMonotonicQGradientsForElems(Domain& domain, Real_t vnew[])
1680 {
1681    Index_t numElem = domain.numElem();
1682 
1683 #ifdef LULESH_USE_OPENMP
1684 #pragma omp parallel for firstprivate(numElem)
1685 #endif
1686    for (Index_t i = 0 ; i < numElem ; ++i ) {
1687       const Real_t ptiny = Real_t(1.e-36) ;
1688       Real_t ax,ay,az ;
1689       Real_t dxv,dyv,dzv ;
1690 
1691       const Index_t *elemToNode = domain.nodelist(i);
1692       Index_t n0 = elemToNode[0] ;
1693       Index_t n1 = elemToNode[1] ;
1694       Index_t n2 = elemToNode[2] ;
1695       Index_t n3 = elemToNode[3] ;
1696       Index_t n4 = elemToNode[4] ;
1697       Index_t n5 = elemToNode[5] ;
1698       Index_t n6 = elemToNode[6] ;
1699       Index_t n7 = elemToNode[7] ;
1700 
1701       Real_t x0 = domain.x(n0) ;
1702       Real_t x1 = domain.x(n1) ;
1703       Real_t x2 = domain.x(n2) ;
1704       Real_t x3 = domain.x(n3) ;
1705       Real_t x4 = domain.x(n4) ;
1706       Real_t x5 = domain.x(n5) ;
1707       Real_t x6 = domain.x(n6) ;
1708       Real_t x7 = domain.x(n7) ;
1709 
1710       Real_t y0 = domain.y(n0) ;
1711       Real_t y1 = domain.y(n1) ;
1712       Real_t y2 = domain.y(n2) ;
1713       Real_t y3 = domain.y(n3) ;
1714       Real_t y4 = domain.y(n4) ;
1715       Real_t y5 = domain.y(n5) ;
1716       Real_t y6 = domain.y(n6) ;
1717       Real_t y7 = domain.y(n7) ;
1718 
1719       Real_t z0 = domain.z(n0) ;
1720       Real_t z1 = domain.z(n1) ;
1721       Real_t z2 = domain.z(n2) ;
1722       Real_t z3 = domain.z(n3) ;
1723       Real_t z4 = domain.z(n4) ;
1724       Real_t z5 = domain.z(n5) ;
1725       Real_t z6 = domain.z(n6) ;
1726       Real_t z7 = domain.z(n7) ;
1727 
1728       Real_t xv0 = domain.xd(n0) ;
1729       Real_t xv1 = domain.xd(n1) ;
1730       Real_t xv2 = domain.xd(n2) ;
1731       Real_t xv3 = domain.xd(n3) ;
1732       Real_t xv4 = domain.xd(n4) ;
1733       Real_t xv5 = domain.xd(n5) ;
1734       Real_t xv6 = domain.xd(n6) ;
1735       Real_t xv7 = domain.xd(n7) ;
1736 
1737       Real_t yv0 = domain.yd(n0) ;
1738       Real_t yv1 = domain.yd(n1) ;
1739       Real_t yv2 = domain.yd(n2) ;
1740       Real_t yv3 = domain.yd(n3) ;
1741       Real_t yv4 = domain.yd(n4) ;
1742       Real_t yv5 = domain.yd(n5) ;
1743       Real_t yv6 = domain.yd(n6) ;
1744       Real_t yv7 = domain.yd(n7) ;
1745 
1746       Real_t zv0 = domain.zd(n0) ;
1747       Real_t zv1 = domain.zd(n1) ;
1748       Real_t zv2 = domain.zd(n2) ;
1749       Real_t zv3 = domain.zd(n3) ;
1750       Real_t zv4 = domain.zd(n4) ;
1751       Real_t zv5 = domain.zd(n5) ;
1752       Real_t zv6 = domain.zd(n6) ;
1753       Real_t zv7 = domain.zd(n7) ;
1754 
1755       Real_t vol = domain.volo(i)*vnew[i] ;
1756       Real_t norm = Real_t(1.0) / ( vol + ptiny ) ;
1757 
1758       Real_t dxj = Real_t(-0.25)*((x0+x1+x5+x4) - (x3+x2+x6+x7)) ;
1759       Real_t dyj = Real_t(-0.25)*((y0+y1+y5+y4) - (y3+y2+y6+y7)) ;
1760       Real_t dzj = Real_t(-0.25)*((z0+z1+z5+z4) - (z3+z2+z6+z7)) ;
1761 
1762       Real_t dxi = Real_t( 0.25)*((x1+x2+x6+x5) - (x0+x3+x7+x4)) ;
1763       Real_t dyi = Real_t( 0.25)*((y1+y2+y6+y5) - (y0+y3+y7+y4)) ;
1764       Real_t dzi = Real_t( 0.25)*((z1+z2+z6+z5) - (z0+z3+z7+z4)) ;
1765 
1766       Real_t dxk = Real_t( 0.25)*((x4+x5+x6+x7) - (x0+x1+x2+x3)) ;
1767       Real_t dyk = Real_t( 0.25)*((y4+y5+y6+y7) - (y0+y1+y2+y3)) ;
1768       Real_t dzk = Real_t( 0.25)*((z4+z5+z6+z7) - (z0+z1+z2+z3)) ;
1769 
1770       /* find delvk and delxk ( i cross j ) */
1771 
1772       ax = dyi*dzj - dzi*dyj ;
1773       ay = dzi*dxj - dxi*dzj ;
1774       az = dxi*dyj - dyi*dxj ;
1775 
1776       domain.delx_zeta(i) = vol / SQRT(ax*ax + ay*ay + az*az + ptiny) ;
1777 
1778       ax *= norm ;
1779       ay *= norm ;
1780       az *= norm ;
1781 
1782       dxv = Real_t(0.25)*((xv4+xv5+xv6+xv7) - (xv0+xv1+xv2+xv3)) ;
1783       dyv = Real_t(0.25)*((yv4+yv5+yv6+yv7) - (yv0+yv1+yv2+yv3)) ;
1784       dzv = Real_t(0.25)*((zv4+zv5+zv6+zv7) - (zv0+zv1+zv2+zv3)) ;
1785 
1786       domain.delv_zeta(i) = ax*dxv + ay*dyv + az*dzv ;
1787 
1788       /* find delxi and delvi ( j cross k ) */
1789 
1790       ax = dyj*dzk - dzj*dyk ;
1791       ay = dzj*dxk - dxj*dzk ;
1792       az = dxj*dyk - dyj*dxk ;
1793 
1794       domain.delx_xi(i) = vol / SQRT(ax*ax + ay*ay + az*az + ptiny) ;
1795 
1796       ax *= norm ;
1797       ay *= norm ;
1798       az *= norm ;
1799 
1800       dxv = Real_t(0.25)*((xv1+xv2+xv6+xv5) - (xv0+xv3+xv7+xv4)) ;
1801       dyv = Real_t(0.25)*((yv1+yv2+yv6+yv5) - (yv0+yv3+yv7+yv4)) ;
1802       dzv = Real_t(0.25)*((zv1+zv2+zv6+zv5) - (zv0+zv3+zv7+zv4)) ;
1803 
1804       domain.delv_xi(i) = ax*dxv + ay*dyv + az*dzv ;
1805 
1806       /* find delxj and delvj ( k cross i ) */
1807 
1808       ax = dyk*dzi - dzk*dyi ;
1809       ay = dzk*dxi - dxk*dzi ;
1810       az = dxk*dyi - dyk*dxi ;
1811 
1812       domain.delx_eta(i) = vol / SQRT(ax*ax + ay*ay + az*az + ptiny) ;
1813 
1814       ax *= norm ;
1815       ay *= norm ;
1816       az *= norm ;
1817 
1818       dxv = Real_t(-0.25)*((xv0+xv1+xv5+xv4) - (xv3+xv2+xv6+xv7)) ;
1819       dyv = Real_t(-0.25)*((yv0+yv1+yv5+yv4) - (yv3+yv2+yv6+yv7)) ;
1820       dzv = Real_t(-0.25)*((zv0+zv1+zv5+zv4) - (zv3+zv2+zv6+zv7)) ;
1821 
1822       domain.delv_eta(i) = ax*dxv + ay*dyv + az*dzv ;
1823    }
1824 }
1825 
1826 /******************************************/
1827 
1828 static inline
CalcMonotonicQRegionForElems(Domain & domain,Int_t r,Real_t vnew[],Real_t ptiny)1829 void CalcMonotonicQRegionForElems(Domain &domain, Int_t r,
1830                                   Real_t vnew[], Real_t ptiny)
1831 {
1832    Real_t monoq_limiter_mult = domain.monoq_limiter_mult();
1833    Real_t monoq_max_slope = domain.monoq_max_slope();
1834    Real_t qlc_monoq = domain.qlc_monoq();
1835    Real_t qqc_monoq = domain.qqc_monoq();
1836 
1837 #ifdef LULESH_USE_OPENMP
1838 #pragma omp parallel for firstprivate(qlc_monoq, qqc_monoq, monoq_limiter_mult, monoq_max_slope, ptiny)
1839 #endif
1840    for ( Index_t ielem = 0 ; ielem < domain.regElemSize(r); ++ielem ) {
1841       Index_t i = domain.regElemlist(r,ielem);
1842       Real_t qlin, qquad ;
1843       Real_t phixi, phieta, phizeta ;
1844       Int_t bcMask = domain.elemBC(i) ;
1845       Real_t delvm = 0.0, delvp =0.0;
1846 
1847       /*  phixi     */
1848       Real_t norm = Real_t(1.) / (domain.delv_xi(i)+ ptiny ) ;
1849 
1850       switch (bcMask & XI_M) {
1851          case XI_M_COMM: /* needs comm data */
1852          case 0:         delvm = domain.delv_xi(domain.lxim(i)); break ;
1853          case XI_M_SYMM: delvm = domain.delv_xi(i) ;       break ;
1854          case XI_M_FREE: delvm = Real_t(0.0) ;      break ;
1855          default:          fprintf(stderr, "Error in switch at %s line %d\n",
1856                                    __FILE__, __LINE__);
1857             delvm = 0; /* ERROR - but quiets the compiler */
1858             break;
1859       }
1860       switch (bcMask & XI_P) {
1861          case XI_P_COMM: /* needs comm data */
1862          case 0:         delvp = domain.delv_xi(domain.lxip(i)) ; break ;
1863          case XI_P_SYMM: delvp = domain.delv_xi(i) ;       break ;
1864          case XI_P_FREE: delvp = Real_t(0.0) ;      break ;
1865          default:          fprintf(stderr, "Error in switch at %s line %d\n",
1866                                    __FILE__, __LINE__);
1867             delvp = 0; /* ERROR - but quiets the compiler */
1868             break;
1869       }
1870 
1871       delvm = delvm * norm ;
1872       delvp = delvp * norm ;
1873 
1874       phixi = Real_t(.5) * ( delvm + delvp ) ;
1875 
1876       delvm *= monoq_limiter_mult ;
1877       delvp *= monoq_limiter_mult ;
1878 
1879       if ( delvm < phixi ) phixi = delvm ;
1880       if ( delvp < phixi ) phixi = delvp ;
1881       if ( phixi < Real_t(0.)) phixi = Real_t(0.) ;
1882       if ( phixi > monoq_max_slope) phixi = monoq_max_slope;
1883 
1884 
1885       /*  phieta     */
1886       norm = Real_t(1.) / ( domain.delv_eta(i) + ptiny ) ;
1887 
1888       switch (bcMask & ETA_M) {
1889          case ETA_M_COMM: /* needs comm data */
1890          case 0:          delvm = domain.delv_eta(domain.letam(i)) ; break ;
1891          case ETA_M_SYMM: delvm = domain.delv_eta(i) ;        break ;
1892          case ETA_M_FREE: delvm = Real_t(0.0) ;        break ;
1893          default:          fprintf(stderr, "Error in switch at %s line %d\n",
1894                                    __FILE__, __LINE__);
1895             delvm = 0; /* ERROR - but quiets the compiler */
1896             break;
1897       }
1898       switch (bcMask & ETA_P) {
1899          case ETA_P_COMM: /* needs comm data */
1900          case 0:          delvp = domain.delv_eta(domain.letap(i)) ; break ;
1901          case ETA_P_SYMM: delvp = domain.delv_eta(i) ;        break ;
1902          case ETA_P_FREE: delvp = Real_t(0.0) ;        break ;
1903          default:          fprintf(stderr, "Error in switch at %s line %d\n",
1904                                    __FILE__, __LINE__);
1905             delvp = 0; /* ERROR - but quiets the compiler */
1906             break;
1907       }
1908 
1909       delvm = delvm * norm ;
1910       delvp = delvp * norm ;
1911 
1912       phieta = Real_t(.5) * ( delvm + delvp ) ;
1913 
1914       delvm *= monoq_limiter_mult ;
1915       delvp *= monoq_limiter_mult ;
1916 
1917       if ( delvm  < phieta ) phieta = delvm ;
1918       if ( delvp  < phieta ) phieta = delvp ;
1919       if ( phieta < Real_t(0.)) phieta = Real_t(0.) ;
1920       if ( phieta > monoq_max_slope)  phieta = monoq_max_slope;
1921 
1922       /*  phizeta     */
1923       norm = Real_t(1.) / ( domain.delv_zeta(i) + ptiny ) ;
1924 
1925       switch (bcMask & ZETA_M) {
1926          case ZETA_M_COMM: /* needs comm data */
1927          case 0:           delvm = domain.delv_zeta(domain.lzetam(i)) ; break ;
1928          case ZETA_M_SYMM: delvm = domain.delv_zeta(i) ;         break ;
1929          case ZETA_M_FREE: delvm = Real_t(0.0) ;          break ;
1930          default:          fprintf(stderr, "Error in switch at %s line %d\n",
1931                                    __FILE__, __LINE__);
1932             delvm = 0; /* ERROR - but quiets the compiler */
1933             break;
1934       }
1935       switch (bcMask & ZETA_P) {
1936          case ZETA_P_COMM: /* needs comm data */
1937          case 0:           delvp = domain.delv_zeta(domain.lzetap(i)) ; break ;
1938          case ZETA_P_SYMM: delvp = domain.delv_zeta(i) ;         break ;
1939          case ZETA_P_FREE: delvp = Real_t(0.0) ;          break ;
1940          default:          fprintf(stderr, "Error in switch at %s line %d\n",
1941                                    __FILE__, __LINE__);
1942             delvp = 0; /* ERROR - but quiets the compiler */
1943             break;
1944       }
1945 
1946       delvm = delvm * norm ;
1947       delvp = delvp * norm ;
1948 
1949       phizeta = Real_t(.5) * ( delvm + delvp ) ;
1950 
1951       delvm *= monoq_limiter_mult ;
1952       delvp *= monoq_limiter_mult ;
1953 
1954       if ( delvm   < phizeta ) phizeta = delvm ;
1955       if ( delvp   < phizeta ) phizeta = delvp ;
1956       if ( phizeta < Real_t(0.)) phizeta = Real_t(0.);
1957       if ( phizeta > monoq_max_slope  ) phizeta = monoq_max_slope;
1958 
1959       /* Remove length scale */
1960 
1961       if ( domain.vdov(i) > Real_t(0.) )  {
1962          qlin  = Real_t(0.) ;
1963          qquad = Real_t(0.) ;
1964       }
1965       else {
1966          Real_t delvxxi   = domain.delv_xi(i)   * domain.delx_xi(i)   ;
1967          Real_t delvxeta  = domain.delv_eta(i)  * domain.delx_eta(i)  ;
1968          Real_t delvxzeta = domain.delv_zeta(i) * domain.delx_zeta(i) ;
1969 
1970          if ( delvxxi   > Real_t(0.) ) delvxxi   = Real_t(0.) ;
1971          if ( delvxeta  > Real_t(0.) ) delvxeta  = Real_t(0.) ;
1972          if ( delvxzeta > Real_t(0.) ) delvxzeta = Real_t(0.) ;
1973 
1974          Real_t rho = domain.elemMass(i) / (domain.volo(i) * vnew[i]) ;
1975 
1976          qlin = -qlc_monoq * rho *
1977             (  delvxxi   * (Real_t(1.) - phixi) +
1978                delvxeta  * (Real_t(1.) - phieta) +
1979                delvxzeta * (Real_t(1.) - phizeta)  ) ;
1980 
1981          qquad = qqc_monoq * rho *
1982             (  delvxxi*delvxxi     * (Real_t(1.) - phixi*phixi) +
1983                delvxeta*delvxeta   * (Real_t(1.) - phieta*phieta) +
1984                delvxzeta*delvxzeta * (Real_t(1.) - phizeta*phizeta)  ) ;
1985       }
1986 
1987       domain.qq(i) = qquad ;
1988       domain.ql(i) = qlin  ;
1989    }
1990 }
1991 
1992 /******************************************/
1993 
1994 static inline
CalcMonotonicQForElems(Domain & domain,Real_t vnew[])1995 void CalcMonotonicQForElems(Domain& domain, Real_t vnew[])
1996 {
1997    //
1998    // initialize parameters
1999    //
2000    const Real_t ptiny = Real_t(1.e-36) ;
2001 
2002    //
2003    // calculate the monotonic q for all regions
2004    //
2005    for (Index_t r=0 ; r<domain.numReg() ; ++r) {
2006 
2007       if (domain.regElemSize(r) > 0) {
2008          CalcMonotonicQRegionForElems(domain, r, vnew, ptiny) ;
2009       }
2010    }
2011 }
2012 
2013 /******************************************/
2014 
2015 static inline
CalcQForElems(Domain & domain,Real_t vnew[])2016 void CalcQForElems(Domain& domain, Real_t vnew[])
2017 {
2018    //
2019    // MONOTONIC Q option
2020    //
2021 
2022    Index_t numElem = domain.numElem() ;
2023 
2024    if (numElem != 0) {
2025       Int_t allElem = numElem +  /* local elem */
2026             2*domain.sizeX()*domain.sizeY() + /* plane ghosts */
2027             2*domain.sizeX()*domain.sizeZ() + /* row ghosts */
2028             2*domain.sizeY()*domain.sizeZ() ; /* col ghosts */
2029 
2030       domain.AllocateGradients(numElem, allElem);
2031 
2032 #if USE_MPI
2033       CommRecv(domain, MSG_MONOQ, 3,
2034                domain.sizeX(), domain.sizeY(), domain.sizeZ(),
2035                true, true) ;
2036 #endif
2037 
2038       /* Calculate velocity gradients */
2039       CalcMonotonicQGradientsForElems(domain, vnew);
2040 
2041 #if USE_MPI
2042       Domain_member fieldData[3] ;
2043 
2044       /* Transfer veloctiy gradients in the first order elements */
2045       /* problem->commElements->Transfer(CommElements::monoQ) ; */
2046 
2047       fieldData[0] = &Domain::delv_xi ;
2048       fieldData[1] = &Domain::delv_eta ;
2049       fieldData[2] = &Domain::delv_zeta ;
2050 
2051       CommSend(domain, MSG_MONOQ, 3, fieldData,
2052                domain.sizeX(), domain.sizeY(), domain.sizeZ(),
2053                true, true) ;
2054 
2055       CommMonoQ(domain) ;
2056 #endif
2057 
2058       CalcMonotonicQForElems(domain, vnew) ;
2059 
2060       // Free up memory
2061       domain.DeallocateGradients();
2062 
2063       /* Don't allow excessive artificial viscosity */
2064       Index_t idx = -1;
2065       for (Index_t i=0; i<numElem; ++i) {
2066          if ( domain.q(i) > domain.qstop() ) {
2067             idx = i ;
2068             break ;
2069          }
2070       }
2071 
2072       if(idx >= 0) {
2073 #if USE_MPI
2074          MPI_Abort(MPI_COMM_WORLD, QStopError) ;
2075 #else
2076          exit(QStopError);
2077 #endif
2078       }
2079    }
2080 }
2081 
2082 /******************************************/
2083 
2084 static inline
CalcPressureForElems(Real_t * p_new,Real_t * bvc,Real_t * pbvc,Real_t * e_old,Real_t * compression,Real_t * vnewc,Real_t pmin,Real_t p_cut,Real_t eosvmax,Index_t length,Index_t * regElemList)2085 void CalcPressureForElems(Real_t* p_new, Real_t* bvc,
2086                           Real_t* pbvc, Real_t* e_old,
2087                           Real_t* compression, Real_t *vnewc,
2088                           Real_t pmin,
2089                           Real_t p_cut, Real_t eosvmax,
2090                           Index_t length, Index_t *regElemList)
2091 {
2092 #ifdef LULESH_USE_OPENMP
2093 #pragma omp parallel for firstprivate(length)
2094 #endif
2095    for (Index_t i = 0; i < length ; ++i) {
2096       Real_t c1s = Real_t(2.0)/Real_t(3.0) ;
2097       bvc[i] = c1s * (compression[i] + Real_t(1.));
2098       pbvc[i] = c1s;
2099    }
2100 #ifdef LULESH_USE_OPENMP
2101 #pragma omp parallel for firstprivate(length, pmin, p_cut, eosvmax)
2102 #endif
2103    for (Index_t i = 0 ; i < length ; ++i){
2104       Index_t elem = regElemList[i];
2105 
2106       p_new[i] = bvc[i] * e_old[i] ;
2107 
2108       if    (FABS(p_new[i]) <  p_cut   )
2109          p_new[i] = Real_t(0.0) ;
2110 
2111       if    ( vnewc[elem] >= eosvmax ) /* impossible condition here? */
2112          p_new[i] = Real_t(0.0) ;
2113 
2114       if    (p_new[i]       <  pmin)
2115          p_new[i]   = pmin ;
2116    }
2117 }
2118 
2119 /******************************************/
2120 
2121 static inline
CalcEnergyForElems(Real_t * p_new,Real_t * e_new,Real_t * q_new,Real_t * bvc,Real_t * pbvc,Real_t * p_old,Real_t * e_old,Real_t * q_old,Real_t * compression,Real_t * compHalfStep,Real_t * vnewc,Real_t * work,Real_t * delvc,Real_t pmin,Real_t p_cut,Real_t e_cut,Real_t q_cut,Real_t emin,Real_t * qq_old,Real_t * ql_old,Real_t rho0,Real_t eosvmax,Index_t length,Index_t * regElemList)2122 void CalcEnergyForElems(Real_t* p_new, Real_t* e_new, Real_t* q_new,
2123                         Real_t* bvc, Real_t* pbvc,
2124                         Real_t* p_old, Real_t* e_old, Real_t* q_old,
2125                         Real_t* compression, Real_t* compHalfStep,
2126                         Real_t* vnewc, Real_t* work, Real_t* delvc, Real_t pmin,
2127                         Real_t p_cut, Real_t  e_cut, Real_t q_cut, Real_t emin,
2128                         Real_t* qq_old, Real_t* ql_old,
2129                         Real_t rho0,
2130                         Real_t eosvmax,
2131                         Index_t length, Index_t *regElemList)
2132 {
2133    Real_t *pHalfStep = Allocate<Real_t>(length) ;
2134 #ifdef LULESH_USE_OPENMP
2135 #pragma omp parallel for firstprivate(length, emin)
2136 #endif
2137    for (Index_t i = 0 ; i < length ; ++i) {
2138       e_new[i] = e_old[i] - Real_t(0.5) * delvc[i] * (p_old[i] + q_old[i])
2139          + Real_t(0.5) * work[i];
2140 
2141       if (e_new[i]  < emin ) {
2142          e_new[i] = emin ;
2143       }
2144    }
2145 
2146    CalcPressureForElems(pHalfStep, bvc, pbvc, e_new, compHalfStep, vnewc,
2147                         pmin, p_cut, eosvmax, length, regElemList);
2148 
2149 #ifdef LULESH_USE_OPENMP
2150 #pragma omp parallel for firstprivate(length, rho0)
2151 #endif
2152    for (Index_t i = 0 ; i < length ; ++i) {
2153       Real_t vhalf = Real_t(1.) / (Real_t(1.) + compHalfStep[i]) ;
2154 
2155       if ( delvc[i] > Real_t(0.) ) {
2156          q_new[i] /* = qq_old[i] = ql_old[i] */ = Real_t(0.) ;
2157       }
2158       else {
2159          Real_t ssc = ( pbvc[i] * e_new[i]
2160                  + vhalf * vhalf * bvc[i] * pHalfStep[i] ) / rho0 ;
2161 
2162          if ( ssc <= Real_t(.1111111e-36) ) {
2163             ssc = Real_t(.3333333e-18) ;
2164          } else {
2165             ssc = SQRT(ssc) ;
2166          }
2167 
2168          q_new[i] = (ssc*ql_old[i] + qq_old[i]) ;
2169       }
2170 
2171       e_new[i] = e_new[i] + Real_t(0.5) * delvc[i]
2172          * (  Real_t(3.0)*(p_old[i]     + q_old[i])
2173               - Real_t(4.0)*(pHalfStep[i] + q_new[i])) ;
2174    }
2175 
2176 #ifdef LULESH_USE_OPENMP
2177 #pragma omp parallel for firstprivate(length, emin, e_cut)
2178 #endif
2179    for (Index_t i = 0 ; i < length ; ++i) {
2180 
2181       e_new[i] += Real_t(0.5) * work[i];
2182 
2183       if (FABS(e_new[i]) < e_cut) {
2184          e_new[i] = Real_t(0.)  ;
2185       }
2186       if (     e_new[i]  < emin ) {
2187          e_new[i] = emin ;
2188       }
2189    }
2190 
2191    CalcPressureForElems(p_new, bvc, pbvc, e_new, compression, vnewc,
2192                         pmin, p_cut, eosvmax, length, regElemList);
2193 
2194 #ifdef LULESH_USE_OPENMP
2195 #pragma omp parallel for firstprivate(length, rho0, emin, e_cut)
2196 #endif
2197    for (Index_t i = 0 ; i < length ; ++i){
2198       const Real_t sixth = Real_t(1.0) / Real_t(6.0) ;
2199       Index_t elem = regElemList[i];
2200       Real_t q_tilde ;
2201 
2202       if (delvc[i] > Real_t(0.)) {
2203          q_tilde = Real_t(0.) ;
2204       }
2205       else {
2206          Real_t ssc = ( pbvc[i] * e_new[i]
2207                  + vnewc[elem] * vnewc[elem] * bvc[i] * p_new[i] ) / rho0 ;
2208 
2209          if ( ssc <= Real_t(.1111111e-36) ) {
2210             ssc = Real_t(.3333333e-18) ;
2211          } else {
2212             ssc = SQRT(ssc) ;
2213          }
2214 
2215          q_tilde = (ssc*ql_old[i] + qq_old[i]) ;
2216       }
2217 
2218       e_new[i] = e_new[i] - (  Real_t(7.0)*(p_old[i]     + q_old[i])
2219                                - Real_t(8.0)*(pHalfStep[i] + q_new[i])
2220                                + (p_new[i] + q_tilde)) * delvc[i]*sixth ;
2221 
2222       if (FABS(e_new[i]) < e_cut) {
2223          e_new[i] = Real_t(0.)  ;
2224       }
2225       if (     e_new[i]  < emin ) {
2226          e_new[i] = emin ;
2227       }
2228    }
2229 
2230    CalcPressureForElems(p_new, bvc, pbvc, e_new, compression, vnewc,
2231                         pmin, p_cut, eosvmax, length, regElemList);
2232 
2233 #ifdef LULESH_USE_OPENMP
2234 #pragma omp parallel for firstprivate(length, rho0, q_cut)
2235 #endif
2236    for (Index_t i = 0 ; i < length ; ++i){
2237       Index_t elem = regElemList[i];
2238 
2239       if ( delvc[i] <= Real_t(0.) ) {
2240          Real_t ssc = ( pbvc[i] * e_new[i]
2241                  + vnewc[elem] * vnewc[elem] * bvc[i] * p_new[i] ) / rho0 ;
2242 
2243          if ( ssc <= Real_t(.1111111e-36) ) {
2244             ssc = Real_t(.3333333e-18) ;
2245          } else {
2246             ssc = SQRT(ssc) ;
2247          }
2248 
2249          q_new[i] = (ssc*ql_old[i] + qq_old[i]) ;
2250 
2251          if (FABS(q_new[i]) < q_cut) q_new[i] = Real_t(0.) ;
2252       }
2253    }
2254 
2255    Release(&pHalfStep) ;
2256 
2257    return ;
2258 }
2259 
2260 /******************************************/
2261 
2262 static inline
CalcSoundSpeedForElems(Domain & domain,Real_t * vnewc,Real_t rho0,Real_t * enewc,Real_t * pnewc,Real_t * pbvc,Real_t * bvc,Real_t ss4o3,Index_t len,Index_t * regElemList)2263 void CalcSoundSpeedForElems(Domain &domain,
2264                             Real_t *vnewc, Real_t rho0, Real_t *enewc,
2265                             Real_t *pnewc, Real_t *pbvc,
2266                             Real_t *bvc, Real_t ss4o3,
2267                             Index_t len, Index_t *regElemList)
2268 {
2269 #ifdef LULESH_USE_OPENMP
2270 #pragma omp parallel for firstprivate(rho0, ss4o3)
2271 #endif
2272    for (Index_t i = 0; i < len ; ++i) {
2273       Index_t elem = regElemList[i];
2274       Real_t ssTmp = (pbvc[i] * enewc[i] + vnewc[elem] * vnewc[elem] *
2275                  bvc[i] * pnewc[i]) / rho0;
2276       if (ssTmp <= Real_t(.1111111e-36)) {
2277          ssTmp = Real_t(.3333333e-18);
2278       }
2279       else {
2280          ssTmp = SQRT(ssTmp);
2281       }
2282       domain.ss(elem) = ssTmp ;
2283    }
2284 }
2285 
2286 /******************************************/
2287 
2288 static inline
EvalEOSForElems(Domain & domain,Real_t * vnewc,Int_t numElemReg,Index_t * regElemList,Int_t rep)2289 void EvalEOSForElems(Domain& domain, Real_t *vnewc,
2290                      Int_t numElemReg, Index_t *regElemList, Int_t rep)
2291 {
2292    Real_t  e_cut = domain.e_cut() ;
2293    Real_t  p_cut = domain.p_cut() ;
2294    Real_t  ss4o3 = domain.ss4o3() ;
2295    Real_t  q_cut = domain.q_cut() ;
2296 
2297    Real_t eosvmax = domain.eosvmax() ;
2298    Real_t eosvmin = domain.eosvmin() ;
2299    Real_t pmin    = domain.pmin() ;
2300    Real_t emin    = domain.emin() ;
2301    Real_t rho0    = domain.refdens() ;
2302 
2303    // These temporaries will be of different size for
2304    // each call (due to different sized region element
2305    // lists)
2306    Real_t *e_old = Allocate<Real_t>(numElemReg) ;
2307    Real_t *delvc = Allocate<Real_t>(numElemReg) ;
2308    Real_t *p_old = Allocate<Real_t>(numElemReg) ;
2309    Real_t *q_old = Allocate<Real_t>(numElemReg) ;
2310    Real_t *compression = Allocate<Real_t>(numElemReg) ;
2311    Real_t *compHalfStep = Allocate<Real_t>(numElemReg) ;
2312    Real_t *qq_old = Allocate<Real_t>(numElemReg) ;
2313    Real_t *ql_old = Allocate<Real_t>(numElemReg) ;
2314    Real_t *work = Allocate<Real_t>(numElemReg) ;
2315    Real_t *p_new = Allocate<Real_t>(numElemReg) ;
2316    Real_t *e_new = Allocate<Real_t>(numElemReg) ;
2317    Real_t *q_new = Allocate<Real_t>(numElemReg) ;
2318    Real_t *bvc = Allocate<Real_t>(numElemReg) ;
2319    Real_t *pbvc = Allocate<Real_t>(numElemReg) ;
2320 
2321    //loop to add load imbalance based on region number
2322    for(Int_t j = 0; j < rep; j++) {
2323       /* compress data, minimal set */
2324 #ifdef LULESH_USE_OPENMP
2325 #pragma omp parallel
2326 #endif
2327       {
2328 #ifdef LULESH_USE_OPENMP
2329 #pragma omp for nowait firstprivate(numElemReg)
2330 #endif
2331          for (Index_t i=0; i<numElemReg; ++i) {
2332             Index_t elem = regElemList[i];
2333             e_old[i] = domain.e(elem) ;
2334             delvc[i] = domain.delv(elem) ;
2335             p_old[i] = domain.p(elem) ;
2336             q_old[i] = domain.q(elem) ;
2337             qq_old[i] = domain.qq(elem) ;
2338             ql_old[i] = domain.ql(elem) ;
2339          }
2340 
2341 #ifdef LULESH_USE_OPENMP
2342 #pragma omp for firstprivate(numElemReg)
2343 #endif
2344          for (Index_t i = 0; i < numElemReg ; ++i) {
2345             Index_t elem = regElemList[i];
2346             Real_t vchalf ;
2347             compression[i] = Real_t(1.) / vnewc[elem] - Real_t(1.);
2348             vchalf = vnewc[elem] - delvc[i] * Real_t(.5);
2349             compHalfStep[i] = Real_t(1.) / vchalf - Real_t(1.);
2350          }
2351 
2352       /* Check for v > eosvmax or v < eosvmin */
2353          if ( eosvmin != Real_t(0.) ) {
2354 #ifdef LULESH_USE_OPENMP
2355 #pragma omp for nowait firstprivate(numElemReg, eosvmin)
2356 #endif
2357             for(Index_t i=0 ; i<numElemReg ; ++i) {
2358                Index_t elem = regElemList[i];
2359                if (vnewc[elem] <= eosvmin) { /* impossible due to calling func? */
2360                   compHalfStep[i] = compression[i] ;
2361                }
2362             }
2363          }
2364          if ( eosvmax != Real_t(0.) ) {
2365 #ifdef LULESH_USE_OPENMP
2366 #pragma omp for nowait firstprivate(numElemReg, eosvmax)
2367 #endif
2368             for(Index_t i=0 ; i<numElemReg ; ++i) {
2369                Index_t elem = regElemList[i];
2370                if (vnewc[elem] >= eosvmax) { /* impossible due to calling func? */
2371                   p_old[i]        = Real_t(0.) ;
2372                   compression[i]  = Real_t(0.) ;
2373                   compHalfStep[i] = Real_t(0.) ;
2374                }
2375             }
2376          }
2377 
2378 #ifdef LULESH_USE_OPENMP
2379 #pragma omp for nowait firstprivate(numElemReg)
2380 #endif
2381          for (Index_t i = 0 ; i < numElemReg ; ++i) {
2382             work[i] = Real_t(0.) ;
2383          }
2384       }
2385       CalcEnergyForElems(p_new, e_new, q_new, bvc, pbvc,
2386                          p_old, e_old,  q_old, compression, compHalfStep,
2387                          vnewc, work,  delvc, pmin,
2388                          p_cut, e_cut, q_cut, emin,
2389                          qq_old, ql_old, rho0, eosvmax,
2390                          numElemReg, regElemList);
2391    }
2392 
2393 #ifdef LULESH_USE_OPENMP
2394 #pragma omp parallel for firstprivate(numElemReg)
2395 #endif
2396    for (Index_t i=0; i<numElemReg; ++i) {
2397       Index_t elem = regElemList[i];
2398       domain.p(elem) = p_new[i] ;
2399       domain.e(elem) = e_new[i] ;
2400       domain.q(elem) = q_new[i] ;
2401    }
2402 
2403    CalcSoundSpeedForElems(domain,
2404                           vnewc, rho0, e_new, p_new,
2405                           pbvc, bvc, ss4o3,
2406                           numElemReg, regElemList) ;
2407 
2408    Release(&pbvc) ;
2409    Release(&bvc) ;
2410    Release(&q_new) ;
2411    Release(&e_new) ;
2412    Release(&p_new) ;
2413    Release(&work) ;
2414    Release(&ql_old) ;
2415    Release(&qq_old) ;
2416    Release(&compHalfStep) ;
2417    Release(&compression) ;
2418    Release(&q_old) ;
2419    Release(&p_old) ;
2420    Release(&delvc) ;
2421    Release(&e_old) ;
2422 }
2423 
2424 /******************************************/
2425 
2426 static inline
ApplyMaterialPropertiesForElems(Domain & domain,Real_t vnew[])2427 void ApplyMaterialPropertiesForElems(Domain& domain, Real_t vnew[])
2428 {
2429    Index_t numElem = domain.numElem() ;
2430 
2431   if (numElem != 0) {
2432     /* Expose all of the variables needed for material evaluation */
2433     Real_t eosvmin = domain.eosvmin() ;
2434     Real_t eosvmax = domain.eosvmax() ;
2435 
2436 #ifdef LULESH_USE_OPENMP
2437 #pragma omp parallel
2438 #endif
2439     {
2440        // Bound the updated relative volumes with eosvmin/max
2441        if (eosvmin != Real_t(0.)) {
2442 #ifdef LULESH_USE_OPENMP
2443 #pragma omp for firstprivate(numElem)
2444 #endif
2445           for(Index_t i=0 ; i<numElem ; ++i) {
2446              if (vnew[i] < eosvmin)
2447                 vnew[i] = eosvmin ;
2448           }
2449        }
2450 
2451        if (eosvmax != Real_t(0.)) {
2452 #ifdef LULESH_USE_OPENMP
2453 #pragma omp for nowait firstprivate(numElem)
2454 #endif
2455           for(Index_t i=0 ; i<numElem ; ++i) {
2456              if (vnew[i] > eosvmax)
2457                 vnew[i] = eosvmax ;
2458           }
2459        }
2460 
2461        // This check may not make perfect sense in LULESH, but
2462        // it's representative of something in the full code -
2463        // just leave it in, please
2464 #ifdef LULESH_USE_OPENMP
2465 #pragma omp for nowait firstprivate(numElem)
2466 #endif
2467        for (Index_t i=0; i<numElem; ++i) {
2468           Real_t vc = domain.v(i) ;
2469           if (eosvmin != Real_t(0.)) {
2470              if (vc < eosvmin)
2471                 vc = eosvmin ;
2472           }
2473           if (eosvmax != Real_t(0.)) {
2474              if (vc > eosvmax)
2475                 vc = eosvmax ;
2476           }
2477           if (vc <= 0.) {
2478 #if USE_MPI
2479              MPI_Abort(MPI_COMM_WORLD, VolumeError) ;
2480 #else
2481              exit(VolumeError);
2482 #endif
2483           }
2484        }
2485     }
2486 
2487     for (Int_t r=0 ; r<domain.numReg() ; r++) {
2488        Index_t numElemReg = domain.regElemSize(r);
2489        Index_t *regElemList = domain.regElemlist(r);
2490        Int_t rep;
2491        //Determine load imbalance for this region
2492        //round down the number with lowest cost
2493        if(r < domain.numReg()/2)
2494 	 rep = 1;
2495        //you don't get an expensive region unless you at least have 5 regions
2496        else if(r < (domain.numReg() - (domain.numReg()+15)/20))
2497          rep = 1 + domain.cost();
2498        //very expensive regions
2499        else
2500 	 rep = 10 * (1+ domain.cost());
2501        EvalEOSForElems(domain, vnew, numElemReg, regElemList, rep);
2502     }
2503 
2504   }
2505 }
2506 
2507 /******************************************/
2508 
2509 static inline
UpdateVolumesForElems(Domain & domain,Real_t * vnew,Real_t v_cut,Index_t length)2510 void UpdateVolumesForElems(Domain &domain, Real_t *vnew,
2511                            Real_t v_cut, Index_t length)
2512 {
2513    if (length != 0) {
2514 #ifdef LULESH_USE_OPENMP
2515 #pragma omp parallel for firstprivate(length, v_cut)
2516 #endif
2517       for(Index_t i=0 ; i<length ; ++i) {
2518          Real_t tmpV = vnew[i] ;
2519 
2520          if ( FABS(tmpV - Real_t(1.0)) < v_cut )
2521             tmpV = Real_t(1.0) ;
2522 
2523          domain.v(i) = tmpV ;
2524       }
2525    }
2526 
2527    return ;
2528 }
2529 
2530 /******************************************/
2531 
2532 static inline
LagrangeElements(Domain & domain,Index_t numElem)2533 void LagrangeElements(Domain& domain, Index_t numElem)
2534 {
2535   Real_t *vnew = Allocate<Real_t>(numElem) ;  /* new relative vol -- temp */
2536 
2537   CalcLagrangeElements(domain, vnew) ;
2538 
2539   /* Calculate Q.  (Monotonic q option requires communication) */
2540   CalcQForElems(domain, vnew) ;
2541 
2542   ApplyMaterialPropertiesForElems(domain, vnew) ;
2543 
2544   UpdateVolumesForElems(domain, vnew,
2545                         domain.v_cut(), numElem) ;
2546 
2547   Release(&vnew);
2548 }
2549 
2550 /******************************************/
2551 
2552 static inline
CalcCourantConstraintForElems(Domain & domain,Index_t length,Index_t * regElemlist,Real_t qqc,Real_t & dtcourant)2553 void CalcCourantConstraintForElems(Domain &domain, Index_t length,
2554                                    Index_t *regElemlist,
2555                                    Real_t qqc, Real_t& dtcourant)
2556 {
2557 #if LULESH_USE_OPENMP
2558    Index_t threads = omp_get_max_threads();
2559    static Index_t *courant_elem_per_thread;
2560    static Real_t *dtcourant_per_thread;
2561    static bool first = true;
2562    if (first) {
2563      courant_elem_per_thread = new Index_t[threads];
2564      dtcourant_per_thread = new Real_t[threads];
2565      first = false;
2566    }
2567 #else
2568    Index_t threads = 1;
2569    Index_t courant_elem_per_thread[1];
2570    Real_t  dtcourant_per_thread[1];
2571 #endif
2572 
2573 
2574 #ifdef LULESH_USE_OPENMP
2575 #pragma omp parallel firstprivate(length, qqc)
2576 #endif
2577    {
2578       Real_t   qqc2 = Real_t(64.0) * qqc * qqc ;
2579       Real_t   dtcourant_tmp = dtcourant;
2580       Index_t  courant_elem  = -1 ;
2581 
2582 #if LULESH_USE_OPENMP
2583       Index_t thread_num = omp_get_thread_num();
2584 #else
2585       Index_t thread_num = 0;
2586 #endif
2587 
2588 #ifdef LULESH_USE_OPENMP
2589 #pragma omp for
2590 #endif
2591       for (Index_t i = 0 ; i < length ; ++i) {
2592          Index_t indx = regElemlist[i] ;
2593          Real_t dtf = domain.ss(indx) * domain.ss(indx) ;
2594 
2595          if ( domain.vdov(indx) < Real_t(0.) ) {
2596             dtf = dtf
2597                 + qqc2 * domain.arealg(indx) * domain.arealg(indx)
2598                 * domain.vdov(indx) * domain.vdov(indx) ;
2599          }
2600 
2601          dtf = SQRT(dtf) ;
2602          dtf = domain.arealg(indx) / dtf ;
2603 
2604          if (domain.vdov(indx) != Real_t(0.)) {
2605             if ( dtf < dtcourant_tmp ) {
2606                dtcourant_tmp = dtf ;
2607                courant_elem  = indx ;
2608             }
2609          }
2610       }
2611 
2612       dtcourant_per_thread[thread_num]    = dtcourant_tmp ;
2613       courant_elem_per_thread[thread_num] = courant_elem ;
2614    }
2615 
2616    for (Index_t i = 1; i < threads; ++i) {
2617       if (dtcourant_per_thread[i] < dtcourant_per_thread[0] ) {
2618          dtcourant_per_thread[0]    = dtcourant_per_thread[i];
2619          courant_elem_per_thread[0] = courant_elem_per_thread[i];
2620       }
2621    }
2622 
2623    if (courant_elem_per_thread[0] != -1) {
2624       dtcourant = dtcourant_per_thread[0] ;
2625    }
2626 
2627    return ;
2628 
2629 }
2630 
2631 /******************************************/
2632 
2633 static inline
CalcHydroConstraintForElems(Domain & domain,Index_t length,Index_t * regElemlist,Real_t dvovmax,Real_t & dthydro)2634 void CalcHydroConstraintForElems(Domain &domain, Index_t length,
2635                                  Index_t *regElemlist, Real_t dvovmax, Real_t& dthydro)
2636 {
2637 #if LULESH_USE_OPENMP
2638    Index_t threads = omp_get_max_threads();
2639    static Index_t *hydro_elem_per_thread;
2640    static Real_t *dthydro_per_thread;
2641    static bool first = true;
2642    if (first) {
2643      hydro_elem_per_thread = new Index_t[threads];
2644      dthydro_per_thread = new Real_t[threads];
2645      first = false;
2646    }
2647 #else
2648    Index_t threads = 1;
2649    Index_t hydro_elem_per_thread[1];
2650    Real_t  dthydro_per_thread[1];
2651 #endif
2652 
2653 #ifdef LULESH_USE_OPENMP
2654 #pragma omp parallel firstprivate(length, dvovmax)
2655 #endif
2656    {
2657       Real_t dthydro_tmp = dthydro ;
2658       Index_t hydro_elem = -1 ;
2659 
2660 #if LULESH_USE_OPENMP
2661       Index_t thread_num = omp_get_thread_num();
2662 #else
2663       Index_t thread_num = 0;
2664 #endif
2665 
2666 
2667 #ifdef LULESH_USE_OPENMP
2668 #pragma omp for
2669 #endif
2670       for (Index_t i = 0 ; i < length ; ++i) {
2671          Index_t indx = regElemlist[i] ;
2672 
2673          if (domain.vdov(indx) != Real_t(0.)) {
2674             Real_t dtdvov = dvovmax / (FABS(domain.vdov(indx))+Real_t(1.e-20)) ;
2675 
2676             if ( dthydro_tmp > dtdvov ) {
2677                   dthydro_tmp = dtdvov ;
2678                   hydro_elem = indx ;
2679             }
2680          }
2681       }
2682 
2683       dthydro_per_thread[thread_num]    = dthydro_tmp ;
2684       hydro_elem_per_thread[thread_num] = hydro_elem ;
2685    }
2686 
2687    for (Index_t i = 1; i < threads; ++i) {
2688       if(dthydro_per_thread[i] < dthydro_per_thread[0]) {
2689          dthydro_per_thread[0]    = dthydro_per_thread[i];
2690          hydro_elem_per_thread[0] =  hydro_elem_per_thread[i];
2691       }
2692    }
2693 
2694    if (hydro_elem_per_thread[0] != -1) {
2695       dthydro =  dthydro_per_thread[0] ;
2696    }
2697 
2698    return ;
2699 }
2700 
2701 /******************************************/
2702 
2703 static inline
CalcTimeConstraintsForElems(Domain & domain)2704 void CalcTimeConstraintsForElems(Domain& domain) {
2705 
2706    // Initialize conditions to a very large value
2707    domain.dtcourant() = 1.0e+20;
2708    domain.dthydro() = 1.0e+20;
2709 
2710    for (Index_t r=0 ; r < domain.numReg() ; ++r) {
2711       /* evaluate time constraint */
2712       CalcCourantConstraintForElems(domain, domain.regElemSize(r),
2713                                     domain.regElemlist(r),
2714                                     domain.qqc(),
2715                                     domain.dtcourant()) ;
2716 
2717       /* check hydro constraint */
2718       CalcHydroConstraintForElems(domain, domain.regElemSize(r),
2719                                   domain.regElemlist(r),
2720                                   domain.dvovmax(),
2721                                   domain.dthydro()) ;
2722    }
2723 }
2724 
2725 /******************************************/
2726 
2727 static inline
LagrangeLeapFrog(Domain & domain)2728 void LagrangeLeapFrog(Domain& domain)
2729 {
2730 #ifdef SEDOV_SYNC_POS_VEL_LATE
2731    Domain_member fieldData[6] ;
2732 #endif
2733 
2734    /* calculate nodal forces, accelerations, velocities, positions, with
2735     * applied boundary conditions and slide surface considerations */
2736    LagrangeNodal(domain);
2737 
2738 
2739 #ifdef SEDOV_SYNC_POS_VEL_LATE
2740 #endif
2741 
2742    /* calculate element quantities (i.e. velocity gradient & q), and update
2743     * material states */
2744    LagrangeElements(domain, domain.numElem());
2745 
2746 #if USE_MPI
2747 #ifdef SEDOV_SYNC_POS_VEL_LATE
2748    CommRecv(domain, MSG_SYNC_POS_VEL, 6,
2749             domain.sizeX() + 1, domain.sizeY() + 1, domain.sizeZ() + 1,
2750             false, false) ;
2751 
2752    fieldData[0] = &Domain::x ;
2753    fieldData[1] = &Domain::y ;
2754    fieldData[2] = &Domain::z ;
2755    fieldData[3] = &Domain::xd ;
2756    fieldData[4] = &Domain::yd ;
2757    fieldData[5] = &Domain::zd ;
2758 
2759    CommSend(domain, MSG_SYNC_POS_VEL, 6, fieldData,
2760             domain.sizeX() + 1, domain.sizeY() + 1, domain.sizeZ() + 1,
2761             false, false) ;
2762 #endif
2763 #endif
2764 
2765    CalcTimeConstraintsForElems(domain);
2766 
2767 #if USE_MPI
2768 #ifdef SEDOV_SYNC_POS_VEL_LATE
2769    CommSyncPosVel(domain) ;
2770 #endif
2771 #endif
2772 }
2773 
2774 
2775 /******************************************/
2776 
main(int argc,char * argv[])2777 int main(int argc, char *argv[])
2778 {
2779   Domain *locDom ;
2780    Int_t numRanks ;
2781    Int_t myRank ;
2782    struct cmdLineOpts opts;
2783 
2784 #if USE_MPI
2785    Domain_member fieldData ;
2786 
2787    MPI_Init(&argc, &argv) ;
2788    MPI_Comm_size(MPI_COMM_WORLD, &numRanks) ;
2789    MPI_Comm_rank(MPI_COMM_WORLD, &myRank) ;
2790 #else
2791    numRanks = 1;
2792    myRank = 0;
2793 #endif
2794 
2795    /* Set defaults that can be overridden by command line opts */
2796    opts.its = 9999999;
2797    opts.nx  = 30;
2798    opts.numReg = 11;
2799    opts.numFiles = (int)(numRanks+10)/9;
2800    opts.showProg = 0;
2801    opts.quiet = 0;
2802    opts.viz = 0;
2803    opts.balance = 1;
2804    opts.cost = 1;
2805 
2806    ParseCommandLineOptions(argc, argv, myRank, &opts);
2807 
2808    if ((myRank == 0) && (opts.quiet == 0)) {
2809       printf("Running problem size %d^3 per domain until completion\n", opts.nx);
2810       printf("Num processors: %d\n", numRanks);
2811 #if LULESH_USE_OPENMP
2812       printf("Num threads: %d\n", omp_get_max_threads());
2813 #endif
2814       printf("Total number of elements: %lld\n\n", (long long int)(numRanks*opts.nx*opts.nx*opts.nx));
2815       printf("To run other sizes, use -s <integer>.\n");
2816       printf("To run a fixed number of iterations, use -i <integer>.\n");
2817       printf("To run a more or less balanced region set, use -b <integer>.\n");
2818       printf("To change the relative costs of regions, use -c <integer>.\n");
2819       printf("To print out progress, use -p\n");
2820       printf("To write an output file for VisIt, use -v\n");
2821       printf("See help (-h) for more options\n\n");
2822    }
2823 
2824    // Set up the mesh and decompose. Assumes regular cubes for now
2825    Int_t col, row, plane, side;
2826    InitMeshDecomp(numRanks, myRank, &col, &row, &plane, &side);
2827 
2828    // Build the main data structure and initialize it
2829    locDom = new Domain(numRanks, myRank, col, row, plane, opts.nx,
2830                        side, opts.numReg, opts.balance, opts.cost) ;
2831 
2832 
2833 #if USE_MPI
2834    fieldData = &Domain::nodalMass ;
2835 
2836    // Initial domain boundary communication
2837    CommRecv(*locDom, MSG_COMM_SBN, 1,
2838             locDom->sizeX() + 1, locDom->sizeY() + 1, locDom->sizeZ() + 1,
2839             true, false) ;
2840    CommSend(*locDom, MSG_COMM_SBN, 1, &fieldData,
2841             locDom->sizeX() + 1, locDom->sizeY() + 1, locDom->sizeZ() +  1,
2842             true, false) ;
2843    CommSBN(*locDom, 1, &fieldData) ;
2844 
2845    // End initialization
2846    MPI_Barrier(MPI_COMM_WORLD);
2847 #endif
2848 /*--------------------------------------------------------------------------
2849  *--------------------------------------------------------------------------
2850  * Begin Ascent Integration
2851  *--------------------------------------------------------------------------
2852  *--------------------------------------------------------------------------*/
2853    //
2854    // setup Ascent In-situ rendering.
2855    //
2856     Ascent ascent;
2857     Node ascent_opts;
2858 
2859 #if USE_MPI
2860     ascent_opts["mpi_comm"] = MPI_Comm_c2f(MPI_COMM_WORLD);
2861 #endif
2862 
2863     ascent.open(ascent_opts);
2864    // BEGIN timestep to solution */
2865 #if USE_MPI
2866    double start = MPI_Wtime();
2867 #else
2868    timeval start;
2869    gettimeofday(&start, NULL) ;
2870 #endif
2871 
2872 //debug to see region sizes
2873 //   for(Int_t i = 0; i < locDom->numReg(); i++)
2874 //      std::cout << "region" << i + 1<< "size" << locDom->regElemSize(i) <<std::endl;
2875 
2876    conduit::Node scenes;
2877    scenes["s1/plots/p1/type"]  = "pseudocolor";
2878    scenes["s1/plots/p1/field"] = "e";
2879    double vec3[3];
2880    vec3[0] = -0.6; vec3[1] = -0.6; vec3[2] = -0.8;
2881    scenes["s1/renders/r1/camera/position"].set_float64_ptr(vec3,3);
2882    scenes["s1/renders/r1/image_prefix"] = "lulesh_image";
2883 
2884    conduit::Node actions;
2885    conduit::Node &add_plots = actions.append();
2886    add_plots["action"] = "add_scenes";
2887    add_plots["scenes"] = scenes;
2888    conduit::Node &execute = actions.append();
2889    execute["action"] = "execute";
2890 
2891    conduit::Node &reset_action = actions.append();
2892    reset_action["action"] = "reset";
2893 
2894    while((locDom->time() < locDom->stoptime()) && (locDom->cycle() < opts.its)) {
2895       {
2896          ASCENT_BLOCK_TIMER(LULESH_MAIN_LOOP)
2897          TimeIncrement(*locDom) ;
2898          LagrangeLeapFrog(*locDom) ;
2899       }
2900       if ((opts.showProg != 0) && (opts.quiet == 0) && (myRank == 0)) {
2901          printf("cycle = %d, time = %e, dt=%e\n",
2902                 locDom->cycle(), double(locDom->time()), double(locDom->deltatime()) ) ;
2903       }
2904       //-- begin calls to ascent -- //
2905       if ( (locDom->cycle() % 1 == 0) || (locDom->cycle() == 0)) {
2906          char outFileName[30];
2907          sprintf(outFileName,"lulesh_image%03d",locDom->cycle());
2908          //
2909          // Create the actions.
2910          //
2911          ascent.publish(locDom->visitNode());
2912          ascent.execute(actions);
2913       }
2914    }
2915    ascent.close();
2916 
2917    /*--------------------------------------------------------------------------
2918     *--------------------------------------------------------------------------
2919     * End Ascent Integration
2920     *--------------------------------------------------------------------------
2921     *--------------------------------------------------------------------------*/
2922 
2923    // Use reduced max elapsed time
2924    double elapsed_time;
2925 #if USE_MPI
2926    elapsed_time = MPI_Wtime() - start;
2927 #else
2928    timeval end;
2929    gettimeofday(&end, NULL) ;
2930    elapsed_time = (double)(end.tv_sec - start.tv_sec) + ((double)(end.tv_usec - start.tv_usec))/1000000 ;
2931 #endif
2932    double elapsed_timeG;
2933 #if USE_MPI
2934    MPI_Reduce(&elapsed_time, &elapsed_timeG, 1, MPI_DOUBLE,
2935               MPI_MAX, 0, MPI_COMM_WORLD);
2936 #else
2937    elapsed_timeG = elapsed_time;
2938 #endif
2939 
2940 
2941 
2942    // Write out final viz file */
2943    if (opts.viz) {
2944       DumpToVisit(*locDom, opts.numFiles, myRank, numRanks) ;
2945    }
2946 
2947    if ((myRank == 0) && (opts.quiet == 0)) {
2948       VerifyAndWriteFinalOutput(elapsed_timeG, *locDom, opts.nx, numRanks);
2949    }
2950 
2951 
2952 #if USE_MPI
2953    MPI_Barrier(MPI_COMM_WORLD);
2954    MPI_Finalize() ;
2955 #endif
2956 
2957    return 0 ;
2958 }
2959