1 /******************************************************************************
2 * Copyright (c) Intel Corporation - All rights reserved.                      *
3 * This file is part of the LIBXSMM library.                                   *
4 *                                                                             *
5 * For information on the license, see the LICENSE file.                       *
6 * Further information: https://github.com/hfp/libxsmm/                        *
7 * SPDX-License-Identifier: BSD-3-Clause                                       *
8 ******************************************************************************/
9 #ifndef PROXY_SEISSOL_INTEGRATORS_HPP
10 #define PROXY_SEISSOL_INTEGRATORS_HPP
11 
12 #if defined(_OPENMP)
13 # include <omp.h>
14 #endif
15 
computeAderIntegration()16 void computeAderIntegration() {
17 #ifdef _OPENMP
18 # pragma omp parallel
19   {
20 #if NUMBER_OF_THREADS_PER_GLOBALDATA_COPY < 512
21   //GlobalData* l_globalData = m_globalDataArray[omp_get_thread_num()/NUMBER_OF_THREADS_PER_GLOBALDATA_COPY];
22   GlobalData* l_globalData = m_globalDataArray[0];
23 #else
24   GlobalData* l_globalData = m_globalData;
25 #endif
26   #pragma omp for schedule(static)
27 #else
28   GlobalData* l_globalData = m_globalData;
29 #endif
30   for( unsigned int l_cell = 0; l_cell < m_cells->numberOfCells; l_cell++ ) {
31     m_timeKernel.computeAder(              m_timeStepWidthSimulation,
32                                            l_globalData->stiffnessMatricesTransposed,
33                                            m_cells->dofs[l_cell],
34                                            m_cellData->localIntegration[l_cell].starMatrices,
35                                            m_cells->buffers[l_cell],
36                                            m_cells->derivatives[l_cell] );
37   }
38 #ifdef _OPENMP
39   }
40 #endif
41 }
42 
computeVolumeIntegration()43 void computeVolumeIntegration() {
44 #ifdef _OPENMP
45 # pragma omp parallel
46   {
47 #if NUMBER_OF_THREADS_PER_GLOBALDATA_COPY < 512
48   //GlobalData* l_globalData = m_globalDataArray[omp_get_thread_num()/NUMBER_OF_THREADS_PER_GLOBALDATA_COPY];
49   GlobalData* l_globalData = m_globalDataArray[0];
50 #else
51   GlobalData* l_globalData = m_globalData;
52 #endif
53   #pragma omp for schedule(static)
54 #else
55   GlobalData* l_globalData = m_globalData;
56 #endif
57   for( unsigned int l_cell = 0; l_cell < m_cells->numberOfCells; l_cell++ ) {
58     m_volumeKernel.computeIntegral(        l_globalData->stiffnessMatrices,
59                                            m_cells->buffers[l_cell],
60                                            m_cellData->localIntegration[l_cell].starMatrices,
61                                            m_cells->dofs[l_cell] );
62   }
63 #ifdef _OPENMP
64   }
65 #endif
66 }
67 
computeLocalBoundaryIntegration()68 void computeLocalBoundaryIntegration() {
69 #ifdef _OPENMP
70   #pragma omp parallel
71   {
72 #if NUMBER_OF_THREADS_PER_GLOBALDATA_COPY < 512
73   //GlobalData* l_globalData = m_globalDataArray[omp_get_thread_num()/NUMBER_OF_THREADS_PER_GLOBALDATA_COPY];
74   GlobalData* l_globalData = m_globalDataArray[0];
75 #else
76   GlobalData* l_globalData = m_globalData;
77 #endif
78   #pragma omp for schedule(static)
79 #else
80   GlobalData* l_globalData = m_globalData;
81 #endif
82   for( unsigned int l_cell = 0; l_cell < m_cells->numberOfCells; l_cell++ ) {
83     m_boundaryKernel.computeLocalIntegral( m_cellInformation[l_cell].faceTypes,
84                                            l_globalData->fluxMatrices,
85                                            m_cells->buffers[l_cell],
86                                            m_cellData->localIntegration[l_cell].nApNm1,
87 #ifdef ENABLE_STREAM_MATRIX_PREFETCH
88                                            m_cells->dofs[l_cell],
89                                            m_cells->buffers[l_cell+1],
90                                            m_cells->dofs[l_cell+1] );
91 #else
92                                            m_cells->dofs[l_cell] );
93 #endif
94   }
95 #ifdef _OPENMP
96   }
97 #endif
98 }
99 
computeLocalIntegration()100 void computeLocalIntegration() {
101 #ifdef _OPENMP
102   #pragma omp parallel
103   {
104 #if NUMBER_OF_THREADS_PER_GLOBALDATA_COPY < 512
105   //GlobalData* l_globalData = m_globalDataArray[omp_get_thread_num()/NUMBER_OF_THREADS_PER_GLOBALDATA_COPY];
106   GlobalData* l_globalData = m_globalDataArray[0];
107 #else
108   GlobalData* l_globalData = m_globalData;
109 #endif
110   #pragma omp for schedule(static)
111 #else
112   GlobalData* l_globalData = m_globalData;
113 #endif
114   for( unsigned int l_cell = 0; l_cell < m_cells->numberOfCells; l_cell++ ) {
115     m_timeKernel.computeAder(      (double)m_timeStepWidthSimulation,
116                                            l_globalData->stiffnessMatricesTransposed,
117                                            m_cells->dofs[l_cell],
118                                            m_cellData->localIntegration[l_cell].starMatrices,
119                                            m_cells->buffers[l_cell],
120                                            m_cells->derivatives[l_cell] );
121 
122     m_volumeKernel.computeIntegral(        l_globalData->stiffnessMatrices,
123                                            m_cells->buffers[l_cell],
124                                            m_cellData->localIntegration[l_cell].starMatrices,
125                                            m_cells->dofs[l_cell] );
126 
127     m_boundaryKernel.computeLocalIntegral( m_cellInformation[l_cell].faceTypes,
128                                            l_globalData->fluxMatrices,
129                                            m_cells->buffers[l_cell],
130                                            m_cellData->localIntegration[l_cell].nApNm1,
131 #ifdef ENABLE_STREAM_MATRIX_PREFETCH
132                                            m_cells->dofs[l_cell],
133                                            m_cells->buffers[l_cell+1],
134                                            m_cells->dofs[l_cell+1] );
135 #else
136                                            m_cells->dofs[l_cell] );
137 #endif
138   }
139 #ifdef _OPENMP
140   }
141 #endif
142 }
143 
computeNeighboringIntegration()144 void computeNeighboringIntegration() {
145   real  l_integrationBuffer[4][NUMBER_OF_ALIGNED_DOFS] __attribute__((aligned(4096)));
146   real *l_timeIntegrated[4];
147 #ifdef ENABLE_MATRIX_PREFETCH
148   real *l_faceNeighbors_prefetch[4];
149   real *l_fluxMatricies_prefetch[4];
150 #endif
151 
152 #ifdef _OPENMP
153 #ifdef ENABLE_MATRIX_PREFETCH
154   #pragma omp parallel private(l_integrationBuffer, l_timeIntegrated, l_faceNeighbors_prefetch, l_fluxMatricies_prefetch)
155 #else
156   #pragma omp parallel private(l_integrationBuffer, l_timeIntegrated)
157 #endif
158   {
159 #if NUMBER_OF_THREADS_PER_GLOBALDATA_COPY < 512
160   GlobalData* l_globalData = m_globalDataArray[omp_get_thread_num()/NUMBER_OF_THREADS_PER_GLOBALDATA_COPY];
161 #else
162   GlobalData* l_globalData = m_globalData;
163 #endif
164   #pragma omp for schedule(static)
165 #else
166   GlobalData* l_globalData = m_globalData;
167 #endif
168   for( int l_cell = 0; l_cell < m_cells->numberOfCells; l_cell++ ) {
169     m_timeKernel.computeIntegrals(             m_cellInformation[l_cell].ltsSetup,
170                                                m_cellInformation[l_cell].faceTypes,
171                                                m_cellInformation[l_cell].currentTime,
172                                        (double)m_timeStepWidthSimulation,
173                                                m_cells->faceNeighbors[l_cell],
174                                                l_integrationBuffer,
175                                                l_timeIntegrated );
176 
177 #ifdef ENABLE_MATRIX_PREFETCH
178 #pragma message("the current prefetch structure (flux matrices and tDOFs is tuned for higher order and shouldn't be harmful for lower orders")
179     int l_face = 1;
180     l_faceNeighbors_prefetch[0] = m_cells->faceNeighbors[l_cell][l_face];
181     l_fluxMatricies_prefetch[0] = l_globalData->fluxMatrices[4+(l_face*12)
182                                                              +(m_cellInformation[l_cell].faceRelations[l_face][0]*3)
183                                                              +(m_cellInformation[l_cell].faceRelations[l_face][1])];
184     l_face = 2;
185     l_faceNeighbors_prefetch[1] = m_cells->faceNeighbors[l_cell][l_face];
186     l_fluxMatricies_prefetch[1] = l_globalData->fluxMatrices[4+(l_face*12)
187                                                              +(m_cellInformation[l_cell].faceRelations[l_face][0]*3)
188                                                              +(m_cellInformation[l_cell].faceRelations[l_face][1])];
189     l_face = 3;
190     l_faceNeighbors_prefetch[2] = m_cells->faceNeighbors[l_cell][l_face];
191     l_fluxMatricies_prefetch[2] = l_globalData->fluxMatrices[4+(l_face*12)
192                                                              +(m_cellInformation[l_cell].faceRelations[l_face][0]*3)
193                                                              +(m_cellInformation[l_cell].faceRelations[l_face][1])];
194     l_face = 0;
195     if (l_cell < (m_cells->numberOfCells-1) ) {
196       l_faceNeighbors_prefetch[3] = m_cells->faceNeighbors[l_cell+1][l_face];
197       l_fluxMatricies_prefetch[3] = l_globalData->fluxMatrices[4+(l_face*12)
198                                                                +(m_cellInformation[l_cell+1].faceRelations[l_face][0]*3)
199                                                                +(m_cellInformation[l_cell+1].faceRelations[l_face][1])];
200     } else {
201       l_faceNeighbors_prefetch[3] = m_cells->faceNeighbors[l_cell][3];
202       l_fluxMatricies_prefetch[3] = l_globalData->fluxMatrices[4+(3*12)
203                                                                +(m_cellInformation[l_cell].faceRelations[l_face][0]*3)
204                                                                +(m_cellInformation[l_cell].faceRelations[l_face][1])];
205     }
206 #endif
207 
208     m_boundaryKernel.computeNeighborsIntegral( m_cellInformation[l_cell].faceTypes,
209                                                m_cellInformation[l_cell].faceRelations,
210                                                l_globalData->fluxMatrices,
211                                                l_timeIntegrated,
212                                                m_cellData->neighboringIntegration[l_cell].nAmNm1,
213 #ifdef ENABLE_MATRIX_PREFETCH
214                                                m_cells->dofs[l_cell],
215                                                l_faceNeighbors_prefetch,
216                                                l_fluxMatricies_prefetch );
217 #else
218                                                m_cells->dofs[l_cell]);
219 #endif
220   }
221 
222 #ifdef _OPENMP
223   }
224 #endif
225 }
226 
227 #endif /*PROXY_SEISSOL_INTEGRATORS_HPP*/
228 
229