1 #include <petsc/private/petscfeimpl.h> /*I "petscfe.h" I*/
2
3 #if defined(PETSC_HAVE_OPENCL)
4
PetscFEDestroy_OpenCL(PetscFE fem)5 static PetscErrorCode PetscFEDestroy_OpenCL(PetscFE fem)
6 {
7 PetscFE_OpenCL *ocl = (PetscFE_OpenCL *) fem->data;
8 PetscErrorCode ierr;
9
10 PetscFunctionBegin;
11 ierr = clReleaseCommandQueue(ocl->queue_id);CHKERRQ(ierr);
12 ocl->queue_id = 0;
13 ierr = clReleaseContext(ocl->ctx_id);CHKERRQ(ierr);
14 ocl->ctx_id = 0;
15 ierr = PetscFree(ocl);CHKERRQ(ierr);
16 PetscFunctionReturn(0);
17 }
18
19 #define CHKSTRINGERROR(MSG) do {CHKERRQ(ierr); string_tail += count; if (string_tail == end_of_buffer) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, MSG);} while (0)
20 enum {LAPLACIAN = 0, ELASTICITY = 1};
21
22 /* NOTE: This is now broken for vector problems. Must redo loops to respect vector basis elements */
23 /* dim Number of spatial dimensions: 2 */
24 /* N_b Number of basis functions: generated */
25 /* N_{bt} Number of total basis functions: N_b * N_{comp} */
26 /* N_q Number of quadrature points: generated */
27 /* N_{bs} Number of block cells LCM(N_b, N_q) */
28 /* N_{bst} Number of block cell components LCM(N_{bt}, N_q) */
29 /* N_{bl} Number of concurrent blocks generated */
30 /* N_t Number of threads: N_{bl} * N_{bs} */
31 /* N_{cbc} Number of concurrent basis cells: N_{bl} * N_q */
32 /* N_{cqc} Number of concurrent quadrature cells: N_{bl} * N_b */
33 /* N_{sbc} Number of serial basis cells: N_{bs} / N_q */
34 /* N_{sqc} Number of serial quadrature cells: N_{bs} / N_b */
35 /* N_{cb} Number of serial cell batches: input */
36 /* N_c Number of total cells: N_{cb}*N_{t}/N_{comp} */
PetscFEOpenCLGenerateIntegrationCode(PetscFE fem,char ** string_buffer,PetscInt buffer_length,PetscBool useAux,PetscInt N_bl)37 static PetscErrorCode PetscFEOpenCLGenerateIntegrationCode(PetscFE fem, char **string_buffer, PetscInt buffer_length, PetscBool useAux, PetscInt N_bl)
38 {
39 PetscFE_OpenCL *ocl = (PetscFE_OpenCL *) fem->data;
40 PetscQuadrature q;
41 char *string_tail = *string_buffer;
42 char *end_of_buffer = *string_buffer + buffer_length;
43 char float_str[] = "float", double_str[] = "double";
44 char *numeric_str = &(float_str[0]);
45 PetscInt op = ocl->op;
46 PetscBool useField = PETSC_FALSE;
47 PetscBool useFieldDer = PETSC_TRUE;
48 PetscBool useFieldAux = useAux;
49 PetscBool useFieldDerAux= PETSC_FALSE;
50 PetscBool useF0 = PETSC_TRUE;
51 PetscBool useF1 = PETSC_TRUE;
52 const PetscReal *points, *weights;
53 PetscTabulation T;
54 PetscInt dim, qNc, N_b, N_c, N_q, N_t, p, d, b, c;
55 size_t count;
56 PetscErrorCode ierr;
57
58 PetscFunctionBegin;
59 ierr = PetscFEGetSpatialDimension(fem, &dim);CHKERRQ(ierr);
60 ierr = PetscFEGetDimension(fem, &N_b);CHKERRQ(ierr);
61 ierr = PetscFEGetNumComponents(fem, &N_c);CHKERRQ(ierr);
62 ierr = PetscFEGetQuadrature(fem, &q);CHKERRQ(ierr);
63 ierr = PetscQuadratureGetData(q, NULL, &qNc, &N_q, &points, &weights);CHKERRQ(ierr);
64 if (qNc != 1) SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_SUP, "Only supports scalar quadrature, not %D components\n", qNc);
65 N_t = N_b * N_c * N_q * N_bl;
66 /* Enable device extension for double precision */
67 if (ocl->realType == PETSC_DOUBLE) {
68 ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
69 "#if defined(cl_khr_fp64)\n"
70 "# pragma OPENCL EXTENSION cl_khr_fp64: enable\n"
71 "#elif defined(cl_amd_fp64)\n"
72 "# pragma OPENCL EXTENSION cl_amd_fp64: enable\n"
73 "#endif\n",
74 &count);CHKSTRINGERROR("Message to short");
75 numeric_str = &(double_str[0]);
76 }
77 /* Kernel API */
78 ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
79 "\n"
80 "__kernel void integrateElementQuadrature(int N_cb, __global %s *coefficients, __global %s *coefficientsAux, __global %s *jacobianInverses, __global %s *jacobianDeterminants, __global %s *elemVec)\n"
81 "{\n",
82 &count, numeric_str, numeric_str, numeric_str, numeric_str, numeric_str);CHKSTRINGERROR("Message to short");
83 /* Quadrature */
84 ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
85 " /* Quadrature points\n"
86 " - (x1,y1,x2,y2,...) */\n"
87 " const %s points[%d] = {\n",
88 &count, numeric_str, N_q*dim);CHKSTRINGERROR("Message to short");
89 for (p = 0; p < N_q; ++p) {
90 for (d = 0; d < dim; ++d) {
91 ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "%g,\n", &count, points[p*dim+d]);CHKSTRINGERROR("Message to short");
92 }
93 }
94 ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "};\n", &count);CHKSTRINGERROR("Message to short");
95 ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
96 " /* Quadrature weights\n"
97 " - (v1,v2,...) */\n"
98 " const %s weights[%d] = {\n",
99 &count, numeric_str, N_q);CHKSTRINGERROR("Message to short");
100 for (p = 0; p < N_q; ++p) {
101 ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "%g,\n", &count, weights[p]);CHKSTRINGERROR("Message to short");
102 }
103 ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "};\n", &count);CHKSTRINGERROR("Message to short");
104 /* Basis Functions */
105 ierr = PetscFEGetCellTabulation(fem, &T);CHKERRQ(ierr);
106 ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
107 " /* Nodal basis function evaluations\n"
108 " - basis component is fastest varying, the basis function, then point */\n"
109 " const %s Basis[%d] = {\n",
110 &count, numeric_str, N_q*N_b*N_c);CHKSTRINGERROR("Message to short");
111 for (p = 0; p < N_q; ++p) {
112 for (b = 0; b < N_b; ++b) {
113 for (c = 0; c < N_c; ++c) {
114 ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "%g,\n", &count, T->T[0][(p*N_b + b)*N_c + c]);CHKSTRINGERROR("Message to short");
115 }
116 }
117 }
118 ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "};\n", &count);CHKSTRINGERROR("Message to short");
119 ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
120 "\n"
121 " /* Nodal basis function derivative evaluations,\n"
122 " - derivative direction is fastest varying, then basis component, then basis function, then point */\n"
123 " const %s%d BasisDerivatives[%d] = {\n",
124 &count, numeric_str, dim, N_q*N_b*N_c);CHKSTRINGERROR("Message to short");
125 for (p = 0; p < N_q; ++p) {
126 for (b = 0; b < N_b; ++b) {
127 for (c = 0; c < N_c; ++c) {
128 ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "(%s%d)(", &count, numeric_str, dim);CHKSTRINGERROR("Message to short");
129 for (d = 0; d < dim; ++d) {
130 if (d > 0) {
131 ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, ", %g", &count, T->T[1][((p*N_b + b)*dim + d)*N_c + c]);CHKSTRINGERROR("Message to short");
132 } else {
133 ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "%g", &count, T->T[1][((p*N_b + b)*dim + d)*N_c + c]);CHKSTRINGERROR("Message to short");
134 }
135 }
136 ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "),\n", &count);CHKSTRINGERROR("Message to short");
137 }
138 }
139 }
140 ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "};\n", &count);CHKSTRINGERROR("Message to short");
141 /* Sizes */
142 ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
143 " const int dim = %d; // The spatial dimension\n"
144 " const int N_bl = %d; // The number of concurrent blocks\n"
145 " const int N_b = %d; // The number of basis functions\n"
146 " const int N_comp = %d; // The number of basis function components\n"
147 " const int N_bt = N_b*N_comp; // The total number of scalar basis functions\n"
148 " const int N_q = %d; // The number of quadrature points\n"
149 " const int N_bst = N_bt*N_q; // The block size, LCM(N_b*N_comp, N_q), Notice that a block is not processed simultaneously\n"
150 " const int N_t = N_bst*N_bl; // The number of threads, N_bst * N_bl\n"
151 " const int N_bc = N_t/N_comp; // The number of cells per batch (N_b*N_q*N_bl)\n"
152 " const int N_sbc = N_bst / (N_q * N_comp);\n"
153 " const int N_sqc = N_bst / N_bt;\n"
154 " /*const int N_c = N_cb * N_bc;*/\n"
155 "\n"
156 " /* Calculated indices */\n"
157 " /*const int tidx = get_local_id(0) + get_local_size(0)*get_local_id(1);*/\n"
158 " const int tidx = get_local_id(0);\n"
159 " const int blidx = tidx / N_bst; // Block number for this thread\n"
160 " const int bidx = tidx %% N_bt; // Basis function mapped to this thread\n"
161 " const int cidx = tidx %% N_comp; // Basis component mapped to this thread\n"
162 " const int qidx = tidx %% N_q; // Quadrature point mapped to this thread\n"
163 " const int blbidx = tidx %% N_q + blidx*N_q; // Cell mapped to this thread in the basis phase\n"
164 " const int blqidx = tidx %% N_b + blidx*N_b; // Cell mapped to this thread in the quadrature phase\n"
165 " const int gidx = get_group_id(1)*get_num_groups(0) + get_group_id(0);\n"
166 " const int Goffset = gidx*N_cb*N_bc;\n",
167 &count, dim, N_bl, N_b, N_c, N_q);CHKSTRINGERROR("Message to short");
168 /* Local memory */
169 ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
170 "\n"
171 " /* Quadrature data */\n"
172 " %s w; // $w_q$, Quadrature weight at $x_q$\n"
173 " __local %s phi_i[%d]; //[N_bt*N_q]; // $\\phi_i(x_q)$, Value of the basis function $i$ at $x_q$\n"
174 " __local %s%d phiDer_i[%d]; //[N_bt*N_q]; // $\\frac{\\partial\\phi_i(x_q)}{\\partial x_d}$, Value of the derivative of basis function $i$ in direction $x_d$ at $x_q$\n"
175 " /* Geometric data */\n"
176 " __local %s detJ[%d]; //[N_t]; // $|J(x_q)|$, Jacobian determinant at $x_q$\n"
177 " __local %s invJ[%d];//[N_t*dim*dim]; // $J^{-1}(x_q)$, Jacobian inverse at $x_q$\n",
178 &count, numeric_str, numeric_str, N_b*N_c*N_q, numeric_str, dim, N_b*N_c*N_q, numeric_str, N_t,
179 numeric_str, N_t*dim*dim, numeric_str, N_t*N_b*N_c);CHKSTRINGERROR("Message to short");
180 ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
181 " /* FEM data */\n"
182 " __local %s u_i[%d]; //[N_t*N_bt]; // Coefficients $u_i$ of the field $u|_{\\mathcal{T}} = \\sum_i u_i \\phi_i$\n",
183 &count, numeric_str, N_t*N_b*N_c);CHKSTRINGERROR("Message to short");
184 if (useAux) {
185 ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
186 " __local %s a_i[%d]; //[N_t]; // Coefficients $a_i$ of the auxiliary field $a|_{\\mathcal{T}} = \\sum_i a_i \\phi^R_i$\n",
187 &count, numeric_str, N_t);CHKSTRINGERROR("Message to short");
188 }
189 if (useF0) {
190 ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
191 " /* Intermediate calculations */\n"
192 " __local %s f_0[%d]; //[N_t*N_sqc]; // $f_0(u(x_q), \\nabla u(x_q)) |J(x_q)| w_q$\n",
193 &count, numeric_str, N_t*N_q);CHKSTRINGERROR("Message to short");
194 }
195 if (useF1) {
196 ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
197 " __local %s%d f_1[%d]; //[N_t*N_sqc]; // $f_1(u(x_q), \\nabla u(x_q)) |J(x_q)| w_q$\n",
198 &count, numeric_str, dim, N_t*N_q);CHKSTRINGERROR("Message to short");
199 }
200 /* TODO: If using elasticity, put in mu/lambda coefficients */
201 ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
202 " /* Output data */\n"
203 " %s e_i; // Coefficient $e_i$ of the residual\n\n",
204 &count, numeric_str);CHKSTRINGERROR("Message to short");
205 /* One-time loads */
206 ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
207 " /* These should be generated inline */\n"
208 " /* Load quadrature weights */\n"
209 " w = weights[qidx];\n"
210 " /* Load basis tabulation \\phi_i for this cell */\n"
211 " if (tidx < N_bt*N_q) {\n"
212 " phi_i[tidx] = Basis[tidx];\n"
213 " phiDer_i[tidx] = BasisDerivatives[tidx];\n"
214 " }\n\n",
215 &count);CHKSTRINGERROR("Message to short");
216 /* Batch loads */
217 ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
218 " for (int batch = 0; batch < N_cb; ++batch) {\n"
219 " /* Load geometry */\n"
220 " detJ[tidx] = jacobianDeterminants[Goffset+batch*N_bc+tidx];\n"
221 " for (int n = 0; n < dim*dim; ++n) {\n"
222 " const int offset = n*N_t;\n"
223 " invJ[offset+tidx] = jacobianInverses[(Goffset+batch*N_bc)*dim*dim+offset+tidx];\n"
224 " }\n"
225 " /* Load coefficients u_i for this cell */\n"
226 " for (int n = 0; n < N_bt; ++n) {\n"
227 " const int offset = n*N_t;\n"
228 " u_i[offset+tidx] = coefficients[(Goffset*N_bt)+batch*N_t*N_b+offset+tidx];\n"
229 " }\n",
230 &count);CHKSTRINGERROR("Message to short");
231 if (useAux) {
232 ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
233 " /* Load coefficients a_i for this cell */\n"
234 " /* TODO: This should not be N_t here, it should be N_bc*N_comp_aux */\n"
235 " a_i[tidx] = coefficientsAux[Goffset+batch*N_t+tidx];\n",
236 &count);CHKSTRINGERROR("Message to short");
237 }
238 /* Quadrature phase */
239 ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
240 " barrier(CLK_LOCAL_MEM_FENCE);\n"
241 "\n"
242 " /* Map coefficients to values at quadrature points */\n"
243 " for (int c = 0; c < N_sqc; ++c) {\n"
244 " const int cell = c*N_bl*N_b + blqidx;\n"
245 " const int fidx = (cell*N_q + qidx)*N_comp + cidx;\n",
246 &count);CHKSTRINGERROR("Message to short");
247 if (useField) {
248 ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
249 " %s u[%d]; //[N_comp]; // $u(x_q)$, Value of the field at $x_q$\n",
250 &count, numeric_str, N_c);CHKSTRINGERROR("Message to short");
251 }
252 if (useFieldDer) {
253 ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
254 " %s%d gradU[%d]; //[N_comp]; // $\\nabla u(x_q)$, Value of the field gradient at $x_q$\n",
255 &count, numeric_str, dim, N_c);CHKSTRINGERROR("Message to short");
256 }
257 if (useFieldAux) {
258 ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
259 " %s a[%d]; //[1]; // $a(x_q)$, Value of the auxiliary fields at $x_q$\n",
260 &count, numeric_str, 1);CHKSTRINGERROR("Message to short");
261 }
262 if (useFieldDerAux) {
263 ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
264 " %s%d gradA[%d]; //[1]; // $\\nabla a(x_q)$, Value of the auxiliary field gradient at $x_q$\n",
265 &count, numeric_str, dim, 1);CHKSTRINGERROR("Message to short");
266 }
267 ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
268 "\n"
269 " for (int comp = 0; comp < N_comp; ++comp) {\n",
270 &count);CHKSTRINGERROR("Message to short");
271 if (useField) {ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, " u[comp] = 0.0;\n", &count);CHKSTRINGERROR("Message to short");}
272 if (useFieldDer) {
273 switch (dim) {
274 case 1:
275 ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, " gradU[comp].x = 0.0;\n", &count);CHKSTRINGERROR("Message to short");break;
276 case 2:
277 ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, " gradU[comp].x = 0.0; gradU[comp].y = 0.0;\n", &count);CHKSTRINGERROR("Message to short");break;
278 case 3:
279 ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, " gradU[comp].x = 0.0; gradU[comp].y = 0.0; gradU[comp].z = 0.0;\n", &count);CHKSTRINGERROR("Message to short");break;
280 }
281 }
282 ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
283 " }\n",
284 &count);CHKSTRINGERROR("Message to short");
285 if (useFieldAux) {
286 ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, " a[0] = 0.0;\n", &count);CHKSTRINGERROR("Message to short");
287 }
288 if (useFieldDerAux) {
289 switch (dim) {
290 case 1:
291 ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, " gradA[0].x = 0.0;\n", &count);CHKSTRINGERROR("Message to short");break;
292 case 2:
293 ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, " gradA[0].x = 0.0; gradA[0].y = 0.0;\n", &count);CHKSTRINGERROR("Message to short");break;
294 case 3:
295 ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, " gradA[0].x = 0.0; gradA[0].y = 0.0; gradA[0].z = 0.0;\n", &count);CHKSTRINGERROR("Message to short");break;
296 }
297 }
298 ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
299 " /* Get field and derivatives at this quadrature point */\n"
300 " for (int i = 0; i < N_b; ++i) {\n"
301 " for (int comp = 0; comp < N_comp; ++comp) {\n"
302 " const int b = i*N_comp+comp;\n"
303 " const int pidx = qidx*N_bt + b;\n"
304 " const int uidx = cell*N_bt + b;\n"
305 " %s%d realSpaceDer;\n\n",
306 &count, numeric_str, dim);CHKSTRINGERROR("Message to short");
307 if (useField) {ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail," u[comp] += u_i[uidx]*phi_i[pidx];\n", &count);CHKSTRINGERROR("Message to short");}
308 if (useFieldDer) {
309 switch (dim) {
310 case 2:
311 ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
312 " realSpaceDer.x = invJ[cell*dim*dim+0*dim+0]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+0]*phiDer_i[pidx].y;\n"
313 " gradU[comp].x += u_i[uidx]*realSpaceDer.x;\n"
314 " realSpaceDer.y = invJ[cell*dim*dim+0*dim+1]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+1]*phiDer_i[pidx].y;\n"
315 " gradU[comp].y += u_i[uidx]*realSpaceDer.y;\n",
316 &count);CHKSTRINGERROR("Message to short");break;
317 case 3:
318 ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
319 " realSpaceDer.x = invJ[cell*dim*dim+0*dim+0]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+0]*phiDer_i[pidx].y + invJ[cell*dim*dim+2*dim+0]*phiDer_i[pidx].z;\n"
320 " gradU[comp].x += u_i[uidx]*realSpaceDer.x;\n"
321 " realSpaceDer.y = invJ[cell*dim*dim+0*dim+1]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+1]*phiDer_i[pidx].y + invJ[cell*dim*dim+2*dim+1]*phiDer_i[pidx].z;\n"
322 " gradU[comp].y += u_i[uidx]*realSpaceDer.y;\n"
323 " realSpaceDer.z = invJ[cell*dim*dim+0*dim+2]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+2]*phiDer_i[pidx].y + invJ[cell*dim*dim+2*dim+2]*phiDer_i[pidx].z;\n"
324 " gradU[comp].z += u_i[uidx]*realSpaceDer.z;\n",
325 &count);CHKSTRINGERROR("Message to short");break;
326 }
327 }
328 ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
329 " }\n"
330 " }\n",
331 &count);CHKSTRINGERROR("Message to short");
332 if (useFieldAux) {
333 ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail," a[0] += a_i[cell];\n", &count);CHKSTRINGERROR("Message to short");
334 }
335 /* Calculate residual at quadrature points: Should be generated by an weak form egine */
336 ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
337 " /* Process values at quadrature points */\n",
338 &count);CHKSTRINGERROR("Message to short");
339 switch (op) {
340 case LAPLACIAN:
341 if (useF0) {ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, " f_0[fidx] = 4.0;\n", &count);CHKSTRINGERROR("Message to short");}
342 if (useF1) {
343 if (useAux) {ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, " f_1[fidx] = a[0]*gradU[cidx];\n", &count);CHKSTRINGERROR("Message to short");}
344 else {ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, " f_1[fidx] = gradU[cidx];\n", &count);CHKSTRINGERROR("Message to short");}
345 }
346 break;
347 case ELASTICITY:
348 if (useF0) {ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, " f_0[fidx] = 4.0;\n", &count);CHKSTRINGERROR("Message to short");}
349 if (useF1) {
350 switch (dim) {
351 case 2:
352 ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
353 " switch (cidx) {\n"
354 " case 0:\n"
355 " f_1[fidx].x = lambda*(gradU[0].x + gradU[1].y) + mu*(gradU[0].x + gradU[0].x);\n"
356 " f_1[fidx].y = lambda*(gradU[0].x + gradU[1].y) + mu*(gradU[0].y + gradU[1].x);\n"
357 " break;\n"
358 " case 1:\n"
359 " f_1[fidx].x = lambda*(gradU[0].x + gradU[1].y) + mu*(gradU[1].x + gradU[0].y);\n"
360 " f_1[fidx].y = lambda*(gradU[0].x + gradU[1].y) + mu*(gradU[1].y + gradU[1].y);\n"
361 " }\n",
362 &count);CHKSTRINGERROR("Message to short");break;
363 case 3:
364 ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
365 " switch (cidx) {\n"
366 " case 0:\n"
367 " f_1[fidx].x = lambda*(gradU[0].x + gradU[1].y + gradU[2].z) + mu*(gradU[0].x + gradU[0].x);\n"
368 " f_1[fidx].y = lambda*(gradU[0].x + gradU[1].y + gradU[2].z) + mu*(gradU[0].y + gradU[1].x);\n"
369 " f_1[fidx].z = lambda*(gradU[0].x + gradU[1].y + gradU[2].z) + mu*(gradU[0].z + gradU[2].x);\n"
370 " break;\n"
371 " case 1:\n"
372 " f_1[fidx].x = lambda*(gradU[0].x + gradU[1].y + gradU[2].z) + mu*(gradU[1].x + gradU[0].y);\n"
373 " f_1[fidx].y = lambda*(gradU[0].x + gradU[1].y + gradU[2].z) + mu*(gradU[1].y + gradU[1].y);\n"
374 " f_1[fidx].z = lambda*(gradU[0].x + gradU[1].y + gradU[2].z) + mu*(gradU[1].y + gradU[2].y);\n"
375 " break;\n"
376 " case 2:\n"
377 " f_1[fidx].x = lambda*(gradU[0].x + gradU[1].y + gradU[2].z) + mu*(gradU[2].x + gradU[0].z);\n"
378 " f_1[fidx].y = lambda*(gradU[0].x + gradU[1].y + gradU[2].z) + mu*(gradU[2].y + gradU[1].z);\n"
379 " f_1[fidx].z = lambda*(gradU[0].x + gradU[1].y + gradU[2].z) + mu*(gradU[2].y + gradU[2].z);\n"
380 " }\n",
381 &count);CHKSTRINGERROR("Message to short");break;
382 }}
383 break;
384 default:
385 SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_SUP, "PDE operator %d is not supported", op);
386 }
387 if (useF0) {ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail," f_0[fidx] *= detJ[cell]*w;\n", &count);CHKSTRINGERROR("Message to short");}
388 if (useF1) {
389 switch (dim) {
390 case 1:
391 ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail," f_1[fidx].x *= detJ[cell]*w;\n", &count);CHKSTRINGERROR("Message to short");break;
392 case 2:
393 ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail," f_1[fidx].x *= detJ[cell]*w; f_1[fidx].y *= detJ[cell]*w;\n", &count);CHKSTRINGERROR("Message to short");break;
394 case 3:
395 ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail," f_1[fidx].x *= detJ[cell]*w; f_1[fidx].y *= detJ[cell]*w; f_1[fidx].z *= detJ[cell]*w;\n", &count);CHKSTRINGERROR("Message to short");break;
396 }
397 }
398 /* Thread transpose */
399 ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
400 " }\n\n"
401 " /* ==== TRANSPOSE THREADS ==== */\n"
402 " barrier(CLK_LOCAL_MEM_FENCE);\n\n",
403 &count);CHKSTRINGERROR("Message to short");
404 /* Basis phase */
405 ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
406 " /* Map values at quadrature points to coefficients */\n"
407 " for (int c = 0; c < N_sbc; ++c) {\n"
408 " const int cell = c*N_bl*N_q + blbidx; /* Cell number in batch */\n"
409 "\n"
410 " e_i = 0.0;\n"
411 " for (int q = 0; q < N_q; ++q) {\n"
412 " const int pidx = q*N_bt + bidx;\n"
413 " const int fidx = (cell*N_q + q)*N_comp + cidx;\n"
414 " %s%d realSpaceDer;\n\n",
415 &count, numeric_str, dim);CHKSTRINGERROR("Message to short");
416
417 if (useF0) {ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail," e_i += phi_i[pidx]*f_0[fidx];\n", &count);CHKSTRINGERROR("Message to short");}
418 if (useF1) {
419 switch (dim) {
420 case 2:
421 ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
422 " realSpaceDer.x = invJ[cell*dim*dim+0*dim+0]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+0]*phiDer_i[pidx].y;\n"
423 " e_i += realSpaceDer.x*f_1[fidx].x;\n"
424 " realSpaceDer.y = invJ[cell*dim*dim+0*dim+1]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+1]*phiDer_i[pidx].y;\n"
425 " e_i += realSpaceDer.y*f_1[fidx].y;\n",
426 &count);CHKSTRINGERROR("Message to short");break;
427 case 3:
428 ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
429 " realSpaceDer.x = invJ[cell*dim*dim+0*dim+0]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+0]*phiDer_i[pidx].y + invJ[cell*dim*dim+2*dim+0]*phiDer_i[pidx].z;\n"
430 " e_i += realSpaceDer.x*f_1[fidx].x;\n"
431 " realSpaceDer.y = invJ[cell*dim*dim+0*dim+1]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+1]*phiDer_i[pidx].y + invJ[cell*dim*dim+2*dim+1]*phiDer_i[pidx].z;\n"
432 " e_i += realSpaceDer.y*f_1[fidx].y;\n"
433 " realSpaceDer.z = invJ[cell*dim*dim+0*dim+2]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+2]*phiDer_i[pidx].y + invJ[cell*dim*dim+2*dim+2]*phiDer_i[pidx].z;\n"
434 " e_i += realSpaceDer.z*f_1[fidx].z;\n",
435 &count);CHKSTRINGERROR("Message to short");break;
436 }
437 }
438 ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
439 " }\n"
440 " /* Write element vector for N_{cbc} cells at a time */\n"
441 " elemVec[(Goffset + batch*N_bc + c*N_bl*N_q)*N_bt + tidx] = e_i;\n"
442 " }\n"
443 " /* ==== Could do one write per batch ==== */\n"
444 " }\n"
445 " return;\n"
446 "}\n",
447 &count);CHKSTRINGERROR("Message to short");
448 PetscFunctionReturn(0);
449 }
450
PetscFEOpenCLGetIntegrationKernel(PetscFE fem,PetscBool useAux,cl_program * ocl_prog,cl_kernel * ocl_kernel)451 static PetscErrorCode PetscFEOpenCLGetIntegrationKernel(PetscFE fem, PetscBool useAux, cl_program *ocl_prog, cl_kernel *ocl_kernel)
452 {
453 PetscFE_OpenCL *ocl = (PetscFE_OpenCL *) fem->data;
454 PetscInt dim, N_bl;
455 PetscBool flg;
456 char *buffer;
457 size_t len;
458 char errMsg[8192];
459 cl_int err;
460 PetscErrorCode ierr;
461
462 PetscFunctionBegin;
463 ierr = PetscFEGetSpatialDimension(fem, &dim);CHKERRQ(ierr);
464 ierr = PetscMalloc1(8192, &buffer);CHKERRQ(ierr);
465 ierr = PetscFEGetTileSizes(fem, NULL, &N_bl, NULL, NULL);CHKERRQ(ierr);
466 ierr = PetscFEOpenCLGenerateIntegrationCode(fem, &buffer, 8192, useAux, N_bl);CHKERRQ(ierr);
467 ierr = PetscOptionsHasName(((PetscObject)fem)->options,((PetscObject)fem)->prefix, "-petscfe_opencl_kernel_print", &flg);CHKERRQ(ierr);
468 if (flg) {ierr = PetscPrintf(PetscObjectComm((PetscObject) fem), "OpenCL FE Integration Kernel:\n%s\n", buffer);CHKERRQ(ierr);}
469 ierr = PetscStrlen(buffer,&len);CHKERRQ(ierr);
470 *ocl_prog = clCreateProgramWithSource(ocl->ctx_id, 1, (const char **) &buffer, &len, &err);CHKERRQ(err);
471 err = clBuildProgram(*ocl_prog, 0, NULL, NULL, NULL, NULL);
472 if (err != CL_SUCCESS) {
473 err = clGetProgramBuildInfo(*ocl_prog, ocl->dev_id, CL_PROGRAM_BUILD_LOG, 8192*sizeof(char), &errMsg, NULL);CHKERRQ(ierr);
474 SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Build failed! Log:\n %s", errMsg);
475 }
476 ierr = PetscFree(buffer);CHKERRQ(ierr);
477 *ocl_kernel = clCreateKernel(*ocl_prog, "integrateElementQuadrature", &ierr);CHKERRQ(ierr);
478 PetscFunctionReturn(0);
479 }
480
PetscFEOpenCLCalculateGrid(PetscFE fem,PetscInt N,PetscInt blockSize,size_t * x,size_t * y,size_t * z)481 static PetscErrorCode PetscFEOpenCLCalculateGrid(PetscFE fem, PetscInt N, PetscInt blockSize, size_t *x, size_t *y, size_t *z)
482 {
483 const PetscInt Nblocks = N/blockSize;
484
485 PetscFunctionBegin;
486 if (N % blockSize) SETERRQ2(PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid block size %d for %d elements", blockSize, N);
487 *z = 1;
488 *y = 1;
489 for (*x = (size_t) (PetscSqrtReal(Nblocks) + 0.5); *x > 0; --*x) {
490 *y = Nblocks / *x;
491 if (*x * *y == Nblocks) break;
492 }
493 if (*x * *y != Nblocks) SETERRQ2(PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Could not find partition for %d with block size %d", N, blockSize);
494 PetscFunctionReturn(0);
495 }
496
PetscFEOpenCLLogResidual(PetscFE fem,PetscLogDouble time,PetscLogDouble flops)497 static PetscErrorCode PetscFEOpenCLLogResidual(PetscFE fem, PetscLogDouble time, PetscLogDouble flops)
498 {
499 PetscFE_OpenCL *ocl = (PetscFE_OpenCL *) fem->data;
500 PetscStageLog stageLog;
501 PetscEventPerfLog eventLog = NULL;
502 int stage;
503 PetscErrorCode ierr;
504
505 PetscFunctionBegin;
506 ierr = PetscLogGetStageLog(&stageLog);CHKERRQ(ierr);
507 ierr = PetscStageLogGetCurrent(stageLog, &stage);CHKERRQ(ierr);
508 ierr = PetscStageLogGetEventPerfLog(stageLog, stage, &eventLog);CHKERRQ(ierr);
509 /* Log performance info */
510 eventLog->eventInfo[ocl->residualEvent].count++;
511 eventLog->eventInfo[ocl->residualEvent].time += time;
512 eventLog->eventInfo[ocl->residualEvent].flops += flops;
513 PetscFunctionReturn(0);
514 }
515
PetscFEIntegrateResidual_OpenCL(PetscDS prob,PetscInt field,PetscInt Ne,PetscFEGeom * cgeom,const PetscScalar coefficients[],const PetscScalar coefficients_t[],PetscDS probAux,const PetscScalar coefficientsAux[],PetscReal t,PetscScalar elemVec[])516 static PetscErrorCode PetscFEIntegrateResidual_OpenCL(PetscDS prob, PetscInt field, PetscInt Ne, PetscFEGeom *cgeom,
517 const PetscScalar coefficients[], const PetscScalar coefficients_t[], PetscDS probAux, const PetscScalar coefficientsAux[], PetscReal t, PetscScalar elemVec[])
518 {
519 /* Nbc = batchSize */
520 PetscFE fem;
521 PetscFE_OpenCL *ocl;
522 PetscPointFunc f0_func;
523 PetscPointFunc f1_func;
524 PetscQuadrature q;
525 PetscInt dim, qNc;
526 PetscInt N_b; /* The number of basis functions */
527 PetscInt N_comp; /* The number of basis function components */
528 PetscInt N_bt; /* The total number of scalar basis functions */
529 PetscInt N_q; /* The number of quadrature points */
530 PetscInt N_bst; /* The block size, LCM(N_bt, N_q), Notice that a block is not process simultaneously */
531 PetscInt N_t; /* The number of threads, N_bst * N_bl */
532 PetscInt N_bl; /* The number of blocks */
533 PetscInt N_bc; /* The batch size, N_bl*N_q*N_b */
534 PetscInt N_cb; /* The number of batches */
535 PetscInt numFlops, f0Flops = 0, f1Flops = 0;
536 PetscBool useAux = probAux ? PETSC_TRUE : PETSC_FALSE;
537 PetscBool useField = PETSC_FALSE;
538 PetscBool useFieldDer = PETSC_TRUE;
539 PetscBool useF0 = PETSC_TRUE;
540 PetscBool useF1 = PETSC_TRUE;
541 /* OpenCL variables */
542 cl_program ocl_prog;
543 cl_kernel ocl_kernel;
544 cl_event ocl_ev; /* The event for tracking kernel execution */
545 cl_ulong ns_start; /* Nanoseconds counter on GPU at kernel start */
546 cl_ulong ns_end; /* Nanoseconds counter on GPU at kernel stop */
547 cl_mem o_jacobianInverses, o_jacobianDeterminants;
548 cl_mem o_coefficients, o_coefficientsAux, o_elemVec;
549 float *f_coeff = NULL, *f_coeffAux = NULL, *f_invJ = NULL, *f_detJ = NULL;
550 double *d_coeff = NULL, *d_coeffAux = NULL, *d_invJ = NULL, *d_detJ = NULL;
551 PetscReal *r_invJ = NULL, *r_detJ = NULL;
552 void *oclCoeff, *oclCoeffAux, *oclInvJ, *oclDetJ;
553 size_t local_work_size[3], global_work_size[3];
554 size_t realSize, x, y, z;
555 const PetscReal *points, *weights;
556 PetscErrorCode ierr;
557
558 PetscFunctionBegin;
559 ierr = PetscDSGetDiscretization(prob, field, (PetscObject *) &fem);CHKERRQ(ierr);
560 ocl = (PetscFE_OpenCL *) fem->data;
561 if (!Ne) {ierr = PetscFEOpenCLLogResidual(fem, 0.0, 0.0);CHKERRQ(ierr); PetscFunctionReturn(0);}
562 ierr = PetscFEGetSpatialDimension(fem, &dim);CHKERRQ(ierr);
563 ierr = PetscFEGetQuadrature(fem, &q);CHKERRQ(ierr);
564 ierr = PetscQuadratureGetData(q, NULL, &qNc, &N_q, &points, &weights);CHKERRQ(ierr);
565 if (qNc != 1) SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_SUP, "Only supports scalar quadrature, not %D components\n", qNc);
566 ierr = PetscFEGetDimension(fem, &N_b);CHKERRQ(ierr);
567 ierr = PetscFEGetNumComponents(fem, &N_comp);CHKERRQ(ierr);
568 ierr = PetscDSGetResidual(prob, field, &f0_func, &f1_func);CHKERRQ(ierr);
569 ierr = PetscFEGetTileSizes(fem, NULL, &N_bl, &N_bc, &N_cb);CHKERRQ(ierr);
570 N_bt = N_b*N_comp;
571 N_bst = N_bt*N_q;
572 N_t = N_bst*N_bl;
573 if (N_bc*N_comp != N_t) SETERRQ3(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Number of threads %d should be %d * %d", N_t, N_bc, N_comp);
574 /* Calculate layout */
575 if (Ne % (N_cb*N_bc)) { /* Remainder cells */
576 ierr = PetscFEIntegrateResidual_Basic(prob, field, Ne, cgeom, coefficients, coefficients_t, probAux, coefficientsAux, t, elemVec);CHKERRQ(ierr);
577 PetscFunctionReturn(0);
578 }
579 ierr = PetscFEOpenCLCalculateGrid(fem, Ne, N_cb*N_bc, &x, &y, &z);CHKERRQ(ierr);
580 local_work_size[0] = N_bc*N_comp;
581 local_work_size[1] = 1;
582 local_work_size[2] = 1;
583 global_work_size[0] = x * local_work_size[0];
584 global_work_size[1] = y * local_work_size[1];
585 global_work_size[2] = z * local_work_size[2];
586 ierr = PetscInfo7(fem, "GPU layout grid(%d,%d,%d) block(%d,%d,%d) with %d batches\n", x, y, z, local_work_size[0], local_work_size[1], local_work_size[2], N_cb);CHKERRQ(ierr);
587 ierr = PetscInfo2(fem, " N_t: %d, N_cb: %d\n", N_t, N_cb);
588 /* Generate code */
589 if (probAux) {
590 PetscSpace P;
591 PetscInt NfAux, order, f;
592
593 ierr = PetscDSGetNumFields(probAux, &NfAux);CHKERRQ(ierr);
594 for (f = 0; f < NfAux; ++f) {
595 PetscFE feAux;
596
597 ierr = PetscDSGetDiscretization(probAux, f, (PetscObject *) &feAux);CHKERRQ(ierr);
598 ierr = PetscFEGetBasisSpace(feAux, &P);CHKERRQ(ierr);
599 ierr = PetscSpaceGetDegree(P, &order, NULL);CHKERRQ(ierr);
600 if (order > 0) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Can only handle P0 coefficient fields");
601 }
602 }
603 ierr = PetscFEOpenCLGetIntegrationKernel(fem, useAux, &ocl_prog, &ocl_kernel);CHKERRQ(ierr);
604 /* Create buffers on the device and send data over */
605 ierr = PetscDataTypeGetSize(ocl->realType, &realSize);CHKERRQ(ierr);
606 if (cgeom->numPoints > 1) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Only support affine geometry for OpenCL integration right now");
607 if (sizeof(PetscReal) != realSize) {
608 switch (ocl->realType) {
609 case PETSC_FLOAT:
610 {
611 PetscInt c, b, d;
612
613 ierr = PetscMalloc4(Ne*N_bt,&f_coeff,Ne,&f_coeffAux,Ne*dim*dim,&f_invJ,Ne,&f_detJ);CHKERRQ(ierr);
614 for (c = 0; c < Ne; ++c) {
615 f_detJ[c] = (float) cgeom->detJ[c];
616 for (d = 0; d < dim*dim; ++d) {
617 f_invJ[c*dim*dim+d] = (float) cgeom->invJ[c * dim * dim + d];
618 }
619 for (b = 0; b < N_bt; ++b) {
620 f_coeff[c*N_bt+b] = (float) coefficients[c*N_bt+b];
621 }
622 }
623 if (coefficientsAux) { /* Assume P0 */
624 for (c = 0; c < Ne; ++c) {
625 f_coeffAux[c] = (float) coefficientsAux[c];
626 }
627 }
628 oclCoeff = (void *) f_coeff;
629 if (coefficientsAux) {
630 oclCoeffAux = (void *) f_coeffAux;
631 } else {
632 oclCoeffAux = NULL;
633 }
634 oclInvJ = (void *) f_invJ;
635 oclDetJ = (void *) f_detJ;
636 }
637 break;
638 case PETSC_DOUBLE:
639 {
640 PetscInt c, b, d;
641
642 ierr = PetscMalloc4(Ne*N_bt,&d_coeff,Ne,&d_coeffAux,Ne*dim*dim,&d_invJ,Ne,&d_detJ);CHKERRQ(ierr);
643 for (c = 0; c < Ne; ++c) {
644 d_detJ[c] = (double) cgeom->detJ[c];
645 for (d = 0; d < dim*dim; ++d) {
646 d_invJ[c*dim*dim+d] = (double) cgeom->invJ[c * dim * dim + d];
647 }
648 for (b = 0; b < N_bt; ++b) {
649 d_coeff[c*N_bt+b] = (double) coefficients[c*N_bt+b];
650 }
651 }
652 if (coefficientsAux) { /* Assume P0 */
653 for (c = 0; c < Ne; ++c) {
654 d_coeffAux[c] = (double) coefficientsAux[c];
655 }
656 }
657 oclCoeff = (void *) d_coeff;
658 if (coefficientsAux) {
659 oclCoeffAux = (void *) d_coeffAux;
660 } else {
661 oclCoeffAux = NULL;
662 }
663 oclInvJ = (void *) d_invJ;
664 oclDetJ = (void *) d_detJ;
665 }
666 break;
667 default:
668 SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Unsupported PETSc type %d", ocl->realType);
669 }
670 } else {
671 PetscInt c, d;
672
673 ierr = PetscMalloc2(Ne*dim*dim,&r_invJ,Ne,&r_detJ);CHKERRQ(ierr);
674 for (c = 0; c < Ne; ++c) {
675 r_detJ[c] = cgeom->detJ[c];
676 for (d = 0; d < dim*dim; ++d) {
677 r_invJ[c*dim*dim+d] = cgeom->invJ[c * dim * dim + d];
678 }
679 }
680 oclCoeff = (void *) coefficients;
681 oclCoeffAux = (void *) coefficientsAux;
682 oclInvJ = (void *) r_invJ;
683 oclDetJ = (void *) r_detJ;
684 }
685 o_coefficients = clCreateBuffer(ocl->ctx_id, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, Ne*N_bt * realSize, oclCoeff, &ierr);CHKERRQ(ierr);
686 if (coefficientsAux) {
687 o_coefficientsAux = clCreateBuffer(ocl->ctx_id, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, Ne * realSize, oclCoeffAux, &ierr);CHKERRQ(ierr);
688 } else {
689 o_coefficientsAux = clCreateBuffer(ocl->ctx_id, CL_MEM_READ_ONLY, Ne * realSize, oclCoeffAux, &ierr);CHKERRQ(ierr);
690 }
691 o_jacobianInverses = clCreateBuffer(ocl->ctx_id, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, Ne*dim*dim * realSize, oclInvJ, &ierr);CHKERRQ(ierr);
692 o_jacobianDeterminants = clCreateBuffer(ocl->ctx_id, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, Ne * realSize, oclDetJ, &ierr);CHKERRQ(ierr);
693 o_elemVec = clCreateBuffer(ocl->ctx_id, CL_MEM_WRITE_ONLY, Ne*N_bt * realSize, NULL, &ierr);CHKERRQ(ierr);
694 /* Kernel launch */
695 ierr = clSetKernelArg(ocl_kernel, 0, sizeof(cl_int), (void*) &N_cb);CHKERRQ(ierr);
696 ierr = clSetKernelArg(ocl_kernel, 1, sizeof(cl_mem), (void*) &o_coefficients);CHKERRQ(ierr);
697 ierr = clSetKernelArg(ocl_kernel, 2, sizeof(cl_mem), (void*) &o_coefficientsAux);CHKERRQ(ierr);
698 ierr = clSetKernelArg(ocl_kernel, 3, sizeof(cl_mem), (void*) &o_jacobianInverses);CHKERRQ(ierr);
699 ierr = clSetKernelArg(ocl_kernel, 4, sizeof(cl_mem), (void*) &o_jacobianDeterminants);CHKERRQ(ierr);
700 ierr = clSetKernelArg(ocl_kernel, 5, sizeof(cl_mem), (void*) &o_elemVec);CHKERRQ(ierr);
701 ierr = clEnqueueNDRangeKernel(ocl->queue_id, ocl_kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &ocl_ev);CHKERRQ(ierr);
702 /* Read data back from device */
703 if (sizeof(PetscReal) != realSize) {
704 switch (ocl->realType) {
705 case PETSC_FLOAT:
706 {
707 float *elem;
708 PetscInt c, b;
709
710 ierr = PetscFree4(f_coeff,f_coeffAux,f_invJ,f_detJ);CHKERRQ(ierr);
711 ierr = PetscMalloc1(Ne*N_bt, &elem);CHKERRQ(ierr);
712 ierr = clEnqueueReadBuffer(ocl->queue_id, o_elemVec, CL_TRUE, 0, Ne*N_bt * realSize, elem, 0, NULL, NULL);CHKERRQ(ierr);
713 for (c = 0; c < Ne; ++c) {
714 for (b = 0; b < N_bt; ++b) {
715 elemVec[c*N_bt+b] = (PetscScalar) elem[c*N_bt+b];
716 }
717 }
718 ierr = PetscFree(elem);CHKERRQ(ierr);
719 }
720 break;
721 case PETSC_DOUBLE:
722 {
723 double *elem;
724 PetscInt c, b;
725
726 ierr = PetscFree4(d_coeff,d_coeffAux,d_invJ,d_detJ);CHKERRQ(ierr);
727 ierr = PetscMalloc1(Ne*N_bt, &elem);CHKERRQ(ierr);
728 ierr = clEnqueueReadBuffer(ocl->queue_id, o_elemVec, CL_TRUE, 0, Ne*N_bt * realSize, elem, 0, NULL, NULL);CHKERRQ(ierr);
729 for (c = 0; c < Ne; ++c) {
730 for (b = 0; b < N_bt; ++b) {
731 elemVec[c*N_bt+b] = (PetscScalar) elem[c*N_bt+b];
732 }
733 }
734 ierr = PetscFree(elem);CHKERRQ(ierr);
735 }
736 break;
737 default:
738 SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Unsupported PETSc type %d", ocl->realType);
739 }
740 } else {
741 ierr = PetscFree2(r_invJ,r_detJ);CHKERRQ(ierr);
742 ierr = clEnqueueReadBuffer(ocl->queue_id, o_elemVec, CL_TRUE, 0, Ne*N_bt * realSize, elemVec, 0, NULL, NULL);CHKERRQ(ierr);
743 }
744 /* Log performance */
745 ierr = clGetEventProfilingInfo(ocl_ev, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &ns_start, NULL);CHKERRQ(ierr);
746 ierr = clGetEventProfilingInfo(ocl_ev, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &ns_end, NULL);CHKERRQ(ierr);
747 f0Flops = 0;
748 switch (ocl->op) {
749 case LAPLACIAN:
750 f1Flops = useAux ? dim : 0;break;
751 case ELASTICITY:
752 f1Flops = 2*dim*dim;break;
753 }
754 numFlops = Ne*(
755 N_q*(
756 N_b*N_comp*((useField ? 2 : 0) + (useFieldDer ? 2*dim*(dim + 1) : 0))
757 /*+
758 N_ba*N_compa*((useFieldAux ? 2 : 0) + (useFieldDerAux ? 2*dim*(dim + 1) : 0))*/
759 +
760 N_comp*((useF0 ? f0Flops + 2 : 0) + (useF1 ? f1Flops + 2*dim : 0)))
761 +
762 N_b*((useF0 ? 2 : 0) + (useF1 ? 2*dim*(dim + 1) : 0)));
763 ierr = PetscFEOpenCLLogResidual(fem, (ns_end - ns_start)*1.0e-9, numFlops);CHKERRQ(ierr);
764 /* Cleanup */
765 ierr = clReleaseMemObject(o_coefficients);CHKERRQ(ierr);
766 ierr = clReleaseMemObject(o_coefficientsAux);CHKERRQ(ierr);
767 ierr = clReleaseMemObject(o_jacobianInverses);CHKERRQ(ierr);
768 ierr = clReleaseMemObject(o_jacobianDeterminants);CHKERRQ(ierr);
769 ierr = clReleaseMemObject(o_elemVec);CHKERRQ(ierr);
770 ierr = clReleaseKernel(ocl_kernel);CHKERRQ(ierr);
771 ierr = clReleaseProgram(ocl_prog);CHKERRQ(ierr);
772 PetscFunctionReturn(0);
773 }
774
775 PETSC_EXTERN PetscErrorCode PetscFESetUp_Basic(PetscFE);
776 PETSC_EXTERN PetscErrorCode PetscFECreateTabulation_Basic(PetscFE, PetscInt, const PetscReal [], PetscInt, PetscTabulation);
777
PetscFEInitialize_OpenCL(PetscFE fem)778 static PetscErrorCode PetscFEInitialize_OpenCL(PetscFE fem)
779 {
780 PetscFunctionBegin;
781 fem->ops->setfromoptions = NULL;
782 fem->ops->setup = PetscFESetUp_Basic;
783 fem->ops->view = NULL;
784 fem->ops->destroy = PetscFEDestroy_OpenCL;
785 fem->ops->getdimension = PetscFEGetDimension_Basic;
786 fem->ops->createtabulation = PetscFECreateTabulation_Basic;
787 fem->ops->integrateresidual = PetscFEIntegrateResidual_OpenCL;
788 fem->ops->integratebdresidual = NULL/* PetscFEIntegrateBdResidual_OpenCL */;
789 fem->ops->integratejacobianaction = NULL/* PetscFEIntegrateJacobianAction_OpenCL */;
790 fem->ops->integratejacobian = PetscFEIntegrateJacobian_Basic;
791 PetscFunctionReturn(0);
792 }
793
794 /*MC
795 PETSCFEOPENCL = "opencl" - A PetscFE object that integrates using a vectorized OpenCL implementation
796
797 Level: intermediate
798
799 .seealso: PetscFEType, PetscFECreate(), PetscFESetType()
800 M*/
801
PetscFECreate_OpenCL(PetscFE fem)802 PETSC_EXTERN PetscErrorCode PetscFECreate_OpenCL(PetscFE fem)
803 {
804 PetscFE_OpenCL *ocl;
805 cl_uint num_platforms;
806 cl_platform_id platform_ids[42];
807 cl_uint num_devices;
808 cl_device_id device_ids[42];
809 cl_int err;
810 PetscErrorCode ierr;
811
812 PetscFunctionBegin;
813 PetscValidHeaderSpecific(fem, PETSCFE_CLASSID, 1);
814 ierr = PetscNewLog(fem,&ocl);CHKERRQ(ierr);
815 fem->data = ocl;
816
817 /* Init Platform */
818 ierr = clGetPlatformIDs(42, platform_ids, &num_platforms);CHKERRQ(ierr);
819 if (!num_platforms) SETERRQ(PetscObjectComm((PetscObject) fem), PETSC_ERR_SUP, "No OpenCL platform found.");
820 ocl->pf_id = platform_ids[0];
821 /* Init Device */
822 ierr = clGetDeviceIDs(ocl->pf_id, CL_DEVICE_TYPE_ALL, 42, device_ids, &num_devices);CHKERRQ(ierr);
823 if (!num_devices) SETERRQ(PetscObjectComm((PetscObject) fem), PETSC_ERR_SUP, "No OpenCL device found.");
824 ocl->dev_id = device_ids[0];
825 /* Create context with one command queue */
826 ocl->ctx_id = clCreateContext(0, 1, &(ocl->dev_id), NULL, NULL, &err);CHKERRQ(err);
827 ocl->queue_id = clCreateCommandQueue(ocl->ctx_id, ocl->dev_id, CL_QUEUE_PROFILING_ENABLE, &err);CHKERRQ(err);
828 /* Types */
829 ocl->realType = PETSC_FLOAT;
830 /* Register events */
831 ierr = PetscLogEventRegister("OpenCL FEResidual", PETSCFE_CLASSID, &ocl->residualEvent);CHKERRQ(ierr);
832 /* Equation handling */
833 ocl->op = LAPLACIAN;
834
835 ierr = PetscFEInitialize_OpenCL(fem);CHKERRQ(ierr);
836 PetscFunctionReturn(0);
837 }
838
839 /*@
840 PetscFEOpenCLSetRealType - Set the scalar type for running on the accelerator
841
842 Input Parameters:
843 + fem - The PetscFE
844 - realType - The scalar type
845
846 Level: developer
847
848 .seealso: PetscFEOpenCLGetRealType()
849 @*/
PetscFEOpenCLSetRealType(PetscFE fem,PetscDataType realType)850 PetscErrorCode PetscFEOpenCLSetRealType(PetscFE fem, PetscDataType realType)
851 {
852 PetscFE_OpenCL *ocl = (PetscFE_OpenCL *) fem->data;
853
854 PetscFunctionBegin;
855 PetscValidHeaderSpecific(fem, PETSCFE_CLASSID, 1);
856 ocl->realType = realType;
857 PetscFunctionReturn(0);
858 }
859
860 /*@
861 PetscFEOpenCLGetRealType - Get the scalar type for running on the accelerator
862
863 Input Parameter:
864 . fem - The PetscFE
865
866 Output Parameter:
867 . realType - The scalar type
868
869 Level: developer
870
871 .seealso: PetscFEOpenCLSetRealType()
872 @*/
PetscFEOpenCLGetRealType(PetscFE fem,PetscDataType * realType)873 PetscErrorCode PetscFEOpenCLGetRealType(PetscFE fem, PetscDataType *realType)
874 {
875 PetscFE_OpenCL *ocl = (PetscFE_OpenCL *) fem->data;
876
877 PetscFunctionBegin;
878 PetscValidHeaderSpecific(fem, PETSCFE_CLASSID, 1);
879 PetscValidPointer(realType, 2);
880 *realType = ocl->realType;
881 PetscFunctionReturn(0);
882 }
883
884 #endif /* PETSC_HAVE_OPENCL */
885