1 #include <petsc/private/petscfeimpl.h> /*I "petscfe.h" I*/
2 
3 #if defined(PETSC_HAVE_OPENCL)
4 
PetscFEDestroy_OpenCL(PetscFE fem)5 static PetscErrorCode PetscFEDestroy_OpenCL(PetscFE fem)
6 {
7   PetscFE_OpenCL *ocl = (PetscFE_OpenCL *) fem->data;
8   PetscErrorCode  ierr;
9 
10   PetscFunctionBegin;
11   ierr = clReleaseCommandQueue(ocl->queue_id);CHKERRQ(ierr);
12   ocl->queue_id = 0;
13   ierr = clReleaseContext(ocl->ctx_id);CHKERRQ(ierr);
14   ocl->ctx_id = 0;
15   ierr = PetscFree(ocl);CHKERRQ(ierr);
16   PetscFunctionReturn(0);
17 }
18 
19 #define CHKSTRINGERROR(MSG) do {CHKERRQ(ierr); string_tail += count; if (string_tail == end_of_buffer) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, MSG);} while (0)
20 enum {LAPLACIAN = 0, ELASTICITY = 1};
21 
22 /* NOTE: This is now broken for vector problems. Must redo loops to respect vector basis elements */
23 /* dim     Number of spatial dimensions:          2                   */
24 /* N_b     Number of basis functions:             generated           */
25 /* N_{bt}  Number of total basis functions:       N_b * N_{comp}      */
26 /* N_q     Number of quadrature points:           generated           */
27 /* N_{bs}  Number of block cells                  LCM(N_b, N_q)       */
28 /* N_{bst} Number of block cell components        LCM(N_{bt}, N_q)    */
29 /* N_{bl}  Number of concurrent blocks            generated           */
30 /* N_t     Number of threads:                     N_{bl} * N_{bs}     */
31 /* N_{cbc} Number of concurrent basis      cells: N_{bl} * N_q        */
32 /* N_{cqc} Number of concurrent quadrature cells: N_{bl} * N_b        */
33 /* N_{sbc} Number of serial     basis      cells: N_{bs} / N_q        */
34 /* N_{sqc} Number of serial     quadrature cells: N_{bs} / N_b        */
35 /* N_{cb}  Number of serial cell batches:         input               */
36 /* N_c     Number of total cells:                 N_{cb}*N_{t}/N_{comp} */
PetscFEOpenCLGenerateIntegrationCode(PetscFE fem,char ** string_buffer,PetscInt buffer_length,PetscBool useAux,PetscInt N_bl)37 static PetscErrorCode PetscFEOpenCLGenerateIntegrationCode(PetscFE fem, char **string_buffer, PetscInt buffer_length, PetscBool useAux, PetscInt N_bl)
38 {
39   PetscFE_OpenCL *ocl = (PetscFE_OpenCL *) fem->data;
40   PetscQuadrature q;
41   char           *string_tail   = *string_buffer;
42   char           *end_of_buffer = *string_buffer + buffer_length;
43   char            float_str[]   = "float", double_str[]  = "double";
44   char           *numeric_str   = &(float_str[0]);
45   PetscInt        op            = ocl->op;
46   PetscBool       useField      = PETSC_FALSE;
47   PetscBool       useFieldDer   = PETSC_TRUE;
48   PetscBool       useFieldAux   = useAux;
49   PetscBool       useFieldDerAux= PETSC_FALSE;
50   PetscBool       useF0         = PETSC_TRUE;
51   PetscBool       useF1         = PETSC_TRUE;
52   const PetscReal *points, *weights;
53   PetscTabulation T;
54   PetscInt        dim, qNc, N_b, N_c, N_q, N_t, p, d, b, c;
55   size_t          count;
56   PetscErrorCode  ierr;
57 
58   PetscFunctionBegin;
59   ierr = PetscFEGetSpatialDimension(fem, &dim);CHKERRQ(ierr);
60   ierr = PetscFEGetDimension(fem, &N_b);CHKERRQ(ierr);
61   ierr = PetscFEGetNumComponents(fem, &N_c);CHKERRQ(ierr);
62   ierr = PetscFEGetQuadrature(fem, &q);CHKERRQ(ierr);
63   ierr = PetscQuadratureGetData(q, NULL, &qNc, &N_q, &points, &weights);CHKERRQ(ierr);
64   if (qNc != 1) SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_SUP, "Only supports scalar quadrature, not %D components\n", qNc);
65   N_t  = N_b * N_c * N_q * N_bl;
66   /* Enable device extension for double precision */
67   if (ocl->realType == PETSC_DOUBLE) {
68     ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
69 "#if defined(cl_khr_fp64)\n"
70 "#  pragma OPENCL EXTENSION cl_khr_fp64: enable\n"
71 "#elif defined(cl_amd_fp64)\n"
72 "#  pragma OPENCL EXTENSION cl_amd_fp64: enable\n"
73 "#endif\n",
74                               &count);CHKSTRINGERROR("Message to short");
75     numeric_str  = &(double_str[0]);
76   }
77   /* Kernel API */
78   ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
79 "\n"
80 "__kernel void integrateElementQuadrature(int N_cb, __global %s *coefficients, __global %s *coefficientsAux, __global %s *jacobianInverses, __global %s *jacobianDeterminants, __global %s *elemVec)\n"
81 "{\n",
82                        &count, numeric_str, numeric_str, numeric_str, numeric_str, numeric_str);CHKSTRINGERROR("Message to short");
83   /* Quadrature */
84   ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
85 "  /* Quadrature points\n"
86 "   - (x1,y1,x2,y2,...) */\n"
87 "  const %s points[%d] = {\n",
88                        &count, numeric_str, N_q*dim);CHKSTRINGERROR("Message to short");
89   for (p = 0; p < N_q; ++p) {
90     for (d = 0; d < dim; ++d) {
91       ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "%g,\n", &count, points[p*dim+d]);CHKSTRINGERROR("Message to short");
92     }
93   }
94   ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "};\n", &count);CHKSTRINGERROR("Message to short");
95   ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
96 "  /* Quadrature weights\n"
97 "   - (v1,v2,...) */\n"
98 "  const %s weights[%d] = {\n",
99                        &count, numeric_str, N_q);CHKSTRINGERROR("Message to short");
100   for (p = 0; p < N_q; ++p) {
101     ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "%g,\n", &count, weights[p]);CHKSTRINGERROR("Message to short");
102   }
103   ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "};\n", &count);CHKSTRINGERROR("Message to short");
104   /* Basis Functions */
105   ierr = PetscFEGetCellTabulation(fem, &T);CHKERRQ(ierr);
106   ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
107 "  /* Nodal basis function evaluations\n"
108 "    - basis component is fastest varying, the basis function, then point */\n"
109 "  const %s Basis[%d] = {\n",
110                        &count, numeric_str, N_q*N_b*N_c);CHKSTRINGERROR("Message to short");
111   for (p = 0; p < N_q; ++p) {
112     for (b = 0; b < N_b; ++b) {
113       for (c = 0; c < N_c; ++c) {
114         ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "%g,\n", &count, T->T[0][(p*N_b + b)*N_c + c]);CHKSTRINGERROR("Message to short");
115       }
116     }
117   }
118   ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "};\n", &count);CHKSTRINGERROR("Message to short");
119   ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
120 "\n"
121 "  /* Nodal basis function derivative evaluations,\n"
122 "      - derivative direction is fastest varying, then basis component, then basis function, then point */\n"
123 "  const %s%d BasisDerivatives[%d] = {\n",
124                        &count, numeric_str, dim, N_q*N_b*N_c);CHKSTRINGERROR("Message to short");
125   for (p = 0; p < N_q; ++p) {
126     for (b = 0; b < N_b; ++b) {
127       for (c = 0; c < N_c; ++c) {
128         ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "(%s%d)(", &count, numeric_str, dim);CHKSTRINGERROR("Message to short");
129         for (d = 0; d < dim; ++d) {
130           if (d > 0) {
131             ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, ", %g", &count, T->T[1][((p*N_b + b)*dim + d)*N_c + c]);CHKSTRINGERROR("Message to short");
132           } else {
133             ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "%g", &count, T->T[1][((p*N_b + b)*dim + d)*N_c + c]);CHKSTRINGERROR("Message to short");
134           }
135         }
136         ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "),\n", &count);CHKSTRINGERROR("Message to short");
137       }
138     }
139   }
140   ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "};\n", &count);CHKSTRINGERROR("Message to short");
141   /* Sizes */
142   ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
143 "  const int dim    = %d;                           // The spatial dimension\n"
144 "  const int N_bl   = %d;                           // The number of concurrent blocks\n"
145 "  const int N_b    = %d;                           // The number of basis functions\n"
146 "  const int N_comp = %d;                           // The number of basis function components\n"
147 "  const int N_bt   = N_b*N_comp;                    // The total number of scalar basis functions\n"
148 "  const int N_q    = %d;                           // The number of quadrature points\n"
149 "  const int N_bst  = N_bt*N_q;                      // The block size, LCM(N_b*N_comp, N_q), Notice that a block is not processed simultaneously\n"
150 "  const int N_t    = N_bst*N_bl;                    // The number of threads, N_bst * N_bl\n"
151 "  const int N_bc   = N_t/N_comp;                    // The number of cells per batch (N_b*N_q*N_bl)\n"
152 "  const int N_sbc  = N_bst / (N_q * N_comp);\n"
153 "  const int N_sqc  = N_bst / N_bt;\n"
154 "  /*const int N_c    = N_cb * N_bc;*/\n"
155 "\n"
156 "  /* Calculated indices */\n"
157 "  /*const int tidx    = get_local_id(0) + get_local_size(0)*get_local_id(1);*/\n"
158 "  const int tidx    = get_local_id(0);\n"
159 "  const int blidx   = tidx / N_bst;                  // Block number for this thread\n"
160 "  const int bidx    = tidx %% N_bt;                   // Basis function mapped to this thread\n"
161 "  const int cidx    = tidx %% N_comp;                 // Basis component mapped to this thread\n"
162 "  const int qidx    = tidx %% N_q;                    // Quadrature point mapped to this thread\n"
163 "  const int blbidx  = tidx %% N_q + blidx*N_q;        // Cell mapped to this thread in the basis phase\n"
164 "  const int blqidx  = tidx %% N_b + blidx*N_b;        // Cell mapped to this thread in the quadrature phase\n"
165 "  const int gidx    = get_group_id(1)*get_num_groups(0) + get_group_id(0);\n"
166 "  const int Goffset = gidx*N_cb*N_bc;\n",
167                             &count, dim, N_bl, N_b, N_c, N_q);CHKSTRINGERROR("Message to short");
168   /* Local memory */
169   ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
170 "\n"
171 "  /* Quadrature data */\n"
172 "  %s                w;                   // $w_q$, Quadrature weight at $x_q$\n"
173 "  __local %s         phi_i[%d];    //[N_bt*N_q];  // $\\phi_i(x_q)$, Value of the basis function $i$ at $x_q$\n"
174 "  __local %s%d       phiDer_i[%d]; //[N_bt*N_q];  // $\\frac{\\partial\\phi_i(x_q)}{\\partial x_d}$, Value of the derivative of basis function $i$ in direction $x_d$ at $x_q$\n"
175 "  /* Geometric data */\n"
176 "  __local %s        detJ[%d]; //[N_t];           // $|J(x_q)|$, Jacobian determinant at $x_q$\n"
177 "  __local %s        invJ[%d];//[N_t*dim*dim];   // $J^{-1}(x_q)$, Jacobian inverse at $x_q$\n",
178                             &count, numeric_str, numeric_str, N_b*N_c*N_q, numeric_str, dim, N_b*N_c*N_q, numeric_str, N_t,
179                             numeric_str, N_t*dim*dim, numeric_str, N_t*N_b*N_c);CHKSTRINGERROR("Message to short");
180   ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
181 "  /* FEM data */\n"
182 "  __local %s        u_i[%d]; //[N_t*N_bt];       // Coefficients $u_i$ of the field $u|_{\\mathcal{T}} = \\sum_i u_i \\phi_i$\n",
183                             &count, numeric_str, N_t*N_b*N_c);CHKSTRINGERROR("Message to short");
184   if (useAux) {
185     ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
186 "  __local %s        a_i[%d]; //[N_t];            // Coefficients $a_i$ of the auxiliary field $a|_{\\mathcal{T}} = \\sum_i a_i \\phi^R_i$\n",
187                             &count, numeric_str, N_t);CHKSTRINGERROR("Message to short");
188   }
189   if (useF0) {
190     ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
191 "  /* Intermediate calculations */\n"
192 "  __local %s         f_0[%d]; //[N_t*N_sqc];      // $f_0(u(x_q), \\nabla u(x_q)) |J(x_q)| w_q$\n",
193                               &count, numeric_str, N_t*N_q);CHKSTRINGERROR("Message to short");
194   }
195   if (useF1) {
196     ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
197 "  __local %s%d       f_1[%d]; //[N_t*N_sqc];      // $f_1(u(x_q), \\nabla u(x_q)) |J(x_q)| w_q$\n",
198                               &count, numeric_str, dim, N_t*N_q);CHKSTRINGERROR("Message to short");
199   }
200   /* TODO: If using elasticity, put in mu/lambda coefficients */
201   ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
202 "  /* Output data */\n"
203 "  %s                e_i;                 // Coefficient $e_i$ of the residual\n\n",
204                             &count, numeric_str);CHKSTRINGERROR("Message to short");
205   /* One-time loads */
206   ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
207 "  /* These should be generated inline */\n"
208 "  /* Load quadrature weights */\n"
209 "  w = weights[qidx];\n"
210 "  /* Load basis tabulation \\phi_i for this cell */\n"
211 "  if (tidx < N_bt*N_q) {\n"
212 "    phi_i[tidx]    = Basis[tidx];\n"
213 "    phiDer_i[tidx] = BasisDerivatives[tidx];\n"
214 "  }\n\n",
215                        &count);CHKSTRINGERROR("Message to short");
216   /* Batch loads */
217   ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
218 "  for (int batch = 0; batch < N_cb; ++batch) {\n"
219 "    /* Load geometry */\n"
220 "    detJ[tidx] = jacobianDeterminants[Goffset+batch*N_bc+tidx];\n"
221 "    for (int n = 0; n < dim*dim; ++n) {\n"
222 "      const int offset = n*N_t;\n"
223 "      invJ[offset+tidx] = jacobianInverses[(Goffset+batch*N_bc)*dim*dim+offset+tidx];\n"
224 "    }\n"
225 "    /* Load coefficients u_i for this cell */\n"
226 "    for (int n = 0; n < N_bt; ++n) {\n"
227 "      const int offset = n*N_t;\n"
228 "      u_i[offset+tidx] = coefficients[(Goffset*N_bt)+batch*N_t*N_b+offset+tidx];\n"
229 "    }\n",
230                        &count);CHKSTRINGERROR("Message to short");
231   if (useAux) {
232   ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
233 "    /* Load coefficients a_i for this cell */\n"
234 "    /* TODO: This should not be N_t here, it should be N_bc*N_comp_aux */\n"
235 "    a_i[tidx] = coefficientsAux[Goffset+batch*N_t+tidx];\n",
236                             &count);CHKSTRINGERROR("Message to short");
237   }
238   /* Quadrature phase */
239   ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
240 "    barrier(CLK_LOCAL_MEM_FENCE);\n"
241 "\n"
242 "    /* Map coefficients to values at quadrature points */\n"
243 "    for (int c = 0; c < N_sqc; ++c) {\n"
244 "      const int cell          = c*N_bl*N_b + blqidx;\n"
245 "      const int fidx          = (cell*N_q + qidx)*N_comp + cidx;\n",
246                        &count);CHKSTRINGERROR("Message to short");
247   if (useField) {
248     ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
249 "      %s  u[%d]; //[N_comp];     // $u(x_q)$, Value of the field at $x_q$\n",
250                               &count, numeric_str, N_c);CHKSTRINGERROR("Message to short");
251   }
252   if (useFieldDer) {
253     ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
254 "      %s%d   gradU[%d]; //[N_comp]; // $\\nabla u(x_q)$, Value of the field gradient at $x_q$\n",
255                               &count, numeric_str, dim, N_c);CHKSTRINGERROR("Message to short");
256   }
257   if (useFieldAux) {
258     ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
259 "      %s  a[%d]; //[1];     // $a(x_q)$, Value of the auxiliary fields at $x_q$\n",
260                               &count, numeric_str, 1);CHKSTRINGERROR("Message to short");
261   }
262   if (useFieldDerAux) {
263     ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
264 "      %s%d   gradA[%d]; //[1]; // $\\nabla a(x_q)$, Value of the auxiliary field gradient at $x_q$\n",
265                               &count, numeric_str, dim, 1);CHKSTRINGERROR("Message to short");
266   }
267   ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
268 "\n"
269 "      for (int comp = 0; comp < N_comp; ++comp) {\n",
270                             &count);CHKSTRINGERROR("Message to short");
271   if (useField) {ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "        u[comp] = 0.0;\n", &count);CHKSTRINGERROR("Message to short");}
272   if (useFieldDer) {
273     switch (dim) {
274     case 1:
275       ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "        gradU[comp].x = 0.0;\n", &count);CHKSTRINGERROR("Message to short");break;
276     case 2:
277       ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "        gradU[comp].x = 0.0; gradU[comp].y = 0.0;\n", &count);CHKSTRINGERROR("Message to short");break;
278     case 3:
279       ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "        gradU[comp].x = 0.0; gradU[comp].y = 0.0; gradU[comp].z = 0.0;\n", &count);CHKSTRINGERROR("Message to short");break;
280     }
281   }
282   ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
283 "      }\n",
284                             &count);CHKSTRINGERROR("Message to short");
285   if (useFieldAux) {
286     ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "      a[0] = 0.0;\n", &count);CHKSTRINGERROR("Message to short");
287   }
288   if (useFieldDerAux) {
289     switch (dim) {
290     case 1:
291       ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "      gradA[0].x = 0.0;\n", &count);CHKSTRINGERROR("Message to short");break;
292     case 2:
293       ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "      gradA[0].x = 0.0; gradA[0].y = 0.0;\n", &count);CHKSTRINGERROR("Message to short");break;
294     case 3:
295       ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "      gradA[0].x = 0.0; gradA[0].y = 0.0; gradA[0].z = 0.0;\n", &count);CHKSTRINGERROR("Message to short");break;
296     }
297   }
298   ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
299 "      /* Get field and derivatives at this quadrature point */\n"
300 "      for (int i = 0; i < N_b; ++i) {\n"
301 "        for (int comp = 0; comp < N_comp; ++comp) {\n"
302 "          const int b    = i*N_comp+comp;\n"
303 "          const int pidx = qidx*N_bt + b;\n"
304 "          const int uidx = cell*N_bt + b;\n"
305 "          %s%d   realSpaceDer;\n\n",
306                             &count, numeric_str, dim);CHKSTRINGERROR("Message to short");
307   if (useField) {ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,"          u[comp] += u_i[uidx]*phi_i[pidx];\n", &count);CHKSTRINGERROR("Message to short");}
308   if (useFieldDer) {
309     switch (dim) {
310     case 2:
311       ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
312 "          realSpaceDer.x = invJ[cell*dim*dim+0*dim+0]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+0]*phiDer_i[pidx].y;\n"
313 "          gradU[comp].x += u_i[uidx]*realSpaceDer.x;\n"
314 "          realSpaceDer.y = invJ[cell*dim*dim+0*dim+1]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+1]*phiDer_i[pidx].y;\n"
315 "          gradU[comp].y += u_i[uidx]*realSpaceDer.y;\n",
316                            &count);CHKSTRINGERROR("Message to short");break;
317     case 3:
318       ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
319 "          realSpaceDer.x = invJ[cell*dim*dim+0*dim+0]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+0]*phiDer_i[pidx].y + invJ[cell*dim*dim+2*dim+0]*phiDer_i[pidx].z;\n"
320 "          gradU[comp].x += u_i[uidx]*realSpaceDer.x;\n"
321 "          realSpaceDer.y = invJ[cell*dim*dim+0*dim+1]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+1]*phiDer_i[pidx].y + invJ[cell*dim*dim+2*dim+1]*phiDer_i[pidx].z;\n"
322 "          gradU[comp].y += u_i[uidx]*realSpaceDer.y;\n"
323 "          realSpaceDer.z = invJ[cell*dim*dim+0*dim+2]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+2]*phiDer_i[pidx].y + invJ[cell*dim*dim+2*dim+2]*phiDer_i[pidx].z;\n"
324 "          gradU[comp].z += u_i[uidx]*realSpaceDer.z;\n",
325                            &count);CHKSTRINGERROR("Message to short");break;
326     }
327   }
328   ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
329 "        }\n"
330 "      }\n",
331                             &count);CHKSTRINGERROR("Message to short");
332   if (useFieldAux) {
333     ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,"          a[0] += a_i[cell];\n", &count);CHKSTRINGERROR("Message to short");
334   }
335   /* Calculate residual at quadrature points: Should be generated by an weak form egine */
336   ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
337 "      /* Process values at quadrature points */\n",
338                             &count);CHKSTRINGERROR("Message to short");
339   switch (op) {
340   case LAPLACIAN:
341     if (useF0) {ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "      f_0[fidx] = 4.0;\n", &count);CHKSTRINGERROR("Message to short");}
342     if (useF1) {
343       if (useAux) {ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "      f_1[fidx] = a[0]*gradU[cidx];\n", &count);CHKSTRINGERROR("Message to short");}
344       else        {ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "      f_1[fidx] = gradU[cidx];\n", &count);CHKSTRINGERROR("Message to short");}
345     }
346     break;
347   case ELASTICITY:
348     if (useF0) {ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "      f_0[fidx] = 4.0;\n", &count);CHKSTRINGERROR("Message to short");}
349     if (useF1) {
350     switch (dim) {
351     case 2:
352       ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
353 "      switch (cidx) {\n"
354 "      case 0:\n"
355 "        f_1[fidx].x = lambda*(gradU[0].x + gradU[1].y) + mu*(gradU[0].x + gradU[0].x);\n"
356 "        f_1[fidx].y = lambda*(gradU[0].x + gradU[1].y) + mu*(gradU[0].y + gradU[1].x);\n"
357 "        break;\n"
358 "      case 1:\n"
359 "        f_1[fidx].x = lambda*(gradU[0].x + gradU[1].y) + mu*(gradU[1].x + gradU[0].y);\n"
360 "        f_1[fidx].y = lambda*(gradU[0].x + gradU[1].y) + mu*(gradU[1].y + gradU[1].y);\n"
361 "      }\n",
362                            &count);CHKSTRINGERROR("Message to short");break;
363     case 3:
364       ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
365 "      switch (cidx) {\n"
366 "      case 0:\n"
367 "        f_1[fidx].x = lambda*(gradU[0].x + gradU[1].y + gradU[2].z) + mu*(gradU[0].x + gradU[0].x);\n"
368 "        f_1[fidx].y = lambda*(gradU[0].x + gradU[1].y + gradU[2].z) + mu*(gradU[0].y + gradU[1].x);\n"
369 "        f_1[fidx].z = lambda*(gradU[0].x + gradU[1].y + gradU[2].z) + mu*(gradU[0].z + gradU[2].x);\n"
370 "        break;\n"
371 "      case 1:\n"
372 "        f_1[fidx].x = lambda*(gradU[0].x + gradU[1].y + gradU[2].z) + mu*(gradU[1].x + gradU[0].y);\n"
373 "        f_1[fidx].y = lambda*(gradU[0].x + gradU[1].y + gradU[2].z) + mu*(gradU[1].y + gradU[1].y);\n"
374 "        f_1[fidx].z = lambda*(gradU[0].x + gradU[1].y + gradU[2].z) + mu*(gradU[1].y + gradU[2].y);\n"
375 "        break;\n"
376 "      case 2:\n"
377 "        f_1[fidx].x = lambda*(gradU[0].x + gradU[1].y + gradU[2].z) + mu*(gradU[2].x + gradU[0].z);\n"
378 "        f_1[fidx].y = lambda*(gradU[0].x + gradU[1].y + gradU[2].z) + mu*(gradU[2].y + gradU[1].z);\n"
379 "        f_1[fidx].z = lambda*(gradU[0].x + gradU[1].y + gradU[2].z) + mu*(gradU[2].y + gradU[2].z);\n"
380 "      }\n",
381                            &count);CHKSTRINGERROR("Message to short");break;
382     }}
383     break;
384   default:
385     SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_SUP, "PDE operator %d is not supported", op);
386   }
387   if (useF0) {ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,"      f_0[fidx] *= detJ[cell]*w;\n", &count);CHKSTRINGERROR("Message to short");}
388   if (useF1) {
389     switch (dim) {
390     case 1:
391       ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,"      f_1[fidx].x *= detJ[cell]*w;\n", &count);CHKSTRINGERROR("Message to short");break;
392     case 2:
393       ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,"      f_1[fidx].x *= detJ[cell]*w; f_1[fidx].y *= detJ[cell]*w;\n", &count);CHKSTRINGERROR("Message to short");break;
394     case 3:
395       ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,"      f_1[fidx].x *= detJ[cell]*w; f_1[fidx].y *= detJ[cell]*w; f_1[fidx].z *= detJ[cell]*w;\n", &count);CHKSTRINGERROR("Message to short");break;
396     }
397   }
398   /* Thread transpose */
399   ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
400 "    }\n\n"
401 "    /* ==== TRANSPOSE THREADS ==== */\n"
402 "    barrier(CLK_LOCAL_MEM_FENCE);\n\n",
403                        &count);CHKSTRINGERROR("Message to short");
404   /* Basis phase */
405   ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
406 "    /* Map values at quadrature points to coefficients */\n"
407 "    for (int c = 0; c < N_sbc; ++c) {\n"
408 "      const int cell = c*N_bl*N_q + blbidx; /* Cell number in batch */\n"
409 "\n"
410 "      e_i = 0.0;\n"
411 "      for (int q = 0; q < N_q; ++q) {\n"
412 "        const int pidx = q*N_bt + bidx;\n"
413 "        const int fidx = (cell*N_q + q)*N_comp + cidx;\n"
414 "        %s%d   realSpaceDer;\n\n",
415                        &count, numeric_str, dim);CHKSTRINGERROR("Message to short");
416 
417   if (useF0) {ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,"        e_i += phi_i[pidx]*f_0[fidx];\n", &count);CHKSTRINGERROR("Message to short");}
418   if (useF1) {
419     switch (dim) {
420     case 2:
421       ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
422 "        realSpaceDer.x = invJ[cell*dim*dim+0*dim+0]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+0]*phiDer_i[pidx].y;\n"
423 "        e_i           += realSpaceDer.x*f_1[fidx].x;\n"
424 "        realSpaceDer.y = invJ[cell*dim*dim+0*dim+1]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+1]*phiDer_i[pidx].y;\n"
425 "        e_i           += realSpaceDer.y*f_1[fidx].y;\n",
426                            &count);CHKSTRINGERROR("Message to short");break;
427     case 3:
428       ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
429 "        realSpaceDer.x = invJ[cell*dim*dim+0*dim+0]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+0]*phiDer_i[pidx].y + invJ[cell*dim*dim+2*dim+0]*phiDer_i[pidx].z;\n"
430 "        e_i           += realSpaceDer.x*f_1[fidx].x;\n"
431 "        realSpaceDer.y = invJ[cell*dim*dim+0*dim+1]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+1]*phiDer_i[pidx].y + invJ[cell*dim*dim+2*dim+1]*phiDer_i[pidx].z;\n"
432 "        e_i           += realSpaceDer.y*f_1[fidx].y;\n"
433 "        realSpaceDer.z = invJ[cell*dim*dim+0*dim+2]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+2]*phiDer_i[pidx].y + invJ[cell*dim*dim+2*dim+2]*phiDer_i[pidx].z;\n"
434 "        e_i           += realSpaceDer.z*f_1[fidx].z;\n",
435                            &count);CHKSTRINGERROR("Message to short");break;
436     }
437   }
438   ierr = PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
439 "      }\n"
440 "      /* Write element vector for N_{cbc} cells at a time */\n"
441 "      elemVec[(Goffset + batch*N_bc + c*N_bl*N_q)*N_bt + tidx] = e_i;\n"
442 "    }\n"
443 "    /* ==== Could do one write per batch ==== */\n"
444 "  }\n"
445 "  return;\n"
446 "}\n",
447                        &count);CHKSTRINGERROR("Message to short");
448   PetscFunctionReturn(0);
449 }
450 
PetscFEOpenCLGetIntegrationKernel(PetscFE fem,PetscBool useAux,cl_program * ocl_prog,cl_kernel * ocl_kernel)451 static PetscErrorCode PetscFEOpenCLGetIntegrationKernel(PetscFE fem, PetscBool useAux, cl_program *ocl_prog, cl_kernel *ocl_kernel)
452 {
453   PetscFE_OpenCL *ocl = (PetscFE_OpenCL *) fem->data;
454   PetscInt        dim, N_bl;
455   PetscBool       flg;
456   char           *buffer;
457   size_t          len;
458   char            errMsg[8192];
459   cl_int          err;
460   PetscErrorCode  ierr;
461 
462   PetscFunctionBegin;
463   ierr = PetscFEGetSpatialDimension(fem, &dim);CHKERRQ(ierr);
464   ierr = PetscMalloc1(8192, &buffer);CHKERRQ(ierr);
465   ierr = PetscFEGetTileSizes(fem, NULL, &N_bl, NULL, NULL);CHKERRQ(ierr);
466   ierr = PetscFEOpenCLGenerateIntegrationCode(fem, &buffer, 8192, useAux, N_bl);CHKERRQ(ierr);
467   ierr = PetscOptionsHasName(((PetscObject)fem)->options,((PetscObject)fem)->prefix, "-petscfe_opencl_kernel_print", &flg);CHKERRQ(ierr);
468   if (flg) {ierr = PetscPrintf(PetscObjectComm((PetscObject) fem), "OpenCL FE Integration Kernel:\n%s\n", buffer);CHKERRQ(ierr);}
469   ierr = PetscStrlen(buffer,&len);CHKERRQ(ierr);
470   *ocl_prog = clCreateProgramWithSource(ocl->ctx_id, 1, (const char **) &buffer, &len, &err);CHKERRQ(err);
471   err = clBuildProgram(*ocl_prog, 0, NULL, NULL, NULL, NULL);
472   if (err != CL_SUCCESS) {
473     err = clGetProgramBuildInfo(*ocl_prog, ocl->dev_id, CL_PROGRAM_BUILD_LOG, 8192*sizeof(char), &errMsg, NULL);CHKERRQ(ierr);
474     SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Build failed! Log:\n %s", errMsg);
475   }
476   ierr = PetscFree(buffer);CHKERRQ(ierr);
477   *ocl_kernel = clCreateKernel(*ocl_prog, "integrateElementQuadrature", &ierr);CHKERRQ(ierr);
478   PetscFunctionReturn(0);
479 }
480 
PetscFEOpenCLCalculateGrid(PetscFE fem,PetscInt N,PetscInt blockSize,size_t * x,size_t * y,size_t * z)481 static PetscErrorCode PetscFEOpenCLCalculateGrid(PetscFE fem, PetscInt N, PetscInt blockSize, size_t *x, size_t *y, size_t *z)
482 {
483   const PetscInt Nblocks = N/blockSize;
484 
485   PetscFunctionBegin;
486   if (N % blockSize) SETERRQ2(PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid block size %d for %d elements", blockSize, N);
487   *z = 1;
488   *y = 1;
489   for (*x = (size_t) (PetscSqrtReal(Nblocks) + 0.5); *x > 0; --*x) {
490     *y = Nblocks / *x;
491     if (*x * *y == Nblocks) break;
492   }
493   if (*x * *y != Nblocks) SETERRQ2(PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Could not find partition for %d with block size %d", N, blockSize);
494   PetscFunctionReturn(0);
495 }
496 
PetscFEOpenCLLogResidual(PetscFE fem,PetscLogDouble time,PetscLogDouble flops)497 static PetscErrorCode PetscFEOpenCLLogResidual(PetscFE fem, PetscLogDouble time, PetscLogDouble flops)
498 {
499   PetscFE_OpenCL   *ocl = (PetscFE_OpenCL *) fem->data;
500   PetscStageLog     stageLog;
501   PetscEventPerfLog eventLog = NULL;
502   int               stage;
503   PetscErrorCode    ierr;
504 
505   PetscFunctionBegin;
506   ierr = PetscLogGetStageLog(&stageLog);CHKERRQ(ierr);
507   ierr = PetscStageLogGetCurrent(stageLog, &stage);CHKERRQ(ierr);
508   ierr = PetscStageLogGetEventPerfLog(stageLog, stage, &eventLog);CHKERRQ(ierr);
509     /* Log performance info */
510   eventLog->eventInfo[ocl->residualEvent].count++;
511   eventLog->eventInfo[ocl->residualEvent].time  += time;
512   eventLog->eventInfo[ocl->residualEvent].flops += flops;
513   PetscFunctionReturn(0);
514 }
515 
PetscFEIntegrateResidual_OpenCL(PetscDS prob,PetscInt field,PetscInt Ne,PetscFEGeom * cgeom,const PetscScalar coefficients[],const PetscScalar coefficients_t[],PetscDS probAux,const PetscScalar coefficientsAux[],PetscReal t,PetscScalar elemVec[])516 static PetscErrorCode PetscFEIntegrateResidual_OpenCL(PetscDS prob, PetscInt field, PetscInt Ne, PetscFEGeom *cgeom,
517                                                       const PetscScalar coefficients[], const PetscScalar coefficients_t[], PetscDS probAux, const PetscScalar coefficientsAux[], PetscReal t, PetscScalar elemVec[])
518 {
519   /* Nbc = batchSize */
520   PetscFE           fem;
521   PetscFE_OpenCL   *ocl;
522   PetscPointFunc    f0_func;
523   PetscPointFunc    f1_func;
524   PetscQuadrature   q;
525   PetscInt          dim, qNc;
526   PetscInt          N_b;    /* The number of basis functions */
527   PetscInt          N_comp; /* The number of basis function components */
528   PetscInt          N_bt;   /* The total number of scalar basis functions */
529   PetscInt          N_q;    /* The number of quadrature points */
530   PetscInt          N_bst;  /* The block size, LCM(N_bt, N_q), Notice that a block is not process simultaneously */
531   PetscInt          N_t;    /* The number of threads, N_bst * N_bl */
532   PetscInt          N_bl;   /* The number of blocks */
533   PetscInt          N_bc;   /* The batch size, N_bl*N_q*N_b */
534   PetscInt          N_cb;   /* The number of batches */
535   PetscInt          numFlops, f0Flops = 0, f1Flops = 0;
536   PetscBool         useAux      = probAux ? PETSC_TRUE : PETSC_FALSE;
537   PetscBool         useField    = PETSC_FALSE;
538   PetscBool         useFieldDer = PETSC_TRUE;
539   PetscBool         useF0       = PETSC_TRUE;
540   PetscBool         useF1       = PETSC_TRUE;
541   /* OpenCL variables */
542   cl_program        ocl_prog;
543   cl_kernel         ocl_kernel;
544   cl_event          ocl_ev;         /* The event for tracking kernel execution */
545   cl_ulong          ns_start;       /* Nanoseconds counter on GPU at kernel start */
546   cl_ulong          ns_end;         /* Nanoseconds counter on GPU at kernel stop */
547   cl_mem            o_jacobianInverses, o_jacobianDeterminants;
548   cl_mem            o_coefficients, o_coefficientsAux, o_elemVec;
549   float            *f_coeff = NULL, *f_coeffAux = NULL, *f_invJ = NULL, *f_detJ = NULL;
550   double           *d_coeff = NULL, *d_coeffAux = NULL, *d_invJ = NULL, *d_detJ = NULL;
551   PetscReal        *r_invJ = NULL, *r_detJ = NULL;
552   void             *oclCoeff, *oclCoeffAux, *oclInvJ, *oclDetJ;
553   size_t            local_work_size[3], global_work_size[3];
554   size_t            realSize, x, y, z;
555   const PetscReal   *points, *weights;
556   PetscErrorCode    ierr;
557 
558   PetscFunctionBegin;
559   ierr = PetscDSGetDiscretization(prob, field, (PetscObject *) &fem);CHKERRQ(ierr);
560   ocl  = (PetscFE_OpenCL *) fem->data;
561   if (!Ne) {ierr = PetscFEOpenCLLogResidual(fem, 0.0, 0.0);CHKERRQ(ierr); PetscFunctionReturn(0);}
562   ierr = PetscFEGetSpatialDimension(fem, &dim);CHKERRQ(ierr);
563   ierr = PetscFEGetQuadrature(fem, &q);CHKERRQ(ierr);
564   ierr = PetscQuadratureGetData(q, NULL, &qNc, &N_q, &points, &weights);CHKERRQ(ierr);
565   if (qNc != 1) SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_SUP, "Only supports scalar quadrature, not %D components\n", qNc);
566   ierr = PetscFEGetDimension(fem, &N_b);CHKERRQ(ierr);
567   ierr = PetscFEGetNumComponents(fem, &N_comp);CHKERRQ(ierr);
568   ierr = PetscDSGetResidual(prob, field, &f0_func, &f1_func);CHKERRQ(ierr);
569   ierr = PetscFEGetTileSizes(fem, NULL, &N_bl, &N_bc, &N_cb);CHKERRQ(ierr);
570   N_bt  = N_b*N_comp;
571   N_bst = N_bt*N_q;
572   N_t   = N_bst*N_bl;
573   if (N_bc*N_comp != N_t) SETERRQ3(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Number of threads %d should be %d * %d", N_t, N_bc, N_comp);
574   /* Calculate layout */
575   if (Ne % (N_cb*N_bc)) { /* Remainder cells */
576     ierr = PetscFEIntegrateResidual_Basic(prob, field, Ne, cgeom, coefficients, coefficients_t, probAux, coefficientsAux, t, elemVec);CHKERRQ(ierr);
577     PetscFunctionReturn(0);
578   }
579   ierr = PetscFEOpenCLCalculateGrid(fem, Ne, N_cb*N_bc, &x, &y, &z);CHKERRQ(ierr);
580   local_work_size[0]  = N_bc*N_comp;
581   local_work_size[1]  = 1;
582   local_work_size[2]  = 1;
583   global_work_size[0] = x * local_work_size[0];
584   global_work_size[1] = y * local_work_size[1];
585   global_work_size[2] = z * local_work_size[2];
586   ierr = PetscInfo7(fem, "GPU layout grid(%d,%d,%d) block(%d,%d,%d) with %d batches\n", x, y, z, local_work_size[0], local_work_size[1], local_work_size[2], N_cb);CHKERRQ(ierr);
587   ierr = PetscInfo2(fem, " N_t: %d, N_cb: %d\n", N_t, N_cb);
588   /* Generate code */
589   if (probAux) {
590     PetscSpace P;
591     PetscInt   NfAux, order, f;
592 
593     ierr = PetscDSGetNumFields(probAux, &NfAux);CHKERRQ(ierr);
594     for (f = 0; f < NfAux; ++f) {
595       PetscFE feAux;
596 
597       ierr = PetscDSGetDiscretization(probAux, f, (PetscObject *) &feAux);CHKERRQ(ierr);
598       ierr = PetscFEGetBasisSpace(feAux, &P);CHKERRQ(ierr);
599       ierr = PetscSpaceGetDegree(P, &order, NULL);CHKERRQ(ierr);
600       if (order > 0) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Can only handle P0 coefficient fields");
601     }
602   }
603   ierr = PetscFEOpenCLGetIntegrationKernel(fem, useAux, &ocl_prog, &ocl_kernel);CHKERRQ(ierr);
604   /* Create buffers on the device and send data over */
605   ierr = PetscDataTypeGetSize(ocl->realType, &realSize);CHKERRQ(ierr);
606   if (cgeom->numPoints > 1) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Only support affine geometry for OpenCL integration right now");
607   if (sizeof(PetscReal) != realSize) {
608     switch (ocl->realType) {
609     case PETSC_FLOAT:
610     {
611       PetscInt c, b, d;
612 
613       ierr = PetscMalloc4(Ne*N_bt,&f_coeff,Ne,&f_coeffAux,Ne*dim*dim,&f_invJ,Ne,&f_detJ);CHKERRQ(ierr);
614       for (c = 0; c < Ne; ++c) {
615         f_detJ[c] = (float) cgeom->detJ[c];
616         for (d = 0; d < dim*dim; ++d) {
617           f_invJ[c*dim*dim+d] = (float) cgeom->invJ[c * dim * dim + d];
618         }
619         for (b = 0; b < N_bt; ++b) {
620           f_coeff[c*N_bt+b] = (float) coefficients[c*N_bt+b];
621         }
622       }
623       if (coefficientsAux) { /* Assume P0 */
624         for (c = 0; c < Ne; ++c) {
625           f_coeffAux[c] = (float) coefficientsAux[c];
626         }
627       }
628       oclCoeff      = (void *) f_coeff;
629       if (coefficientsAux) {
630         oclCoeffAux = (void *) f_coeffAux;
631       } else {
632         oclCoeffAux = NULL;
633       }
634       oclInvJ       = (void *) f_invJ;
635       oclDetJ       = (void *) f_detJ;
636     }
637     break;
638     case PETSC_DOUBLE:
639     {
640       PetscInt c, b, d;
641 
642       ierr = PetscMalloc4(Ne*N_bt,&d_coeff,Ne,&d_coeffAux,Ne*dim*dim,&d_invJ,Ne,&d_detJ);CHKERRQ(ierr);
643       for (c = 0; c < Ne; ++c) {
644         d_detJ[c] = (double) cgeom->detJ[c];
645         for (d = 0; d < dim*dim; ++d) {
646           d_invJ[c*dim*dim+d] = (double) cgeom->invJ[c * dim * dim + d];
647         }
648         for (b = 0; b < N_bt; ++b) {
649           d_coeff[c*N_bt+b] = (double) coefficients[c*N_bt+b];
650         }
651       }
652       if (coefficientsAux) { /* Assume P0 */
653         for (c = 0; c < Ne; ++c) {
654           d_coeffAux[c] = (double) coefficientsAux[c];
655         }
656       }
657       oclCoeff      = (void *) d_coeff;
658       if (coefficientsAux) {
659         oclCoeffAux = (void *) d_coeffAux;
660       } else {
661         oclCoeffAux = NULL;
662       }
663       oclInvJ       = (void *) d_invJ;
664       oclDetJ       = (void *) d_detJ;
665     }
666     break;
667     default:
668       SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Unsupported PETSc type %d", ocl->realType);
669     }
670   } else {
671     PetscInt c, d;
672 
673     ierr = PetscMalloc2(Ne*dim*dim,&r_invJ,Ne,&r_detJ);CHKERRQ(ierr);
674     for (c = 0; c < Ne; ++c) {
675       r_detJ[c] = cgeom->detJ[c];
676       for (d = 0; d < dim*dim; ++d) {
677         r_invJ[c*dim*dim+d] = cgeom->invJ[c * dim * dim + d];
678       }
679     }
680     oclCoeff    = (void *) coefficients;
681     oclCoeffAux = (void *) coefficientsAux;
682     oclInvJ     = (void *) r_invJ;
683     oclDetJ     = (void *) r_detJ;
684   }
685   o_coefficients         = clCreateBuffer(ocl->ctx_id, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, Ne*N_bt    * realSize, oclCoeff,    &ierr);CHKERRQ(ierr);
686   if (coefficientsAux) {
687     o_coefficientsAux    = clCreateBuffer(ocl->ctx_id, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, Ne         * realSize, oclCoeffAux, &ierr);CHKERRQ(ierr);
688   } else {
689     o_coefficientsAux    = clCreateBuffer(ocl->ctx_id, CL_MEM_READ_ONLY,                        Ne         * realSize, oclCoeffAux, &ierr);CHKERRQ(ierr);
690   }
691   o_jacobianInverses     = clCreateBuffer(ocl->ctx_id, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, Ne*dim*dim * realSize, oclInvJ,     &ierr);CHKERRQ(ierr);
692   o_jacobianDeterminants = clCreateBuffer(ocl->ctx_id, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, Ne         * realSize, oclDetJ,     &ierr);CHKERRQ(ierr);
693   o_elemVec              = clCreateBuffer(ocl->ctx_id, CL_MEM_WRITE_ONLY,                       Ne*N_bt    * realSize, NULL,        &ierr);CHKERRQ(ierr);
694   /* Kernel launch */
695   ierr = clSetKernelArg(ocl_kernel, 0, sizeof(cl_int), (void*) &N_cb);CHKERRQ(ierr);
696   ierr = clSetKernelArg(ocl_kernel, 1, sizeof(cl_mem), (void*) &o_coefficients);CHKERRQ(ierr);
697   ierr = clSetKernelArg(ocl_kernel, 2, sizeof(cl_mem), (void*) &o_coefficientsAux);CHKERRQ(ierr);
698   ierr = clSetKernelArg(ocl_kernel, 3, sizeof(cl_mem), (void*) &o_jacobianInverses);CHKERRQ(ierr);
699   ierr = clSetKernelArg(ocl_kernel, 4, sizeof(cl_mem), (void*) &o_jacobianDeterminants);CHKERRQ(ierr);
700   ierr = clSetKernelArg(ocl_kernel, 5, sizeof(cl_mem), (void*) &o_elemVec);CHKERRQ(ierr);
701   ierr = clEnqueueNDRangeKernel(ocl->queue_id, ocl_kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &ocl_ev);CHKERRQ(ierr);
702   /* Read data back from device */
703   if (sizeof(PetscReal) != realSize) {
704     switch (ocl->realType) {
705     case PETSC_FLOAT:
706     {
707       float   *elem;
708       PetscInt c, b;
709 
710       ierr = PetscFree4(f_coeff,f_coeffAux,f_invJ,f_detJ);CHKERRQ(ierr);
711       ierr = PetscMalloc1(Ne*N_bt, &elem);CHKERRQ(ierr);
712       ierr = clEnqueueReadBuffer(ocl->queue_id, o_elemVec, CL_TRUE, 0, Ne*N_bt * realSize, elem, 0, NULL, NULL);CHKERRQ(ierr);
713       for (c = 0; c < Ne; ++c) {
714         for (b = 0; b < N_bt; ++b) {
715           elemVec[c*N_bt+b] = (PetscScalar) elem[c*N_bt+b];
716         }
717       }
718       ierr = PetscFree(elem);CHKERRQ(ierr);
719     }
720     break;
721     case PETSC_DOUBLE:
722     {
723       double  *elem;
724       PetscInt c, b;
725 
726       ierr = PetscFree4(d_coeff,d_coeffAux,d_invJ,d_detJ);CHKERRQ(ierr);
727       ierr = PetscMalloc1(Ne*N_bt, &elem);CHKERRQ(ierr);
728       ierr = clEnqueueReadBuffer(ocl->queue_id, o_elemVec, CL_TRUE, 0, Ne*N_bt * realSize, elem, 0, NULL, NULL);CHKERRQ(ierr);
729       for (c = 0; c < Ne; ++c) {
730         for (b = 0; b < N_bt; ++b) {
731           elemVec[c*N_bt+b] = (PetscScalar) elem[c*N_bt+b];
732         }
733       }
734       ierr = PetscFree(elem);CHKERRQ(ierr);
735     }
736     break;
737     default:
738       SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Unsupported PETSc type %d", ocl->realType);
739     }
740   } else {
741     ierr = PetscFree2(r_invJ,r_detJ);CHKERRQ(ierr);
742     ierr = clEnqueueReadBuffer(ocl->queue_id, o_elemVec, CL_TRUE, 0, Ne*N_bt * realSize, elemVec, 0, NULL, NULL);CHKERRQ(ierr);
743   }
744   /* Log performance */
745   ierr = clGetEventProfilingInfo(ocl_ev, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &ns_start, NULL);CHKERRQ(ierr);
746   ierr = clGetEventProfilingInfo(ocl_ev, CL_PROFILING_COMMAND_END,   sizeof(cl_ulong), &ns_end,   NULL);CHKERRQ(ierr);
747   f0Flops = 0;
748   switch (ocl->op) {
749   case LAPLACIAN:
750     f1Flops = useAux ? dim : 0;break;
751   case ELASTICITY:
752     f1Flops = 2*dim*dim;break;
753   }
754   numFlops = Ne*(
755     N_q*(
756       N_b*N_comp*((useField ? 2 : 0) + (useFieldDer ? 2*dim*(dim + 1) : 0))
757       /*+
758        N_ba*N_compa*((useFieldAux ? 2 : 0) + (useFieldDerAux ? 2*dim*(dim + 1) : 0))*/
759       +
760       N_comp*((useF0 ? f0Flops + 2 : 0) + (useF1 ? f1Flops + 2*dim : 0)))
761     +
762     N_b*((useF0 ? 2 : 0) + (useF1 ? 2*dim*(dim + 1) : 0)));
763   ierr = PetscFEOpenCLLogResidual(fem, (ns_end - ns_start)*1.0e-9, numFlops);CHKERRQ(ierr);
764   /* Cleanup */
765   ierr = clReleaseMemObject(o_coefficients);CHKERRQ(ierr);
766   ierr = clReleaseMemObject(o_coefficientsAux);CHKERRQ(ierr);
767   ierr = clReleaseMemObject(o_jacobianInverses);CHKERRQ(ierr);
768   ierr = clReleaseMemObject(o_jacobianDeterminants);CHKERRQ(ierr);
769   ierr = clReleaseMemObject(o_elemVec);CHKERRQ(ierr);
770   ierr = clReleaseKernel(ocl_kernel);CHKERRQ(ierr);
771   ierr = clReleaseProgram(ocl_prog);CHKERRQ(ierr);
772   PetscFunctionReturn(0);
773 }
774 
775 PETSC_EXTERN PetscErrorCode PetscFESetUp_Basic(PetscFE);
776 PETSC_EXTERN PetscErrorCode PetscFECreateTabulation_Basic(PetscFE, PetscInt, const PetscReal [], PetscInt, PetscTabulation);
777 
PetscFEInitialize_OpenCL(PetscFE fem)778 static PetscErrorCode PetscFEInitialize_OpenCL(PetscFE fem)
779 {
780   PetscFunctionBegin;
781   fem->ops->setfromoptions          = NULL;
782   fem->ops->setup                   = PetscFESetUp_Basic;
783   fem->ops->view                    = NULL;
784   fem->ops->destroy                 = PetscFEDestroy_OpenCL;
785   fem->ops->getdimension            = PetscFEGetDimension_Basic;
786   fem->ops->createtabulation        = PetscFECreateTabulation_Basic;
787   fem->ops->integrateresidual       = PetscFEIntegrateResidual_OpenCL;
788   fem->ops->integratebdresidual     = NULL/* PetscFEIntegrateBdResidual_OpenCL */;
789   fem->ops->integratejacobianaction = NULL/* PetscFEIntegrateJacobianAction_OpenCL */;
790   fem->ops->integratejacobian       = PetscFEIntegrateJacobian_Basic;
791   PetscFunctionReturn(0);
792 }
793 
794 /*MC
795   PETSCFEOPENCL = "opencl" - A PetscFE object that integrates using a vectorized OpenCL implementation
796 
797   Level: intermediate
798 
799 .seealso: PetscFEType, PetscFECreate(), PetscFESetType()
800 M*/
801 
PetscFECreate_OpenCL(PetscFE fem)802 PETSC_EXTERN PetscErrorCode PetscFECreate_OpenCL(PetscFE fem)
803 {
804   PetscFE_OpenCL *ocl;
805   cl_uint         num_platforms;
806   cl_platform_id  platform_ids[42];
807   cl_uint         num_devices;
808   cl_device_id    device_ids[42];
809   cl_int          err;
810   PetscErrorCode  ierr;
811 
812   PetscFunctionBegin;
813   PetscValidHeaderSpecific(fem, PETSCFE_CLASSID, 1);
814   ierr      = PetscNewLog(fem,&ocl);CHKERRQ(ierr);
815   fem->data = ocl;
816 
817   /* Init Platform */
818   ierr = clGetPlatformIDs(42, platform_ids, &num_platforms);CHKERRQ(ierr);
819   if (!num_platforms) SETERRQ(PetscObjectComm((PetscObject) fem), PETSC_ERR_SUP, "No OpenCL platform found.");
820   ocl->pf_id = platform_ids[0];
821   /* Init Device */
822   ierr = clGetDeviceIDs(ocl->pf_id, CL_DEVICE_TYPE_ALL, 42, device_ids, &num_devices);CHKERRQ(ierr);
823   if (!num_devices) SETERRQ(PetscObjectComm((PetscObject) fem), PETSC_ERR_SUP, "No OpenCL device found.");
824   ocl->dev_id = device_ids[0];
825   /* Create context with one command queue */
826   ocl->ctx_id   = clCreateContext(0, 1, &(ocl->dev_id), NULL, NULL, &err);CHKERRQ(err);
827   ocl->queue_id = clCreateCommandQueue(ocl->ctx_id, ocl->dev_id, CL_QUEUE_PROFILING_ENABLE, &err);CHKERRQ(err);
828   /* Types */
829   ocl->realType = PETSC_FLOAT;
830   /* Register events */
831   ierr = PetscLogEventRegister("OpenCL FEResidual", PETSCFE_CLASSID, &ocl->residualEvent);CHKERRQ(ierr);
832   /* Equation handling */
833   ocl->op = LAPLACIAN;
834 
835   ierr = PetscFEInitialize_OpenCL(fem);CHKERRQ(ierr);
836   PetscFunctionReturn(0);
837 }
838 
839 /*@
840   PetscFEOpenCLSetRealType - Set the scalar type for running on the accelerator
841 
842   Input Parameters:
843 + fem      - The PetscFE
844 - realType - The scalar type
845 
846   Level: developer
847 
848 .seealso: PetscFEOpenCLGetRealType()
849 @*/
PetscFEOpenCLSetRealType(PetscFE fem,PetscDataType realType)850 PetscErrorCode PetscFEOpenCLSetRealType(PetscFE fem, PetscDataType realType)
851 {
852   PetscFE_OpenCL *ocl = (PetscFE_OpenCL *) fem->data;
853 
854   PetscFunctionBegin;
855   PetscValidHeaderSpecific(fem, PETSCFE_CLASSID, 1);
856   ocl->realType = realType;
857   PetscFunctionReturn(0);
858 }
859 
860 /*@
861   PetscFEOpenCLGetRealType - Get the scalar type for running on the accelerator
862 
863   Input Parameter:
864 . fem      - The PetscFE
865 
866   Output Parameter:
867 . realType - The scalar type
868 
869   Level: developer
870 
871 .seealso: PetscFEOpenCLSetRealType()
872 @*/
PetscFEOpenCLGetRealType(PetscFE fem,PetscDataType * realType)873 PetscErrorCode PetscFEOpenCLGetRealType(PetscFE fem, PetscDataType *realType)
874 {
875   PetscFE_OpenCL *ocl = (PetscFE_OpenCL *) fem->data;
876 
877   PetscFunctionBegin;
878   PetscValidHeaderSpecific(fem, PETSCFE_CLASSID, 1);
879   PetscValidPointer(realType, 2);
880   *realType = ocl->realType;
881   PetscFunctionReturn(0);
882 }
883 
884 #endif /* PETSC_HAVE_OPENCL */
885