1 // ---------------------------------------------------------------------
2 //
3 // Copyright (C) 2016 - 2020 by the deal.II authors
4 //
5 // This file is part of the deal.II library.
6 //
7 // The deal.II library is free software; you can use it, redistribute
8 // it, and/or modify it under the terms of the GNU Lesser General
9 // Public License as published by the Free Software Foundation; either
10 // version 2.1 of the License, or (at your option) any later version.
11 // The full text of the license can be found in the file LICENSE.md at
12 // the top level directory of deal.II.
13 //
14 // ---------------------------------------------------------------------
15 
16 #ifndef dealii_cuda_fe_evaluation_h
17 #define dealii_cuda_fe_evaluation_h
18 
19 #include <deal.II/base/config.h>
20 
21 #ifdef DEAL_II_COMPILER_CUDA_AWARE
22 
23 #  include <deal.II/base/tensor.h>
24 #  include <deal.II/base/utilities.h>
25 
26 #  include <deal.II/lac/cuda_atomic.h>
27 #  include <deal.II/lac/cuda_vector.h>
28 
29 #  include <deal.II/matrix_free/cuda_hanging_nodes_internal.h>
30 #  include <deal.II/matrix_free/cuda_matrix_free.h>
31 #  include <deal.II/matrix_free/cuda_matrix_free.templates.h>
32 #  include <deal.II/matrix_free/cuda_tensor_product_kernels.h>
33 
34 DEAL_II_NAMESPACE_OPEN
35 
36 /**
37  * Namespace for the CUDA wrappers
38  */
39 namespace CUDAWrappers
40 {
41   namespace internal
42   {
43     /**
44      * Compute the dof/quad index for a given thread id, dimension, and
45      * number of points in each space dimensions.
46      */
47     template <int dim, int n_points_1d>
48     __device__ inline unsigned int
compute_index()49     compute_index()
50     {
51       return (dim == 1 ?
52                 threadIdx.x % n_points_1d :
53                 dim == 2 ?
54                 threadIdx.x % n_points_1d + n_points_1d * threadIdx.y :
55                 threadIdx.x % n_points_1d +
56                     n_points_1d * (threadIdx.y + n_points_1d * threadIdx.z));
57     }
58   } // namespace internal
59 
60   /**
61    * This class provides all the functions necessary to evaluate functions at
62    * quadrature points and cell integrations. In functionality, this class is
63    * similar to FEValues<dim>.
64    *
65    * This class has five template arguments:
66    *
67    * @tparam dim Dimension in which this class is to be used
68    *
69    * @tparam fe_degree Degree of the tensor prodict finite element with fe_degree+1
70    * degrees of freedom per coordinate direction
71    *
72    * @tparam n_q_points_1d Number of points in the quadrature formular in 1D,
73    * defaults to fe_degree+1
74    *
75    * @tparam n_components Number of vector components when solving a system of
76    * PDEs. If the same operation is applied to several components of a PDE (e.g.
77    * a vector Laplace equation), they can be applied simultaneously with one
78    * call (and often more efficiently). Defaults to 1
79    *
80    * @tparam Number Number format, @p double or @p float. Defaults to @p
81    * double.
82    *
83    * @ingroup CUDAWrappers
84    */
85   template <int dim,
86             int fe_degree,
87             int n_q_points_1d = fe_degree + 1,
88             int n_components_ = 1,
89             typename Number   = double>
90   class FEEvaluation
91   {
92   public:
93     /**
94      * An alias for scalar quantities.
95      */
96     using value_type = Number;
97 
98     /**
99      * An alias for vectorial quantities.
100      */
101     using gradient_type = Tensor<1, dim, Number>;
102 
103     /**
104      * An alias to kernel specific information.
105      */
106     using data_type = typename MatrixFree<dim, Number>::Data;
107 
108     /**
109      * Dimension.
110      */
111     static constexpr unsigned int dimension = dim;
112 
113     /**
114      * Number of components.
115      */
116     static constexpr unsigned int n_components = n_components_;
117 
118     /**
119      * Number of quadrature points per cell.
120      */
121     static constexpr unsigned int n_q_points =
122       Utilities::pow(n_q_points_1d, dim);
123 
124     /**
125      * Number of tensor degrees of freedoms per cell.
126      */
127     static constexpr unsigned int tensor_dofs_per_cell =
128       Utilities::pow(fe_degree + 1, dim);
129 
130     /**
131      * Constructor.
132      */
133     __device__
134     FEEvaluation(const unsigned int       cell_id,
135                  const data_type *        data,
136                  SharedData<dim, Number> *shdata);
137 
138     /**
139      * For the vector @p src, read out the values on the degrees of freedom of
140      * the current cell, and store them internally. Similar functionality as
141      * the function DoFAccessor::get_interpolated_dof_values when no
142      * constraints are present, but it also includes constraints from hanging
143      * nodes, so once can see it as a similar function to
144      * AffineConstraints::read_dof_valuess as well.
145      */
146     __device__ void
147     read_dof_values(const Number *src);
148 
149     /**
150      * Take the value stored internally on dof values of the current cell and
151      * sum them into the vector @p dst. The function also applies constraints
152      * during the write operation. The functionality is hence similar to the
153      * function AffineConstraints::distribute_local_to_global.
154      */
155     __device__ void
156     distribute_local_to_global(Number *dst) const;
157 
158     /**
159      * Evaluate the function values and the gradients of the FE function given
160      * at the DoF values in the input vector at the quadrature points on the
161      * unit cell. The function arguments specify which parts shall actually be
162      * computed. This function needs to be called before the functions
163      * @p get_value() or @p get_gradient() give useful information.
164      */
165     __device__ void
166     evaluate(const bool evaluate_val, const bool evaluate_grad);
167 
168     /**
169      * This function takes the values and/or gradients that are stored on
170      * quadrature points, tests them by all the basis functions/gradients on
171      * the cell and performs the cell integration. The two function arguments
172      * @p integrate_val and @p integrate_grad are used to enable/disable some
173      * of the values or the gradients.
174      */
175     __device__ void
176     integrate(const bool integrate_val, const bool integrate_grad);
177 
178     /**
179      * Return the value of a finite element function at quadrature point
180      * number @p q_point after a call to @p evaluate(true,...).
181      *
182      * @deprecated Use the version without parameters instead.
183      */
184     DEAL_II_DEPRECATED __device__ value_type
185                                   get_value(const unsigned int q_point) const;
186 
187     /**
188      * Same as above, except that the quadrature point is computed from thread
189      * id.
190      */
191     __device__ value_type
192                get_value() const;
193 
194     /**
195      * Return the value of a finite element function at degree of freedom
196      * @p dof after a call to integrate() or before a call to evaluate().
197      *
198      * @deprecated Use the version without parameters instead.
199      */
200     DEAL_II_DEPRECATED __device__ value_type
201                                   get_dof_value(const unsigned int dof) const;
202 
203     /**
204      * Same as above, except that the local dof index is computed from the
205      * thread id.
206      */
207     __device__ value_type
208                get_dof_value() const;
209 
210     /**
211      * Write a value to the field containing the values on quadrature points
212      * with component @p q_point. Access to the same fields as through @p
213      * get_value(). This specifies the value which is tested by all basis
214      * function on the current cell and integrated over.
215      *
216      * @deprecated Use the version without parameters instead.
217      */
218     DEAL_II_DEPRECATED __device__ void
219                        submit_value(const value_type &val_in, const unsigned int q_point);
220 
221     /**
222      * Same as above, except that the quadrature point is computed from the
223      * thread id.
224      */
225     __device__ void
226     submit_value(const value_type &val_in);
227 
228     /**
229      * Write a value to the field containing the values for the degree of
230      * freedom with index @p dof after a call to integrate() or before
231      * calling evaluate(). Access through the same fields as through
232      * get_dof_value().
233      *
234      * @deprecated Use the version without parameters instead.
235      */
236     DEAL_II_DEPRECATED __device__ void
237                        submit_dof_value(const value_type &val_in, const unsigned int dof);
238 
239     /**
240      * Same as above, except that the local dof index is computed from the
241      * thread id.
242      */
243     __device__ void
244     submit_dof_value(const value_type &val_in);
245 
246     /**
247      * Return the gradient of a finite element function at quadrature point
248      * number @p q_point after a call to @p evaluate(...,true).
249      *
250      * @deprecated Use the version without parameters instead.
251      */
252     DEAL_II_DEPRECATED __device__ gradient_type
253                                   get_gradient(const unsigned int q_point) const;
254 
255     /**
256      * Same as above, except that the quadrature point is computed from the
257      * thread id.
258      */
259     __device__ gradient_type
260                get_gradient() const;
261 
262     /**
263      * Write a contribution that is tested by the gradient to the field
264      * containing the values on quadrature points with component @p q_point.
265      *
266      * @deprecated Use the version without parameters instead.
267      */
268     DEAL_II_DEPRECATED __device__ void
269                        submit_gradient(const gradient_type &grad_in, const unsigned int q_point);
270 
271 
272     /**
273      * Same as above, except that the quadrature point is computed from the
274      * thread id.
275      */
276     __device__ void
277     submit_gradient(const gradient_type &grad_in);
278 
279     // clang-format off
280     /**
281      * Apply the functor @p func on every quadrature point.
282      *
283      * @p func needs to define
284      * \code
285      * __device__ void operator()(
286      *   CUDAWrappers::FEEvaluation<dim, fe_degree, n_q_points_1d, n_components, Number> *fe_eval,
287      *   const unsigned int                                                               q_point) const;
288      * \endcode
289      *
290      * @deprecated Use apply_for_each_quad_point() instead.
291      */
292     // clang-format on
293     template <typename Functor>
294     DEAL_II_DEPRECATED __device__ void
295                        apply_quad_point_operations(const Functor &func);
296 
297     // clang-format off
298     /**
299      * Same as above, except that the functor @p func only takes a single input
300      * argument (fe_eval) and computes the quadrature point from the thread id.
301      *
302      * @p func needs to define
303      * \code
304      * __device__ void operator()(
305      *   CUDAWrappers::FEEvaluation<dim, fe_degree, n_q_points_1d, n_components, Number> *fe_eval) const;
306      * \endcode
307      */
308     // clang-format on
309     template <typename Functor>
310     __device__ void
311     apply_for_each_quad_point(const Functor &func);
312 
313   private:
314     types::global_dof_index *local_to_global;
315     unsigned int             n_cells;
316     unsigned int             padding_length;
317     const unsigned int       mf_object_id;
318 
319     const unsigned int constraint_mask;
320 
321     const bool use_coloring;
322 
323     Number *inv_jac;
324     Number *JxW;
325 
326     // Internal buffer
327     Number *values;
328     Number *gradients[dim];
329   };
330 
331 
332 
333   template <int dim,
334             int fe_degree,
335             int n_q_points_1d,
336             int n_components_,
337             typename Number>
338   __device__
339   FEEvaluation<dim, fe_degree, n_q_points_1d, n_components_, Number>::
FEEvaluation(const unsigned int cell_id,const data_type * data,SharedData<dim,Number> * shdata)340     FEEvaluation(const unsigned int       cell_id,
341                  const data_type *        data,
342                  SharedData<dim, Number> *shdata)
343     : n_cells(data->n_cells)
344     , padding_length(data->padding_length)
345     , mf_object_id(data->id)
346     , constraint_mask(data->constraint_mask[cell_id])
347     , use_coloring(data->use_coloring)
348     , values(shdata->values)
349   {
350     local_to_global = data->local_to_global + padding_length * cell_id;
351     inv_jac         = data->inv_jacobian + padding_length * cell_id;
352     JxW             = data->JxW + padding_length * cell_id;
353 
354     for (unsigned int i = 0; i < dim; ++i)
355       gradients[i] = shdata->gradients[i];
356   }
357 
358 
359 
360   template <int dim,
361             int fe_degree,
362             int n_q_points_1d,
363             int n_components_,
364             typename Number>
365   __device__ void
366   FEEvaluation<dim, fe_degree, n_q_points_1d, n_components_, Number>::
read_dof_values(const Number * src)367     read_dof_values(const Number *src)
368   {
369     static_assert(n_components_ == 1, "This function only supports FE with one \
370                   components");
371     const unsigned int idx = internal::compute_index<dim, n_q_points_1d>();
372 
373     const types::global_dof_index src_idx = local_to_global[idx];
374     // Use the read-only data cache.
375     values[idx] = __ldg(&src[src_idx]);
376 
377     __syncthreads();
378 
379     internal::resolve_hanging_nodes<dim, fe_degree, false>(constraint_mask,
380                                                            values);
381   }
382 
383 
384 
385   template <int dim,
386             int fe_degree,
387             int n_q_points_1d,
388             int n_components_,
389             typename Number>
390   __device__ void
391   FEEvaluation<dim, fe_degree, n_q_points_1d, n_components_, Number>::
distribute_local_to_global(Number * dst)392     distribute_local_to_global(Number *dst) const
393   {
394     static_assert(n_components_ == 1, "This function only supports FE with one \
395                   components");
396     internal::resolve_hanging_nodes<dim, fe_degree, true>(constraint_mask,
397                                                           values);
398 
399     const unsigned int idx = internal::compute_index<dim, n_q_points_1d>();
400 
401     const types::global_dof_index destination_idx = local_to_global[idx];
402 
403     if (use_coloring)
404       dst[destination_idx] += values[idx];
405     else
406       atomicAdd(&dst[destination_idx], values[idx]);
407   }
408 
409 
410 
411   template <int dim,
412             int fe_degree,
413             int n_q_points_1d,
414             int n_components_,
415             typename Number>
416   __device__ void
evaluate(const bool evaluate_val,const bool evaluate_grad)417   FEEvaluation<dim, fe_degree, n_q_points_1d, n_components_, Number>::evaluate(
418     const bool evaluate_val,
419     const bool evaluate_grad)
420   {
421     // First evaluate the gradients because it requires values that will be
422     // changed if evaluate_val is true
423     internal::EvaluatorTensorProduct<
424       internal::EvaluatorVariant::evaluate_general,
425       dim,
426       fe_degree,
427       n_q_points_1d,
428       Number>
429       evaluator_tensor_product(mf_object_id);
430     if (evaluate_val == true && evaluate_grad == true)
431       {
432         evaluator_tensor_product.value_and_gradient_at_quad_pts(values,
433                                                                 gradients);
434         __syncthreads();
435       }
436     else if (evaluate_grad == true)
437       {
438         evaluator_tensor_product.gradient_at_quad_pts(values, gradients);
439         __syncthreads();
440       }
441     else if (evaluate_val == true)
442       {
443         evaluator_tensor_product.value_at_quad_pts(values);
444         __syncthreads();
445       }
446   }
447 
448 
449 
450   template <int dim,
451             int fe_degree,
452             int n_q_points_1d,
453             int n_components_,
454             typename Number>
455   __device__ void
integrate(const bool integrate_val,const bool integrate_grad)456   FEEvaluation<dim, fe_degree, n_q_points_1d, n_components_, Number>::integrate(
457     const bool integrate_val,
458     const bool integrate_grad)
459   {
460     internal::EvaluatorTensorProduct<
461       internal::EvaluatorVariant::evaluate_general,
462       dim,
463       fe_degree,
464       n_q_points_1d,
465       Number>
466       evaluator_tensor_product(mf_object_id);
467     if (integrate_val == true && integrate_grad == true)
468       {
469         evaluator_tensor_product.integrate_value_and_gradient(values,
470                                                               gradients);
471       }
472     else if (integrate_val == true)
473       {
474         evaluator_tensor_product.integrate_value(values);
475         __syncthreads();
476       }
477     else if (integrate_grad == true)
478       {
479         evaluator_tensor_product.integrate_gradient<false>(values, gradients);
480         __syncthreads();
481       }
482   }
483 
484 
485 
486   template <int dim,
487             int fe_degree,
488             int n_q_points_1d,
489             int n_components_,
490             typename Number>
491   __device__ typename FEEvaluation<dim,
492                                    fe_degree,
493                                    n_q_points_1d,
494                                    n_components_,
495                                    Number>::value_type
get_value(const unsigned int q_point)496   FEEvaluation<dim, fe_degree, n_q_points_1d, n_components_, Number>::get_value(
497     const unsigned int q_point) const
498   {
499     return values[q_point];
500   }
501 
502 
503 
504   template <int dim,
505             int fe_degree,
506             int n_q_points_1d,
507             int n_components_,
508             typename Number>
509   __device__ typename FEEvaluation<dim,
510                                    fe_degree,
511                                    n_q_points_1d,
512                                    n_components_,
513                                    Number>::value_type
514   FEEvaluation<dim, fe_degree, n_q_points_1d, n_components_, Number>::
get_value()515     get_value() const
516   {
517     const unsigned int q_point = internal::compute_index<dim, n_q_points_1d>();
518     return values[q_point];
519   }
520 
521 
522 
523   template <int dim,
524             int fe_degree,
525             int n_q_points_1d,
526             int n_components_,
527             typename Number>
528   __device__ typename FEEvaluation<dim,
529                                    fe_degree,
530                                    n_q_points_1d,
531                                    n_components_,
532                                    Number>::value_type
533   FEEvaluation<dim, fe_degree, n_q_points_1d, n_components_, Number>::
get_dof_value(const unsigned int dof)534     get_dof_value(const unsigned int dof) const
535   {
536     return values[dof];
537   }
538 
539 
540 
541   template <int dim,
542             int fe_degree,
543             int n_q_points_1d,
544             int n_components_,
545             typename Number>
546   __device__ typename FEEvaluation<dim,
547                                    fe_degree,
548                                    n_q_points_1d,
549                                    n_components_,
550                                    Number>::value_type
551   FEEvaluation<dim, fe_degree, n_q_points_1d, n_components_, Number>::
get_dof_value()552     get_dof_value() const
553   {
554     const unsigned int dof = internal::compute_index<dim, fe_degree + 1>();
555     return values[dof];
556   }
557 
558 
559 
560   template <int dim,
561             int fe_degree,
562             int n_q_points_1d,
563             int n_components_,
564             typename Number>
565   __device__ void
566   FEEvaluation<dim, fe_degree, n_q_points_1d, n_components_, Number>::
submit_value(const value_type & val_in,const unsigned int q_point)567     submit_value(const value_type &val_in, const unsigned int q_point)
568   {
569     values[q_point] = val_in * JxW[q_point];
570   }
571 
572 
573 
574   template <int dim,
575             int fe_degree,
576             int n_q_points_1d,
577             int n_components_,
578             typename Number>
579   __device__ void
580   FEEvaluation<dim, fe_degree, n_q_points_1d, n_components_, Number>::
submit_value(const value_type & val_in)581     submit_value(const value_type &val_in)
582   {
583     const unsigned int q_point = internal::compute_index<dim, n_q_points_1d>();
584     values[q_point]            = val_in * JxW[q_point];
585   }
586 
587 
588 
589   template <int dim,
590             int fe_degree,
591             int n_q_points_1d,
592             int n_components_,
593             typename Number>
594   __device__ void
595   FEEvaluation<dim, fe_degree, n_q_points_1d, n_components_, Number>::
submit_dof_value(const value_type & val_in,const unsigned int dof)596     submit_dof_value(const value_type &val_in, const unsigned int dof)
597   {
598     values[dof] = val_in;
599   }
600 
601 
602 
603   template <int dim,
604             int fe_degree,
605             int n_q_points_1d,
606             int n_components_,
607             typename Number>
608   __device__ void
609   FEEvaluation<dim, fe_degree, n_q_points_1d, n_components_, Number>::
submit_dof_value(const value_type & val_in)610     submit_dof_value(const value_type &val_in)
611   {
612     const unsigned int dof = internal::compute_index<dim, fe_degree + 1>();
613     values[dof]            = val_in;
614   }
615 
616 
617 
618   template <int dim,
619             int fe_degree,
620             int n_q_points_1d,
621             int n_components_,
622             typename Number>
623   __device__ typename FEEvaluation<dim,
624                                    fe_degree,
625                                    n_q_points_1d,
626                                    n_components_,
627                                    Number>::gradient_type
628   FEEvaluation<dim, fe_degree, n_q_points_1d, n_components_, Number>::
get_gradient(const unsigned int q_point)629     get_gradient(const unsigned int q_point) const
630   {
631     static_assert(n_components_ == 1, "This function only supports FE with one \
632                   components");
633     // TODO optimize if the mesh is uniform
634     const Number *inv_jacobian = &inv_jac[q_point];
635     gradient_type grad;
636     for (int d_1 = 0; d_1 < dim; ++d_1)
637       {
638         Number tmp = 0.;
639         for (int d_2 = 0; d_2 < dim; ++d_2)
640           tmp += inv_jacobian[padding_length * n_cells * (dim * d_2 + d_1)] *
641                  gradients[d_2][q_point];
642         grad[d_1] = tmp;
643       }
644 
645     return grad;
646   }
647 
648 
649 
650   template <int dim,
651             int fe_degree,
652             int n_q_points_1d,
653             int n_components_,
654             typename Number>
655   __device__ typename FEEvaluation<dim,
656                                    fe_degree,
657                                    n_q_points_1d,
658                                    n_components_,
659                                    Number>::gradient_type
660   FEEvaluation<dim, fe_degree, n_q_points_1d, n_components_, Number>::
get_gradient()661     get_gradient() const
662   {
663     static_assert(n_components_ == 1, "This function only supports FE with one \
664                   components");
665 
666     // TODO optimize if the mesh is uniform
667     const unsigned int q_point = internal::compute_index<dim, n_q_points_1d>();
668     const Number *     inv_jacobian = &inv_jac[q_point];
669     gradient_type      grad;
670     for (int d_1 = 0; d_1 < dim; ++d_1)
671       {
672         Number tmp = 0.;
673         for (int d_2 = 0; d_2 < dim; ++d_2)
674           tmp += inv_jacobian[padding_length * n_cells * (dim * d_2 + d_1)] *
675                  gradients[d_2][q_point];
676         grad[d_1] = tmp;
677       }
678 
679     return grad;
680   }
681 
682 
683 
684   template <int dim,
685             int fe_degree,
686             int n_q_points_1d,
687             int n_components_,
688             typename Number>
689   __device__ void
690   FEEvaluation<dim, fe_degree, n_q_points_1d, n_components_, Number>::
submit_gradient(const gradient_type & grad_in,const unsigned int q_point)691     submit_gradient(const gradient_type &grad_in, const unsigned int q_point)
692   {
693     // TODO optimize if the mesh is uniform
694     const Number *inv_jacobian = &inv_jac[q_point];
695     for (int d_1 = 0; d_1 < dim; ++d_1)
696       {
697         Number tmp = 0.;
698         for (int d_2 = 0; d_2 < dim; ++d_2)
699           tmp += inv_jacobian[n_cells * padding_length * (dim * d_1 + d_2)] *
700                  grad_in[d_2];
701         gradients[d_1][q_point] = tmp * JxW[q_point];
702       }
703   }
704 
705 
706 
707   template <int dim,
708             int fe_degree,
709             int n_q_points_1d,
710             int n_components_,
711             typename Number>
712   __device__ void
713   FEEvaluation<dim, fe_degree, n_q_points_1d, n_components_, Number>::
submit_gradient(const gradient_type & grad_in)714     submit_gradient(const gradient_type &grad_in)
715   {
716     // TODO optimize if the mesh is uniform
717     const unsigned int q_point = internal::compute_index<dim, n_q_points_1d>();
718     const Number *     inv_jacobian = &inv_jac[q_point];
719     for (int d_1 = 0; d_1 < dim; ++d_1)
720       {
721         Number tmp = 0.;
722         for (int d_2 = 0; d_2 < dim; ++d_2)
723           tmp += inv_jacobian[n_cells * padding_length * (dim * d_1 + d_2)] *
724                  grad_in[d_2];
725         gradients[d_1][q_point] = tmp * JxW[q_point];
726       }
727   }
728 
729 
730 
731   template <int dim,
732             int fe_degree,
733             int n_q_points_1d,
734             int n_components_,
735             typename Number>
736   template <typename Functor>
737   __device__ void
738   FEEvaluation<dim, fe_degree, n_q_points_1d, n_components_, Number>::
apply_quad_point_operations(const Functor & func)739     apply_quad_point_operations(const Functor &func)
740   {
741     func(this, internal::compute_index<dim, n_q_points_1d>());
742 
743     __syncthreads();
744   }
745 
746 
747 
748   template <int dim,
749             int fe_degree,
750             int n_q_points_1d,
751             int n_components_,
752             typename Number>
753   template <typename Functor>
754   __device__ void
755   FEEvaluation<dim, fe_degree, n_q_points_1d, n_components_, Number>::
apply_for_each_quad_point(const Functor & func)756     apply_for_each_quad_point(const Functor &func)
757   {
758     func(this);
759 
760     __syncthreads();
761   }
762 } // namespace CUDAWrappers
763 
764 DEAL_II_NAMESPACE_CLOSE
765 
766 #endif
767 
768 #endif
769