1 // --------------------------------------------------------------------- 2 // 3 // Copyright (C) 2016 - 2020 by the deal.II authors 4 // 5 // This file is part of the deal.II library. 6 // 7 // The deal.II library is free software; you can use it, redistribute 8 // it, and/or modify it under the terms of the GNU Lesser General 9 // Public License as published by the Free Software Foundation; either 10 // version 2.1 of the License, or (at your option) any later version. 11 // The full text of the license can be found in the file LICENSE.md at 12 // the top level directory of deal.II. 13 // 14 // --------------------------------------------------------------------- 15 16 #ifndef dealii_cuda_fe_evaluation_h 17 #define dealii_cuda_fe_evaluation_h 18 19 #include <deal.II/base/config.h> 20 21 #ifdef DEAL_II_COMPILER_CUDA_AWARE 22 23 # include <deal.II/base/tensor.h> 24 # include <deal.II/base/utilities.h> 25 26 # include <deal.II/lac/cuda_atomic.h> 27 # include <deal.II/lac/cuda_vector.h> 28 29 # include <deal.II/matrix_free/cuda_hanging_nodes_internal.h> 30 # include <deal.II/matrix_free/cuda_matrix_free.h> 31 # include <deal.II/matrix_free/cuda_matrix_free.templates.h> 32 # include <deal.II/matrix_free/cuda_tensor_product_kernels.h> 33 34 DEAL_II_NAMESPACE_OPEN 35 36 /** 37 * Namespace for the CUDA wrappers 38 */ 39 namespace CUDAWrappers 40 { 41 namespace internal 42 { 43 /** 44 * Compute the dof/quad index for a given thread id, dimension, and 45 * number of points in each space dimensions. 46 */ 47 template <int dim, int n_points_1d> 48 __device__ inline unsigned int compute_index()49 compute_index() 50 { 51 return (dim == 1 ? 52 threadIdx.x % n_points_1d : 53 dim == 2 ? 54 threadIdx.x % n_points_1d + n_points_1d * threadIdx.y : 55 threadIdx.x % n_points_1d + 56 n_points_1d * (threadIdx.y + n_points_1d * threadIdx.z)); 57 } 58 } // namespace internal 59 60 /** 61 * This class provides all the functions necessary to evaluate functions at 62 * quadrature points and cell integrations. In functionality, this class is 63 * similar to FEValues<dim>. 64 * 65 * This class has five template arguments: 66 * 67 * @tparam dim Dimension in which this class is to be used 68 * 69 * @tparam fe_degree Degree of the tensor prodict finite element with fe_degree+1 70 * degrees of freedom per coordinate direction 71 * 72 * @tparam n_q_points_1d Number of points in the quadrature formular in 1D, 73 * defaults to fe_degree+1 74 * 75 * @tparam n_components Number of vector components when solving a system of 76 * PDEs. If the same operation is applied to several components of a PDE (e.g. 77 * a vector Laplace equation), they can be applied simultaneously with one 78 * call (and often more efficiently). Defaults to 1 79 * 80 * @tparam Number Number format, @p double or @p float. Defaults to @p 81 * double. 82 * 83 * @ingroup CUDAWrappers 84 */ 85 template <int dim, 86 int fe_degree, 87 int n_q_points_1d = fe_degree + 1, 88 int n_components_ = 1, 89 typename Number = double> 90 class FEEvaluation 91 { 92 public: 93 /** 94 * An alias for scalar quantities. 95 */ 96 using value_type = Number; 97 98 /** 99 * An alias for vectorial quantities. 100 */ 101 using gradient_type = Tensor<1, dim, Number>; 102 103 /** 104 * An alias to kernel specific information. 105 */ 106 using data_type = typename MatrixFree<dim, Number>::Data; 107 108 /** 109 * Dimension. 110 */ 111 static constexpr unsigned int dimension = dim; 112 113 /** 114 * Number of components. 115 */ 116 static constexpr unsigned int n_components = n_components_; 117 118 /** 119 * Number of quadrature points per cell. 120 */ 121 static constexpr unsigned int n_q_points = 122 Utilities::pow(n_q_points_1d, dim); 123 124 /** 125 * Number of tensor degrees of freedoms per cell. 126 */ 127 static constexpr unsigned int tensor_dofs_per_cell = 128 Utilities::pow(fe_degree + 1, dim); 129 130 /** 131 * Constructor. 132 */ 133 __device__ 134 FEEvaluation(const unsigned int cell_id, 135 const data_type * data, 136 SharedData<dim, Number> *shdata); 137 138 /** 139 * For the vector @p src, read out the values on the degrees of freedom of 140 * the current cell, and store them internally. Similar functionality as 141 * the function DoFAccessor::get_interpolated_dof_values when no 142 * constraints are present, but it also includes constraints from hanging 143 * nodes, so once can see it as a similar function to 144 * AffineConstraints::read_dof_valuess as well. 145 */ 146 __device__ void 147 read_dof_values(const Number *src); 148 149 /** 150 * Take the value stored internally on dof values of the current cell and 151 * sum them into the vector @p dst. The function also applies constraints 152 * during the write operation. The functionality is hence similar to the 153 * function AffineConstraints::distribute_local_to_global. 154 */ 155 __device__ void 156 distribute_local_to_global(Number *dst) const; 157 158 /** 159 * Evaluate the function values and the gradients of the FE function given 160 * at the DoF values in the input vector at the quadrature points on the 161 * unit cell. The function arguments specify which parts shall actually be 162 * computed. This function needs to be called before the functions 163 * @p get_value() or @p get_gradient() give useful information. 164 */ 165 __device__ void 166 evaluate(const bool evaluate_val, const bool evaluate_grad); 167 168 /** 169 * This function takes the values and/or gradients that are stored on 170 * quadrature points, tests them by all the basis functions/gradients on 171 * the cell and performs the cell integration. The two function arguments 172 * @p integrate_val and @p integrate_grad are used to enable/disable some 173 * of the values or the gradients. 174 */ 175 __device__ void 176 integrate(const bool integrate_val, const bool integrate_grad); 177 178 /** 179 * Return the value of a finite element function at quadrature point 180 * number @p q_point after a call to @p evaluate(true,...). 181 * 182 * @deprecated Use the version without parameters instead. 183 */ 184 DEAL_II_DEPRECATED __device__ value_type 185 get_value(const unsigned int q_point) const; 186 187 /** 188 * Same as above, except that the quadrature point is computed from thread 189 * id. 190 */ 191 __device__ value_type 192 get_value() const; 193 194 /** 195 * Return the value of a finite element function at degree of freedom 196 * @p dof after a call to integrate() or before a call to evaluate(). 197 * 198 * @deprecated Use the version without parameters instead. 199 */ 200 DEAL_II_DEPRECATED __device__ value_type 201 get_dof_value(const unsigned int dof) const; 202 203 /** 204 * Same as above, except that the local dof index is computed from the 205 * thread id. 206 */ 207 __device__ value_type 208 get_dof_value() const; 209 210 /** 211 * Write a value to the field containing the values on quadrature points 212 * with component @p q_point. Access to the same fields as through @p 213 * get_value(). This specifies the value which is tested by all basis 214 * function on the current cell and integrated over. 215 * 216 * @deprecated Use the version without parameters instead. 217 */ 218 DEAL_II_DEPRECATED __device__ void 219 submit_value(const value_type &val_in, const unsigned int q_point); 220 221 /** 222 * Same as above, except that the quadrature point is computed from the 223 * thread id. 224 */ 225 __device__ void 226 submit_value(const value_type &val_in); 227 228 /** 229 * Write a value to the field containing the values for the degree of 230 * freedom with index @p dof after a call to integrate() or before 231 * calling evaluate(). Access through the same fields as through 232 * get_dof_value(). 233 * 234 * @deprecated Use the version without parameters instead. 235 */ 236 DEAL_II_DEPRECATED __device__ void 237 submit_dof_value(const value_type &val_in, const unsigned int dof); 238 239 /** 240 * Same as above, except that the local dof index is computed from the 241 * thread id. 242 */ 243 __device__ void 244 submit_dof_value(const value_type &val_in); 245 246 /** 247 * Return the gradient of a finite element function at quadrature point 248 * number @p q_point after a call to @p evaluate(...,true). 249 * 250 * @deprecated Use the version without parameters instead. 251 */ 252 DEAL_II_DEPRECATED __device__ gradient_type 253 get_gradient(const unsigned int q_point) const; 254 255 /** 256 * Same as above, except that the quadrature point is computed from the 257 * thread id. 258 */ 259 __device__ gradient_type 260 get_gradient() const; 261 262 /** 263 * Write a contribution that is tested by the gradient to the field 264 * containing the values on quadrature points with component @p q_point. 265 * 266 * @deprecated Use the version without parameters instead. 267 */ 268 DEAL_II_DEPRECATED __device__ void 269 submit_gradient(const gradient_type &grad_in, const unsigned int q_point); 270 271 272 /** 273 * Same as above, except that the quadrature point is computed from the 274 * thread id. 275 */ 276 __device__ void 277 submit_gradient(const gradient_type &grad_in); 278 279 // clang-format off 280 /** 281 * Apply the functor @p func on every quadrature point. 282 * 283 * @p func needs to define 284 * \code 285 * __device__ void operator()( 286 * CUDAWrappers::FEEvaluation<dim, fe_degree, n_q_points_1d, n_components, Number> *fe_eval, 287 * const unsigned int q_point) const; 288 * \endcode 289 * 290 * @deprecated Use apply_for_each_quad_point() instead. 291 */ 292 // clang-format on 293 template <typename Functor> 294 DEAL_II_DEPRECATED __device__ void 295 apply_quad_point_operations(const Functor &func); 296 297 // clang-format off 298 /** 299 * Same as above, except that the functor @p func only takes a single input 300 * argument (fe_eval) and computes the quadrature point from the thread id. 301 * 302 * @p func needs to define 303 * \code 304 * __device__ void operator()( 305 * CUDAWrappers::FEEvaluation<dim, fe_degree, n_q_points_1d, n_components, Number> *fe_eval) const; 306 * \endcode 307 */ 308 // clang-format on 309 template <typename Functor> 310 __device__ void 311 apply_for_each_quad_point(const Functor &func); 312 313 private: 314 types::global_dof_index *local_to_global; 315 unsigned int n_cells; 316 unsigned int padding_length; 317 const unsigned int mf_object_id; 318 319 const unsigned int constraint_mask; 320 321 const bool use_coloring; 322 323 Number *inv_jac; 324 Number *JxW; 325 326 // Internal buffer 327 Number *values; 328 Number *gradients[dim]; 329 }; 330 331 332 333 template <int dim, 334 int fe_degree, 335 int n_q_points_1d, 336 int n_components_, 337 typename Number> 338 __device__ 339 FEEvaluation<dim, fe_degree, n_q_points_1d, n_components_, Number>:: FEEvaluation(const unsigned int cell_id,const data_type * data,SharedData<dim,Number> * shdata)340 FEEvaluation(const unsigned int cell_id, 341 const data_type * data, 342 SharedData<dim, Number> *shdata) 343 : n_cells(data->n_cells) 344 , padding_length(data->padding_length) 345 , mf_object_id(data->id) 346 , constraint_mask(data->constraint_mask[cell_id]) 347 , use_coloring(data->use_coloring) 348 , values(shdata->values) 349 { 350 local_to_global = data->local_to_global + padding_length * cell_id; 351 inv_jac = data->inv_jacobian + padding_length * cell_id; 352 JxW = data->JxW + padding_length * cell_id; 353 354 for (unsigned int i = 0; i < dim; ++i) 355 gradients[i] = shdata->gradients[i]; 356 } 357 358 359 360 template <int dim, 361 int fe_degree, 362 int n_q_points_1d, 363 int n_components_, 364 typename Number> 365 __device__ void 366 FEEvaluation<dim, fe_degree, n_q_points_1d, n_components_, Number>:: read_dof_values(const Number * src)367 read_dof_values(const Number *src) 368 { 369 static_assert(n_components_ == 1, "This function only supports FE with one \ 370 components"); 371 const unsigned int idx = internal::compute_index<dim, n_q_points_1d>(); 372 373 const types::global_dof_index src_idx = local_to_global[idx]; 374 // Use the read-only data cache. 375 values[idx] = __ldg(&src[src_idx]); 376 377 __syncthreads(); 378 379 internal::resolve_hanging_nodes<dim, fe_degree, false>(constraint_mask, 380 values); 381 } 382 383 384 385 template <int dim, 386 int fe_degree, 387 int n_q_points_1d, 388 int n_components_, 389 typename Number> 390 __device__ void 391 FEEvaluation<dim, fe_degree, n_q_points_1d, n_components_, Number>:: distribute_local_to_global(Number * dst)392 distribute_local_to_global(Number *dst) const 393 { 394 static_assert(n_components_ == 1, "This function only supports FE with one \ 395 components"); 396 internal::resolve_hanging_nodes<dim, fe_degree, true>(constraint_mask, 397 values); 398 399 const unsigned int idx = internal::compute_index<dim, n_q_points_1d>(); 400 401 const types::global_dof_index destination_idx = local_to_global[idx]; 402 403 if (use_coloring) 404 dst[destination_idx] += values[idx]; 405 else 406 atomicAdd(&dst[destination_idx], values[idx]); 407 } 408 409 410 411 template <int dim, 412 int fe_degree, 413 int n_q_points_1d, 414 int n_components_, 415 typename Number> 416 __device__ void evaluate(const bool evaluate_val,const bool evaluate_grad)417 FEEvaluation<dim, fe_degree, n_q_points_1d, n_components_, Number>::evaluate( 418 const bool evaluate_val, 419 const bool evaluate_grad) 420 { 421 // First evaluate the gradients because it requires values that will be 422 // changed if evaluate_val is true 423 internal::EvaluatorTensorProduct< 424 internal::EvaluatorVariant::evaluate_general, 425 dim, 426 fe_degree, 427 n_q_points_1d, 428 Number> 429 evaluator_tensor_product(mf_object_id); 430 if (evaluate_val == true && evaluate_grad == true) 431 { 432 evaluator_tensor_product.value_and_gradient_at_quad_pts(values, 433 gradients); 434 __syncthreads(); 435 } 436 else if (evaluate_grad == true) 437 { 438 evaluator_tensor_product.gradient_at_quad_pts(values, gradients); 439 __syncthreads(); 440 } 441 else if (evaluate_val == true) 442 { 443 evaluator_tensor_product.value_at_quad_pts(values); 444 __syncthreads(); 445 } 446 } 447 448 449 450 template <int dim, 451 int fe_degree, 452 int n_q_points_1d, 453 int n_components_, 454 typename Number> 455 __device__ void integrate(const bool integrate_val,const bool integrate_grad)456 FEEvaluation<dim, fe_degree, n_q_points_1d, n_components_, Number>::integrate( 457 const bool integrate_val, 458 const bool integrate_grad) 459 { 460 internal::EvaluatorTensorProduct< 461 internal::EvaluatorVariant::evaluate_general, 462 dim, 463 fe_degree, 464 n_q_points_1d, 465 Number> 466 evaluator_tensor_product(mf_object_id); 467 if (integrate_val == true && integrate_grad == true) 468 { 469 evaluator_tensor_product.integrate_value_and_gradient(values, 470 gradients); 471 } 472 else if (integrate_val == true) 473 { 474 evaluator_tensor_product.integrate_value(values); 475 __syncthreads(); 476 } 477 else if (integrate_grad == true) 478 { 479 evaluator_tensor_product.integrate_gradient<false>(values, gradients); 480 __syncthreads(); 481 } 482 } 483 484 485 486 template <int dim, 487 int fe_degree, 488 int n_q_points_1d, 489 int n_components_, 490 typename Number> 491 __device__ typename FEEvaluation<dim, 492 fe_degree, 493 n_q_points_1d, 494 n_components_, 495 Number>::value_type get_value(const unsigned int q_point)496 FEEvaluation<dim, fe_degree, n_q_points_1d, n_components_, Number>::get_value( 497 const unsigned int q_point) const 498 { 499 return values[q_point]; 500 } 501 502 503 504 template <int dim, 505 int fe_degree, 506 int n_q_points_1d, 507 int n_components_, 508 typename Number> 509 __device__ typename FEEvaluation<dim, 510 fe_degree, 511 n_q_points_1d, 512 n_components_, 513 Number>::value_type 514 FEEvaluation<dim, fe_degree, n_q_points_1d, n_components_, Number>:: get_value()515 get_value() const 516 { 517 const unsigned int q_point = internal::compute_index<dim, n_q_points_1d>(); 518 return values[q_point]; 519 } 520 521 522 523 template <int dim, 524 int fe_degree, 525 int n_q_points_1d, 526 int n_components_, 527 typename Number> 528 __device__ typename FEEvaluation<dim, 529 fe_degree, 530 n_q_points_1d, 531 n_components_, 532 Number>::value_type 533 FEEvaluation<dim, fe_degree, n_q_points_1d, n_components_, Number>:: get_dof_value(const unsigned int dof)534 get_dof_value(const unsigned int dof) const 535 { 536 return values[dof]; 537 } 538 539 540 541 template <int dim, 542 int fe_degree, 543 int n_q_points_1d, 544 int n_components_, 545 typename Number> 546 __device__ typename FEEvaluation<dim, 547 fe_degree, 548 n_q_points_1d, 549 n_components_, 550 Number>::value_type 551 FEEvaluation<dim, fe_degree, n_q_points_1d, n_components_, Number>:: get_dof_value()552 get_dof_value() const 553 { 554 const unsigned int dof = internal::compute_index<dim, fe_degree + 1>(); 555 return values[dof]; 556 } 557 558 559 560 template <int dim, 561 int fe_degree, 562 int n_q_points_1d, 563 int n_components_, 564 typename Number> 565 __device__ void 566 FEEvaluation<dim, fe_degree, n_q_points_1d, n_components_, Number>:: submit_value(const value_type & val_in,const unsigned int q_point)567 submit_value(const value_type &val_in, const unsigned int q_point) 568 { 569 values[q_point] = val_in * JxW[q_point]; 570 } 571 572 573 574 template <int dim, 575 int fe_degree, 576 int n_q_points_1d, 577 int n_components_, 578 typename Number> 579 __device__ void 580 FEEvaluation<dim, fe_degree, n_q_points_1d, n_components_, Number>:: submit_value(const value_type & val_in)581 submit_value(const value_type &val_in) 582 { 583 const unsigned int q_point = internal::compute_index<dim, n_q_points_1d>(); 584 values[q_point] = val_in * JxW[q_point]; 585 } 586 587 588 589 template <int dim, 590 int fe_degree, 591 int n_q_points_1d, 592 int n_components_, 593 typename Number> 594 __device__ void 595 FEEvaluation<dim, fe_degree, n_q_points_1d, n_components_, Number>:: submit_dof_value(const value_type & val_in,const unsigned int dof)596 submit_dof_value(const value_type &val_in, const unsigned int dof) 597 { 598 values[dof] = val_in; 599 } 600 601 602 603 template <int dim, 604 int fe_degree, 605 int n_q_points_1d, 606 int n_components_, 607 typename Number> 608 __device__ void 609 FEEvaluation<dim, fe_degree, n_q_points_1d, n_components_, Number>:: submit_dof_value(const value_type & val_in)610 submit_dof_value(const value_type &val_in) 611 { 612 const unsigned int dof = internal::compute_index<dim, fe_degree + 1>(); 613 values[dof] = val_in; 614 } 615 616 617 618 template <int dim, 619 int fe_degree, 620 int n_q_points_1d, 621 int n_components_, 622 typename Number> 623 __device__ typename FEEvaluation<dim, 624 fe_degree, 625 n_q_points_1d, 626 n_components_, 627 Number>::gradient_type 628 FEEvaluation<dim, fe_degree, n_q_points_1d, n_components_, Number>:: get_gradient(const unsigned int q_point)629 get_gradient(const unsigned int q_point) const 630 { 631 static_assert(n_components_ == 1, "This function only supports FE with one \ 632 components"); 633 // TODO optimize if the mesh is uniform 634 const Number *inv_jacobian = &inv_jac[q_point]; 635 gradient_type grad; 636 for (int d_1 = 0; d_1 < dim; ++d_1) 637 { 638 Number tmp = 0.; 639 for (int d_2 = 0; d_2 < dim; ++d_2) 640 tmp += inv_jacobian[padding_length * n_cells * (dim * d_2 + d_1)] * 641 gradients[d_2][q_point]; 642 grad[d_1] = tmp; 643 } 644 645 return grad; 646 } 647 648 649 650 template <int dim, 651 int fe_degree, 652 int n_q_points_1d, 653 int n_components_, 654 typename Number> 655 __device__ typename FEEvaluation<dim, 656 fe_degree, 657 n_q_points_1d, 658 n_components_, 659 Number>::gradient_type 660 FEEvaluation<dim, fe_degree, n_q_points_1d, n_components_, Number>:: get_gradient()661 get_gradient() const 662 { 663 static_assert(n_components_ == 1, "This function only supports FE with one \ 664 components"); 665 666 // TODO optimize if the mesh is uniform 667 const unsigned int q_point = internal::compute_index<dim, n_q_points_1d>(); 668 const Number * inv_jacobian = &inv_jac[q_point]; 669 gradient_type grad; 670 for (int d_1 = 0; d_1 < dim; ++d_1) 671 { 672 Number tmp = 0.; 673 for (int d_2 = 0; d_2 < dim; ++d_2) 674 tmp += inv_jacobian[padding_length * n_cells * (dim * d_2 + d_1)] * 675 gradients[d_2][q_point]; 676 grad[d_1] = tmp; 677 } 678 679 return grad; 680 } 681 682 683 684 template <int dim, 685 int fe_degree, 686 int n_q_points_1d, 687 int n_components_, 688 typename Number> 689 __device__ void 690 FEEvaluation<dim, fe_degree, n_q_points_1d, n_components_, Number>:: submit_gradient(const gradient_type & grad_in,const unsigned int q_point)691 submit_gradient(const gradient_type &grad_in, const unsigned int q_point) 692 { 693 // TODO optimize if the mesh is uniform 694 const Number *inv_jacobian = &inv_jac[q_point]; 695 for (int d_1 = 0; d_1 < dim; ++d_1) 696 { 697 Number tmp = 0.; 698 for (int d_2 = 0; d_2 < dim; ++d_2) 699 tmp += inv_jacobian[n_cells * padding_length * (dim * d_1 + d_2)] * 700 grad_in[d_2]; 701 gradients[d_1][q_point] = tmp * JxW[q_point]; 702 } 703 } 704 705 706 707 template <int dim, 708 int fe_degree, 709 int n_q_points_1d, 710 int n_components_, 711 typename Number> 712 __device__ void 713 FEEvaluation<dim, fe_degree, n_q_points_1d, n_components_, Number>:: submit_gradient(const gradient_type & grad_in)714 submit_gradient(const gradient_type &grad_in) 715 { 716 // TODO optimize if the mesh is uniform 717 const unsigned int q_point = internal::compute_index<dim, n_q_points_1d>(); 718 const Number * inv_jacobian = &inv_jac[q_point]; 719 for (int d_1 = 0; d_1 < dim; ++d_1) 720 { 721 Number tmp = 0.; 722 for (int d_2 = 0; d_2 < dim; ++d_2) 723 tmp += inv_jacobian[n_cells * padding_length * (dim * d_1 + d_2)] * 724 grad_in[d_2]; 725 gradients[d_1][q_point] = tmp * JxW[q_point]; 726 } 727 } 728 729 730 731 template <int dim, 732 int fe_degree, 733 int n_q_points_1d, 734 int n_components_, 735 typename Number> 736 template <typename Functor> 737 __device__ void 738 FEEvaluation<dim, fe_degree, n_q_points_1d, n_components_, Number>:: apply_quad_point_operations(const Functor & func)739 apply_quad_point_operations(const Functor &func) 740 { 741 func(this, internal::compute_index<dim, n_q_points_1d>()); 742 743 __syncthreads(); 744 } 745 746 747 748 template <int dim, 749 int fe_degree, 750 int n_q_points_1d, 751 int n_components_, 752 typename Number> 753 template <typename Functor> 754 __device__ void 755 FEEvaluation<dim, fe_degree, n_q_points_1d, n_components_, Number>:: apply_for_each_quad_point(const Functor & func)756 apply_for_each_quad_point(const Functor &func) 757 { 758 func(this); 759 760 __syncthreads(); 761 } 762 } // namespace CUDAWrappers 763 764 DEAL_II_NAMESPACE_CLOSE 765 766 #endif 767 768 #endif 769