1 //////////////////////////////////////////////////////////////////////
2 // This file is distributed under the University of Illinois/NCSA Open Source
3 // License.  See LICENSE file in top directory for details.
4 //
5 // Copyright (c) 2016 Jeongnim Kim and QMCPACK developers.
6 //
7 // File developed by:
8 //    Lawrence Livermore National Laboratory
9 //
10 // File created by:
11 // Miguel A. Morales, moralessilva2@llnl.gov
12 //    Lawrence Livermore National Laboratory
13 ////////////////////////////////////////////////////////////////////////////////
14 
15 #include <cassert>
16 #include <complex>
17 #include <cuda.h>
18 #include <thrust/complex.h>
19 #include <cuda_runtime.h>
20 #include "AFQMC/Numerics/detail/CUDA/Kernels/cuda_settings.h"
21 #define ENABLE_CUDA 1
22 #include "AFQMC/Memory/CUDA/cuda_utilities.h"
23 
24 namespace kernels
25 {
26 // simple
27 // A[k][i] = B[k][i][i]
28 template<typename T>
kernel_get_diagonal_strided(int nk,int ni,thrust::complex<T> const * B,int ldb,int stride,thrust::complex<T> * A,int lda)29 __global__ void kernel_get_diagonal_strided(int nk,
30                                             int ni,
31                                             thrust::complex<T> const* B,
32                                             int ldb,
33                                             int stride,
34                                             thrust::complex<T>* A,
35                                             int lda)
36 {
37   int k = blockIdx.y;
38   int i = blockIdx.x * blockDim.x + threadIdx.x;
39   if ((i < ni) && (k < nk))
40     A[k * lda + i] = B[k * stride + i * ldb + i];
41 }
42 
43 // A[k][i] = B[k][i][i]
get_diagonal_strided(int nk,int ni,std::complex<double> const * B,int ldb,int stride,std::complex<double> * A,int lda)44 void get_diagonal_strided(int nk,
45                           int ni,
46                           std::complex<double> const* B,
47                           int ldb,
48                           int stride,
49                           std::complex<double>* A,
50                           int lda)
51 {
52   size_t nthr = 32;
53   size_t nbks = (ni + nthr - 1) / nthr;
54   dim3 grid_dim(nbks, nk, 1);
55   kernel_get_diagonal_strided<<<grid_dim, nthr>>>(nk, ni, reinterpret_cast<thrust::complex<double> const*>(B), ldb,
56                                                   stride, reinterpret_cast<thrust::complex<double>*>(A), lda);
57   qmc_cuda::cuda_check(cudaGetLastError());
58   qmc_cuda::cuda_check(cudaDeviceSynchronize());
59 }
60 
get_diagonal_strided(int nk,int ni,std::complex<float> const * B,int ldb,int stride,std::complex<float> * A,int lda)61 void get_diagonal_strided(int nk,
62                           int ni,
63                           std::complex<float> const* B,
64                           int ldb,
65                           int stride,
66                           std::complex<float>* A,
67                           int lda)
68 {
69   size_t nthr = 32;
70   size_t nbks = (ni + nthr - 1) / nthr;
71   dim3 grid_dim(nbks, nk, 1);
72   kernel_get_diagonal_strided<<<grid_dim, nthr>>>(nk, ni, reinterpret_cast<thrust::complex<float> const*>(B), ldb,
73                                                   stride, reinterpret_cast<thrust::complex<float>*>(A), lda);
74   qmc_cuda::cuda_check(cudaGetLastError());
75   qmc_cuda::cuda_check(cudaDeviceSynchronize());
76 }
77 
78 } // namespace kernels
79