1 //////////////////////////////////////////////////////////////////////
2 // This file is distributed under the University of Illinois/NCSA Open Source
3 // License. See LICENSE file in top directory for details.
4 //
5 // Copyright (c) 2016 Jeongnim Kim and QMCPACK developers.
6 //
7 // File developed by:
8 // Lawrence Livermore National Laboratory
9 //
10 // File created by:
11 // Miguel A. Morales, moralessilva2@llnl.gov
12 // Lawrence Livermore National Laboratory
13 ////////////////////////////////////////////////////////////////////////////////
14
15 #include <cassert>
16 #include <complex>
17 #include <cuda.h>
18 #include <thrust/complex.h>
19 #include <cuda_runtime.h>
20 #include "AFQMC/Numerics/detail/CUDA/Kernels/cuda_settings.h"
21 #define ENABLE_CUDA 1
22 #include "AFQMC/Memory/CUDA/cuda_utilities.h"
23
24 namespace kernels
25 {
26 // simple
27 // A[k][i] = B[k][i][i]
28 template<typename T>
kernel_get_diagonal_strided(int nk,int ni,thrust::complex<T> const * B,int ldb,int stride,thrust::complex<T> * A,int lda)29 __global__ void kernel_get_diagonal_strided(int nk,
30 int ni,
31 thrust::complex<T> const* B,
32 int ldb,
33 int stride,
34 thrust::complex<T>* A,
35 int lda)
36 {
37 int k = blockIdx.y;
38 int i = blockIdx.x * blockDim.x + threadIdx.x;
39 if ((i < ni) && (k < nk))
40 A[k * lda + i] = B[k * stride + i * ldb + i];
41 }
42
43 // A[k][i] = B[k][i][i]
get_diagonal_strided(int nk,int ni,std::complex<double> const * B,int ldb,int stride,std::complex<double> * A,int lda)44 void get_diagonal_strided(int nk,
45 int ni,
46 std::complex<double> const* B,
47 int ldb,
48 int stride,
49 std::complex<double>* A,
50 int lda)
51 {
52 size_t nthr = 32;
53 size_t nbks = (ni + nthr - 1) / nthr;
54 dim3 grid_dim(nbks, nk, 1);
55 kernel_get_diagonal_strided<<<grid_dim, nthr>>>(nk, ni, reinterpret_cast<thrust::complex<double> const*>(B), ldb,
56 stride, reinterpret_cast<thrust::complex<double>*>(A), lda);
57 qmc_cuda::cuda_check(cudaGetLastError());
58 qmc_cuda::cuda_check(cudaDeviceSynchronize());
59 }
60
get_diagonal_strided(int nk,int ni,std::complex<float> const * B,int ldb,int stride,std::complex<float> * A,int lda)61 void get_diagonal_strided(int nk,
62 int ni,
63 std::complex<float> const* B,
64 int ldb,
65 int stride,
66 std::complex<float>* A,
67 int lda)
68 {
69 size_t nthr = 32;
70 size_t nbks = (ni + nthr - 1) / nthr;
71 dim3 grid_dim(nbks, nk, 1);
72 kernel_get_diagonal_strided<<<grid_dim, nthr>>>(nk, ni, reinterpret_cast<thrust::complex<float> const*>(B), ldb,
73 stride, reinterpret_cast<thrust::complex<float>*>(A), lda);
74 qmc_cuda::cuda_check(cudaGetLastError());
75 qmc_cuda::cuda_check(cudaDeviceSynchronize());
76 }
77
78 } // namespace kernels
79