1 //////////////////////////////////////////////////////////////////////
2 // This file is distributed under the University of Illinois/NCSA Open Source
3 // License. See LICENSE file in top directory for details.
4 //
5 // Copyright (c) 2016 Jeongnim Kim and QMCPACK developers.
6 //
7 // File developed by:
8 // Lawrence Livermore National Laboratory
9 //
10 // File created by:
11 // Miguel A. Morales, moralessilva2@llnl.gov
12 // Lawrence Livermore National Laboratory
13 ////////////////////////////////////////////////////////////////////////////////
14
15 #include <cassert>
16 #include <complex>
17 #include <cuda.h>
18 #include <thrust/complex.h>
19 #include <cuda_runtime.h>
20 #define ENABLE_CUDA 1
21 #include "AFQMC/Memory/CUDA/cuda_utilities.h"
22
23 namespace kernels
24 {
25 template<typename T>
kernel_axpy_batched(int n,thrust::complex<T> * x,thrust::complex<T> ** a,int inca,thrust::complex<T> ** b,int incb,int batchSize)26 __global__ void kernel_axpy_batched(int n,
27 thrust::complex<T>* x,
28 thrust::complex<T>** a,
29 int inca,
30 thrust::complex<T>** b,
31 int incb,
32 int batchSize)
33 {
34 int batch = blockIdx.x;
35 if (batch >= batchSize)
36 return;
37
38 thrust::complex<T>* a_(a[batch]);
39 thrust::complex<T>* b_(b[batch]);
40 thrust::complex<T> x_(x[batch]);
41
42 int i = threadIdx.x;
43 while (i < n)
44 {
45 b_[i * incb] = b_[i * incb] + x_ * a_[i * inca];
46 i += blockDim.x;
47 }
48 }
49
50 template<typename T>
kernel_sumGw_batched(int n,thrust::complex<T> * x,thrust::complex<T> ** a,int inca,thrust::complex<T> ** b,int incb,int b0,int nw,int batchSize)51 __global__ void kernel_sumGw_batched(int n,
52 thrust::complex<T>* x,
53 thrust::complex<T>** a,
54 int inca,
55 thrust::complex<T>** b,
56 int incb,
57 int b0,
58 int nw,
59 int batchSize)
60 {
61 if (blockIdx.x >= batchSize)
62 return;
63
64 int my_iw = (b0 + blockIdx.x) % nw;
65
66 for (int m = 0; m < batchSize; ++m)
67 {
68 if ((b0 + m) % nw != my_iw)
69 continue;
70
71 thrust::complex<T>* a_(a[m]);
72 thrust::complex<T>* b_(b[m]);
73 thrust::complex<T> x_(x[m]);
74
75 int i = threadIdx.x;
76 while (i < n)
77 {
78 b_[i * incb] = b_[i * incb] + x_ * a_[i * inca];
79 i += blockDim.x;
80 }
81 }
82 }
83
axpy_batched_gpu(int n,std::complex<double> * x,const std::complex<double> ** a,int inca,std::complex<double> ** b,int incb,int batchSize)84 void axpy_batched_gpu(int n,
85 std::complex<double>* x,
86 const std::complex<double>** a,
87 int inca,
88 std::complex<double>** b,
89 int incb,
90 int batchSize)
91 {
92 thrust::complex<double>*x_, **a_, **b_;
93 cudaMalloc((void**)&a_, batchSize * sizeof(*a_));
94 cudaMalloc((void**)&b_, batchSize * sizeof(*b_));
95 cudaMalloc((void**)&x_, batchSize * sizeof(*x_));
96 cudaMemcpy(a_, a, batchSize * sizeof(*a), cudaMemcpyHostToDevice);
97 cudaMemcpy(b_, b, batchSize * sizeof(*b), cudaMemcpyHostToDevice);
98 cudaMemcpy(x_, x, batchSize * sizeof(*x), cudaMemcpyHostToDevice);
99 kernel_axpy_batched<<<batchSize, 128>>>(n, x_, a_, inca, b_, incb, batchSize);
100 qmc_cuda::cuda_check(cudaGetLastError());
101 qmc_cuda::cuda_check(cudaDeviceSynchronize());
102 cudaFree(a_);
103 cudaFree(b_);
104 cudaFree(x_);
105 }
106
sumGw_batched_gpu(int n,std::complex<double> * x,const std::complex<double> ** a,int inca,std::complex<double> ** b,int incb,int b0,int nw,int batchSize)107 void sumGw_batched_gpu(int n,
108 std::complex<double>* x,
109 const std::complex<double>** a,
110 int inca,
111 std::complex<double>** b,
112 int incb,
113 int b0,
114 int nw,
115 int batchSize)
116 {
117 thrust::complex<double>*x_, **a_, **b_;
118 cudaMalloc((void**)&a_, batchSize * sizeof(*a_));
119 cudaMalloc((void**)&b_, batchSize * sizeof(*b_));
120 cudaMalloc((void**)&x_, batchSize * sizeof(*x_));
121 cudaMemcpy(a_, a, batchSize * sizeof(*a), cudaMemcpyHostToDevice);
122 cudaMemcpy(b_, b, batchSize * sizeof(*b), cudaMemcpyHostToDevice);
123 cudaMemcpy(x_, x, batchSize * sizeof(*x), cudaMemcpyHostToDevice);
124 int nb_(nw > batchSize ? batchSize : nw);
125 kernel_sumGw_batched<<<nb_, 256>>>(n, x_, a_, inca, b_, incb, b0, nw, batchSize);
126 qmc_cuda::cuda_check(cudaGetLastError());
127 qmc_cuda::cuda_check(cudaDeviceSynchronize());
128 cudaFree(a_);
129 cudaFree(b_);
130 cudaFree(x_);
131 }
132
133 } // namespace kernels
134