1 //////////////////////////////////////////////////////////////////////
2 // This file is distributed under the University of Illinois/NCSA Open Source
3 // License.  See LICENSE file in top directory for details.
4 //
5 // Copyright (c) 2016 Jeongnim Kim and QMCPACK developers.
6 //
7 // File developed by:
8 //    Lawrence Livermore National Laboratory
9 //
10 // File created by:
11 // Miguel A. Morales, moralessilva2@llnl.gov
12 //    Lawrence Livermore National Laboratory
13 ////////////////////////////////////////////////////////////////////////////////
14 
15 #include <cassert>
16 #include <complex>
17 #include <cuda.h>
18 #include <thrust/complex.h>
19 #include <cuda_runtime.h>
20 #define ENABLE_CUDA 1
21 #include "AFQMC/Memory/CUDA/cuda_utilities.h"
22 
23 namespace kernels
24 {
25 template<typename T>
kernel_axpy_batched(int n,thrust::complex<T> * x,thrust::complex<T> ** a,int inca,thrust::complex<T> ** b,int incb,int batchSize)26 __global__ void kernel_axpy_batched(int n,
27                                     thrust::complex<T>* x,
28                                     thrust::complex<T>** a,
29                                     int inca,
30                                     thrust::complex<T>** b,
31                                     int incb,
32                                     int batchSize)
33 {
34   int batch = blockIdx.x;
35   if (batch >= batchSize)
36     return;
37 
38   thrust::complex<T>* a_(a[batch]);
39   thrust::complex<T>* b_(b[batch]);
40   thrust::complex<T> x_(x[batch]);
41 
42   int i = threadIdx.x;
43   while (i < n)
44   {
45     b_[i * incb] = b_[i * incb] + x_ * a_[i * inca];
46     i += blockDim.x;
47   }
48 }
49 
50 template<typename T>
kernel_sumGw_batched(int n,thrust::complex<T> * x,thrust::complex<T> ** a,int inca,thrust::complex<T> ** b,int incb,int b0,int nw,int batchSize)51 __global__ void kernel_sumGw_batched(int n,
52                                      thrust::complex<T>* x,
53                                      thrust::complex<T>** a,
54                                      int inca,
55                                      thrust::complex<T>** b,
56                                      int incb,
57                                      int b0,
58                                      int nw,
59                                      int batchSize)
60 {
61   if (blockIdx.x >= batchSize)
62     return;
63 
64   int my_iw = (b0 + blockIdx.x) % nw;
65 
66   for (int m = 0; m < batchSize; ++m)
67   {
68     if ((b0 + m) % nw != my_iw)
69       continue;
70 
71     thrust::complex<T>* a_(a[m]);
72     thrust::complex<T>* b_(b[m]);
73     thrust::complex<T> x_(x[m]);
74 
75     int i = threadIdx.x;
76     while (i < n)
77     {
78       b_[i * incb] = b_[i * incb] + x_ * a_[i * inca];
79       i += blockDim.x;
80     }
81   }
82 }
83 
axpy_batched_gpu(int n,std::complex<double> * x,const std::complex<double> ** a,int inca,std::complex<double> ** b,int incb,int batchSize)84 void axpy_batched_gpu(int n,
85                       std::complex<double>* x,
86                       const std::complex<double>** a,
87                       int inca,
88                       std::complex<double>** b,
89                       int incb,
90                       int batchSize)
91 {
92   thrust::complex<double>*x_, **a_, **b_;
93   cudaMalloc((void**)&a_, batchSize * sizeof(*a_));
94   cudaMalloc((void**)&b_, batchSize * sizeof(*b_));
95   cudaMalloc((void**)&x_, batchSize * sizeof(*x_));
96   cudaMemcpy(a_, a, batchSize * sizeof(*a), cudaMemcpyHostToDevice);
97   cudaMemcpy(b_, b, batchSize * sizeof(*b), cudaMemcpyHostToDevice);
98   cudaMemcpy(x_, x, batchSize * sizeof(*x), cudaMemcpyHostToDevice);
99   kernel_axpy_batched<<<batchSize, 128>>>(n, x_, a_, inca, b_, incb, batchSize);
100   qmc_cuda::cuda_check(cudaGetLastError());
101   qmc_cuda::cuda_check(cudaDeviceSynchronize());
102   cudaFree(a_);
103   cudaFree(b_);
104   cudaFree(x_);
105 }
106 
sumGw_batched_gpu(int n,std::complex<double> * x,const std::complex<double> ** a,int inca,std::complex<double> ** b,int incb,int b0,int nw,int batchSize)107 void sumGw_batched_gpu(int n,
108                        std::complex<double>* x,
109                        const std::complex<double>** a,
110                        int inca,
111                        std::complex<double>** b,
112                        int incb,
113                        int b0,
114                        int nw,
115                        int batchSize)
116 {
117   thrust::complex<double>*x_, **a_, **b_;
118   cudaMalloc((void**)&a_, batchSize * sizeof(*a_));
119   cudaMalloc((void**)&b_, batchSize * sizeof(*b_));
120   cudaMalloc((void**)&x_, batchSize * sizeof(*x_));
121   cudaMemcpy(a_, a, batchSize * sizeof(*a), cudaMemcpyHostToDevice);
122   cudaMemcpy(b_, b, batchSize * sizeof(*b), cudaMemcpyHostToDevice);
123   cudaMemcpy(x_, x, batchSize * sizeof(*x), cudaMemcpyHostToDevice);
124   int nb_(nw > batchSize ? batchSize : nw);
125   kernel_sumGw_batched<<<nb_, 256>>>(n, x_, a_, inca, b_, incb, b0, nw, batchSize);
126   qmc_cuda::cuda_check(cudaGetLastError());
127   qmc_cuda::cuda_check(cudaDeviceSynchronize());
128   cudaFree(a_);
129   cudaFree(b_);
130   cudaFree(x_);
131 }
132 
133 } // namespace kernels
134