1 #ifdef COMPILATION
2 /usr/local/cuda/bin/nvcc -DBOOST_ALL_NO_LIB -DBOOST_SERIALIZATION_DYN_LINK -DUSING_Libxc -I/home/correaa/prj/inq/external_libs/libxc/src -I/home/correaa/prj/inq/build/cuda11/external_libs/libxc -I/home/correaa/prj/inq/build/cuda11/external_libs/libxc/gen_funcidx -I/home/correaa/prj/inq/build/cuda11 -I/home/correaa/prj/inq/build/cuda11/external_libs/pseudopod -I/home/correaa/prj/inq/src/. -I/home/correaa/prj/inq/src/../external_libs -DNDEBUG -I/usr/lib/x86_64-linux-gnu/openmpi/include/openmpi -I/usr/lib/x86_64-linux-gnu/openmpi/include -g -pg -D_DISABLE_CUDA_SLOW -O3 --expt-relaxed-constexpr --expt-extended-lambda --Werror=cross-execution-space-call --compiler-options -Ofast,-g,-pg,-std=c++14,-Wall,-Wfatal-errors -g -x cu $0 -o $0x&&$0x&&rm $0x;exit
3 #/usr/local/cuda/bin/nvcc -g -pg -O3 --expt-relaxed-constexpr --expt-extended-lambda --Werror=cross-execution-space-call --compiler-options -Ofast,-g,-pg,-Wall,-Wfatal-errors -g $0 - $0x &&$0x&&rm $0x; exit
4 #endif
5
6 #include "../../cuda/managed/allocator.hpp"
7 #include "../../../../array.hpp"
8
9 #include <cuda.h>
10
11 template<class Kernel>
cuda_run_kernel_1(unsigned n,Kernel k)12 __global__ void cuda_run_kernel_1(unsigned n, Kernel k){
13 auto i = blockIdx.x*blockDim.x + threadIdx.x;
14 if(i < n) k(i);
15 }
16
17 template<class Kernel>
run(std::size_t n,Kernel k)18 void run(std::size_t n, Kernel k){
19 constexpr int CUDA_BLOCK_SIZE = 256;
20
21 unsigned nblock = (n + CUDA_BLOCK_SIZE - 1)/CUDA_BLOCK_SIZE;
22
23 cuda_run_kernel_1<<<nblock, CUDA_BLOCK_SIZE>>>(n, k);
24
25 auto code = cudaGetLastError();
26 if(code != cudaSuccess){
27 fprintf(stderr,"GPU assert: %s %s %d\n", cudaGetErrorString(code), __FILE__, __LINE__);
28 exit(-1);
29 }
30
31 cudaDeviceSynchronize();
32 }
33
34 #include<thrust/complex.h>
35
36 namespace multi = boost::multi;
37 namespace cuda = multi::memory::cuda;
38
39 using complex = thrust::complex<double>;
40
main()41 int main(){
42
43
44 int set_size = 32;
45 int basis_size = 100;
46
47 multi::array<complex, 2, cuda::managed::allocator<complex>> phi1({set_size, basis_size}, 1.);
48 multi::array<complex, 2, cuda::managed::allocator<complex>> phi2({set_size, basis_size}, 1.);
49
50 multi::array<complex, 1, cuda::managed::allocator<complex>> overlap(set_size);
51
52 #if 0 // using iterators
53 auto phi1_p = begin(phi1);
54 auto phi2_p = begin(phi2);
55
56 auto overlap_p = begin(overlap);
57
58 run(set_size,
59 [phi1_p, phi2_p, overlap_p, basis_size] __device__ (int ist){
60 complex a = 0.;
61 for(int ip = 0; ip < basis_size; ip++){
62 auto p1 = phi1_p[ist][ip];
63 auto p2 = phi2_p[ist][ip];
64 a += conj(p1)*p2;
65 }
66 overlap_p[ist] = a;
67 }
68 );
69 #endif
70 #if 0
71 run(set_size,
72 [phi1_p = begin(phi1), phi2_p = begin(phi2), overlap_p = begin(overlap), basis_size] __device__ (int ist){
73 complex a = 0.;
74 for(int ip = 0; ip < basis_size; ip++){
75 auto p1 = phi1_p[ist][ip];
76 auto p2 = phi2_p[ist][ip];
77 a += conj(p1)*p2;
78 }
79 overlap_p[ist] = a;
80 }
81 );
82 #endif
83
84 run(set_size,
85 [phi1_p = &phi1(), phi2_p = &phi2(), overlap_p = &overlap()] __device__ (int ist){
86 (*overlap_p)[ist] = 0;
87 for(int ip = 0; ip < overlap_p->size(); ip++)
88 (*overlap_p)[ist] += conj((*phi1_p)[ist][ip])*(*phi2_p)[ist][ip];
89 }
90 );
91
92 assert( overlap[21] == double(basis_size) );
93
94 #if 0
95 {
96 auto npoints = phi1.basis().part().local_size();
97 auto vol_element = phi1.basis().volume_element();
98 auto phi1p = begin(phi1.matrix());
99 auto phi2p = begin(phi2.matrix());
100 auto overlap = begin(overlap_vector);
101
102 //OPTIMIZATION: here we should parallelize over points as well
103 gpu::run(phi1.set_size(),
104 [phi1p, phi2p, overlap, npoints,vol_element] __host__ __device__ (int ist){
105 type aa = 0.0;
106 for(int ip = 0; ip < npoints; ip++){
107 phi1p[ip];
108 // auto p1 = phi1p[ip][ist];
109 // auto p2 = phi2p[ip][ist];
110 // aa += conj(p1)*p2;
111
112 }
113
114 overlap[ist] = vol_element*aa;
115 });
116 }
117 #endif
118 return 0;
119 }
120
121