1 #ifdef COMPILATION
2 /usr/local/cuda/bin/nvcc -DBOOST_ALL_NO_LIB -DBOOST_SERIALIZATION_DYN_LINK -DUSING_Libxc -I/home/correaa/prj/inq/external_libs/libxc/src -I/home/correaa/prj/inq/build/cuda11/external_libs/libxc -I/home/correaa/prj/inq/build/cuda11/external_libs/libxc/gen_funcidx -I/home/correaa/prj/inq/build/cuda11 -I/home/correaa/prj/inq/build/cuda11/external_libs/pseudopod -I/home/correaa/prj/inq/src/. -I/home/correaa/prj/inq/src/../external_libs  -DNDEBUG -I/usr/lib/x86_64-linux-gnu/openmpi/include/openmpi -I/usr/lib/x86_64-linux-gnu/openmpi/include  -g -pg -D_DISABLE_CUDA_SLOW -O3 --expt-relaxed-constexpr --expt-extended-lambda --Werror=cross-execution-space-call --compiler-options -Ofast,-g,-pg,-std=c++14,-Wall,-Wfatal-errors -g   -x cu $0 -o $0x&&$0x&&rm $0x;exit
3 #/usr/local/cuda/bin/nvcc -g -pg -O3 --expt-relaxed-constexpr --expt-extended-lambda --Werror=cross-execution-space-call --compiler-options -Ofast,-g,-pg,-Wall,-Wfatal-errors -g $0 - $0x &&$0x&&rm $0x; exit
4 #endif
5 
6 #include "../../cuda/managed/allocator.hpp"
7 #include "../../../../array.hpp"
8 
9 #include <cuda.h>
10 
11 template<class Kernel>
cuda_run_kernel_1(unsigned n,Kernel k)12 __global__ void cuda_run_kernel_1(unsigned n, Kernel k){
13 	auto i = blockIdx.x*blockDim.x + threadIdx.x;
14 	if(i < n) k(i);
15 }
16 
17 template<class Kernel>
run(std::size_t n,Kernel k)18 void run(std::size_t n, Kernel k){
19 	constexpr int CUDA_BLOCK_SIZE = 256;
20 
21 	unsigned nblock = (n + CUDA_BLOCK_SIZE - 1)/CUDA_BLOCK_SIZE;
22 
23 	cuda_run_kernel_1<<<nblock, CUDA_BLOCK_SIZE>>>(n, k);
24 
25 	auto code = cudaGetLastError();
26 	if(code != cudaSuccess){
27 		fprintf(stderr,"GPU assert: %s %s %d\n", cudaGetErrorString(code), __FILE__, __LINE__);
28 		exit(-1);
29 	}
30 
31 	cudaDeviceSynchronize();
32 }
33 
34 #include<thrust/complex.h>
35 
36 namespace multi = boost::multi;
37 namespace cuda = multi::memory::cuda;
38 
39 using complex = thrust::complex<double>;
40 
main()41 int main(){
42 
43 
44 	int set_size = 32;
45 	int basis_size = 100;
46 
47 	multi::array<complex, 2, cuda::managed::allocator<complex>> phi1({set_size, basis_size}, 1.);
48 	multi::array<complex, 2, cuda::managed::allocator<complex>> phi2({set_size, basis_size}, 1.);
49 
50 	multi::array<complex, 1, cuda::managed::allocator<complex>> overlap(set_size);
51 
52 #if 0 // using iterators
53 	auto phi1_p = begin(phi1);
54 	auto phi2_p = begin(phi2);
55 
56 	auto overlap_p = begin(overlap);
57 
58 	run(set_size,
59 		[phi1_p, phi2_p, overlap_p, basis_size] __device__ (int ist){
60 			complex a = 0.;
61 			for(int ip = 0; ip < basis_size; ip++){
62 				auto p1 = phi1_p[ist][ip];
63 				auto p2 = phi2_p[ist][ip];
64 				a += conj(p1)*p2;
65 			}
66 			overlap_p[ist] = a;
67 		}
68 	);
69 #endif
70 #if 0
71 	run(set_size,
72 		[phi1_p = begin(phi1), phi2_p = begin(phi2), overlap_p = begin(overlap), basis_size] __device__ (int ist){
73 			complex a = 0.;
74 			for(int ip = 0; ip < basis_size; ip++){
75 				auto p1 = phi1_p[ist][ip];
76 				auto p2 = phi2_p[ist][ip];
77 				a += conj(p1)*p2;
78 			}
79 			overlap_p[ist] = a;
80 		}
81 	);
82 #endif
83 
84 	run(set_size,
85 		[phi1_p = &phi1(), phi2_p = &phi2(), overlap_p = &overlap()] __device__ (int ist){
86 			(*overlap_p)[ist] = 0;
87 			for(int ip = 0; ip < overlap_p->size(); ip++)
88 				(*overlap_p)[ist] += conj((*phi1_p)[ist][ip])*(*phi2_p)[ist][ip];
89 		}
90 	);
91 
92 	assert( overlap[21] == double(basis_size) );
93 
94 #if 0
95 	{
96 		auto npoints = phi1.basis().part().local_size();
97 		auto vol_element = phi1.basis().volume_element();
98 		auto phi1p = begin(phi1.matrix());
99 		auto phi2p = begin(phi2.matrix());
100 		auto overlap = begin(overlap_vector);
101 
102 		//OPTIMIZATION: here we should parallelize over points as well
103 		gpu::run(phi1.set_size(),
104 						 [phi1p, phi2p, overlap, npoints,vol_element] __host__ __device__ (int ist){
105 							 type aa = 0.0;
106 							 for(int ip = 0; ip < npoints; ip++){
107 								phi1p[ip];
108 							//	 auto p1 = phi1p[ip][ist];
109 							//	 auto p2 = phi2p[ip][ist];
110 							//	 aa += conj(p1)*p2;
111 
112 							 }
113 
114 							 overlap[ist] = vol_element*aa;
115 						 });
116 	}
117 #endif
118 	return 0;
119 }
120 
121