1 /*-------------------------------------------------------------------
2 Copyright 2011 Ravishankar Sundararaman
3 
4 This file is part of JDFTx.
5 
6 JDFTx is free software: you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation, either version 3 of the License, or
9 (at your option) any later version.
10 
11 JDFTx is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14 GNU General Public License for more details.
15 
16 You should have received a copy of the GNU General Public License
17 along with JDFTx.  If not, see <http://www.gnu.org/licenses/>.
18 -------------------------------------------------------------------*/
19 
20 
21 #ifndef JDFTX_CORE_GPUKERNELUTILS_H
22 #define JDFTX_CORE_GPUKERNELUTILS_H
23 
24 #include <algorithm>
25 #include <cuda_runtime.h>
26 #include <cublas_v2.h>
27 #include <driver_types.h>
28 #include <vector_types.h>
29 #include <core/vector3.h>
30 
31 //! @addtogroup Utilities
32 //! @{
33 
34 //! @file GpuKernelUtils.h
35 //! @brief Common utility functions/macros for the gpu kernels and launchers in the .cu files
36 
37 extern cudaDeviceProp cudaDevProps; //!< cached properties of currently running device (defined in GpuUtil.cpp)
38 extern cublasHandle_t cublasHandle; //!< global handle to cublas (defined in GpuUtil.cpp)
39 #ifdef CUSOLVER_ENABLED
40 #include <cusolverDn.h>
41 extern cusolverDnHandle_t cusolverHandle;  //!< global handle to cusolverDn (defined in GpuUtil.cpp)
42 #endif
43 
44 //! Base-class for launch configuration for gpu kernels
45 struct GpuLaunchConfig
46 {	cudaFuncAttributes attr; //!< attributes of the function
47 
48 	//! Initialize the device and function properties
GpuLaunchConfigGpuLaunchConfig49 	template<typename GpuKernel> GpuLaunchConfig(GpuKernel* gpuKernel)
50 	{	cudaFuncGetAttributes(&attr, gpuKernel);
51 	}
52 };
53 
54 
55 //Get the logical index of the kernel (dir is x, y, or z)
56 #define kernelIndex(dir) (blockIdx.dir * blockDim.dir + threadIdx.dir)
57 
58 //Get the logical 1D index, even if the grid is 2D (required for very large 1D kernels)
59 #define kernelIndex1D() ((blockIdx.y*gridDim.x+blockIdx.x) * blockDim.x + threadIdx.x)
60 
61 
62 //! 1D launch configuration
63 struct GpuLaunchConfig1D : public GpuLaunchConfig
64 {	dim3 nPerBlock; //!< dimension of block
65 	dim3 nBlocks; //!< dimension of grid (note nBlocks could be 3D for really large kernels)
66 
67 	//! Set up blocks and grid for a 1D operation over N data points
GpuLaunchConfig1DGpuLaunchConfig1D68 	template<typename GpuKernel> GpuLaunchConfig1D(GpuKernel* gpuKernel, int N)
69 	: GpuLaunchConfig(gpuKernel),
70 	nPerBlock(attr.maxThreadsPerBlock,1,1),
71 	nBlocks(ceildiv(N, int(nPerBlock.x)),1,1)
72 	{	//If the grid is too big, make it 2D:
73 		while(int(nBlocks.x) > cudaDevProps.maxGridSize[0])
74 		{	nBlocks.x = ceildiv(int(nBlocks.x),2);
75 			nBlocks.y *= 2;
76 		}
77 	}
78 };
79 
80 //! 3D launch configuration
81 struct GpuLaunchConfig3D : public GpuLaunchConfig
82 {	dim3 nPerBlock; //!< dimension of block
83 	dim3 nBlocks; //!< dimension of grid (note nBlocks could be 3D for really large kernels)
84 	int zBlockMax; //!< Grids are 2D, so need to loop over last dim
85 
86 	//! Set up blocks and grid for a 1D operation over N data points
GpuLaunchConfig3DGpuLaunchConfig3D87 	template<typename GpuKernel> GpuLaunchConfig3D(GpuKernel* gpuKernel, vector3<int> S)
88 	: GpuLaunchConfig(gpuKernel)
89 	{	// Try to minimize zBlockMax and maximize block size within constraint:
90 		zBlockMax = ceildiv(S[0], std::min(attr.maxThreadsPerBlock, cudaDevProps.maxThreadsDim[2]));
91 		nPerBlock.z = ceildiv(S[0], zBlockMax);
92 		// For the chosen z configuration, maximize x block size within constraint
93 		int maxBlockXY = attr.maxThreadsPerBlock/nPerBlock.z;
94 		nBlocks.x = ceildiv(S[2], std::min(maxBlockXY,cudaDevProps.maxThreadsDim[0]));
95 		nPerBlock.x = ceildiv(S[2], int(nBlocks.x));
96 		// For the chosen x and z configuration, maximize y block size within constraint
97 		int maxBlockY = attr.maxThreadsPerBlock/(nPerBlock.z*nPerBlock.x);
98 		nBlocks.y = ceildiv(S[1], std::min(maxBlockY,cudaDevProps.maxThreadsDim[1]));
99 		nPerBlock.y = ceildiv(S[1], int(nBlocks.y));
100 	}
101 };
102 
103 //! 3D launch configuration for symmetry-reduced G-space loops (z dimension folded for real data sets)
104 struct GpuLaunchConfigHalf3D : public GpuLaunchConfig3D
105 {	//!Just use the above after reducing the z-dimension to half
GpuLaunchConfigHalf3DGpuLaunchConfigHalf3D106 	template<typename GpuKernel> GpuLaunchConfigHalf3D(GpuKernel* gpuKernel, vector3<int> S)
107 	: GpuLaunchConfig3D(gpuKernel, vector3<int>(S[0], S[1], S[2]/2+1))
108 	{
109 	}
110 };
111 
112 //! Check for gpu errors and print a useful message (implemented in GpuUtils.cpp)
113 void gpuErrorCheck();
114 
115 //! @}
116 #endif // JDFTX_CORE_GPUKERNELUTILS_H
117