1 /*------------------------------------------------------------------- 2 Copyright 2011 Ravishankar Sundararaman 3 4 This file is part of JDFTx. 5 6 JDFTx is free software: you can redistribute it and/or modify 7 it under the terms of the GNU General Public License as published by 8 the Free Software Foundation, either version 3 of the License, or 9 (at your option) any later version. 10 11 JDFTx is distributed in the hope that it will be useful, 12 but WITHOUT ANY WARRANTY; without even the implied warranty of 13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 GNU General Public License for more details. 15 16 You should have received a copy of the GNU General Public License 17 along with JDFTx. If not, see <http://www.gnu.org/licenses/>. 18 -------------------------------------------------------------------*/ 19 20 21 #ifndef JDFTX_CORE_GPUKERNELUTILS_H 22 #define JDFTX_CORE_GPUKERNELUTILS_H 23 24 #include <algorithm> 25 #include <cuda_runtime.h> 26 #include <cublas_v2.h> 27 #include <driver_types.h> 28 #include <vector_types.h> 29 #include <core/vector3.h> 30 31 //! @addtogroup Utilities 32 //! @{ 33 34 //! @file GpuKernelUtils.h 35 //! @brief Common utility functions/macros for the gpu kernels and launchers in the .cu files 36 37 extern cudaDeviceProp cudaDevProps; //!< cached properties of currently running device (defined in GpuUtil.cpp) 38 extern cublasHandle_t cublasHandle; //!< global handle to cublas (defined in GpuUtil.cpp) 39 #ifdef CUSOLVER_ENABLED 40 #include <cusolverDn.h> 41 extern cusolverDnHandle_t cusolverHandle; //!< global handle to cusolverDn (defined in GpuUtil.cpp) 42 #endif 43 44 //! Base-class for launch configuration for gpu kernels 45 struct GpuLaunchConfig 46 { cudaFuncAttributes attr; //!< attributes of the function 47 48 //! Initialize the device and function properties GpuLaunchConfigGpuLaunchConfig49 template<typename GpuKernel> GpuLaunchConfig(GpuKernel* gpuKernel) 50 { cudaFuncGetAttributes(&attr, gpuKernel); 51 } 52 }; 53 54 55 //Get the logical index of the kernel (dir is x, y, or z) 56 #define kernelIndex(dir) (blockIdx.dir * blockDim.dir + threadIdx.dir) 57 58 //Get the logical 1D index, even if the grid is 2D (required for very large 1D kernels) 59 #define kernelIndex1D() ((blockIdx.y*gridDim.x+blockIdx.x) * blockDim.x + threadIdx.x) 60 61 62 //! 1D launch configuration 63 struct GpuLaunchConfig1D : public GpuLaunchConfig 64 { dim3 nPerBlock; //!< dimension of block 65 dim3 nBlocks; //!< dimension of grid (note nBlocks could be 3D for really large kernels) 66 67 //! Set up blocks and grid for a 1D operation over N data points GpuLaunchConfig1DGpuLaunchConfig1D68 template<typename GpuKernel> GpuLaunchConfig1D(GpuKernel* gpuKernel, int N) 69 : GpuLaunchConfig(gpuKernel), 70 nPerBlock(attr.maxThreadsPerBlock,1,1), 71 nBlocks(ceildiv(N, int(nPerBlock.x)),1,1) 72 { //If the grid is too big, make it 2D: 73 while(int(nBlocks.x) > cudaDevProps.maxGridSize[0]) 74 { nBlocks.x = ceildiv(int(nBlocks.x),2); 75 nBlocks.y *= 2; 76 } 77 } 78 }; 79 80 //! 3D launch configuration 81 struct GpuLaunchConfig3D : public GpuLaunchConfig 82 { dim3 nPerBlock; //!< dimension of block 83 dim3 nBlocks; //!< dimension of grid (note nBlocks could be 3D for really large kernels) 84 int zBlockMax; //!< Grids are 2D, so need to loop over last dim 85 86 //! Set up blocks and grid for a 1D operation over N data points GpuLaunchConfig3DGpuLaunchConfig3D87 template<typename GpuKernel> GpuLaunchConfig3D(GpuKernel* gpuKernel, vector3<int> S) 88 : GpuLaunchConfig(gpuKernel) 89 { // Try to minimize zBlockMax and maximize block size within constraint: 90 zBlockMax = ceildiv(S[0], std::min(attr.maxThreadsPerBlock, cudaDevProps.maxThreadsDim[2])); 91 nPerBlock.z = ceildiv(S[0], zBlockMax); 92 // For the chosen z configuration, maximize x block size within constraint 93 int maxBlockXY = attr.maxThreadsPerBlock/nPerBlock.z; 94 nBlocks.x = ceildiv(S[2], std::min(maxBlockXY,cudaDevProps.maxThreadsDim[0])); 95 nPerBlock.x = ceildiv(S[2], int(nBlocks.x)); 96 // For the chosen x and z configuration, maximize y block size within constraint 97 int maxBlockY = attr.maxThreadsPerBlock/(nPerBlock.z*nPerBlock.x); 98 nBlocks.y = ceildiv(S[1], std::min(maxBlockY,cudaDevProps.maxThreadsDim[1])); 99 nPerBlock.y = ceildiv(S[1], int(nBlocks.y)); 100 } 101 }; 102 103 //! 3D launch configuration for symmetry-reduced G-space loops (z dimension folded for real data sets) 104 struct GpuLaunchConfigHalf3D : public GpuLaunchConfig3D 105 { //!Just use the above after reducing the z-dimension to half GpuLaunchConfigHalf3DGpuLaunchConfigHalf3D106 template<typename GpuKernel> GpuLaunchConfigHalf3D(GpuKernel* gpuKernel, vector3<int> S) 107 : GpuLaunchConfig3D(gpuKernel, vector3<int>(S[0], S[1], S[2]/2+1)) 108 { 109 } 110 }; 111 112 //! Check for gpu errors and print a useful message (implemented in GpuUtils.cpp) 113 void gpuErrorCheck(); 114 115 //! @} 116 #endif // JDFTX_CORE_GPUKERNELUTILS_H 117