1 #pragma once 2 3 #include <cublas_v2.h> 4 #include <cudnn.h> 5 #include <cusolverDn.h> 6 7 #include <atomic> 8 #include <cstddef> 9 #include <cstdint> 10 #include <memory> 11 #include <mutex> 12 #include <queue> 13 #include <utility> 14 15 #include <absl/types/optional.h> 16 17 #include "chainerx/array.h" 18 #include "chainerx/axes.h" 19 #include "chainerx/cuda/cublas.h" 20 #include "chainerx/cuda/cuda_backend.h" 21 #include "chainerx/cuda/cuda_conv.h" 22 #include "chainerx/cuda/cudnn.h" 23 #include "chainerx/cuda/cusolver.h" 24 #include "chainerx/cuda/memory_pool.h" 25 #include "chainerx/device.h" 26 #include "chainerx/dtype.h" 27 #include "chainerx/kernels/normalization.h" 28 #include "chainerx/kernels/pooling.h" 29 #include "chainerx/kernels/rnn.h" 30 #include "chainerx/routines/normalization.h" 31 #include "chainerx/routines/pooling.h" 32 #include "chainerx/scalar.h" 33 #include "chainerx/stack_vector.h" 34 35 namespace chainerx { 36 namespace cuda { 37 38 class CudaDevice; 39 40 namespace cuda_internal { 41 42 class CudaConvTest; // for unit-tests 43 44 // Keeps any memory from being freed before CUDA asynchronous operations are finished. 45 // Operations in this class are thread safe. 46 class MemoryKeeper { 47 public: 48 MemoryKeeper() = default; 49 50 ~MemoryKeeper(); 51 52 MemoryKeeper(const MemoryKeeper&) = delete; 53 MemoryKeeper(MemoryKeeper&&) = delete; 54 MemoryKeeper& operator=(const MemoryKeeper&) = delete; 55 MemoryKeeper& operator=(MemoryKeeper&&) = delete; 56 57 // Registers a pointer to a memory chunk. 58 // The memory is only freed after all preceding CUDA operations in the stream are finished. 59 // TODO(niboshi): Currently only the default stream is supported. 60 void Add(cudaStream_t stream, std::shared_ptr<void> memory); 61 62 // Checks for recorded events and frees the associated memories. 63 void Collect(); 64 65 private: 66 std::mutex mutex_{}; 67 std::queue<std::pair<cudaEvent_t, std::shared_ptr<void>>> queue_{}; 68 std::atomic<bool> is_empty_{true}; 69 }; 70 71 // Keeps handles and other device internals. 72 // These internals are exposed through `GetDeviceInternals` for CUDA internal usages. 73 class DeviceInternals { 74 public: DeviceInternals(int device_index)75 explicit DeviceInternals(int device_index) 76 : cublas_handle_{device_index}, cudnn_handle_{device_index}, cusolverdn_handle_{device_index} {} 77 78 ~DeviceInternals() = default; 79 80 DeviceInternals(const DeviceInternals&) = delete; 81 DeviceInternals(DeviceInternals&&) = delete; 82 DeviceInternals& operator=(const DeviceInternals&) = delete; 83 DeviceInternals& operator=(DeviceInternals&&) = delete; 84 cublas_handle()85 cuda_internal::CublasHandle& cublas_handle() { return cublas_handle_; } 86 cudnn_handle()87 cuda_internal::CudnnHandle& cudnn_handle() { return cudnn_handle_; } 88 cusolverdn_handle()89 cuda_internal::CusolverDnHandle& cusolverdn_handle() { return cusolverdn_handle_; } 90 cuda_conv()91 cuda_internal::CudaConv& cuda_conv() { return cuda_conv_; } 92 93 private: 94 cuda_internal::CublasHandle cublas_handle_; 95 96 cuda_internal::CudnnHandle cudnn_handle_; 97 98 cuda_internal::CusolverDnHandle cusolverdn_handle_; 99 100 cuda_internal::CudaConv cuda_conv_{}; 101 }; 102 103 DeviceInternals& GetDeviceInternals(CudaDevice& device); 104 105 } // namespace cuda_internal 106 107 struct CudaBatchNormGradState : public BatchNormGradState { 108 public: CudaBatchNormGradStateCudaBatchNormGradState109 CudaBatchNormGradState(Array x_cont, Array x_mean, Array x_inv_std, Dtype beta_dtype) 110 : x_cont_{std::move(x_cont)}, x_mean_{std::move(x_mean)}, x_inv_std_{std::move(x_inv_std)}, beta_dtype_{beta_dtype} {} 111 x_contCudaBatchNormGradState112 const Array& x_cont() const { return x_cont_; } x_meanCudaBatchNormGradState113 const Array& x_mean() const { return x_mean_; } x_inv_stdCudaBatchNormGradState114 const Array& x_inv_std() const { return x_inv_std_; } beta_dtypeCudaBatchNormGradState115 Dtype beta_dtype() const { return beta_dtype_; } 116 117 private: 118 Array x_cont_; 119 Array x_mean_; 120 Array x_inv_std_; 121 Dtype beta_dtype_; 122 }; 123 124 struct GenericRnnGradState : public RnnGradState { GenericRnnGradStateGenericRnnGradState125 GenericRnnGradState(cudnnRNNDescriptor_t rnn_desc, cudnnFilterDescriptor_t w_desc, Array w, Array reserve, Array workspace) 126 : rnn_desc_{rnn_desc}, w_desc_{w_desc}, w_{std::move(w)}, reserve_{std::move(reserve)}, workspace_{std::move(workspace)} {} rnn_descGenericRnnGradState127 cudnnRNNDescriptor_t rnn_desc() { return rnn_desc_; } wDescGenericRnnGradState128 cudnnFilterDescriptor_t wDesc() { return w_desc_; } wGenericRnnGradState129 Array w() { return w_; } reserveGenericRnnGradState130 Array reserve() { return reserve_; } workspaceGenericRnnGradState131 Array workspace() { return workspace_; } 132 133 private: 134 cudnnRNNDescriptor_t rnn_desc_; 135 cudnnFilterDescriptor_t w_desc_; 136 Array w_; 137 Array reserve_; 138 Array workspace_; 139 }; 140 141 // Pooling states are identical for most CUDA pooling ops so we define a common base class. 142 class CudaPoolStateBase { 143 public: CudaPoolStateBase(Array x,Array out)144 CudaPoolStateBase(Array x, Array out) : x_{std::move(x)}, out_{std::move(out)} {} 145 x()146 const Array& x() const { return x_; } out()147 const Array& out() const { return out_; } 148 149 private: 150 Array x_{}; 151 Array out_{}; 152 }; 153 154 class CudaMaxPoolGradState : public MaxPoolGradState, public CudaPoolStateBase { 155 using CudaPoolStateBase::CudaPoolStateBase; 156 }; 157 158 class CudaMaxPoolGradGradState : public MaxPoolGradGradState, public CudaPoolStateBase { 159 using CudaPoolStateBase::CudaPoolStateBase; 160 }; 161 162 class CudaAveragePoolGradState : public AveragePoolGradState, public CudaPoolStateBase { 163 using CudaPoolStateBase::CudaPoolStateBase; 164 }; 165 166 class CudaDevice : public Device { 167 public: device_memory_pool()168 const std::shared_ptr<MemoryPool>& device_memory_pool() { return device_memory_pool_; } 169 170 void Synchronize() override; 171 172 // memory.cc 173 174 std::shared_ptr<void> Allocate(size_t bytesize) override; 175 176 std::shared_ptr<void> MakeDataFromForeignPointer(const std::shared_ptr<void>& data) override; 177 178 void MemoryCopyFrom(void* dst, const void* src, size_t bytesize, Device& src_device) override; 179 180 void MemoryCopyTo(void* dst, const void* src, size_t bytesize, Device& dst_device) override; 181 182 std::shared_ptr<void> TransferDataFrom( 183 Device& src_device, const std::shared_ptr<void>& src_ptr, size_t offset, size_t bytesize) override; 184 185 std::shared_ptr<void> TransferDataTo(Device& dst_device, const std::shared_ptr<void>& src_ptr, size_t offset, size_t bytesize) override; 186 187 std::shared_ptr<void> FromHostMemory(const std::shared_ptr<void>& src_ptr, size_t bytesize) override; 188 189 protected: CudaDevice(CudaBackend & backend,int index)190 CudaDevice(CudaBackend& backend, int index) 191 : Device{backend, index}, 192 device_memory_pool_{std::make_shared<MemoryPool>(index, std::make_unique<DeviceMemoryAllocator>())}, 193 pinned_memory_pool_{std::make_shared<MemoryPool>(index, std::make_unique<PinnedMemoryAllocator>())}, 194 device_internals_{index} {} 195 196 private: 197 friend CudaDevice* cuda_internal::CreateDevice(CudaBackend& backend, int index); 198 199 friend cuda_internal::DeviceInternals& cuda_internal::GetDeviceInternals(CudaDevice& device); 200 201 friend class cuda_internal::CudaConvTest; // for unit-tests 202 203 // Allocates pinned memory. 204 // The pinned memory is used internally by the CUDA device for asynchronous memory transfer, i.e. cudaMemcpyAsync. 205 std::shared_ptr<void> AllocatePinnedMemory(size_t bytesize); 206 207 // Asynchronous transfer from host to this device, w.r.t. host, using temporary pinned memory. 208 // The current device must be set to this device, prior to calling this function. 209 void MemoryCopyFromHostAsync(void* dst, const void* src, size_t bytesize); 210 211 std::shared_ptr<MemoryPool> device_memory_pool_; 212 213 // TODO(hvy): Consider checking if pinned memory is available by querying canMapHostMemory. 214 std::shared_ptr<MemoryPool> pinned_memory_pool_; 215 216 cuda_internal::DeviceInternals device_internals_; 217 218 // Memory keeper. 219 cuda_internal::MemoryKeeper memory_keeper_{}; 220 }; 221 222 } // namespace cuda 223 } // namespace chainerx 224