1 #pragma once
2 
3 #include <cublas_v2.h>
4 #include <cudnn.h>
5 #include <cusolverDn.h>
6 
7 #include <atomic>
8 #include <cstddef>
9 #include <cstdint>
10 #include <memory>
11 #include <mutex>
12 #include <queue>
13 #include <utility>
14 
15 #include <absl/types/optional.h>
16 
17 #include "chainerx/array.h"
18 #include "chainerx/axes.h"
19 #include "chainerx/cuda/cublas.h"
20 #include "chainerx/cuda/cuda_backend.h"
21 #include "chainerx/cuda/cuda_conv.h"
22 #include "chainerx/cuda/cudnn.h"
23 #include "chainerx/cuda/cusolver.h"
24 #include "chainerx/cuda/memory_pool.h"
25 #include "chainerx/device.h"
26 #include "chainerx/dtype.h"
27 #include "chainerx/kernels/normalization.h"
28 #include "chainerx/kernels/pooling.h"
29 #include "chainerx/kernels/rnn.h"
30 #include "chainerx/routines/normalization.h"
31 #include "chainerx/routines/pooling.h"
32 #include "chainerx/scalar.h"
33 #include "chainerx/stack_vector.h"
34 
35 namespace chainerx {
36 namespace cuda {
37 
38 class CudaDevice;
39 
40 namespace cuda_internal {
41 
42 class CudaConvTest;  // for unit-tests
43 
44 // Keeps any memory from being freed before CUDA asynchronous operations are finished.
45 // Operations in this class are thread safe.
46 class MemoryKeeper {
47 public:
48     MemoryKeeper() = default;
49 
50     ~MemoryKeeper();
51 
52     MemoryKeeper(const MemoryKeeper&) = delete;
53     MemoryKeeper(MemoryKeeper&&) = delete;
54     MemoryKeeper& operator=(const MemoryKeeper&) = delete;
55     MemoryKeeper& operator=(MemoryKeeper&&) = delete;
56 
57     // Registers a pointer to a memory chunk.
58     // The memory is only freed after all preceding CUDA operations in the stream are finished.
59     // TODO(niboshi): Currently only the default stream is supported.
60     void Add(cudaStream_t stream, std::shared_ptr<void> memory);
61 
62     // Checks for recorded events and frees the associated memories.
63     void Collect();
64 
65 private:
66     std::mutex mutex_{};
67     std::queue<std::pair<cudaEvent_t, std::shared_ptr<void>>> queue_{};
68     std::atomic<bool> is_empty_{true};
69 };
70 
71 // Keeps handles and other device internals.
72 // These internals are exposed through `GetDeviceInternals` for CUDA internal usages.
73 class DeviceInternals {
74 public:
DeviceInternals(int device_index)75     explicit DeviceInternals(int device_index)
76         : cublas_handle_{device_index}, cudnn_handle_{device_index}, cusolverdn_handle_{device_index} {}
77 
78     ~DeviceInternals() = default;
79 
80     DeviceInternals(const DeviceInternals&) = delete;
81     DeviceInternals(DeviceInternals&&) = delete;
82     DeviceInternals& operator=(const DeviceInternals&) = delete;
83     DeviceInternals& operator=(DeviceInternals&&) = delete;
84 
cublas_handle()85     cuda_internal::CublasHandle& cublas_handle() { return cublas_handle_; }
86 
cudnn_handle()87     cuda_internal::CudnnHandle& cudnn_handle() { return cudnn_handle_; }
88 
cusolverdn_handle()89     cuda_internal::CusolverDnHandle& cusolverdn_handle() { return cusolverdn_handle_; }
90 
cuda_conv()91     cuda_internal::CudaConv& cuda_conv() { return cuda_conv_; }
92 
93 private:
94     cuda_internal::CublasHandle cublas_handle_;
95 
96     cuda_internal::CudnnHandle cudnn_handle_;
97 
98     cuda_internal::CusolverDnHandle cusolverdn_handle_;
99 
100     cuda_internal::CudaConv cuda_conv_{};
101 };
102 
103 DeviceInternals& GetDeviceInternals(CudaDevice& device);
104 
105 }  // namespace cuda_internal
106 
107 struct CudaBatchNormGradState : public BatchNormGradState {
108 public:
CudaBatchNormGradStateCudaBatchNormGradState109     CudaBatchNormGradState(Array x_cont, Array x_mean, Array x_inv_std, Dtype beta_dtype)
110         : x_cont_{std::move(x_cont)}, x_mean_{std::move(x_mean)}, x_inv_std_{std::move(x_inv_std)}, beta_dtype_{beta_dtype} {}
111 
x_contCudaBatchNormGradState112     const Array& x_cont() const { return x_cont_; }
x_meanCudaBatchNormGradState113     const Array& x_mean() const { return x_mean_; }
x_inv_stdCudaBatchNormGradState114     const Array& x_inv_std() const { return x_inv_std_; }
beta_dtypeCudaBatchNormGradState115     Dtype beta_dtype() const { return beta_dtype_; }
116 
117 private:
118     Array x_cont_;
119     Array x_mean_;
120     Array x_inv_std_;
121     Dtype beta_dtype_;
122 };
123 
124 struct GenericRnnGradState : public RnnGradState {
GenericRnnGradStateGenericRnnGradState125     GenericRnnGradState(cudnnRNNDescriptor_t rnn_desc, cudnnFilterDescriptor_t w_desc, Array w, Array reserve, Array workspace)
126         : rnn_desc_{rnn_desc}, w_desc_{w_desc}, w_{std::move(w)}, reserve_{std::move(reserve)}, workspace_{std::move(workspace)} {}
rnn_descGenericRnnGradState127     cudnnRNNDescriptor_t rnn_desc() { return rnn_desc_; }
wDescGenericRnnGradState128     cudnnFilterDescriptor_t wDesc() { return w_desc_; }
wGenericRnnGradState129     Array w() { return w_; }
reserveGenericRnnGradState130     Array reserve() { return reserve_; }
workspaceGenericRnnGradState131     Array workspace() { return workspace_; }
132 
133 private:
134     cudnnRNNDescriptor_t rnn_desc_;
135     cudnnFilterDescriptor_t w_desc_;
136     Array w_;
137     Array reserve_;
138     Array workspace_;
139 };
140 
141 // Pooling states are identical for most CUDA pooling ops so we define a common base class.
142 class CudaPoolStateBase {
143 public:
CudaPoolStateBase(Array x,Array out)144     CudaPoolStateBase(Array x, Array out) : x_{std::move(x)}, out_{std::move(out)} {}
145 
x()146     const Array& x() const { return x_; }
out()147     const Array& out() const { return out_; }
148 
149 private:
150     Array x_{};
151     Array out_{};
152 };
153 
154 class CudaMaxPoolGradState : public MaxPoolGradState, public CudaPoolStateBase {
155     using CudaPoolStateBase::CudaPoolStateBase;
156 };
157 
158 class CudaMaxPoolGradGradState : public MaxPoolGradGradState, public CudaPoolStateBase {
159     using CudaPoolStateBase::CudaPoolStateBase;
160 };
161 
162 class CudaAveragePoolGradState : public AveragePoolGradState, public CudaPoolStateBase {
163     using CudaPoolStateBase::CudaPoolStateBase;
164 };
165 
166 class CudaDevice : public Device {
167 public:
device_memory_pool()168     const std::shared_ptr<MemoryPool>& device_memory_pool() { return device_memory_pool_; }
169 
170     void Synchronize() override;
171 
172     // memory.cc
173 
174     std::shared_ptr<void> Allocate(size_t bytesize) override;
175 
176     std::shared_ptr<void> MakeDataFromForeignPointer(const std::shared_ptr<void>& data) override;
177 
178     void MemoryCopyFrom(void* dst, const void* src, size_t bytesize, Device& src_device) override;
179 
180     void MemoryCopyTo(void* dst, const void* src, size_t bytesize, Device& dst_device) override;
181 
182     std::shared_ptr<void> TransferDataFrom(
183             Device& src_device, const std::shared_ptr<void>& src_ptr, size_t offset, size_t bytesize) override;
184 
185     std::shared_ptr<void> TransferDataTo(Device& dst_device, const std::shared_ptr<void>& src_ptr, size_t offset, size_t bytesize) override;
186 
187     std::shared_ptr<void> FromHostMemory(const std::shared_ptr<void>& src_ptr, size_t bytesize) override;
188 
189 protected:
CudaDevice(CudaBackend & backend,int index)190     CudaDevice(CudaBackend& backend, int index)
191         : Device{backend, index},
192           device_memory_pool_{std::make_shared<MemoryPool>(index, std::make_unique<DeviceMemoryAllocator>())},
193           pinned_memory_pool_{std::make_shared<MemoryPool>(index, std::make_unique<PinnedMemoryAllocator>())},
194           device_internals_{index} {}
195 
196 private:
197     friend CudaDevice* cuda_internal::CreateDevice(CudaBackend& backend, int index);
198 
199     friend cuda_internal::DeviceInternals& cuda_internal::GetDeviceInternals(CudaDevice& device);
200 
201     friend class cuda_internal::CudaConvTest;  // for unit-tests
202 
203     // Allocates pinned memory.
204     // The pinned memory is used internally by the CUDA device for asynchronous memory transfer, i.e. cudaMemcpyAsync.
205     std::shared_ptr<void> AllocatePinnedMemory(size_t bytesize);
206 
207     // Asynchronous transfer from host to this device, w.r.t. host, using temporary pinned memory.
208     // The current device must be set to this device, prior to calling this function.
209     void MemoryCopyFromHostAsync(void* dst, const void* src, size_t bytesize);
210 
211     std::shared_ptr<MemoryPool> device_memory_pool_;
212 
213     // TODO(hvy): Consider checking if pinned memory is available by querying canMapHostMemory.
214     std::shared_ptr<MemoryPool> pinned_memory_pool_;
215 
216     cuda_internal::DeviceInternals device_internals_;
217 
218     // Memory keeper.
219     cuda_internal::MemoryKeeper memory_keeper_{};
220 };
221 
222 }  // namespace cuda
223 }  // namespace chainerx
224