1 // Copyright (C) 2015  Davis E. King (davis@dlib.net)
2 // License: Boost Software License   See LICENSE.txt for the full license.
3 #ifndef DLIB_GPU_DaTA_CPP_
4 #define DLIB_GPU_DaTA_CPP_
5 
6 // Only things that require CUDA are declared in this cpp file.  Everything else is in the
7 // gpu_data.h header so that it can operate as "header-only" code when using just the CPU.
8 #ifdef DLIB_USE_CUDA
9 
10 #include "gpu_data.h"
11 #include <iostream>
12 #include "cuda_utils.h"
13 #include <cstring>
14 #include <cuda.h>
15 
16 namespace dlib
17 {
18 
19 // ----------------------------------------------------------------------------------------
20 
memcpy(gpu_data & dest,const gpu_data & src)21     void memcpy (
22         gpu_data& dest,
23         const gpu_data& src
24     )
25     {
26         DLIB_CASSERT(dest.size() == src.size());
27         if (src.size() == 0 || &dest == &src)
28             return;
29 
30         memcpy(dest,0, src, 0, src.size());
31     }
32 
memcpy(gpu_data & dest,size_t dest_offset,const gpu_data & src,size_t src_offset,size_t num)33     void memcpy (
34         gpu_data& dest,
35         size_t dest_offset,
36         const gpu_data& src,
37         size_t src_offset,
38         size_t num
39     )
40     {
41         DLIB_CASSERT(dest_offset + num <= dest.size());
42         DLIB_CASSERT(src_offset + num <= src.size());
43         if (num == 0)
44             return;
45 
46         // if there is aliasing
47         if (&dest == &src && std::max(dest_offset, src_offset) < std::min(dest_offset,src_offset)+num)
48         {
49             // if they perfectly alias each other then there is nothing to do
50             if (dest_offset == src_offset)
51                 return;
52             else
53                 std::memmove(dest.host()+dest_offset, src.host()+src_offset, sizeof(float)*num);
54         }
55         else
56         {
57             // if we write to the entire thing then we can use device_write_only()
58             if (dest_offset == 0 && num == dest.size())
59             {
60                 // copy the memory efficiently based on which copy is current in each object.
61                 if (src.device_ready())
62                     CHECK_CUDA(cudaMemcpy(dest.device_write_only(), src.device()+src_offset,  num*sizeof(float), cudaMemcpyDeviceToDevice));
63                 else
64                     CHECK_CUDA(cudaMemcpy(dest.device_write_only(), src.host()+src_offset,    num*sizeof(float), cudaMemcpyHostToDevice));
65             }
66             else
67             {
68                 // copy the memory efficiently based on which copy is current in each object.
69                 if (dest.device_ready() && src.device_ready())
70                     CHECK_CUDA(cudaMemcpy(dest.device()+dest_offset, src.device()+src_offset, num*sizeof(float), cudaMemcpyDeviceToDevice));
71                 else if (!dest.device_ready() && src.device_ready())
72                     CHECK_CUDA(cudaMemcpy(dest.host()+dest_offset, src.device()+src_offset,   num*sizeof(float), cudaMemcpyDeviceToHost));
73                 else if (dest.device_ready() && !src.device_ready())
74                     CHECK_CUDA(cudaMemcpy(dest.device()+dest_offset, src.host()+src_offset,   num*sizeof(float), cudaMemcpyHostToDevice));
75                 else
76                     CHECK_CUDA(cudaMemcpy(dest.host()+dest_offset, src.host()+src_offset,     num*sizeof(float), cudaMemcpyHostToHost));
77             }
78         }
79     }
80 // ----------------------------------------------------------------------------------------
81 
synchronize_stream(cudaStream_t stream)82     void synchronize_stream(cudaStream_t stream)
83     {
84 #if !defined CUDA_VERSION
85 #error CUDA_VERSION not defined
86 #elif CUDA_VERSION >= 9020 && CUDA_VERSION < 11000
87         // We will stop using this alternative version with cuda V11, hopefully the bug in
88         // cudaStreamSynchronize is fixed by then.
89         //
90         // This should be pretty much the same as cudaStreamSynchronize, which for some
91         // reason makes training freeze in some cases.
92         // (see https://github.com/davisking/dlib/issues/1513)
93         while (true)
94         {
95             cudaError_t err = cudaStreamQuery(stream);
96             switch (err)
97             {
98             case cudaSuccess: return;      // now we are synchronized
99             case cudaErrorNotReady: break; // continue waiting
100             default: CHECK_CUDA(err);      // unexpected error: throw
101             }
102         }
103 #else // CUDA_VERSION
104         CHECK_CUDA(cudaStreamSynchronize(stream));
105 #endif // CUDA_VERSION
106     }
107 
108     void gpu_data::
wait_for_transfer_to_finish() const109     wait_for_transfer_to_finish() const
110     {
111         if (have_active_transfer)
112         {
113             synchronize_stream((cudaStream_t)cuda_stream.get());
114             have_active_transfer = false;
115             // Check for errors.  These calls to cudaGetLastError() are what help us find
116             // out if our kernel launches have been failing.
117             CHECK_CUDA(cudaGetLastError());
118         }
119     }
120 
121     void gpu_data::
copy_to_device() const122     copy_to_device() const
123     {
124         // We want transfers to the device to always be concurrent with any device
125         // computation.  So we use our non-default stream to do the transfer.
126         async_copy_to_device();
127         wait_for_transfer_to_finish();
128     }
129 
130     void gpu_data::
copy_to_host() const131     copy_to_host() const
132     {
133         if (!host_current)
134         {
135             wait_for_transfer_to_finish();
136             CHECK_CUDA(cudaMemcpy(data_host.get(), data_device.get(), data_size*sizeof(float), cudaMemcpyDeviceToHost));
137             host_current = true;
138             // At this point we know our RAM block isn't in use because cudaMemcpy()
139             // implicitly syncs with the device.
140             device_in_use = false;
141             // Check for errors.  These calls to cudaGetLastError() are what help us find
142             // out if our kernel launches have been failing.
143             CHECK_CUDA(cudaGetLastError());
144         }
145     }
146 
147     void gpu_data::
async_copy_to_device() const148     async_copy_to_device() const
149     {
150         if (!device_current)
151         {
152             if (device_in_use)
153             {
154                 // Wait for any possible CUDA kernels that might be using our memory block to
155                 // complete before we overwrite the memory.
156                 synchronize_stream(0);
157                 device_in_use = false;
158             }
159             CHECK_CUDA(cudaMemcpyAsync(data_device.get(), data_host.get(), data_size*sizeof(float), cudaMemcpyHostToDevice, (cudaStream_t)cuda_stream.get()));
160             have_active_transfer = true;
161             device_current = true;
162         }
163     }
164 
165     void gpu_data::
set_size(size_t new_size)166     set_size(
167         size_t new_size
168     )
169     {
170         if (new_size == 0)
171         {
172             if (device_in_use)
173             {
174                 // Wait for any possible CUDA kernels that might be using our memory block to
175                 // complete before we free the memory.
176                 synchronize_stream(0);
177                 device_in_use = false;
178             }
179             wait_for_transfer_to_finish();
180             data_size = 0;
181             host_current = true;
182             device_current = true;
183             device_in_use = false;
184             data_host.reset();
185             data_device.reset();
186         }
187         else if (new_size != data_size)
188         {
189             if (device_in_use)
190             {
191                 // Wait for any possible CUDA kernels that might be using our memory block to
192                 // complete before we free the memory.
193                 synchronize_stream(0);
194                 device_in_use = false;
195             }
196             wait_for_transfer_to_finish();
197             data_size = new_size;
198             host_current = true;
199             device_current = true;
200             device_in_use = false;
201 
202             try
203             {
204                 CHECK_CUDA(cudaGetDevice(&the_device_id));
205 
206                 // free memory blocks before we allocate new ones.
207                 data_host.reset();
208                 data_device.reset();
209 
210                 void* data;
211                 CHECK_CUDA(cudaMallocHost(&data, new_size*sizeof(float)));
212                 // Note that we don't throw exceptions since the free calls are invariably
213                 // called in destructors.  They also shouldn't fail anyway unless someone
214                 // is resetting the GPU card in the middle of their program.
215                 data_host.reset((float*)data, [](float* ptr){
216                     auto err = cudaFreeHost(ptr);
217                     if(err!=cudaSuccess)
218                         std::cerr << "cudaFreeHost() failed. Reason: " << cudaGetErrorString(err) << std::endl;
219                 });
220 
221                 CHECK_CUDA(cudaMalloc(&data, new_size*sizeof(float)));
222                 data_device.reset((float*)data, [](float* ptr){
223                     auto err = cudaFree(ptr);
224                     if(err!=cudaSuccess)
225                         std::cerr << "cudaFree() failed. Reason: " << cudaGetErrorString(err) << std::endl;
226                 });
227 
228                 if (!cuda_stream)
229                 {
230                     cudaStream_t cstream;
231                     CHECK_CUDA(cudaStreamCreateWithFlags(&cstream, cudaStreamNonBlocking));
232                     cuda_stream.reset(cstream, [](void* ptr){
233                         auto err = cudaStreamDestroy((cudaStream_t)ptr);
234                         if(err!=cudaSuccess)
235                             std::cerr << "cudaStreamDestroy() failed. Reason: " << cudaGetErrorString(err) << std::endl;
236                     });
237                 }
238 
239             }
240             catch(...)
241             {
242                 set_size(0);
243                 throw;
244             }
245         }
246     }
247 
248 // ----------------------------------------------------------------------------------------
249 }
250 
251 #endif // DLIB_USE_CUDA
252 
253 #endif // DLIB_GPU_DaTA_CPP_
254 
255