1 // Copyright (C) 2015 Davis E. King (davis@dlib.net) 2 // License: Boost Software License See LICENSE.txt for the full license. 3 #ifndef DLIB_GPU_DaTA_CPP_ 4 #define DLIB_GPU_DaTA_CPP_ 5 6 // Only things that require CUDA are declared in this cpp file. Everything else is in the 7 // gpu_data.h header so that it can operate as "header-only" code when using just the CPU. 8 #ifdef DLIB_USE_CUDA 9 10 #include "gpu_data.h" 11 #include <iostream> 12 #include "cuda_utils.h" 13 #include <cstring> 14 #include <cuda.h> 15 16 namespace dlib 17 { 18 19 // ---------------------------------------------------------------------------------------- 20 memcpy(gpu_data & dest,const gpu_data & src)21 void memcpy ( 22 gpu_data& dest, 23 const gpu_data& src 24 ) 25 { 26 DLIB_CASSERT(dest.size() == src.size()); 27 if (src.size() == 0 || &dest == &src) 28 return; 29 30 memcpy(dest,0, src, 0, src.size()); 31 } 32 memcpy(gpu_data & dest,size_t dest_offset,const gpu_data & src,size_t src_offset,size_t num)33 void memcpy ( 34 gpu_data& dest, 35 size_t dest_offset, 36 const gpu_data& src, 37 size_t src_offset, 38 size_t num 39 ) 40 { 41 DLIB_CASSERT(dest_offset + num <= dest.size()); 42 DLIB_CASSERT(src_offset + num <= src.size()); 43 if (num == 0) 44 return; 45 46 // if there is aliasing 47 if (&dest == &src && std::max(dest_offset, src_offset) < std::min(dest_offset,src_offset)+num) 48 { 49 // if they perfectly alias each other then there is nothing to do 50 if (dest_offset == src_offset) 51 return; 52 else 53 std::memmove(dest.host()+dest_offset, src.host()+src_offset, sizeof(float)*num); 54 } 55 else 56 { 57 // if we write to the entire thing then we can use device_write_only() 58 if (dest_offset == 0 && num == dest.size()) 59 { 60 // copy the memory efficiently based on which copy is current in each object. 61 if (src.device_ready()) 62 CHECK_CUDA(cudaMemcpy(dest.device_write_only(), src.device()+src_offset, num*sizeof(float), cudaMemcpyDeviceToDevice)); 63 else 64 CHECK_CUDA(cudaMemcpy(dest.device_write_only(), src.host()+src_offset, num*sizeof(float), cudaMemcpyHostToDevice)); 65 } 66 else 67 { 68 // copy the memory efficiently based on which copy is current in each object. 69 if (dest.device_ready() && src.device_ready()) 70 CHECK_CUDA(cudaMemcpy(dest.device()+dest_offset, src.device()+src_offset, num*sizeof(float), cudaMemcpyDeviceToDevice)); 71 else if (!dest.device_ready() && src.device_ready()) 72 CHECK_CUDA(cudaMemcpy(dest.host()+dest_offset, src.device()+src_offset, num*sizeof(float), cudaMemcpyDeviceToHost)); 73 else if (dest.device_ready() && !src.device_ready()) 74 CHECK_CUDA(cudaMemcpy(dest.device()+dest_offset, src.host()+src_offset, num*sizeof(float), cudaMemcpyHostToDevice)); 75 else 76 CHECK_CUDA(cudaMemcpy(dest.host()+dest_offset, src.host()+src_offset, num*sizeof(float), cudaMemcpyHostToHost)); 77 } 78 } 79 } 80 // ---------------------------------------------------------------------------------------- 81 synchronize_stream(cudaStream_t stream)82 void synchronize_stream(cudaStream_t stream) 83 { 84 #if !defined CUDA_VERSION 85 #error CUDA_VERSION not defined 86 #elif CUDA_VERSION >= 9020 && CUDA_VERSION < 11000 87 // We will stop using this alternative version with cuda V11, hopefully the bug in 88 // cudaStreamSynchronize is fixed by then. 89 // 90 // This should be pretty much the same as cudaStreamSynchronize, which for some 91 // reason makes training freeze in some cases. 92 // (see https://github.com/davisking/dlib/issues/1513) 93 while (true) 94 { 95 cudaError_t err = cudaStreamQuery(stream); 96 switch (err) 97 { 98 case cudaSuccess: return; // now we are synchronized 99 case cudaErrorNotReady: break; // continue waiting 100 default: CHECK_CUDA(err); // unexpected error: throw 101 } 102 } 103 #else // CUDA_VERSION 104 CHECK_CUDA(cudaStreamSynchronize(stream)); 105 #endif // CUDA_VERSION 106 } 107 108 void gpu_data:: wait_for_transfer_to_finish() const109 wait_for_transfer_to_finish() const 110 { 111 if (have_active_transfer) 112 { 113 synchronize_stream((cudaStream_t)cuda_stream.get()); 114 have_active_transfer = false; 115 // Check for errors. These calls to cudaGetLastError() are what help us find 116 // out if our kernel launches have been failing. 117 CHECK_CUDA(cudaGetLastError()); 118 } 119 } 120 121 void gpu_data:: copy_to_device() const122 copy_to_device() const 123 { 124 // We want transfers to the device to always be concurrent with any device 125 // computation. So we use our non-default stream to do the transfer. 126 async_copy_to_device(); 127 wait_for_transfer_to_finish(); 128 } 129 130 void gpu_data:: copy_to_host() const131 copy_to_host() const 132 { 133 if (!host_current) 134 { 135 wait_for_transfer_to_finish(); 136 CHECK_CUDA(cudaMemcpy(data_host.get(), data_device.get(), data_size*sizeof(float), cudaMemcpyDeviceToHost)); 137 host_current = true; 138 // At this point we know our RAM block isn't in use because cudaMemcpy() 139 // implicitly syncs with the device. 140 device_in_use = false; 141 // Check for errors. These calls to cudaGetLastError() are what help us find 142 // out if our kernel launches have been failing. 143 CHECK_CUDA(cudaGetLastError()); 144 } 145 } 146 147 void gpu_data:: async_copy_to_device() const148 async_copy_to_device() const 149 { 150 if (!device_current) 151 { 152 if (device_in_use) 153 { 154 // Wait for any possible CUDA kernels that might be using our memory block to 155 // complete before we overwrite the memory. 156 synchronize_stream(0); 157 device_in_use = false; 158 } 159 CHECK_CUDA(cudaMemcpyAsync(data_device.get(), data_host.get(), data_size*sizeof(float), cudaMemcpyHostToDevice, (cudaStream_t)cuda_stream.get())); 160 have_active_transfer = true; 161 device_current = true; 162 } 163 } 164 165 void gpu_data:: set_size(size_t new_size)166 set_size( 167 size_t new_size 168 ) 169 { 170 if (new_size == 0) 171 { 172 if (device_in_use) 173 { 174 // Wait for any possible CUDA kernels that might be using our memory block to 175 // complete before we free the memory. 176 synchronize_stream(0); 177 device_in_use = false; 178 } 179 wait_for_transfer_to_finish(); 180 data_size = 0; 181 host_current = true; 182 device_current = true; 183 device_in_use = false; 184 data_host.reset(); 185 data_device.reset(); 186 } 187 else if (new_size != data_size) 188 { 189 if (device_in_use) 190 { 191 // Wait for any possible CUDA kernels that might be using our memory block to 192 // complete before we free the memory. 193 synchronize_stream(0); 194 device_in_use = false; 195 } 196 wait_for_transfer_to_finish(); 197 data_size = new_size; 198 host_current = true; 199 device_current = true; 200 device_in_use = false; 201 202 try 203 { 204 CHECK_CUDA(cudaGetDevice(&the_device_id)); 205 206 // free memory blocks before we allocate new ones. 207 data_host.reset(); 208 data_device.reset(); 209 210 void* data; 211 CHECK_CUDA(cudaMallocHost(&data, new_size*sizeof(float))); 212 // Note that we don't throw exceptions since the free calls are invariably 213 // called in destructors. They also shouldn't fail anyway unless someone 214 // is resetting the GPU card in the middle of their program. 215 data_host.reset((float*)data, [](float* ptr){ 216 auto err = cudaFreeHost(ptr); 217 if(err!=cudaSuccess) 218 std::cerr << "cudaFreeHost() failed. Reason: " << cudaGetErrorString(err) << std::endl; 219 }); 220 221 CHECK_CUDA(cudaMalloc(&data, new_size*sizeof(float))); 222 data_device.reset((float*)data, [](float* ptr){ 223 auto err = cudaFree(ptr); 224 if(err!=cudaSuccess) 225 std::cerr << "cudaFree() failed. Reason: " << cudaGetErrorString(err) << std::endl; 226 }); 227 228 if (!cuda_stream) 229 { 230 cudaStream_t cstream; 231 CHECK_CUDA(cudaStreamCreateWithFlags(&cstream, cudaStreamNonBlocking)); 232 cuda_stream.reset(cstream, [](void* ptr){ 233 auto err = cudaStreamDestroy((cudaStream_t)ptr); 234 if(err!=cudaSuccess) 235 std::cerr << "cudaStreamDestroy() failed. Reason: " << cudaGetErrorString(err) << std::endl; 236 }); 237 } 238 239 } 240 catch(...) 241 { 242 set_size(0); 243 throw; 244 } 245 } 246 } 247 248 // ---------------------------------------------------------------------------------------- 249 } 250 251 #endif // DLIB_USE_CUDA 252 253 #endif // DLIB_GPU_DaTA_CPP_ 254 255