1 /*
2 //@HEADER
3 // ************************************************************************
4 //
5 // Kokkos v. 3.0
6 // Copyright (2020) National Technology & Engineering
7 // Solutions of Sandia, LLC (NTESS).
8 //
9 // Under the terms of Contract DE-NA0003525 with NTESS,
10 // the U.S. Government retains certain rights in this software.
11 //
12 // Redistribution and use in source and binary forms, with or without
13 // modification, are permitted provided that the following conditions are
14 // met:
15 //
16 // 1. Redistributions of source code must retain the above copyright
17 // notice, this list of conditions and the following disclaimer.
18 //
19 // 2. Redistributions in binary form must reproduce the above copyright
20 // notice, this list of conditions and the following disclaimer in the
21 // documentation and/or other materials provided with the distribution.
22 //
23 // 3. Neither the name of the Corporation nor the names of the
24 // contributors may be used to endorse or promote products derived from
25 // this software without specific prior written permission.
26 //
27 // THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
28 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
30 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
31 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
32 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
33 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
34 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
35 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
36 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
37 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
38 //
39 // Questions? Contact Christian R. Trott (crtrott@sandia.gov)
40 //
41 // ************************************************************************
42 //@HEADER
43 */
44
45 #include <Kokkos_Macros.hpp>
46 #ifdef KOKKOS_ENABLE_CUDA
47
48 #include <Kokkos_Core.hpp>
49 #include <Kokkos_Cuda.hpp>
50 #include <Kokkos_CudaSpace.hpp>
51
52 #include <cstdlib>
53 #include <iostream>
54 #include <sstream>
55 #include <stdexcept>
56 #include <algorithm>
57 #include <atomic>
58
59 //#include <Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp>
60 #include <impl/Kokkos_Error.hpp>
61 #include <impl/Kokkos_MemorySpace.hpp>
62
63 #include <impl/Kokkos_Tools.hpp>
64
65 /*--------------------------------------------------------------------------*/
66 /*--------------------------------------------------------------------------*/
67
cuda_get_deep_copy_stream()68 cudaStream_t Kokkos::Impl::cuda_get_deep_copy_stream() {
69 static cudaStream_t s = nullptr;
70 if (s == nullptr) {
71 cudaStreamCreate(&s);
72 }
73 return s;
74 }
75
cuda_get_deep_copy_space(bool initialize)76 const std::unique_ptr<Kokkos::Cuda> &Kokkos::Impl::cuda_get_deep_copy_space(
77 bool initialize) {
78 static std::unique_ptr<Cuda> space = nullptr;
79 if (!space && initialize)
80 space = std::make_unique<Cuda>(Kokkos::Impl::cuda_get_deep_copy_stream());
81 return space;
82 }
83
84 namespace Kokkos {
85 namespace Impl {
86
87 namespace {
88
89 static std::atomic<int> num_uvm_allocations(0);
90
91 } // namespace
92
DeepCopy(void * dst,const void * src,size_t n)93 DeepCopy<CudaSpace, CudaSpace, Cuda>::DeepCopy(void *dst, const void *src,
94 size_t n) {
95 CUDA_SAFE_CALL(cudaMemcpy(dst, src, n, cudaMemcpyDefault));
96 }
97
DeepCopy(void * dst,const void * src,size_t n)98 DeepCopy<HostSpace, CudaSpace, Cuda>::DeepCopy(void *dst, const void *src,
99 size_t n) {
100 CUDA_SAFE_CALL(cudaMemcpy(dst, src, n, cudaMemcpyDefault));
101 }
102
DeepCopy(void * dst,const void * src,size_t n)103 DeepCopy<CudaSpace, HostSpace, Cuda>::DeepCopy(void *dst, const void *src,
104 size_t n) {
105 CUDA_SAFE_CALL(cudaMemcpy(dst, src, n, cudaMemcpyDefault));
106 }
107
DeepCopy(const Cuda & instance,void * dst,const void * src,size_t n)108 DeepCopy<CudaSpace, CudaSpace, Cuda>::DeepCopy(const Cuda &instance, void *dst,
109 const void *src, size_t n) {
110 CUDA_SAFE_CALL(
111 cudaMemcpyAsync(dst, src, n, cudaMemcpyDefault, instance.cuda_stream()));
112 }
113
DeepCopy(const Cuda & instance,void * dst,const void * src,size_t n)114 DeepCopy<HostSpace, CudaSpace, Cuda>::DeepCopy(const Cuda &instance, void *dst,
115 const void *src, size_t n) {
116 CUDA_SAFE_CALL(
117 cudaMemcpyAsync(dst, src, n, cudaMemcpyDefault, instance.cuda_stream()));
118 }
119
DeepCopy(const Cuda & instance,void * dst,const void * src,size_t n)120 DeepCopy<CudaSpace, HostSpace, Cuda>::DeepCopy(const Cuda &instance, void *dst,
121 const void *src, size_t n) {
122 CUDA_SAFE_CALL(
123 cudaMemcpyAsync(dst, src, n, cudaMemcpyDefault, instance.cuda_stream()));
124 }
125
DeepCopyAsyncCuda(void * dst,const void * src,size_t n)126 void DeepCopyAsyncCuda(void *dst, const void *src, size_t n) {
127 cudaStream_t s = cuda_get_deep_copy_stream();
128 CUDA_SAFE_CALL(cudaMemcpyAsync(dst, src, n, cudaMemcpyDefault, s));
129 cudaStreamSynchronize(s);
130 }
131
132 } // namespace Impl
133 } // namespace Kokkos
134
135 /*--------------------------------------------------------------------------*/
136 /*--------------------------------------------------------------------------*/
137
138 namespace Kokkos {
139
access_error()140 KOKKOS_DEPRECATED void CudaSpace::access_error() {
141 const std::string msg(
142 "Kokkos::CudaSpace::access_error attempt to execute Cuda function from "
143 "non-Cuda space");
144 Kokkos::Impl::throw_runtime_exception(msg);
145 }
146
access_error(const void * const)147 KOKKOS_DEPRECATED void CudaSpace::access_error(const void *const) {
148 const std::string msg(
149 "Kokkos::CudaSpace::access_error attempt to execute Cuda function from "
150 "non-Cuda space");
151 Kokkos::Impl::throw_runtime_exception(msg);
152 }
153
154 /*--------------------------------------------------------------------------*/
155
available()156 bool CudaUVMSpace::available() {
157 #if defined(CUDA_VERSION) && !defined(__APPLE__)
158 enum : bool { UVM_available = true };
159 #else
160 enum : bool { UVM_available = false };
161 #endif
162 return UVM_available;
163 }
164
165 /*--------------------------------------------------------------------------*/
166
number_of_allocations()167 int CudaUVMSpace::number_of_allocations() {
168 return Kokkos::Impl::num_uvm_allocations.load();
169 }
170 #ifdef KOKKOS_IMPL_DEBUG_CUDA_PIN_UVM_TO_HOST
171 // The purpose of the following variable is to allow a state-based choice
172 // for pinning UVM allocations to the CPU. For now this is considered
173 // an experimental debugging capability - with the potential to work around
174 // some CUDA issues.
175 bool CudaUVMSpace::kokkos_impl_cuda_pin_uvm_to_host_v = false;
176
cuda_pin_uvm_to_host()177 bool CudaUVMSpace::cuda_pin_uvm_to_host() {
178 return CudaUVMSpace::kokkos_impl_cuda_pin_uvm_to_host_v;
179 }
cuda_set_pin_uvm_to_host(bool val)180 void CudaUVMSpace::cuda_set_pin_uvm_to_host(bool val) {
181 CudaUVMSpace::kokkos_impl_cuda_pin_uvm_to_host_v = val;
182 }
183 #endif
184 } // namespace Kokkos
185
186 #ifdef KOKKOS_IMPL_DEBUG_CUDA_PIN_UVM_TO_HOST
kokkos_impl_cuda_pin_uvm_to_host()187 bool kokkos_impl_cuda_pin_uvm_to_host() {
188 return Kokkos::CudaUVMSpace::cuda_pin_uvm_to_host();
189 }
190
kokkos_impl_cuda_set_pin_uvm_to_host(bool val)191 void kokkos_impl_cuda_set_pin_uvm_to_host(bool val) {
192 Kokkos::CudaUVMSpace::cuda_set_pin_uvm_to_host(val);
193 }
194 #endif
195
196 /*--------------------------------------------------------------------------*/
197 /*--------------------------------------------------------------------------*/
198
199 namespace Kokkos {
200
CudaSpace()201 CudaSpace::CudaSpace() : m_device(Kokkos::Cuda().cuda_device()) {}
202
CudaUVMSpace()203 CudaUVMSpace::CudaUVMSpace() : m_device(Kokkos::Cuda().cuda_device()) {}
204
CudaHostPinnedSpace()205 CudaHostPinnedSpace::CudaHostPinnedSpace() {}
206
207 //==============================================================================
208 // <editor-fold desc="allocate()"> {{{1
209
allocate(const size_t arg_alloc_size) const210 void *CudaSpace::allocate(const size_t arg_alloc_size) const {
211 return allocate("[unlabeled]", arg_alloc_size);
212 }
213
allocate(const char * arg_label,const size_t arg_alloc_size,const size_t arg_logical_size) const214 void *CudaSpace::allocate(const char *arg_label, const size_t arg_alloc_size,
215 const size_t arg_logical_size) const {
216 return impl_allocate(arg_label, arg_alloc_size, arg_logical_size);
217 }
impl_allocate(const char * arg_label,const size_t arg_alloc_size,const size_t arg_logical_size,const Kokkos::Tools::SpaceHandle arg_handle) const218 void *CudaSpace::impl_allocate(
219 const char *arg_label, const size_t arg_alloc_size,
220 const size_t arg_logical_size,
221 const Kokkos::Tools::SpaceHandle arg_handle) const {
222 void *ptr = nullptr;
223
224 auto error_code = cudaMalloc(&ptr, arg_alloc_size);
225 if (error_code != cudaSuccess) { // TODO tag as unlikely branch
226 cudaGetLastError(); // This is the only way to clear the last error, which
227 // we should do here since we're turning it into an
228 // exception here
229 throw Experimental::CudaRawMemoryAllocationFailure(
230 arg_alloc_size, error_code,
231 Experimental::RawMemoryAllocationFailure::AllocationMechanism::
232 CudaMalloc);
233 }
234
235 if (Kokkos::Profiling::profileLibraryLoaded()) {
236 const size_t reported_size =
237 (arg_logical_size > 0) ? arg_logical_size : arg_alloc_size;
238 Kokkos::Profiling::allocateData(arg_handle, arg_label, ptr, reported_size);
239 }
240 return ptr;
241 }
242
allocate(const size_t arg_alloc_size) const243 void *CudaUVMSpace::allocate(const size_t arg_alloc_size) const {
244 return allocate("[unlabeled]", arg_alloc_size);
245 }
allocate(const char * arg_label,const size_t arg_alloc_size,const size_t arg_logical_size) const246 void *CudaUVMSpace::allocate(const char *arg_label, const size_t arg_alloc_size,
247 const size_t arg_logical_size) const {
248 return impl_allocate(arg_label, arg_alloc_size, arg_logical_size);
249 }
impl_allocate(const char * arg_label,const size_t arg_alloc_size,const size_t arg_logical_size,const Kokkos::Tools::SpaceHandle arg_handle) const250 void *CudaUVMSpace::impl_allocate(
251 const char *arg_label, const size_t arg_alloc_size,
252 const size_t arg_logical_size,
253 const Kokkos::Tools::SpaceHandle arg_handle) const {
254 void *ptr = nullptr;
255
256 Cuda::impl_static_fence();
257 if (arg_alloc_size > 0) {
258 Kokkos::Impl::num_uvm_allocations++;
259
260 auto error_code =
261 cudaMallocManaged(&ptr, arg_alloc_size, cudaMemAttachGlobal);
262
263 #ifdef KOKKOS_IMPL_DEBUG_CUDA_PIN_UVM_TO_HOST
264 if (Kokkos::CudaUVMSpace::cuda_pin_uvm_to_host())
265 cudaMemAdvise(ptr, arg_alloc_size, cudaMemAdviseSetPreferredLocation,
266 cudaCpuDeviceId);
267 #endif
268
269 if (error_code != cudaSuccess) { // TODO tag as unlikely branch
270 cudaGetLastError(); // This is the only way to clear the last error,
271 // which we should do here since we're turning it
272 // into an exception here
273 throw Experimental::CudaRawMemoryAllocationFailure(
274 arg_alloc_size, error_code,
275 Experimental::RawMemoryAllocationFailure::AllocationMechanism::
276 CudaMallocManaged);
277 }
278 }
279 Cuda::impl_static_fence();
280 if (Kokkos::Profiling::profileLibraryLoaded()) {
281 const size_t reported_size =
282 (arg_logical_size > 0) ? arg_logical_size : arg_alloc_size;
283 Kokkos::Profiling::allocateData(arg_handle, arg_label, ptr, reported_size);
284 }
285 return ptr;
286 }
allocate(const size_t arg_alloc_size) const287 void *CudaHostPinnedSpace::allocate(const size_t arg_alloc_size) const {
288 return allocate("[unlabeled]", arg_alloc_size);
289 }
allocate(const char * arg_label,const size_t arg_alloc_size,const size_t arg_logical_size) const290 void *CudaHostPinnedSpace::allocate(const char *arg_label,
291 const size_t arg_alloc_size,
292 const size_t arg_logical_size) const {
293 return impl_allocate(arg_label, arg_alloc_size, arg_logical_size);
294 }
impl_allocate(const char * arg_label,const size_t arg_alloc_size,const size_t arg_logical_size,const Kokkos::Tools::SpaceHandle arg_handle) const295 void *CudaHostPinnedSpace::impl_allocate(
296 const char *arg_label, const size_t arg_alloc_size,
297 const size_t arg_logical_size,
298 const Kokkos::Tools::SpaceHandle arg_handle) const {
299 void *ptr = nullptr;
300
301 auto error_code = cudaHostAlloc(&ptr, arg_alloc_size, cudaHostAllocDefault);
302 if (error_code != cudaSuccess) { // TODO tag as unlikely branch
303 cudaGetLastError(); // This is the only way to clear the last error, which
304 // we should do here since we're turning it into an
305 // exception here
306 throw Experimental::CudaRawMemoryAllocationFailure(
307 arg_alloc_size, error_code,
308 Experimental::RawMemoryAllocationFailure::AllocationMechanism::
309 CudaHostAlloc);
310 }
311 if (Kokkos::Profiling::profileLibraryLoaded()) {
312 const size_t reported_size =
313 (arg_logical_size > 0) ? arg_logical_size : arg_alloc_size;
314 Kokkos::Profiling::allocateData(arg_handle, arg_label, ptr, reported_size);
315 }
316 return ptr;
317 }
318
319 // </editor-fold> end allocate() }}}1
320 //==============================================================================
deallocate(void * const arg_alloc_ptr,const size_t arg_alloc_size) const321 void CudaSpace::deallocate(void *const arg_alloc_ptr,
322 const size_t arg_alloc_size) const {
323 deallocate("[unlabeled]", arg_alloc_ptr, arg_alloc_size);
324 }
deallocate(const char * arg_label,void * const arg_alloc_ptr,const size_t arg_alloc_size,const size_t arg_logical_size) const325 void CudaSpace::deallocate(const char *arg_label, void *const arg_alloc_ptr,
326 const size_t arg_alloc_size,
327 const size_t arg_logical_size) const {
328 impl_deallocate(arg_label, arg_alloc_ptr, arg_alloc_size, arg_logical_size);
329 }
impl_deallocate(const char * arg_label,void * const arg_alloc_ptr,const size_t arg_alloc_size,const size_t arg_logical_size,const Kokkos::Tools::SpaceHandle arg_handle) const330 void CudaSpace::impl_deallocate(
331 const char *arg_label, void *const arg_alloc_ptr,
332 const size_t arg_alloc_size, const size_t arg_logical_size,
333 const Kokkos::Tools::SpaceHandle arg_handle) const {
334 if (Kokkos::Profiling::profileLibraryLoaded()) {
335 const size_t reported_size =
336 (arg_logical_size > 0) ? arg_logical_size : arg_alloc_size;
337 Kokkos::Profiling::deallocateData(arg_handle, arg_label, arg_alloc_ptr,
338 reported_size);
339 }
340
341 try {
342 CUDA_SAFE_CALL(cudaFree(arg_alloc_ptr));
343 } catch (...) {
344 }
345 }
deallocate(void * const arg_alloc_ptr,const size_t arg_alloc_size) const346 void CudaUVMSpace::deallocate(void *const arg_alloc_ptr,
347 const size_t arg_alloc_size) const {
348 deallocate("[unlabeled]", arg_alloc_ptr, arg_alloc_size);
349 }
350
deallocate(const char * arg_label,void * const arg_alloc_ptr,const size_t arg_alloc_size,const size_t arg_logical_size) const351 void CudaUVMSpace::deallocate(const char *arg_label, void *const arg_alloc_ptr,
352 const size_t arg_alloc_size
353
354 ,
355 const size_t arg_logical_size) const {
356 impl_deallocate(arg_label, arg_alloc_ptr, arg_alloc_size, arg_logical_size);
357 }
impl_deallocate(const char * arg_label,void * const arg_alloc_ptr,const size_t arg_alloc_size,const size_t arg_logical_size,const Kokkos::Tools::SpaceHandle arg_handle) const358 void CudaUVMSpace::impl_deallocate(
359 const char *arg_label, void *const arg_alloc_ptr,
360 const size_t arg_alloc_size
361
362 ,
363 const size_t arg_logical_size,
364 const Kokkos::Tools::SpaceHandle arg_handle) const {
365 Cuda::impl_static_fence();
366 if (Kokkos::Profiling::profileLibraryLoaded()) {
367 const size_t reported_size =
368 (arg_logical_size > 0) ? arg_logical_size : arg_alloc_size;
369 Kokkos::Profiling::deallocateData(arg_handle, arg_label, arg_alloc_ptr,
370 reported_size);
371 }
372 try {
373 if (arg_alloc_ptr != nullptr) {
374 Kokkos::Impl::num_uvm_allocations--;
375 CUDA_SAFE_CALL(cudaFree(arg_alloc_ptr));
376 }
377 } catch (...) {
378 }
379 Cuda::impl_static_fence();
380 }
381
deallocate(void * const arg_alloc_ptr,const size_t arg_alloc_size) const382 void CudaHostPinnedSpace::deallocate(void *const arg_alloc_ptr,
383 const size_t arg_alloc_size) const {
384 deallocate("[unlabeled]", arg_alloc_ptr, arg_alloc_size);
385 }
deallocate(const char * arg_label,void * const arg_alloc_ptr,const size_t arg_alloc_size,const size_t arg_logical_size) const386 void CudaHostPinnedSpace::deallocate(const char *arg_label,
387 void *const arg_alloc_ptr,
388 const size_t arg_alloc_size,
389 const size_t arg_logical_size) const {
390 impl_deallocate(arg_label, arg_alloc_ptr, arg_alloc_size, arg_logical_size);
391 }
392
impl_deallocate(const char * arg_label,void * const arg_alloc_ptr,const size_t arg_alloc_size,const size_t arg_logical_size,const Kokkos::Tools::SpaceHandle arg_handle) const393 void CudaHostPinnedSpace::impl_deallocate(
394 const char *arg_label, void *const arg_alloc_ptr,
395 const size_t arg_alloc_size, const size_t arg_logical_size,
396 const Kokkos::Tools::SpaceHandle arg_handle) const {
397 if (Kokkos::Profiling::profileLibraryLoaded()) {
398 const size_t reported_size =
399 (arg_logical_size > 0) ? arg_logical_size : arg_alloc_size;
400 Kokkos::Profiling::deallocateData(arg_handle, arg_label, arg_alloc_ptr,
401 reported_size);
402 }
403 try {
404 CUDA_SAFE_CALL(cudaFreeHost(arg_alloc_ptr));
405 } catch (...) {
406 }
407 }
408
409 } // namespace Kokkos
410
411 //----------------------------------------------------------------------------
412 //----------------------------------------------------------------------------
413
414 namespace Kokkos {
415 namespace Impl {
416
417 #ifdef KOKKOS_ENABLE_DEBUG
418 SharedAllocationRecord<void, void>
419 SharedAllocationRecord<Kokkos::CudaSpace, void>::s_root_record;
420
421 SharedAllocationRecord<void, void>
422 SharedAllocationRecord<Kokkos::CudaUVMSpace, void>::s_root_record;
423
424 SharedAllocationRecord<void, void>
425 SharedAllocationRecord<Kokkos::CudaHostPinnedSpace, void>::s_root_record;
426 #endif
427
428 ::cudaTextureObject_t
attach_texture_object(const unsigned sizeof_alias,void * const alloc_ptr,size_t const alloc_size)429 SharedAllocationRecord<Kokkos::CudaSpace, void>::attach_texture_object(
430 const unsigned sizeof_alias, void *const alloc_ptr,
431 size_t const alloc_size) {
432 enum { TEXTURE_BOUND_1D = 1u << 27 };
433
434 if ((alloc_ptr == nullptr) ||
435 (sizeof_alias * TEXTURE_BOUND_1D <= alloc_size)) {
436 std::ostringstream msg;
437 msg << "Kokkos::CudaSpace ERROR: Cannot attach texture object to"
438 << " alloc_ptr(" << alloc_ptr << ")"
439 << " alloc_size(" << alloc_size << ")"
440 << " max_size(" << (sizeof_alias * TEXTURE_BOUND_1D) << ")";
441 std::cerr << msg.str() << std::endl;
442 std::cerr.flush();
443 Kokkos::Impl::throw_runtime_exception(msg.str());
444 }
445
446 ::cudaTextureObject_t tex_obj;
447
448 struct cudaResourceDesc resDesc;
449 struct cudaTextureDesc texDesc;
450
451 memset(&resDesc, 0, sizeof(resDesc));
452 memset(&texDesc, 0, sizeof(texDesc));
453
454 resDesc.resType = cudaResourceTypeLinear;
455 resDesc.res.linear.desc =
456 (sizeof_alias == 4
457 ? cudaCreateChannelDesc<int>()
458 : (sizeof_alias == 8
459 ? cudaCreateChannelDesc< ::int2>()
460 :
461 /* sizeof_alias == 16 */ cudaCreateChannelDesc< ::int4>()));
462 resDesc.res.linear.sizeInBytes = alloc_size;
463 resDesc.res.linear.devPtr = alloc_ptr;
464
465 CUDA_SAFE_CALL(
466 cudaCreateTextureObject(&tex_obj, &resDesc, &texDesc, nullptr));
467
468 return tex_obj;
469 }
470
471 //==============================================================================
472 // <editor-fold desc="SharedAllocationRecord destructors"> {{{1
473
~SharedAllocationRecord()474 SharedAllocationRecord<Kokkos::CudaSpace, void>::~SharedAllocationRecord() {
475 const char *label = nullptr;
476 if (Kokkos::Profiling::profileLibraryLoaded()) {
477 SharedAllocationHeader header;
478 Kokkos::Impl::DeepCopy<Kokkos::CudaSpace, HostSpace>(
479 &header, RecordBase::m_alloc_ptr, sizeof(SharedAllocationHeader));
480 label = header.label();
481 }
482 auto alloc_size = SharedAllocationRecord<void, void>::m_alloc_size;
483 m_space.deallocate(label, SharedAllocationRecord<void, void>::m_alloc_ptr,
484 alloc_size, (alloc_size - sizeof(SharedAllocationHeader)));
485 }
486
~SharedAllocationRecord()487 SharedAllocationRecord<Kokkos::CudaUVMSpace, void>::~SharedAllocationRecord() {
488 const char *label = nullptr;
489 if (Kokkos::Profiling::profileLibraryLoaded()) {
490 label = RecordBase::m_alloc_ptr->m_label;
491 }
492 m_space.deallocate(label, SharedAllocationRecord<void, void>::m_alloc_ptr,
493 SharedAllocationRecord<void, void>::m_alloc_size,
494 (SharedAllocationRecord<void, void>::m_alloc_size -
495 sizeof(SharedAllocationHeader)));
496 }
497
498 SharedAllocationRecord<Kokkos::CudaHostPinnedSpace,
~SharedAllocationRecord()499 void>::~SharedAllocationRecord() {
500 m_space.deallocate(RecordBase::m_alloc_ptr->m_label,
501 SharedAllocationRecord<void, void>::m_alloc_ptr,
502 SharedAllocationRecord<void, void>::m_alloc_size,
503 (SharedAllocationRecord<void, void>::m_alloc_size -
504 sizeof(SharedAllocationHeader)));
505 }
506
507 // </editor-fold> end SharedAllocationRecord destructors }}}1
508 //==============================================================================
509
510 //==============================================================================
511 // <editor-fold desc="SharedAllocationRecord constructors"> {{{1
512
SharedAllocationRecord(const Kokkos::CudaSpace & arg_space,const std::string & arg_label,const size_t arg_alloc_size,const SharedAllocationRecord<void,void>::function_type arg_dealloc)513 SharedAllocationRecord<Kokkos::CudaSpace, void>::SharedAllocationRecord(
514 const Kokkos::CudaSpace &arg_space, const std::string &arg_label,
515 const size_t arg_alloc_size,
516 const SharedAllocationRecord<void, void>::function_type arg_dealloc)
517 // Pass through allocated [ SharedAllocationHeader , user_memory ]
518 // Pass through deallocation function
519 : base_t(
520 #ifdef KOKKOS_ENABLE_DEBUG
521 &SharedAllocationRecord<Kokkos::CudaSpace, void>::s_root_record,
522 #endif
523 Impl::checked_allocation_with_header(arg_space, arg_label,
524 arg_alloc_size),
525 sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc),
526 m_tex_obj(0),
527 m_space(arg_space) {
528
529 SharedAllocationHeader header;
530
531 this->base_t::_fill_host_accessible_header_info(header, arg_label);
532
533 // Copy to device memory
534 Kokkos::Impl::DeepCopy<CudaSpace, HostSpace>(RecordBase::m_alloc_ptr, &header,
535 sizeof(SharedAllocationHeader));
536 }
537
SharedAllocationRecord(const Kokkos::CudaUVMSpace & arg_space,const std::string & arg_label,const size_t arg_alloc_size,const SharedAllocationRecord<void,void>::function_type arg_dealloc)538 SharedAllocationRecord<Kokkos::CudaUVMSpace, void>::SharedAllocationRecord(
539 const Kokkos::CudaUVMSpace &arg_space, const std::string &arg_label,
540 const size_t arg_alloc_size,
541 const SharedAllocationRecord<void, void>::function_type arg_dealloc)
542 // Pass through allocated [ SharedAllocationHeader , user_memory ]
543 // Pass through deallocation function
544 : base_t(
545 #ifdef KOKKOS_ENABLE_DEBUG
546 &SharedAllocationRecord<Kokkos::CudaUVMSpace, void>::s_root_record,
547 #endif
548 Impl::checked_allocation_with_header(arg_space, arg_label,
549 arg_alloc_size),
550 sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc),
551 m_tex_obj(0),
552 m_space(arg_space) {
553 this->base_t::_fill_host_accessible_header_info(*base_t::m_alloc_ptr,
554 arg_label);
555 }
556
557 SharedAllocationRecord<Kokkos::CudaHostPinnedSpace, void>::
SharedAllocationRecord(const Kokkos::CudaHostPinnedSpace & arg_space,const std::string & arg_label,const size_t arg_alloc_size,const SharedAllocationRecord<void,void>::function_type arg_dealloc)558 SharedAllocationRecord(
559 const Kokkos::CudaHostPinnedSpace &arg_space,
560 const std::string &arg_label, const size_t arg_alloc_size,
561 const SharedAllocationRecord<void, void>::function_type arg_dealloc)
562 // Pass through allocated [ SharedAllocationHeader , user_memory ]
563 // Pass through deallocation function
564 : base_t(
565 #ifdef KOKKOS_ENABLE_DEBUG
566 &SharedAllocationRecord<Kokkos::CudaHostPinnedSpace,
567 void>::s_root_record,
568 #endif
569 Impl::checked_allocation_with_header(arg_space, arg_label,
570 arg_alloc_size),
571 sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc),
572 m_space(arg_space) {
573 this->base_t::_fill_host_accessible_header_info(*base_t::m_alloc_ptr,
574 arg_label);
575 }
576
577 // </editor-fold> end SharedAllocationRecord constructors }}}1
578 //==============================================================================
579
cuda_prefetch_pointer(const Cuda & space,const void * ptr,size_t bytes,bool to_device)580 void cuda_prefetch_pointer(const Cuda &space, const void *ptr, size_t bytes,
581 bool to_device) {
582 if ((ptr == nullptr) || (bytes == 0)) return;
583 cudaPointerAttributes attr;
584 CUDA_SAFE_CALL(cudaPointerGetAttributes(&attr, ptr));
585 // I measured this and it turns out prefetching towards the host slows
586 // DualView syncs down. Probably because the latency is not too bad in the
587 // first place for the pull down. If we want to change that provde
588 // cudaCpuDeviceId as the device if to_device is false
589 #if CUDA_VERSION < 10000
590 bool is_managed = attr.isManaged;
591 #else
592 bool is_managed = attr.type == cudaMemoryTypeManaged;
593 #endif
594 if (to_device && is_managed &&
595 space.cuda_device_prop().concurrentManagedAccess) {
596 CUDA_SAFE_CALL(cudaMemPrefetchAsync(ptr, bytes, space.cuda_device(),
597 space.cuda_stream()));
598 }
599 }
600
601 } // namespace Impl
602 } // namespace Kokkos
603
604 //==============================================================================
605 // <editor-fold desc="Explicit instantiations of CRTP Base classes"> {{{1
606
607 #include <impl/Kokkos_SharedAlloc_timpl.hpp>
608
609 namespace Kokkos {
610 namespace Impl {
611
612 // To avoid additional compilation cost for something that's (mostly?) not
613 // performance sensitive, we explicity instantiate these CRTP base classes here,
614 // where we have access to the associated *_timpl.hpp header files.
615 template class SharedAllocationRecordCommon<Kokkos::CudaSpace>;
616 template class HostInaccessibleSharedAllocationRecordCommon<Kokkos::CudaSpace>;
617 template class SharedAllocationRecordCommon<Kokkos::CudaUVMSpace>;
618 template class SharedAllocationRecordCommon<Kokkos::CudaHostPinnedSpace>;
619
620 } // end namespace Impl
621 } // end namespace Kokkos
622
623 // </editor-fold> end Explicit instantiations of CRTP Base classes }}}1
624 //==============================================================================
625
626 #else
KOKKOS_CORE_SRC_CUDA_CUDASPACE_PREVENT_LINK_ERROR()627 void KOKKOS_CORE_SRC_CUDA_CUDASPACE_PREVENT_LINK_ERROR() {}
628 #endif // KOKKOS_ENABLE_CUDA
629