1 /*
2 //@HEADER
3 // ************************************************************************
4 //
5 //                        Kokkos v. 3.0
6 //       Copyright (2020) National Technology & Engineering
7 //               Solutions of Sandia, LLC (NTESS).
8 //
9 // Under the terms of Contract DE-NA0003525 with NTESS,
10 // the U.S. Government retains certain rights in this software.
11 //
12 // Redistribution and use in source and binary forms, with or without
13 // modification, are permitted provided that the following conditions are
14 // met:
15 //
16 // 1. Redistributions of source code must retain the above copyright
17 // notice, this list of conditions and the following disclaimer.
18 //
19 // 2. Redistributions in binary form must reproduce the above copyright
20 // notice, this list of conditions and the following disclaimer in the
21 // documentation and/or other materials provided with the distribution.
22 //
23 // 3. Neither the name of the Corporation nor the names of the
24 // contributors may be used to endorse or promote products derived from
25 // this software without specific prior written permission.
26 //
27 // THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
28 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
30 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
31 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
32 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
33 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
34 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
35 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
36 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
37 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
38 //
39 // Questions? Contact Christian R. Trott (crtrott@sandia.gov)
40 //
41 // ************************************************************************
42 //@HEADER
43 */
44 
45 #include <Kokkos_Macros.hpp>
46 #ifdef KOKKOS_ENABLE_CUDA
47 
48 #include <Kokkos_Core.hpp>
49 #include <Kokkos_Cuda.hpp>
50 #include <Kokkos_CudaSpace.hpp>
51 
52 #include <cstdlib>
53 #include <iostream>
54 #include <sstream>
55 #include <stdexcept>
56 #include <algorithm>
57 #include <atomic>
58 
59 //#include <Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp>
60 #include <impl/Kokkos_Error.hpp>
61 #include <impl/Kokkos_MemorySpace.hpp>
62 
63 #include <impl/Kokkos_Tools.hpp>
64 
65 /*--------------------------------------------------------------------------*/
66 /*--------------------------------------------------------------------------*/
67 
cuda_get_deep_copy_stream()68 cudaStream_t Kokkos::Impl::cuda_get_deep_copy_stream() {
69   static cudaStream_t s = nullptr;
70   if (s == nullptr) {
71     cudaStreamCreate(&s);
72   }
73   return s;
74 }
75 
cuda_get_deep_copy_space(bool initialize)76 const std::unique_ptr<Kokkos::Cuda> &Kokkos::Impl::cuda_get_deep_copy_space(
77     bool initialize) {
78   static std::unique_ptr<Cuda> space = nullptr;
79   if (!space && initialize)
80     space = std::make_unique<Cuda>(Kokkos::Impl::cuda_get_deep_copy_stream());
81   return space;
82 }
83 
84 namespace Kokkos {
85 namespace Impl {
86 
87 namespace {
88 
89 static std::atomic<int> num_uvm_allocations(0);
90 
91 }  // namespace
92 
DeepCopy(void * dst,const void * src,size_t n)93 DeepCopy<CudaSpace, CudaSpace, Cuda>::DeepCopy(void *dst, const void *src,
94                                                size_t n) {
95   CUDA_SAFE_CALL(cudaMemcpy(dst, src, n, cudaMemcpyDefault));
96 }
97 
DeepCopy(void * dst,const void * src,size_t n)98 DeepCopy<HostSpace, CudaSpace, Cuda>::DeepCopy(void *dst, const void *src,
99                                                size_t n) {
100   CUDA_SAFE_CALL(cudaMemcpy(dst, src, n, cudaMemcpyDefault));
101 }
102 
DeepCopy(void * dst,const void * src,size_t n)103 DeepCopy<CudaSpace, HostSpace, Cuda>::DeepCopy(void *dst, const void *src,
104                                                size_t n) {
105   CUDA_SAFE_CALL(cudaMemcpy(dst, src, n, cudaMemcpyDefault));
106 }
107 
DeepCopy(const Cuda & instance,void * dst,const void * src,size_t n)108 DeepCopy<CudaSpace, CudaSpace, Cuda>::DeepCopy(const Cuda &instance, void *dst,
109                                                const void *src, size_t n) {
110   CUDA_SAFE_CALL(
111       cudaMemcpyAsync(dst, src, n, cudaMemcpyDefault, instance.cuda_stream()));
112 }
113 
DeepCopy(const Cuda & instance,void * dst,const void * src,size_t n)114 DeepCopy<HostSpace, CudaSpace, Cuda>::DeepCopy(const Cuda &instance, void *dst,
115                                                const void *src, size_t n) {
116   CUDA_SAFE_CALL(
117       cudaMemcpyAsync(dst, src, n, cudaMemcpyDefault, instance.cuda_stream()));
118 }
119 
DeepCopy(const Cuda & instance,void * dst,const void * src,size_t n)120 DeepCopy<CudaSpace, HostSpace, Cuda>::DeepCopy(const Cuda &instance, void *dst,
121                                                const void *src, size_t n) {
122   CUDA_SAFE_CALL(
123       cudaMemcpyAsync(dst, src, n, cudaMemcpyDefault, instance.cuda_stream()));
124 }
125 
DeepCopyAsyncCuda(void * dst,const void * src,size_t n)126 void DeepCopyAsyncCuda(void *dst, const void *src, size_t n) {
127   cudaStream_t s = cuda_get_deep_copy_stream();
128   CUDA_SAFE_CALL(cudaMemcpyAsync(dst, src, n, cudaMemcpyDefault, s));
129   cudaStreamSynchronize(s);
130 }
131 
132 }  // namespace Impl
133 }  // namespace Kokkos
134 
135 /*--------------------------------------------------------------------------*/
136 /*--------------------------------------------------------------------------*/
137 
138 namespace Kokkos {
139 
access_error()140 KOKKOS_DEPRECATED void CudaSpace::access_error() {
141   const std::string msg(
142       "Kokkos::CudaSpace::access_error attempt to execute Cuda function from "
143       "non-Cuda space");
144   Kokkos::Impl::throw_runtime_exception(msg);
145 }
146 
access_error(const void * const)147 KOKKOS_DEPRECATED void CudaSpace::access_error(const void *const) {
148   const std::string msg(
149       "Kokkos::CudaSpace::access_error attempt to execute Cuda function from "
150       "non-Cuda space");
151   Kokkos::Impl::throw_runtime_exception(msg);
152 }
153 
154 /*--------------------------------------------------------------------------*/
155 
available()156 bool CudaUVMSpace::available() {
157 #if defined(CUDA_VERSION) && !defined(__APPLE__)
158   enum : bool { UVM_available = true };
159 #else
160   enum : bool { UVM_available = false };
161 #endif
162   return UVM_available;
163 }
164 
165 /*--------------------------------------------------------------------------*/
166 
number_of_allocations()167 int CudaUVMSpace::number_of_allocations() {
168   return Kokkos::Impl::num_uvm_allocations.load();
169 }
170 #ifdef KOKKOS_IMPL_DEBUG_CUDA_PIN_UVM_TO_HOST
171 // The purpose of the following variable is to allow a state-based choice
172 // for pinning UVM allocations to the CPU. For now this is considered
173 // an experimental debugging capability - with the potential to work around
174 // some CUDA issues.
175 bool CudaUVMSpace::kokkos_impl_cuda_pin_uvm_to_host_v = false;
176 
cuda_pin_uvm_to_host()177 bool CudaUVMSpace::cuda_pin_uvm_to_host() {
178   return CudaUVMSpace::kokkos_impl_cuda_pin_uvm_to_host_v;
179 }
cuda_set_pin_uvm_to_host(bool val)180 void CudaUVMSpace::cuda_set_pin_uvm_to_host(bool val) {
181   CudaUVMSpace::kokkos_impl_cuda_pin_uvm_to_host_v = val;
182 }
183 #endif
184 }  // namespace Kokkos
185 
186 #ifdef KOKKOS_IMPL_DEBUG_CUDA_PIN_UVM_TO_HOST
kokkos_impl_cuda_pin_uvm_to_host()187 bool kokkos_impl_cuda_pin_uvm_to_host() {
188   return Kokkos::CudaUVMSpace::cuda_pin_uvm_to_host();
189 }
190 
kokkos_impl_cuda_set_pin_uvm_to_host(bool val)191 void kokkos_impl_cuda_set_pin_uvm_to_host(bool val) {
192   Kokkos::CudaUVMSpace::cuda_set_pin_uvm_to_host(val);
193 }
194 #endif
195 
196 /*--------------------------------------------------------------------------*/
197 /*--------------------------------------------------------------------------*/
198 
199 namespace Kokkos {
200 
CudaSpace()201 CudaSpace::CudaSpace() : m_device(Kokkos::Cuda().cuda_device()) {}
202 
CudaUVMSpace()203 CudaUVMSpace::CudaUVMSpace() : m_device(Kokkos::Cuda().cuda_device()) {}
204 
CudaHostPinnedSpace()205 CudaHostPinnedSpace::CudaHostPinnedSpace() {}
206 
207 //==============================================================================
208 // <editor-fold desc="allocate()"> {{{1
209 
allocate(const size_t arg_alloc_size) const210 void *CudaSpace::allocate(const size_t arg_alloc_size) const {
211   return allocate("[unlabeled]", arg_alloc_size);
212 }
213 
allocate(const char * arg_label,const size_t arg_alloc_size,const size_t arg_logical_size) const214 void *CudaSpace::allocate(const char *arg_label, const size_t arg_alloc_size,
215                           const size_t arg_logical_size) const {
216   return impl_allocate(arg_label, arg_alloc_size, arg_logical_size);
217 }
impl_allocate(const char * arg_label,const size_t arg_alloc_size,const size_t arg_logical_size,const Kokkos::Tools::SpaceHandle arg_handle) const218 void *CudaSpace::impl_allocate(
219     const char *arg_label, const size_t arg_alloc_size,
220     const size_t arg_logical_size,
221     const Kokkos::Tools::SpaceHandle arg_handle) const {
222   void *ptr = nullptr;
223 
224   auto error_code = cudaMalloc(&ptr, arg_alloc_size);
225   if (error_code != cudaSuccess) {  // TODO tag as unlikely branch
226     cudaGetLastError();  // This is the only way to clear the last error, which
227                          // we should do here since we're turning it into an
228                          // exception here
229     throw Experimental::CudaRawMemoryAllocationFailure(
230         arg_alloc_size, error_code,
231         Experimental::RawMemoryAllocationFailure::AllocationMechanism::
232             CudaMalloc);
233   }
234 
235   if (Kokkos::Profiling::profileLibraryLoaded()) {
236     const size_t reported_size =
237         (arg_logical_size > 0) ? arg_logical_size : arg_alloc_size;
238     Kokkos::Profiling::allocateData(arg_handle, arg_label, ptr, reported_size);
239   }
240   return ptr;
241 }
242 
allocate(const size_t arg_alloc_size) const243 void *CudaUVMSpace::allocate(const size_t arg_alloc_size) const {
244   return allocate("[unlabeled]", arg_alloc_size);
245 }
allocate(const char * arg_label,const size_t arg_alloc_size,const size_t arg_logical_size) const246 void *CudaUVMSpace::allocate(const char *arg_label, const size_t arg_alloc_size,
247                              const size_t arg_logical_size) const {
248   return impl_allocate(arg_label, arg_alloc_size, arg_logical_size);
249 }
impl_allocate(const char * arg_label,const size_t arg_alloc_size,const size_t arg_logical_size,const Kokkos::Tools::SpaceHandle arg_handle) const250 void *CudaUVMSpace::impl_allocate(
251     const char *arg_label, const size_t arg_alloc_size,
252     const size_t arg_logical_size,
253     const Kokkos::Tools::SpaceHandle arg_handle) const {
254   void *ptr = nullptr;
255 
256   Cuda::impl_static_fence();
257   if (arg_alloc_size > 0) {
258     Kokkos::Impl::num_uvm_allocations++;
259 
260     auto error_code =
261         cudaMallocManaged(&ptr, arg_alloc_size, cudaMemAttachGlobal);
262 
263 #ifdef KOKKOS_IMPL_DEBUG_CUDA_PIN_UVM_TO_HOST
264     if (Kokkos::CudaUVMSpace::cuda_pin_uvm_to_host())
265       cudaMemAdvise(ptr, arg_alloc_size, cudaMemAdviseSetPreferredLocation,
266                     cudaCpuDeviceId);
267 #endif
268 
269     if (error_code != cudaSuccess) {  // TODO tag as unlikely branch
270       cudaGetLastError();  // This is the only way to clear the last error,
271                            // which we should do here since we're turning it
272                            // into an exception here
273       throw Experimental::CudaRawMemoryAllocationFailure(
274           arg_alloc_size, error_code,
275           Experimental::RawMemoryAllocationFailure::AllocationMechanism::
276               CudaMallocManaged);
277     }
278   }
279   Cuda::impl_static_fence();
280   if (Kokkos::Profiling::profileLibraryLoaded()) {
281     const size_t reported_size =
282         (arg_logical_size > 0) ? arg_logical_size : arg_alloc_size;
283     Kokkos::Profiling::allocateData(arg_handle, arg_label, ptr, reported_size);
284   }
285   return ptr;
286 }
allocate(const size_t arg_alloc_size) const287 void *CudaHostPinnedSpace::allocate(const size_t arg_alloc_size) const {
288   return allocate("[unlabeled]", arg_alloc_size);
289 }
allocate(const char * arg_label,const size_t arg_alloc_size,const size_t arg_logical_size) const290 void *CudaHostPinnedSpace::allocate(const char *arg_label,
291                                     const size_t arg_alloc_size,
292                                     const size_t arg_logical_size) const {
293   return impl_allocate(arg_label, arg_alloc_size, arg_logical_size);
294 }
impl_allocate(const char * arg_label,const size_t arg_alloc_size,const size_t arg_logical_size,const Kokkos::Tools::SpaceHandle arg_handle) const295 void *CudaHostPinnedSpace::impl_allocate(
296     const char *arg_label, const size_t arg_alloc_size,
297     const size_t arg_logical_size,
298     const Kokkos::Tools::SpaceHandle arg_handle) const {
299   void *ptr = nullptr;
300 
301   auto error_code = cudaHostAlloc(&ptr, arg_alloc_size, cudaHostAllocDefault);
302   if (error_code != cudaSuccess) {  // TODO tag as unlikely branch
303     cudaGetLastError();  // This is the only way to clear the last error, which
304                          // we should do here since we're turning it into an
305                          // exception here
306     throw Experimental::CudaRawMemoryAllocationFailure(
307         arg_alloc_size, error_code,
308         Experimental::RawMemoryAllocationFailure::AllocationMechanism::
309             CudaHostAlloc);
310   }
311   if (Kokkos::Profiling::profileLibraryLoaded()) {
312     const size_t reported_size =
313         (arg_logical_size > 0) ? arg_logical_size : arg_alloc_size;
314     Kokkos::Profiling::allocateData(arg_handle, arg_label, ptr, reported_size);
315   }
316   return ptr;
317 }
318 
319 // </editor-fold> end allocate() }}}1
320 //==============================================================================
deallocate(void * const arg_alloc_ptr,const size_t arg_alloc_size) const321 void CudaSpace::deallocate(void *const arg_alloc_ptr,
322                            const size_t arg_alloc_size) const {
323   deallocate("[unlabeled]", arg_alloc_ptr, arg_alloc_size);
324 }
deallocate(const char * arg_label,void * const arg_alloc_ptr,const size_t arg_alloc_size,const size_t arg_logical_size) const325 void CudaSpace::deallocate(const char *arg_label, void *const arg_alloc_ptr,
326                            const size_t arg_alloc_size,
327                            const size_t arg_logical_size) const {
328   impl_deallocate(arg_label, arg_alloc_ptr, arg_alloc_size, arg_logical_size);
329 }
impl_deallocate(const char * arg_label,void * const arg_alloc_ptr,const size_t arg_alloc_size,const size_t arg_logical_size,const Kokkos::Tools::SpaceHandle arg_handle) const330 void CudaSpace::impl_deallocate(
331     const char *arg_label, void *const arg_alloc_ptr,
332     const size_t arg_alloc_size, const size_t arg_logical_size,
333     const Kokkos::Tools::SpaceHandle arg_handle) const {
334   if (Kokkos::Profiling::profileLibraryLoaded()) {
335     const size_t reported_size =
336         (arg_logical_size > 0) ? arg_logical_size : arg_alloc_size;
337     Kokkos::Profiling::deallocateData(arg_handle, arg_label, arg_alloc_ptr,
338                                       reported_size);
339   }
340 
341   try {
342     CUDA_SAFE_CALL(cudaFree(arg_alloc_ptr));
343   } catch (...) {
344   }
345 }
deallocate(void * const arg_alloc_ptr,const size_t arg_alloc_size) const346 void CudaUVMSpace::deallocate(void *const arg_alloc_ptr,
347                               const size_t arg_alloc_size) const {
348   deallocate("[unlabeled]", arg_alloc_ptr, arg_alloc_size);
349 }
350 
deallocate(const char * arg_label,void * const arg_alloc_ptr,const size_t arg_alloc_size,const size_t arg_logical_size) const351 void CudaUVMSpace::deallocate(const char *arg_label, void *const arg_alloc_ptr,
352                               const size_t arg_alloc_size
353 
354                               ,
355                               const size_t arg_logical_size) const {
356   impl_deallocate(arg_label, arg_alloc_ptr, arg_alloc_size, arg_logical_size);
357 }
impl_deallocate(const char * arg_label,void * const arg_alloc_ptr,const size_t arg_alloc_size,const size_t arg_logical_size,const Kokkos::Tools::SpaceHandle arg_handle) const358 void CudaUVMSpace::impl_deallocate(
359     const char *arg_label, void *const arg_alloc_ptr,
360     const size_t arg_alloc_size
361 
362     ,
363     const size_t arg_logical_size,
364     const Kokkos::Tools::SpaceHandle arg_handle) const {
365   Cuda::impl_static_fence();
366   if (Kokkos::Profiling::profileLibraryLoaded()) {
367     const size_t reported_size =
368         (arg_logical_size > 0) ? arg_logical_size : arg_alloc_size;
369     Kokkos::Profiling::deallocateData(arg_handle, arg_label, arg_alloc_ptr,
370                                       reported_size);
371   }
372   try {
373     if (arg_alloc_ptr != nullptr) {
374       Kokkos::Impl::num_uvm_allocations--;
375       CUDA_SAFE_CALL(cudaFree(arg_alloc_ptr));
376     }
377   } catch (...) {
378   }
379   Cuda::impl_static_fence();
380 }
381 
deallocate(void * const arg_alloc_ptr,const size_t arg_alloc_size) const382 void CudaHostPinnedSpace::deallocate(void *const arg_alloc_ptr,
383                                      const size_t arg_alloc_size) const {
384   deallocate("[unlabeled]", arg_alloc_ptr, arg_alloc_size);
385 }
deallocate(const char * arg_label,void * const arg_alloc_ptr,const size_t arg_alloc_size,const size_t arg_logical_size) const386 void CudaHostPinnedSpace::deallocate(const char *arg_label,
387                                      void *const arg_alloc_ptr,
388                                      const size_t arg_alloc_size,
389                                      const size_t arg_logical_size) const {
390   impl_deallocate(arg_label, arg_alloc_ptr, arg_alloc_size, arg_logical_size);
391 }
392 
impl_deallocate(const char * arg_label,void * const arg_alloc_ptr,const size_t arg_alloc_size,const size_t arg_logical_size,const Kokkos::Tools::SpaceHandle arg_handle) const393 void CudaHostPinnedSpace::impl_deallocate(
394     const char *arg_label, void *const arg_alloc_ptr,
395     const size_t arg_alloc_size, const size_t arg_logical_size,
396     const Kokkos::Tools::SpaceHandle arg_handle) const {
397   if (Kokkos::Profiling::profileLibraryLoaded()) {
398     const size_t reported_size =
399         (arg_logical_size > 0) ? arg_logical_size : arg_alloc_size;
400     Kokkos::Profiling::deallocateData(arg_handle, arg_label, arg_alloc_ptr,
401                                       reported_size);
402   }
403   try {
404     CUDA_SAFE_CALL(cudaFreeHost(arg_alloc_ptr));
405   } catch (...) {
406   }
407 }
408 
409 }  // namespace Kokkos
410 
411 //----------------------------------------------------------------------------
412 //----------------------------------------------------------------------------
413 
414 namespace Kokkos {
415 namespace Impl {
416 
417 #ifdef KOKKOS_ENABLE_DEBUG
418 SharedAllocationRecord<void, void>
419     SharedAllocationRecord<Kokkos::CudaSpace, void>::s_root_record;
420 
421 SharedAllocationRecord<void, void>
422     SharedAllocationRecord<Kokkos::CudaUVMSpace, void>::s_root_record;
423 
424 SharedAllocationRecord<void, void>
425     SharedAllocationRecord<Kokkos::CudaHostPinnedSpace, void>::s_root_record;
426 #endif
427 
428 ::cudaTextureObject_t
attach_texture_object(const unsigned sizeof_alias,void * const alloc_ptr,size_t const alloc_size)429 SharedAllocationRecord<Kokkos::CudaSpace, void>::attach_texture_object(
430     const unsigned sizeof_alias, void *const alloc_ptr,
431     size_t const alloc_size) {
432   enum { TEXTURE_BOUND_1D = 1u << 27 };
433 
434   if ((alloc_ptr == nullptr) ||
435       (sizeof_alias * TEXTURE_BOUND_1D <= alloc_size)) {
436     std::ostringstream msg;
437     msg << "Kokkos::CudaSpace ERROR: Cannot attach texture object to"
438         << " alloc_ptr(" << alloc_ptr << ")"
439         << " alloc_size(" << alloc_size << ")"
440         << " max_size(" << (sizeof_alias * TEXTURE_BOUND_1D) << ")";
441     std::cerr << msg.str() << std::endl;
442     std::cerr.flush();
443     Kokkos::Impl::throw_runtime_exception(msg.str());
444   }
445 
446   ::cudaTextureObject_t tex_obj;
447 
448   struct cudaResourceDesc resDesc;
449   struct cudaTextureDesc texDesc;
450 
451   memset(&resDesc, 0, sizeof(resDesc));
452   memset(&texDesc, 0, sizeof(texDesc));
453 
454   resDesc.resType = cudaResourceTypeLinear;
455   resDesc.res.linear.desc =
456       (sizeof_alias == 4
457            ? cudaCreateChannelDesc<int>()
458            : (sizeof_alias == 8
459                   ? cudaCreateChannelDesc< ::int2>()
460                   :
461                   /* sizeof_alias == 16 */ cudaCreateChannelDesc< ::int4>()));
462   resDesc.res.linear.sizeInBytes = alloc_size;
463   resDesc.res.linear.devPtr      = alloc_ptr;
464 
465   CUDA_SAFE_CALL(
466       cudaCreateTextureObject(&tex_obj, &resDesc, &texDesc, nullptr));
467 
468   return tex_obj;
469 }
470 
471 //==============================================================================
472 // <editor-fold desc="SharedAllocationRecord destructors"> {{{1
473 
~SharedAllocationRecord()474 SharedAllocationRecord<Kokkos::CudaSpace, void>::~SharedAllocationRecord() {
475   const char *label = nullptr;
476   if (Kokkos::Profiling::profileLibraryLoaded()) {
477     SharedAllocationHeader header;
478     Kokkos::Impl::DeepCopy<Kokkos::CudaSpace, HostSpace>(
479         &header, RecordBase::m_alloc_ptr, sizeof(SharedAllocationHeader));
480     label = header.label();
481   }
482   auto alloc_size = SharedAllocationRecord<void, void>::m_alloc_size;
483   m_space.deallocate(label, SharedAllocationRecord<void, void>::m_alloc_ptr,
484                      alloc_size, (alloc_size - sizeof(SharedAllocationHeader)));
485 }
486 
~SharedAllocationRecord()487 SharedAllocationRecord<Kokkos::CudaUVMSpace, void>::~SharedAllocationRecord() {
488   const char *label = nullptr;
489   if (Kokkos::Profiling::profileLibraryLoaded()) {
490     label = RecordBase::m_alloc_ptr->m_label;
491   }
492   m_space.deallocate(label, SharedAllocationRecord<void, void>::m_alloc_ptr,
493                      SharedAllocationRecord<void, void>::m_alloc_size,
494                      (SharedAllocationRecord<void, void>::m_alloc_size -
495                       sizeof(SharedAllocationHeader)));
496 }
497 
498 SharedAllocationRecord<Kokkos::CudaHostPinnedSpace,
~SharedAllocationRecord()499                        void>::~SharedAllocationRecord() {
500   m_space.deallocate(RecordBase::m_alloc_ptr->m_label,
501                      SharedAllocationRecord<void, void>::m_alloc_ptr,
502                      SharedAllocationRecord<void, void>::m_alloc_size,
503                      (SharedAllocationRecord<void, void>::m_alloc_size -
504                       sizeof(SharedAllocationHeader)));
505 }
506 
507 // </editor-fold> end SharedAllocationRecord destructors }}}1
508 //==============================================================================
509 
510 //==============================================================================
511 // <editor-fold desc="SharedAllocationRecord constructors"> {{{1
512 
SharedAllocationRecord(const Kokkos::CudaSpace & arg_space,const std::string & arg_label,const size_t arg_alloc_size,const SharedAllocationRecord<void,void>::function_type arg_dealloc)513 SharedAllocationRecord<Kokkos::CudaSpace, void>::SharedAllocationRecord(
514     const Kokkos::CudaSpace &arg_space, const std::string &arg_label,
515     const size_t arg_alloc_size,
516     const SharedAllocationRecord<void, void>::function_type arg_dealloc)
517     // Pass through allocated [ SharedAllocationHeader , user_memory ]
518     // Pass through deallocation function
519     : base_t(
520 #ifdef KOKKOS_ENABLE_DEBUG
521           &SharedAllocationRecord<Kokkos::CudaSpace, void>::s_root_record,
522 #endif
523           Impl::checked_allocation_with_header(arg_space, arg_label,
524                                                arg_alloc_size),
525           sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc),
526       m_tex_obj(0),
527       m_space(arg_space) {
528 
529   SharedAllocationHeader header;
530 
531   this->base_t::_fill_host_accessible_header_info(header, arg_label);
532 
533   // Copy to device memory
534   Kokkos::Impl::DeepCopy<CudaSpace, HostSpace>(RecordBase::m_alloc_ptr, &header,
535                                                sizeof(SharedAllocationHeader));
536 }
537 
SharedAllocationRecord(const Kokkos::CudaUVMSpace & arg_space,const std::string & arg_label,const size_t arg_alloc_size,const SharedAllocationRecord<void,void>::function_type arg_dealloc)538 SharedAllocationRecord<Kokkos::CudaUVMSpace, void>::SharedAllocationRecord(
539     const Kokkos::CudaUVMSpace &arg_space, const std::string &arg_label,
540     const size_t arg_alloc_size,
541     const SharedAllocationRecord<void, void>::function_type arg_dealloc)
542     // Pass through allocated [ SharedAllocationHeader , user_memory ]
543     // Pass through deallocation function
544     : base_t(
545 #ifdef KOKKOS_ENABLE_DEBUG
546           &SharedAllocationRecord<Kokkos::CudaUVMSpace, void>::s_root_record,
547 #endif
548           Impl::checked_allocation_with_header(arg_space, arg_label,
549                                                arg_alloc_size),
550           sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc),
551       m_tex_obj(0),
552       m_space(arg_space) {
553   this->base_t::_fill_host_accessible_header_info(*base_t::m_alloc_ptr,
554                                                   arg_label);
555 }
556 
557 SharedAllocationRecord<Kokkos::CudaHostPinnedSpace, void>::
SharedAllocationRecord(const Kokkos::CudaHostPinnedSpace & arg_space,const std::string & arg_label,const size_t arg_alloc_size,const SharedAllocationRecord<void,void>::function_type arg_dealloc)558     SharedAllocationRecord(
559         const Kokkos::CudaHostPinnedSpace &arg_space,
560         const std::string &arg_label, const size_t arg_alloc_size,
561         const SharedAllocationRecord<void, void>::function_type arg_dealloc)
562     // Pass through allocated [ SharedAllocationHeader , user_memory ]
563     // Pass through deallocation function
564     : base_t(
565 #ifdef KOKKOS_ENABLE_DEBUG
566           &SharedAllocationRecord<Kokkos::CudaHostPinnedSpace,
567                                   void>::s_root_record,
568 #endif
569           Impl::checked_allocation_with_header(arg_space, arg_label,
570                                                arg_alloc_size),
571           sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc),
572       m_space(arg_space) {
573   this->base_t::_fill_host_accessible_header_info(*base_t::m_alloc_ptr,
574                                                   arg_label);
575 }
576 
577 // </editor-fold> end SharedAllocationRecord constructors }}}1
578 //==============================================================================
579 
cuda_prefetch_pointer(const Cuda & space,const void * ptr,size_t bytes,bool to_device)580 void cuda_prefetch_pointer(const Cuda &space, const void *ptr, size_t bytes,
581                            bool to_device) {
582   if ((ptr == nullptr) || (bytes == 0)) return;
583   cudaPointerAttributes attr;
584   CUDA_SAFE_CALL(cudaPointerGetAttributes(&attr, ptr));
585   // I measured this and it turns out prefetching towards the host slows
586   // DualView syncs down. Probably because the latency is not too bad in the
587   // first place for the pull down. If we want to change that provde
588   // cudaCpuDeviceId as the device if to_device is false
589 #if CUDA_VERSION < 10000
590   bool is_managed = attr.isManaged;
591 #else
592   bool is_managed = attr.type == cudaMemoryTypeManaged;
593 #endif
594   if (to_device && is_managed &&
595       space.cuda_device_prop().concurrentManagedAccess) {
596     CUDA_SAFE_CALL(cudaMemPrefetchAsync(ptr, bytes, space.cuda_device(),
597                                         space.cuda_stream()));
598   }
599 }
600 
601 }  // namespace Impl
602 }  // namespace Kokkos
603 
604 //==============================================================================
605 // <editor-fold desc="Explicit instantiations of CRTP Base classes"> {{{1
606 
607 #include <impl/Kokkos_SharedAlloc_timpl.hpp>
608 
609 namespace Kokkos {
610 namespace Impl {
611 
612 // To avoid additional compilation cost for something that's (mostly?) not
613 // performance sensitive, we explicity instantiate these CRTP base classes here,
614 // where we have access to the associated *_timpl.hpp header files.
615 template class SharedAllocationRecordCommon<Kokkos::CudaSpace>;
616 template class HostInaccessibleSharedAllocationRecordCommon<Kokkos::CudaSpace>;
617 template class SharedAllocationRecordCommon<Kokkos::CudaUVMSpace>;
618 template class SharedAllocationRecordCommon<Kokkos::CudaHostPinnedSpace>;
619 
620 }  // end namespace Impl
621 }  // end namespace Kokkos
622 
623 // </editor-fold> end Explicit instantiations of CRTP Base classes }}}1
624 //==============================================================================
625 
626 #else
KOKKOS_CORE_SRC_CUDA_CUDASPACE_PREVENT_LINK_ERROR()627 void KOKKOS_CORE_SRC_CUDA_CUDASPACE_PREVENT_LINK_ERROR() {}
628 #endif  // KOKKOS_ENABLE_CUDA
629