1 // Licensed to the Apache Software Foundation (ASF) under one
2 // or more contributor license agreements.  See the NOTICE file
3 // distributed with this work for additional information
4 // regarding copyright ownership.  The ASF licenses this file
5 // to you under the Apache License, Version 2.0 (the
6 // "License"); you may not use this file except in compliance
7 // with the License.  You may obtain a copy of the License at
8 //
9 //   http://www.apache.org/licenses/LICENSE-2.0
10 //
11 // Unless required by applicable law or agreed to in writing,
12 // software distributed under the License is distributed on an
13 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 // KIND, either express or implied.  See the License for the
15 // specific language governing permissions and limitations
16 // under the License.
17 
18 #include "arrow/gpu/cuda_context.h"
19 
20 #include <atomic>
21 #include <cstdint>
22 #include <memory>
23 #include <mutex>
24 #include <sstream>
25 #include <string>
26 #include <utility>
27 #include <vector>
28 
29 #include <cuda.h>
30 
31 #include "arrow/gpu/cuda_internal.h"
32 #include "arrow/gpu/cuda_memory.h"
33 #include "arrow/util/checked_cast.h"
34 
35 namespace arrow {
36 
37 using internal::checked_cast;
38 using internal::checked_pointer_cast;
39 
40 namespace cuda {
41 
42 using internal::ContextSaver;
43 
44 namespace {
45 
46 struct DeviceProperties {
47   int device_number_;
48   CUdevice handle_;
49   int64_t total_memory_;
50   std::string name_;
51 
Initarrow::cuda::__anonc8e1da650111::DeviceProperties52   Status Init(int device_number) {
53     device_number_ = device_number;
54     CU_RETURN_NOT_OK("cuDeviceGet", cuDeviceGet(&handle_, device_number));
55     size_t total_memory = 0;
56     CU_RETURN_NOT_OK("cuDeviceTotalMem", cuDeviceTotalMem(&total_memory, handle_));
57     total_memory_ = total_memory;
58 
59     char buf[200];
60     CU_RETURN_NOT_OK("cuDeviceGetName", cuDeviceGetName(buf, sizeof(buf), device_number));
61     name_.assign(buf);
62     return Status::OK();
63   }
64 };
65 
66 const char kCudaDeviceTypeName[] = "arrow::cuda::CudaDevice";
67 
68 }  // namespace
69 
70 struct CudaDevice::Impl {
71   DeviceProperties props;
72 };
73 
74 // ----------------------------------------------------------------------
75 // CudaContext implementation
76 
77 class CudaContext::Impl {
78  public:
Impl()79   Impl() : bytes_allocated_(0) {}
80 
Init(const std::shared_ptr<CudaDevice> & device)81   Status Init(const std::shared_ptr<CudaDevice>& device) {
82     mm_ = checked_pointer_cast<CudaMemoryManager>(device->default_memory_manager());
83     props_ = &device->impl_->props;
84     own_context_ = true;
85     CU_RETURN_NOT_OK("cuDevicePrimaryCtxRetain",
86                      cuDevicePrimaryCtxRetain(&context_, props_->handle_));
87     is_open_ = true;
88     return Status::OK();
89   }
90 
InitShared(const std::shared_ptr<CudaDevice> & device,CUcontext ctx)91   Status InitShared(const std::shared_ptr<CudaDevice>& device, CUcontext ctx) {
92     mm_ = checked_pointer_cast<CudaMemoryManager>(device->default_memory_manager());
93     props_ = &device->impl_->props;
94     own_context_ = false;
95     context_ = ctx;
96     is_open_ = true;
97     return Status::OK();
98   }
99 
Close()100   Status Close() {
101     if (is_open_ && own_context_) {
102       CU_RETURN_NOT_OK("cuDevicePrimaryCtxRelease",
103                        cuDevicePrimaryCtxRelease(props_->handle_));
104     }
105     is_open_ = false;
106     return Status::OK();
107   }
108 
bytes_allocated() const109   int64_t bytes_allocated() const { return bytes_allocated_.load(); }
110 
Allocate(int64_t nbytes,uint8_t ** out)111   Status Allocate(int64_t nbytes, uint8_t** out) {
112     if (nbytes > 0) {
113       ContextSaver set_temporary(context_);
114       CUdeviceptr data;
115       CU_RETURN_NOT_OK("cuMemAlloc", cuMemAlloc(&data, static_cast<size_t>(nbytes)));
116       bytes_allocated_ += nbytes;
117       *out = reinterpret_cast<uint8_t*>(data);
118     } else {
119       *out = nullptr;
120     }
121     return Status::OK();
122   }
123 
CopyHostToDevice(uintptr_t dst,const void * src,int64_t nbytes)124   Status CopyHostToDevice(uintptr_t dst, const void* src, int64_t nbytes) {
125     ContextSaver set_temporary(context_);
126     CU_RETURN_NOT_OK("cuMemcpyHtoD", cuMemcpyHtoD(dst, src, static_cast<size_t>(nbytes)));
127     return Status::OK();
128   }
129 
CopyDeviceToHost(void * dst,uintptr_t src,int64_t nbytes)130   Status CopyDeviceToHost(void* dst, uintptr_t src, int64_t nbytes) {
131     ContextSaver set_temporary(context_);
132     CU_RETURN_NOT_OK("cuMemcpyDtoH", cuMemcpyDtoH(dst, src, static_cast<size_t>(nbytes)));
133     return Status::OK();
134   }
135 
CopyDeviceToDevice(uintptr_t dst,uintptr_t src,int64_t nbytes)136   Status CopyDeviceToDevice(uintptr_t dst, uintptr_t src, int64_t nbytes) {
137     ContextSaver set_temporary(context_);
138     CU_RETURN_NOT_OK("cuMemcpyDtoD", cuMemcpyDtoD(dst, src, static_cast<size_t>(nbytes)));
139     return Status::OK();
140   }
141 
CopyDeviceToAnotherDevice(const std::shared_ptr<CudaContext> & dst_ctx,uintptr_t dst,uintptr_t src,int64_t nbytes)142   Status CopyDeviceToAnotherDevice(const std::shared_ptr<CudaContext>& dst_ctx,
143                                    uintptr_t dst, uintptr_t src, int64_t nbytes) {
144     ContextSaver set_temporary(context_);
145     CU_RETURN_NOT_OK("cuMemcpyPeer",
146                      cuMemcpyPeer(dst, reinterpret_cast<CUcontext>(dst_ctx->handle()),
147                                   src, context_, static_cast<size_t>(nbytes)));
148     return Status::OK();
149   }
150 
Synchronize(void)151   Status Synchronize(void) {
152     ContextSaver set_temporary(context_);
153     CU_RETURN_NOT_OK("cuCtxSynchronize", cuCtxSynchronize());
154     return Status::OK();
155   }
156 
Free(void * device_ptr,int64_t nbytes)157   Status Free(void* device_ptr, int64_t nbytes) {
158     CU_RETURN_NOT_OK("cuMemFree", cuMemFree(reinterpret_cast<CUdeviceptr>(device_ptr)));
159     bytes_allocated_ -= nbytes;
160     return Status::OK();
161   }
162 
ExportIpcBuffer(void * data,int64_t size)163   Result<std::shared_ptr<CudaIpcMemHandle>> ExportIpcBuffer(void* data, int64_t size) {
164     CUipcMemHandle cu_handle;
165     if (size > 0) {
166       ContextSaver set_temporary(context_);
167       CU_RETURN_NOT_OK(
168           "cuIpcGetMemHandle",
169           cuIpcGetMemHandle(&cu_handle, reinterpret_cast<CUdeviceptr>(data)));
170     }
171     return std::shared_ptr<CudaIpcMemHandle>(new CudaIpcMemHandle(size, &cu_handle));
172   }
173 
OpenIpcBuffer(const CudaIpcMemHandle & ipc_handle,uint8_t ** out)174   Status OpenIpcBuffer(const CudaIpcMemHandle& ipc_handle, uint8_t** out) {
175     int64_t size = ipc_handle.memory_size();
176     if (size > 0) {
177       auto handle = reinterpret_cast<const CUipcMemHandle*>(ipc_handle.handle());
178       CUdeviceptr data;
179       CU_RETURN_NOT_OK(
180           "cuIpcOpenMemHandle",
181           cuIpcOpenMemHandle(&data, *handle, CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS));
182       *out = reinterpret_cast<uint8_t*>(data);
183     } else {
184       *out = nullptr;
185     }
186     return Status::OK();
187   }
188 
device() const189   std::shared_ptr<CudaDevice> device() const {
190     return checked_pointer_cast<CudaDevice>(mm_->device());
191   }
192 
memory_manager() const193   const std::shared_ptr<CudaMemoryManager>& memory_manager() const { return mm_; }
194 
context_handle() const195   void* context_handle() const { return reinterpret_cast<void*>(context_); }
196 
197  private:
198   std::shared_ptr<CudaMemoryManager> mm_;
199   const DeviceProperties* props_;
200   CUcontext context_;
201   bool is_open_;
202 
203   // So that we can utilize a CUcontext that was created outside this library
204   bool own_context_;
205 
206   std::atomic<int64_t> bytes_allocated_;
207 };
208 
209 // ----------------------------------------------------------------------
210 // CudaDevice implementation
211 
CudaDevice(Impl impl)212 CudaDevice::CudaDevice(Impl impl) : impl_(new Impl(std::move(impl))) {}
213 
type_name() const214 const char* CudaDevice::type_name() const { return kCudaDeviceTypeName; }
215 
ToString() const216 std::string CudaDevice::ToString() const {
217   std::stringstream ss;
218   ss << "CudaDevice(device_number=" << device_number() << ", name=\"" << device_name()
219      << "\")";
220   return ss.str();
221 }
222 
Equals(const Device & other) const223 bool CudaDevice::Equals(const Device& other) const {
224   if (!IsCudaDevice(other)) {
225     return false;
226   }
227   return checked_cast<const CudaDevice&>(other).device_number() == device_number();
228 }
229 
device_number() const230 int CudaDevice::device_number() const { return impl_->props.device_number_; }
231 
device_name() const232 std::string CudaDevice::device_name() const { return impl_->props.name_; }
233 
total_memory() const234 int64_t CudaDevice::total_memory() const { return impl_->props.total_memory_; }
235 
handle() const236 int CudaDevice::handle() const { return impl_->props.handle_; }
237 
Make(int device_number)238 Result<std::shared_ptr<CudaDevice>> CudaDevice::Make(int device_number) {
239   ARROW_ASSIGN_OR_RAISE(auto manager, CudaDeviceManager::Instance());
240   return manager->GetDevice(device_number);
241 }
242 
default_memory_manager()243 std::shared_ptr<MemoryManager> CudaDevice::default_memory_manager() {
244   return CudaMemoryManager::Make(shared_from_this());
245 }
246 
GetContext()247 Result<std::shared_ptr<CudaContext>> CudaDevice::GetContext() {
248   // XXX should we cache a default context in CudaDevice instance?
249   auto context = std::shared_ptr<CudaContext>(new CudaContext());
250   auto self = checked_pointer_cast<CudaDevice>(shared_from_this());
251   RETURN_NOT_OK(context->impl_->Init(self));
252   return context;
253 }
254 
GetSharedContext(void * handle)255 Result<std::shared_ptr<CudaContext>> CudaDevice::GetSharedContext(void* handle) {
256   auto context = std::shared_ptr<CudaContext>(new CudaContext());
257   auto self = checked_pointer_cast<CudaDevice>(shared_from_this());
258   RETURN_NOT_OK(context->impl_->InitShared(self, reinterpret_cast<CUcontext>(handle)));
259   return context;
260 }
261 
AllocateHostBuffer(int64_t size)262 Result<std::shared_ptr<CudaHostBuffer>> CudaDevice::AllocateHostBuffer(int64_t size) {
263   ARROW_ASSIGN_OR_RAISE(auto context, GetContext());
264   ContextSaver set_temporary(*context);
265   void* ptr;
266   CU_RETURN_NOT_OK("cuMemHostAlloc", cuMemHostAlloc(&ptr, static_cast<size_t>(size),
267                                                     CU_MEMHOSTALLOC_PORTABLE));
268   return std::make_shared<CudaHostBuffer>(reinterpret_cast<uint8_t*>(ptr), size);
269 }
270 
IsCudaDevice(const Device & device)271 bool IsCudaDevice(const Device& device) {
272   return device.type_name() == kCudaDeviceTypeName;
273 }
274 
AsCudaDevice(const std::shared_ptr<Device> & device)275 Result<std::shared_ptr<CudaDevice>> AsCudaDevice(const std::shared_ptr<Device>& device) {
276   if (IsCudaDevice(*device)) {
277     return checked_pointer_cast<CudaDevice>(device);
278   } else {
279     return Status::TypeError("Device is not a Cuda device: ", device->ToString());
280   }
281 }
282 
283 // ----------------------------------------------------------------------
284 // CudaMemoryManager implementation
285 
Make(const std::shared_ptr<Device> & device)286 std::shared_ptr<CudaMemoryManager> CudaMemoryManager::Make(
287     const std::shared_ptr<Device>& device) {
288   return std::shared_ptr<CudaMemoryManager>(new CudaMemoryManager(device));
289 }
290 
cuda_device() const291 std::shared_ptr<CudaDevice> CudaMemoryManager::cuda_device() const {
292   return checked_pointer_cast<CudaDevice>(device_);
293 }
294 
GetBufferReader(std::shared_ptr<Buffer> buf)295 Result<std::shared_ptr<io::RandomAccessFile>> CudaMemoryManager::GetBufferReader(
296     std::shared_ptr<Buffer> buf) {
297   if (*buf->device() != *device_) {
298     return Status::Invalid(
299         "CudaMemoryManager::GetBufferReader called on foreign buffer "
300         "for device ",
301         buf->device()->ToString());
302   }
303   return std::make_shared<CudaBufferReader>(checked_pointer_cast<CudaBuffer>(buf));
304 }
305 
GetBufferWriter(std::shared_ptr<Buffer> buf)306 Result<std::shared_ptr<io::OutputStream>> CudaMemoryManager::GetBufferWriter(
307     std::shared_ptr<Buffer> buf) {
308   if (*buf->device() != *device_) {
309     return Status::Invalid(
310         "CudaMemoryManager::GetBufferReader called on foreign buffer "
311         "for device ",
312         buf->device()->ToString());
313   }
314   ARROW_ASSIGN_OR_RAISE(auto cuda_buf, CudaBuffer::FromBuffer(buf));
315   auto writer = std::make_shared<CudaBufferWriter>(cuda_buf);
316   // Use 8MB buffering, which yields generally good performance
317   RETURN_NOT_OK(writer->SetBufferSize(1 << 23));
318   return writer;
319 }
320 
AllocateBuffer(int64_t size)321 Result<std::shared_ptr<Buffer>> CudaMemoryManager::AllocateBuffer(int64_t size) {
322   ARROW_ASSIGN_OR_RAISE(auto context, cuda_device()->GetContext());
323   std::shared_ptr<CudaBuffer> dest;
324   return context->Allocate(size);
325 }
326 
CopyBufferTo(const std::shared_ptr<Buffer> & buf,const std::shared_ptr<MemoryManager> & to)327 Result<std::shared_ptr<Buffer>> CudaMemoryManager::CopyBufferTo(
328     const std::shared_ptr<Buffer>& buf, const std::shared_ptr<MemoryManager>& to) {
329   if (to->is_cpu()) {
330     // Device-to-CPU copy
331     std::shared_ptr<Buffer> dest;
332     ARROW_ASSIGN_OR_RAISE(auto from_context, cuda_device()->GetContext());
333     ARROW_ASSIGN_OR_RAISE(dest, to->AllocateBuffer(buf->size()));
334     RETURN_NOT_OK(from_context->CopyDeviceToHost(dest->mutable_data(), buf->address(),
335                                                  buf->size()));
336     return dest;
337   }
338   return nullptr;
339 }
340 
CopyBufferFrom(const std::shared_ptr<Buffer> & buf,const std::shared_ptr<MemoryManager> & from)341 Result<std::shared_ptr<Buffer>> CudaMemoryManager::CopyBufferFrom(
342     const std::shared_ptr<Buffer>& buf, const std::shared_ptr<MemoryManager>& from) {
343   if (from->is_cpu()) {
344     // CPU-to-device copy
345     ARROW_ASSIGN_OR_RAISE(auto to_context, cuda_device()->GetContext());
346     ARROW_ASSIGN_OR_RAISE(auto dest, to_context->Allocate(buf->size()));
347     RETURN_NOT_OK(
348         to_context->CopyHostToDevice(dest->address(), buf->data(), buf->size()));
349     return dest;
350   }
351   if (IsCudaMemoryManager(*from)) {
352     // Device-to-device copy
353     ARROW_ASSIGN_OR_RAISE(auto to_context, cuda_device()->GetContext());
354     ARROW_ASSIGN_OR_RAISE(
355         auto from_context,
356         checked_cast<const CudaMemoryManager&>(*from).cuda_device()->GetContext());
357     ARROW_ASSIGN_OR_RAISE(auto dest, to_context->Allocate(buf->size()));
358     if (to_context->handle() == from_context->handle()) {
359       // Same context
360       RETURN_NOT_OK(
361           to_context->CopyDeviceToDevice(dest->address(), buf->address(), buf->size()));
362     } else {
363       // Other context
364       RETURN_NOT_OK(from_context->CopyDeviceToAnotherDevice(to_context, dest->address(),
365                                                             buf->address(), buf->size()));
366     }
367     return dest;
368   }
369   return nullptr;
370 }
371 
ViewBufferTo(const std::shared_ptr<Buffer> & buf,const std::shared_ptr<MemoryManager> & to)372 Result<std::shared_ptr<Buffer>> CudaMemoryManager::ViewBufferTo(
373     const std::shared_ptr<Buffer>& buf, const std::shared_ptr<MemoryManager>& to) {
374   if (to->is_cpu()) {
375     // Device-on-CPU view
376     ARROW_ASSIGN_OR_RAISE(auto address, GetHostAddress(buf->address()));
377     return std::make_shared<Buffer>(address, buf->size(), to, buf);
378   }
379   return nullptr;
380 }
381 
ViewBufferFrom(const std::shared_ptr<Buffer> & buf,const std::shared_ptr<MemoryManager> & from)382 Result<std::shared_ptr<Buffer>> CudaMemoryManager::ViewBufferFrom(
383     const std::shared_ptr<Buffer>& buf, const std::shared_ptr<MemoryManager>& from) {
384   if (from->is_cpu()) {
385     // CPU-on-device view
386     ARROW_ASSIGN_OR_RAISE(auto to_context, cuda_device()->GetContext());
387     ARROW_ASSIGN_OR_RAISE(auto address, GetDeviceAddress(buf->data(), to_context));
388     return std::make_shared<Buffer>(address, buf->size(), shared_from_this(), buf);
389   }
390   return nullptr;
391 }
392 
IsCudaMemoryManager(const MemoryManager & mm)393 bool IsCudaMemoryManager(const MemoryManager& mm) { return IsCudaDevice(*mm.device()); }
394 
AsCudaMemoryManager(const std::shared_ptr<MemoryManager> & mm)395 Result<std::shared_ptr<CudaMemoryManager>> AsCudaMemoryManager(
396     const std::shared_ptr<MemoryManager>& mm) {
397   if (IsCudaMemoryManager(*mm)) {
398     return checked_pointer_cast<CudaMemoryManager>(mm);
399   } else {
400     return Status::TypeError("Device is not a Cuda device: ", mm->device()->ToString());
401   }
402 }
403 
404 // ----------------------------------------------------------------------
405 // CudaDeviceManager implementation
406 
407 class CudaDeviceManager::Impl {
408  public:
Impl()409   Impl() : host_bytes_allocated_(0) {}
410 
Init()411   Status Init() {
412     CU_RETURN_NOT_OK("cuInit", cuInit(0));
413     CU_RETURN_NOT_OK("cuDeviceGetCount", cuDeviceGetCount(&num_devices_));
414 
415     devices_.resize(num_devices_);
416     for (int i = 0; i < num_devices_; ++i) {
417       ARROW_ASSIGN_OR_RAISE(devices_[i], MakeDevice(i));
418     }
419     return Status::OK();
420   }
421 
AllocateHost(int device_number,int64_t nbytes,uint8_t ** out)422   Status AllocateHost(int device_number, int64_t nbytes, uint8_t** out) {
423     RETURN_NOT_OK(CheckDeviceNum(device_number));
424     ARROW_ASSIGN_OR_RAISE(auto ctx, GetContext(device_number));
425     ContextSaver set_temporary((CUcontext)(ctx.get()->handle()));
426     CU_RETURN_NOT_OK("cuMemHostAlloc", cuMemHostAlloc(reinterpret_cast<void**>(out),
427                                                       static_cast<size_t>(nbytes),
428                                                       CU_MEMHOSTALLOC_PORTABLE));
429     host_bytes_allocated_ += nbytes;
430     return Status::OK();
431   }
432 
FreeHost(void * data,int64_t nbytes)433   Status FreeHost(void* data, int64_t nbytes) {
434     CU_RETURN_NOT_OK("cuMemFreeHost", cuMemFreeHost(data));
435     host_bytes_allocated_ -= nbytes;
436     return Status::OK();
437   }
438 
GetContext(int device_number)439   Result<std::shared_ptr<CudaContext>> GetContext(int device_number) {
440     RETURN_NOT_OK(CheckDeviceNum(device_number));
441     return devices_[device_number]->GetContext();
442   }
443 
GetSharedContext(int device_number,void * handle)444   Result<std::shared_ptr<CudaContext>> GetSharedContext(int device_number, void* handle) {
445     RETURN_NOT_OK(CheckDeviceNum(device_number));
446     return devices_[device_number]->GetSharedContext(handle);
447   }
448 
GetDevice(int device_number)449   Result<std::shared_ptr<CudaDevice>> GetDevice(int device_number) {
450     RETURN_NOT_OK(CheckDeviceNum(device_number));
451     return devices_[device_number];
452   }
453 
num_devices() const454   int num_devices() const { return num_devices_; }
455 
CheckDeviceNum(int device_number) const456   Status CheckDeviceNum(int device_number) const {
457     if (device_number < 0 || device_number >= num_devices_) {
458       return Status::Invalid("Invalid Cuda device number ", device_number,
459                              " (should be between 0 and ", num_devices_ - 1,
460                              ", inclusive)");
461     }
462     return Status::OK();
463   }
464 
465  protected:
MakeDevice(int device_number)466   Result<std::shared_ptr<CudaDevice>> MakeDevice(int device_number) {
467     DeviceProperties props;
468     RETURN_NOT_OK(props.Init(device_number));
469     return std::shared_ptr<CudaDevice>(new CudaDevice({std::move(props)}));
470   }
471 
472  private:
473   int num_devices_;
474   std::vector<std::shared_ptr<CudaDevice>> devices_;
475 
476   int64_t host_bytes_allocated_;
477 };
478 
CudaDeviceManager()479 CudaDeviceManager::CudaDeviceManager() { impl_.reset(new Impl()); }
480 
481 std::unique_ptr<CudaDeviceManager> CudaDeviceManager::instance_ = nullptr;
482 
Instance()483 Result<CudaDeviceManager*> CudaDeviceManager::Instance() {
484   static std::mutex mutex;
485   static std::atomic<bool> init_end(false);
486 
487   if (!init_end) {
488     std::lock_guard<std::mutex> lock(mutex);
489     if (!init_end) {
490       instance_.reset(new CudaDeviceManager());
491       RETURN_NOT_OK(instance_->impl_->Init());
492       init_end = true;
493     }
494   }
495   return instance_.get();
496 }
497 
GetInstance(CudaDeviceManager ** manager)498 Status CudaDeviceManager::GetInstance(CudaDeviceManager** manager) {
499   return Instance().Value(manager);
500 }
501 
GetDevice(int device_number)502 Result<std::shared_ptr<CudaDevice>> CudaDeviceManager::GetDevice(int device_number) {
503   return impl_->GetDevice(device_number);
504 }
505 
GetContext(int device_number)506 Result<std::shared_ptr<CudaContext>> CudaDeviceManager::GetContext(int device_number) {
507   return impl_->GetContext(device_number);
508 }
509 
GetContext(int device_number,std::shared_ptr<CudaContext> * out)510 Status CudaDeviceManager::GetContext(int device_number,
511                                      std::shared_ptr<CudaContext>* out) {
512   return impl_->GetContext(device_number).Value(out);
513 }
514 
GetSharedContext(int device_number,void * ctx)515 Result<std::shared_ptr<CudaContext>> CudaDeviceManager::GetSharedContext(
516     int device_number, void* ctx) {
517   return impl_->GetSharedContext(device_number, ctx);
518 }
519 
GetSharedContext(int device_number,void * ctx,std::shared_ptr<CudaContext> * out)520 Status CudaDeviceManager::GetSharedContext(int device_number, void* ctx,
521                                            std::shared_ptr<CudaContext>* out) {
522   return impl_->GetSharedContext(device_number, ctx).Value(out);
523 }
524 
AllocateHost(int device_number,int64_t nbytes)525 Result<std::shared_ptr<CudaHostBuffer>> CudaDeviceManager::AllocateHost(int device_number,
526                                                                         int64_t nbytes) {
527   uint8_t* data = nullptr;
528   RETURN_NOT_OK(impl_->AllocateHost(device_number, nbytes, &data));
529   return std::make_shared<CudaHostBuffer>(data, nbytes);
530 }
531 
AllocateHost(int device_number,int64_t nbytes,std::shared_ptr<CudaHostBuffer> * out)532 Status CudaDeviceManager::AllocateHost(int device_number, int64_t nbytes,
533                                        std::shared_ptr<CudaHostBuffer>* out) {
534   return AllocateHost(device_number, nbytes).Value(out);
535 }
536 
FreeHost(void * data,int64_t nbytes)537 Status CudaDeviceManager::FreeHost(void* data, int64_t nbytes) {
538   return impl_->FreeHost(data, nbytes);
539 }
540 
num_devices() const541 int CudaDeviceManager::num_devices() const { return impl_->num_devices(); }
542 
543 // ----------------------------------------------------------------------
544 // CudaContext public API
545 
CudaContext()546 CudaContext::CudaContext() { impl_.reset(new Impl()); }
547 
~CudaContext()548 CudaContext::~CudaContext() {}
549 
Allocate(int64_t nbytes)550 Result<std::shared_ptr<CudaBuffer>> CudaContext::Allocate(int64_t nbytes) {
551   uint8_t* data = nullptr;
552   RETURN_NOT_OK(impl_->Allocate(nbytes, &data));
553   return std::make_shared<CudaBuffer>(data, nbytes, this->shared_from_this(), true);
554 }
555 
Allocate(int64_t nbytes,std::shared_ptr<CudaBuffer> * out)556 Status CudaContext::Allocate(int64_t nbytes, std::shared_ptr<CudaBuffer>* out) {
557   return Allocate(nbytes).Value(out);
558 }
559 
View(uint8_t * data,int64_t nbytes)560 Result<std::shared_ptr<CudaBuffer>> CudaContext::View(uint8_t* data, int64_t nbytes) {
561   return std::make_shared<CudaBuffer>(data, nbytes, this->shared_from_this(), false);
562 }
563 
View(uint8_t * data,int64_t nbytes,std::shared_ptr<CudaBuffer> * out)564 Status CudaContext::View(uint8_t* data, int64_t nbytes,
565                          std::shared_ptr<CudaBuffer>* out) {
566   return View(data, nbytes).Value(out);
567 }
568 
ExportIpcBuffer(void * data,int64_t size)569 Result<std::shared_ptr<CudaIpcMemHandle>> CudaContext::ExportIpcBuffer(void* data,
570                                                                        int64_t size) {
571   return impl_->ExportIpcBuffer(data, size);
572 }
573 
CopyHostToDevice(uintptr_t dst,const void * src,int64_t nbytes)574 Status CudaContext::CopyHostToDevice(uintptr_t dst, const void* src, int64_t nbytes) {
575   return impl_->CopyHostToDevice(dst, src, nbytes);
576 }
577 
CopyHostToDevice(void * dst,const void * src,int64_t nbytes)578 Status CudaContext::CopyHostToDevice(void* dst, const void* src, int64_t nbytes) {
579   return impl_->CopyHostToDevice(reinterpret_cast<uintptr_t>(dst), src, nbytes);
580 }
581 
CopyDeviceToHost(void * dst,uintptr_t src,int64_t nbytes)582 Status CudaContext::CopyDeviceToHost(void* dst, uintptr_t src, int64_t nbytes) {
583   return impl_->CopyDeviceToHost(dst, src, nbytes);
584 }
585 
CopyDeviceToHost(void * dst,const void * src,int64_t nbytes)586 Status CudaContext::CopyDeviceToHost(void* dst, const void* src, int64_t nbytes) {
587   return impl_->CopyDeviceToHost(dst, reinterpret_cast<uintptr_t>(src), nbytes);
588 }
589 
CopyDeviceToDevice(uintptr_t dst,uintptr_t src,int64_t nbytes)590 Status CudaContext::CopyDeviceToDevice(uintptr_t dst, uintptr_t src, int64_t nbytes) {
591   return impl_->CopyDeviceToDevice(dst, src, nbytes);
592 }
593 
CopyDeviceToDevice(void * dst,const void * src,int64_t nbytes)594 Status CudaContext::CopyDeviceToDevice(void* dst, const void* src, int64_t nbytes) {
595   return impl_->CopyDeviceToDevice(reinterpret_cast<uintptr_t>(dst),
596                                    reinterpret_cast<uintptr_t>(src), nbytes);
597 }
598 
CopyDeviceToAnotherDevice(const std::shared_ptr<CudaContext> & dst_ctx,uintptr_t dst,uintptr_t src,int64_t nbytes)599 Status CudaContext::CopyDeviceToAnotherDevice(const std::shared_ptr<CudaContext>& dst_ctx,
600                                               uintptr_t dst, uintptr_t src,
601                                               int64_t nbytes) {
602   return impl_->CopyDeviceToAnotherDevice(dst_ctx, dst, src, nbytes);
603 }
604 
CopyDeviceToAnotherDevice(const std::shared_ptr<CudaContext> & dst_ctx,void * dst,const void * src,int64_t nbytes)605 Status CudaContext::CopyDeviceToAnotherDevice(const std::shared_ptr<CudaContext>& dst_ctx,
606                                               void* dst, const void* src,
607                                               int64_t nbytes) {
608   return impl_->CopyDeviceToAnotherDevice(dst_ctx, reinterpret_cast<uintptr_t>(dst),
609                                           reinterpret_cast<uintptr_t>(src), nbytes);
610 }
611 
Synchronize(void)612 Status CudaContext::Synchronize(void) { return impl_->Synchronize(); }
613 
Close()614 Status CudaContext::Close() { return impl_->Close(); }
615 
Free(void * device_ptr,int64_t nbytes)616 Status CudaContext::Free(void* device_ptr, int64_t nbytes) {
617   return impl_->Free(device_ptr, nbytes);
618 }
619 
OpenIpcBuffer(const CudaIpcMemHandle & ipc_handle)620 Result<std::shared_ptr<CudaBuffer>> CudaContext::OpenIpcBuffer(
621     const CudaIpcMemHandle& ipc_handle) {
622   if (ipc_handle.memory_size() > 0) {
623     ContextSaver set_temporary(*this);
624     uint8_t* data = nullptr;
625     RETURN_NOT_OK(impl_->OpenIpcBuffer(ipc_handle, &data));
626     // Need to ask the device how big the buffer is
627     size_t allocation_size = 0;
628     CU_RETURN_NOT_OK("cuMemGetAddressRange",
629                      cuMemGetAddressRange(nullptr, &allocation_size,
630                                           reinterpret_cast<CUdeviceptr>(data)));
631     return std::make_shared<CudaBuffer>(data, allocation_size, this->shared_from_this(),
632                                         true, true);
633   } else {
634     // zero-sized buffer does not own data (which is nullptr), hence
635     // CloseIpcBuffer will not be called (see CudaBuffer::Close).
636     return std::make_shared<CudaBuffer>(nullptr, 0, this->shared_from_this(), false,
637                                         true);
638   }
639 }
640 
OpenIpcBuffer(const CudaIpcMemHandle & ipc_handle,std::shared_ptr<CudaBuffer> * out)641 Status CudaContext::OpenIpcBuffer(const CudaIpcMemHandle& ipc_handle,
642                                   std::shared_ptr<CudaBuffer>* out) {
643   if (ipc_handle.memory_size() > 0) {
644     ContextSaver set_temporary(*this);
645     uint8_t* data = nullptr;
646     RETURN_NOT_OK(impl_->OpenIpcBuffer(ipc_handle, &data));
647     // Need to ask the device how big the buffer is
648     size_t allocation_size = 0;
649     CU_RETURN_NOT_OK("cuMemGetAddressRange",
650                      cuMemGetAddressRange(nullptr, &allocation_size,
651                                           reinterpret_cast<CUdeviceptr>(data)));
652     *out = std::make_shared<CudaBuffer>(data, allocation_size, this->shared_from_this(),
653                                         true, true);
654   } else {
655     // zero-sized buffer does not own data (which is nullptr), hence
656     // CloseIpcBuffer will not be called (see CudaBuffer::Close).
657     *out =
658         std::make_shared<CudaBuffer>(nullptr, 0, this->shared_from_this(), false, true);
659   }
660   return Status::OK();
661 }
662 
CloseIpcBuffer(CudaBuffer * buf)663 Status CudaContext::CloseIpcBuffer(CudaBuffer* buf) {
664   ContextSaver set_temporary(*this);
665   CU_RETURN_NOT_OK("cuIpcCloseMemHandle", cuIpcCloseMemHandle(buf->address()));
666   return Status::OK();
667 }
668 
bytes_allocated() const669 int64_t CudaContext::bytes_allocated() const { return impl_->bytes_allocated(); }
670 
handle() const671 void* CudaContext::handle() const { return impl_->context_handle(); }
672 
device() const673 std::shared_ptr<CudaDevice> CudaContext::device() const { return impl_->device(); }
674 
memory_manager() const675 std::shared_ptr<CudaMemoryManager> CudaContext::memory_manager() const {
676   return impl_->memory_manager();
677 }
678 
device_number() const679 int CudaContext::device_number() const { return impl_->device()->device_number(); }
680 
GetDeviceAddress(uintptr_t addr)681 Result<uintptr_t> CudaContext::GetDeviceAddress(uintptr_t addr) {
682   ContextSaver set_temporary(*this);
683   CUdeviceptr ptr;
684   CU_RETURN_NOT_OK("cuPointerGetAttribute",
685                    cuPointerGetAttribute(&ptr, CU_POINTER_ATTRIBUTE_DEVICE_POINTER,
686                                          static_cast<CUdeviceptr>(addr)));
687   return static_cast<uintptr_t>(ptr);
688 }
689 
GetDeviceAddress(uint8_t * addr)690 Result<uintptr_t> CudaContext::GetDeviceAddress(uint8_t* addr) {
691   return GetDeviceAddress(reinterpret_cast<uintptr_t>(addr));
692 }
693 
GetDeviceAddress(uint8_t * addr,uint8_t ** devaddr)694 Status CudaContext::GetDeviceAddress(uint8_t* addr, uint8_t** devaddr) {
695   ARROW_ASSIGN_OR_RAISE(auto ptr, GetDeviceAddress(addr));
696   *devaddr = reinterpret_cast<uint8_t*>(ptr);
697   return Status::OK();
698 }
699 
700 }  // namespace cuda
701 }  // namespace arrow
702