1 // Licensed to the Apache Software Foundation (ASF) under one
2 // or more contributor license agreements. See the NOTICE file
3 // distributed with this work for additional information
4 // regarding copyright ownership. The ASF licenses this file
5 // to you under the Apache License, Version 2.0 (the
6 // "License"); you may not use this file except in compliance
7 // with the License. You may obtain a copy of the License at
8 //
9 // http://www.apache.org/licenses/LICENSE-2.0
10 //
11 // Unless required by applicable law or agreed to in writing,
12 // software distributed under the License is distributed on an
13 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 // KIND, either express or implied. See the License for the
15 // specific language governing permissions and limitations
16 // under the License.
17
18 #include "arrow/gpu/cuda_context.h"
19
20 #include <atomic>
21 #include <cstdint>
22 #include <memory>
23 #include <mutex>
24 #include <sstream>
25 #include <string>
26 #include <utility>
27 #include <vector>
28
29 #include <cuda.h>
30
31 #include "arrow/gpu/cuda_internal.h"
32 #include "arrow/gpu/cuda_memory.h"
33 #include "arrow/util/checked_cast.h"
34
35 namespace arrow {
36
37 using internal::checked_cast;
38 using internal::checked_pointer_cast;
39
40 namespace cuda {
41
42 using internal::ContextSaver;
43
44 namespace {
45
46 struct DeviceProperties {
47 int device_number_;
48 CUdevice handle_;
49 int64_t total_memory_;
50 std::string name_;
51
Initarrow::cuda::__anonc8e1da650111::DeviceProperties52 Status Init(int device_number) {
53 device_number_ = device_number;
54 CU_RETURN_NOT_OK("cuDeviceGet", cuDeviceGet(&handle_, device_number));
55 size_t total_memory = 0;
56 CU_RETURN_NOT_OK("cuDeviceTotalMem", cuDeviceTotalMem(&total_memory, handle_));
57 total_memory_ = total_memory;
58
59 char buf[200];
60 CU_RETURN_NOT_OK("cuDeviceGetName", cuDeviceGetName(buf, sizeof(buf), device_number));
61 name_.assign(buf);
62 return Status::OK();
63 }
64 };
65
66 const char kCudaDeviceTypeName[] = "arrow::cuda::CudaDevice";
67
68 } // namespace
69
70 struct CudaDevice::Impl {
71 DeviceProperties props;
72 };
73
74 // ----------------------------------------------------------------------
75 // CudaContext implementation
76
77 class CudaContext::Impl {
78 public:
Impl()79 Impl() : bytes_allocated_(0) {}
80
Init(const std::shared_ptr<CudaDevice> & device)81 Status Init(const std::shared_ptr<CudaDevice>& device) {
82 mm_ = checked_pointer_cast<CudaMemoryManager>(device->default_memory_manager());
83 props_ = &device->impl_->props;
84 own_context_ = true;
85 CU_RETURN_NOT_OK("cuDevicePrimaryCtxRetain",
86 cuDevicePrimaryCtxRetain(&context_, props_->handle_));
87 is_open_ = true;
88 return Status::OK();
89 }
90
InitShared(const std::shared_ptr<CudaDevice> & device,CUcontext ctx)91 Status InitShared(const std::shared_ptr<CudaDevice>& device, CUcontext ctx) {
92 mm_ = checked_pointer_cast<CudaMemoryManager>(device->default_memory_manager());
93 props_ = &device->impl_->props;
94 own_context_ = false;
95 context_ = ctx;
96 is_open_ = true;
97 return Status::OK();
98 }
99
Close()100 Status Close() {
101 if (is_open_ && own_context_) {
102 CU_RETURN_NOT_OK("cuDevicePrimaryCtxRelease",
103 cuDevicePrimaryCtxRelease(props_->handle_));
104 }
105 is_open_ = false;
106 return Status::OK();
107 }
108
bytes_allocated() const109 int64_t bytes_allocated() const { return bytes_allocated_.load(); }
110
Allocate(int64_t nbytes,uint8_t ** out)111 Status Allocate(int64_t nbytes, uint8_t** out) {
112 if (nbytes > 0) {
113 ContextSaver set_temporary(context_);
114 CUdeviceptr data;
115 CU_RETURN_NOT_OK("cuMemAlloc", cuMemAlloc(&data, static_cast<size_t>(nbytes)));
116 bytes_allocated_ += nbytes;
117 *out = reinterpret_cast<uint8_t*>(data);
118 } else {
119 *out = nullptr;
120 }
121 return Status::OK();
122 }
123
CopyHostToDevice(uintptr_t dst,const void * src,int64_t nbytes)124 Status CopyHostToDevice(uintptr_t dst, const void* src, int64_t nbytes) {
125 ContextSaver set_temporary(context_);
126 CU_RETURN_NOT_OK("cuMemcpyHtoD", cuMemcpyHtoD(dst, src, static_cast<size_t>(nbytes)));
127 return Status::OK();
128 }
129
CopyDeviceToHost(void * dst,uintptr_t src,int64_t nbytes)130 Status CopyDeviceToHost(void* dst, uintptr_t src, int64_t nbytes) {
131 ContextSaver set_temporary(context_);
132 CU_RETURN_NOT_OK("cuMemcpyDtoH", cuMemcpyDtoH(dst, src, static_cast<size_t>(nbytes)));
133 return Status::OK();
134 }
135
CopyDeviceToDevice(uintptr_t dst,uintptr_t src,int64_t nbytes)136 Status CopyDeviceToDevice(uintptr_t dst, uintptr_t src, int64_t nbytes) {
137 ContextSaver set_temporary(context_);
138 CU_RETURN_NOT_OK("cuMemcpyDtoD", cuMemcpyDtoD(dst, src, static_cast<size_t>(nbytes)));
139 return Status::OK();
140 }
141
CopyDeviceToAnotherDevice(const std::shared_ptr<CudaContext> & dst_ctx,uintptr_t dst,uintptr_t src,int64_t nbytes)142 Status CopyDeviceToAnotherDevice(const std::shared_ptr<CudaContext>& dst_ctx,
143 uintptr_t dst, uintptr_t src, int64_t nbytes) {
144 ContextSaver set_temporary(context_);
145 CU_RETURN_NOT_OK("cuMemcpyPeer",
146 cuMemcpyPeer(dst, reinterpret_cast<CUcontext>(dst_ctx->handle()),
147 src, context_, static_cast<size_t>(nbytes)));
148 return Status::OK();
149 }
150
Synchronize(void)151 Status Synchronize(void) {
152 ContextSaver set_temporary(context_);
153 CU_RETURN_NOT_OK("cuCtxSynchronize", cuCtxSynchronize());
154 return Status::OK();
155 }
156
Free(void * device_ptr,int64_t nbytes)157 Status Free(void* device_ptr, int64_t nbytes) {
158 CU_RETURN_NOT_OK("cuMemFree", cuMemFree(reinterpret_cast<CUdeviceptr>(device_ptr)));
159 bytes_allocated_ -= nbytes;
160 return Status::OK();
161 }
162
ExportIpcBuffer(void * data,int64_t size)163 Result<std::shared_ptr<CudaIpcMemHandle>> ExportIpcBuffer(void* data, int64_t size) {
164 CUipcMemHandle cu_handle;
165 if (size > 0) {
166 ContextSaver set_temporary(context_);
167 CU_RETURN_NOT_OK(
168 "cuIpcGetMemHandle",
169 cuIpcGetMemHandle(&cu_handle, reinterpret_cast<CUdeviceptr>(data)));
170 }
171 return std::shared_ptr<CudaIpcMemHandle>(new CudaIpcMemHandle(size, &cu_handle));
172 }
173
OpenIpcBuffer(const CudaIpcMemHandle & ipc_handle,uint8_t ** out)174 Status OpenIpcBuffer(const CudaIpcMemHandle& ipc_handle, uint8_t** out) {
175 int64_t size = ipc_handle.memory_size();
176 if (size > 0) {
177 auto handle = reinterpret_cast<const CUipcMemHandle*>(ipc_handle.handle());
178 CUdeviceptr data;
179 CU_RETURN_NOT_OK(
180 "cuIpcOpenMemHandle",
181 cuIpcOpenMemHandle(&data, *handle, CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS));
182 *out = reinterpret_cast<uint8_t*>(data);
183 } else {
184 *out = nullptr;
185 }
186 return Status::OK();
187 }
188
device() const189 std::shared_ptr<CudaDevice> device() const {
190 return checked_pointer_cast<CudaDevice>(mm_->device());
191 }
192
memory_manager() const193 const std::shared_ptr<CudaMemoryManager>& memory_manager() const { return mm_; }
194
context_handle() const195 void* context_handle() const { return reinterpret_cast<void*>(context_); }
196
197 private:
198 std::shared_ptr<CudaMemoryManager> mm_;
199 const DeviceProperties* props_;
200 CUcontext context_;
201 bool is_open_;
202
203 // So that we can utilize a CUcontext that was created outside this library
204 bool own_context_;
205
206 std::atomic<int64_t> bytes_allocated_;
207 };
208
209 // ----------------------------------------------------------------------
210 // CudaDevice implementation
211
CudaDevice(Impl impl)212 CudaDevice::CudaDevice(Impl impl) : impl_(new Impl(std::move(impl))) {}
213
type_name() const214 const char* CudaDevice::type_name() const { return kCudaDeviceTypeName; }
215
ToString() const216 std::string CudaDevice::ToString() const {
217 std::stringstream ss;
218 ss << "CudaDevice(device_number=" << device_number() << ", name=\"" << device_name()
219 << "\")";
220 return ss.str();
221 }
222
Equals(const Device & other) const223 bool CudaDevice::Equals(const Device& other) const {
224 if (!IsCudaDevice(other)) {
225 return false;
226 }
227 return checked_cast<const CudaDevice&>(other).device_number() == device_number();
228 }
229
device_number() const230 int CudaDevice::device_number() const { return impl_->props.device_number_; }
231
device_name() const232 std::string CudaDevice::device_name() const { return impl_->props.name_; }
233
total_memory() const234 int64_t CudaDevice::total_memory() const { return impl_->props.total_memory_; }
235
handle() const236 int CudaDevice::handle() const { return impl_->props.handle_; }
237
Make(int device_number)238 Result<std::shared_ptr<CudaDevice>> CudaDevice::Make(int device_number) {
239 ARROW_ASSIGN_OR_RAISE(auto manager, CudaDeviceManager::Instance());
240 return manager->GetDevice(device_number);
241 }
242
default_memory_manager()243 std::shared_ptr<MemoryManager> CudaDevice::default_memory_manager() {
244 return CudaMemoryManager::Make(shared_from_this());
245 }
246
GetContext()247 Result<std::shared_ptr<CudaContext>> CudaDevice::GetContext() {
248 // XXX should we cache a default context in CudaDevice instance?
249 auto context = std::shared_ptr<CudaContext>(new CudaContext());
250 auto self = checked_pointer_cast<CudaDevice>(shared_from_this());
251 RETURN_NOT_OK(context->impl_->Init(self));
252 return context;
253 }
254
GetSharedContext(void * handle)255 Result<std::shared_ptr<CudaContext>> CudaDevice::GetSharedContext(void* handle) {
256 auto context = std::shared_ptr<CudaContext>(new CudaContext());
257 auto self = checked_pointer_cast<CudaDevice>(shared_from_this());
258 RETURN_NOT_OK(context->impl_->InitShared(self, reinterpret_cast<CUcontext>(handle)));
259 return context;
260 }
261
AllocateHostBuffer(int64_t size)262 Result<std::shared_ptr<CudaHostBuffer>> CudaDevice::AllocateHostBuffer(int64_t size) {
263 ARROW_ASSIGN_OR_RAISE(auto context, GetContext());
264 ContextSaver set_temporary(*context);
265 void* ptr;
266 CU_RETURN_NOT_OK("cuMemHostAlloc", cuMemHostAlloc(&ptr, static_cast<size_t>(size),
267 CU_MEMHOSTALLOC_PORTABLE));
268 return std::make_shared<CudaHostBuffer>(reinterpret_cast<uint8_t*>(ptr), size);
269 }
270
IsCudaDevice(const Device & device)271 bool IsCudaDevice(const Device& device) {
272 return device.type_name() == kCudaDeviceTypeName;
273 }
274
AsCudaDevice(const std::shared_ptr<Device> & device)275 Result<std::shared_ptr<CudaDevice>> AsCudaDevice(const std::shared_ptr<Device>& device) {
276 if (IsCudaDevice(*device)) {
277 return checked_pointer_cast<CudaDevice>(device);
278 } else {
279 return Status::TypeError("Device is not a Cuda device: ", device->ToString());
280 }
281 }
282
283 // ----------------------------------------------------------------------
284 // CudaMemoryManager implementation
285
Make(const std::shared_ptr<Device> & device)286 std::shared_ptr<CudaMemoryManager> CudaMemoryManager::Make(
287 const std::shared_ptr<Device>& device) {
288 return std::shared_ptr<CudaMemoryManager>(new CudaMemoryManager(device));
289 }
290
cuda_device() const291 std::shared_ptr<CudaDevice> CudaMemoryManager::cuda_device() const {
292 return checked_pointer_cast<CudaDevice>(device_);
293 }
294
GetBufferReader(std::shared_ptr<Buffer> buf)295 Result<std::shared_ptr<io::RandomAccessFile>> CudaMemoryManager::GetBufferReader(
296 std::shared_ptr<Buffer> buf) {
297 if (*buf->device() != *device_) {
298 return Status::Invalid(
299 "CudaMemoryManager::GetBufferReader called on foreign buffer "
300 "for device ",
301 buf->device()->ToString());
302 }
303 return std::make_shared<CudaBufferReader>(checked_pointer_cast<CudaBuffer>(buf));
304 }
305
GetBufferWriter(std::shared_ptr<Buffer> buf)306 Result<std::shared_ptr<io::OutputStream>> CudaMemoryManager::GetBufferWriter(
307 std::shared_ptr<Buffer> buf) {
308 if (*buf->device() != *device_) {
309 return Status::Invalid(
310 "CudaMemoryManager::GetBufferReader called on foreign buffer "
311 "for device ",
312 buf->device()->ToString());
313 }
314 ARROW_ASSIGN_OR_RAISE(auto cuda_buf, CudaBuffer::FromBuffer(buf));
315 auto writer = std::make_shared<CudaBufferWriter>(cuda_buf);
316 // Use 8MB buffering, which yields generally good performance
317 RETURN_NOT_OK(writer->SetBufferSize(1 << 23));
318 return writer;
319 }
320
AllocateBuffer(int64_t size)321 Result<std::shared_ptr<Buffer>> CudaMemoryManager::AllocateBuffer(int64_t size) {
322 ARROW_ASSIGN_OR_RAISE(auto context, cuda_device()->GetContext());
323 std::shared_ptr<CudaBuffer> dest;
324 return context->Allocate(size);
325 }
326
CopyBufferTo(const std::shared_ptr<Buffer> & buf,const std::shared_ptr<MemoryManager> & to)327 Result<std::shared_ptr<Buffer>> CudaMemoryManager::CopyBufferTo(
328 const std::shared_ptr<Buffer>& buf, const std::shared_ptr<MemoryManager>& to) {
329 if (to->is_cpu()) {
330 // Device-to-CPU copy
331 std::shared_ptr<Buffer> dest;
332 ARROW_ASSIGN_OR_RAISE(auto from_context, cuda_device()->GetContext());
333 ARROW_ASSIGN_OR_RAISE(dest, to->AllocateBuffer(buf->size()));
334 RETURN_NOT_OK(from_context->CopyDeviceToHost(dest->mutable_data(), buf->address(),
335 buf->size()));
336 return dest;
337 }
338 return nullptr;
339 }
340
CopyBufferFrom(const std::shared_ptr<Buffer> & buf,const std::shared_ptr<MemoryManager> & from)341 Result<std::shared_ptr<Buffer>> CudaMemoryManager::CopyBufferFrom(
342 const std::shared_ptr<Buffer>& buf, const std::shared_ptr<MemoryManager>& from) {
343 if (from->is_cpu()) {
344 // CPU-to-device copy
345 ARROW_ASSIGN_OR_RAISE(auto to_context, cuda_device()->GetContext());
346 ARROW_ASSIGN_OR_RAISE(auto dest, to_context->Allocate(buf->size()));
347 RETURN_NOT_OK(
348 to_context->CopyHostToDevice(dest->address(), buf->data(), buf->size()));
349 return dest;
350 }
351 if (IsCudaMemoryManager(*from)) {
352 // Device-to-device copy
353 ARROW_ASSIGN_OR_RAISE(auto to_context, cuda_device()->GetContext());
354 ARROW_ASSIGN_OR_RAISE(
355 auto from_context,
356 checked_cast<const CudaMemoryManager&>(*from).cuda_device()->GetContext());
357 ARROW_ASSIGN_OR_RAISE(auto dest, to_context->Allocate(buf->size()));
358 if (to_context->handle() == from_context->handle()) {
359 // Same context
360 RETURN_NOT_OK(
361 to_context->CopyDeviceToDevice(dest->address(), buf->address(), buf->size()));
362 } else {
363 // Other context
364 RETURN_NOT_OK(from_context->CopyDeviceToAnotherDevice(to_context, dest->address(),
365 buf->address(), buf->size()));
366 }
367 return dest;
368 }
369 return nullptr;
370 }
371
ViewBufferTo(const std::shared_ptr<Buffer> & buf,const std::shared_ptr<MemoryManager> & to)372 Result<std::shared_ptr<Buffer>> CudaMemoryManager::ViewBufferTo(
373 const std::shared_ptr<Buffer>& buf, const std::shared_ptr<MemoryManager>& to) {
374 if (to->is_cpu()) {
375 // Device-on-CPU view
376 ARROW_ASSIGN_OR_RAISE(auto address, GetHostAddress(buf->address()));
377 return std::make_shared<Buffer>(address, buf->size(), to, buf);
378 }
379 return nullptr;
380 }
381
ViewBufferFrom(const std::shared_ptr<Buffer> & buf,const std::shared_ptr<MemoryManager> & from)382 Result<std::shared_ptr<Buffer>> CudaMemoryManager::ViewBufferFrom(
383 const std::shared_ptr<Buffer>& buf, const std::shared_ptr<MemoryManager>& from) {
384 if (from->is_cpu()) {
385 // CPU-on-device view
386 ARROW_ASSIGN_OR_RAISE(auto to_context, cuda_device()->GetContext());
387 ARROW_ASSIGN_OR_RAISE(auto address, GetDeviceAddress(buf->data(), to_context));
388 return std::make_shared<Buffer>(address, buf->size(), shared_from_this(), buf);
389 }
390 return nullptr;
391 }
392
IsCudaMemoryManager(const MemoryManager & mm)393 bool IsCudaMemoryManager(const MemoryManager& mm) { return IsCudaDevice(*mm.device()); }
394
AsCudaMemoryManager(const std::shared_ptr<MemoryManager> & mm)395 Result<std::shared_ptr<CudaMemoryManager>> AsCudaMemoryManager(
396 const std::shared_ptr<MemoryManager>& mm) {
397 if (IsCudaMemoryManager(*mm)) {
398 return checked_pointer_cast<CudaMemoryManager>(mm);
399 } else {
400 return Status::TypeError("Device is not a Cuda device: ", mm->device()->ToString());
401 }
402 }
403
404 // ----------------------------------------------------------------------
405 // CudaDeviceManager implementation
406
407 class CudaDeviceManager::Impl {
408 public:
Impl()409 Impl() : host_bytes_allocated_(0) {}
410
Init()411 Status Init() {
412 CU_RETURN_NOT_OK("cuInit", cuInit(0));
413 CU_RETURN_NOT_OK("cuDeviceGetCount", cuDeviceGetCount(&num_devices_));
414
415 devices_.resize(num_devices_);
416 for (int i = 0; i < num_devices_; ++i) {
417 ARROW_ASSIGN_OR_RAISE(devices_[i], MakeDevice(i));
418 }
419 return Status::OK();
420 }
421
AllocateHost(int device_number,int64_t nbytes,uint8_t ** out)422 Status AllocateHost(int device_number, int64_t nbytes, uint8_t** out) {
423 RETURN_NOT_OK(CheckDeviceNum(device_number));
424 ARROW_ASSIGN_OR_RAISE(auto ctx, GetContext(device_number));
425 ContextSaver set_temporary((CUcontext)(ctx.get()->handle()));
426 CU_RETURN_NOT_OK("cuMemHostAlloc", cuMemHostAlloc(reinterpret_cast<void**>(out),
427 static_cast<size_t>(nbytes),
428 CU_MEMHOSTALLOC_PORTABLE));
429 host_bytes_allocated_ += nbytes;
430 return Status::OK();
431 }
432
FreeHost(void * data,int64_t nbytes)433 Status FreeHost(void* data, int64_t nbytes) {
434 CU_RETURN_NOT_OK("cuMemFreeHost", cuMemFreeHost(data));
435 host_bytes_allocated_ -= nbytes;
436 return Status::OK();
437 }
438
GetContext(int device_number)439 Result<std::shared_ptr<CudaContext>> GetContext(int device_number) {
440 RETURN_NOT_OK(CheckDeviceNum(device_number));
441 return devices_[device_number]->GetContext();
442 }
443
GetSharedContext(int device_number,void * handle)444 Result<std::shared_ptr<CudaContext>> GetSharedContext(int device_number, void* handle) {
445 RETURN_NOT_OK(CheckDeviceNum(device_number));
446 return devices_[device_number]->GetSharedContext(handle);
447 }
448
GetDevice(int device_number)449 Result<std::shared_ptr<CudaDevice>> GetDevice(int device_number) {
450 RETURN_NOT_OK(CheckDeviceNum(device_number));
451 return devices_[device_number];
452 }
453
num_devices() const454 int num_devices() const { return num_devices_; }
455
CheckDeviceNum(int device_number) const456 Status CheckDeviceNum(int device_number) const {
457 if (device_number < 0 || device_number >= num_devices_) {
458 return Status::Invalid("Invalid Cuda device number ", device_number,
459 " (should be between 0 and ", num_devices_ - 1,
460 ", inclusive)");
461 }
462 return Status::OK();
463 }
464
465 protected:
MakeDevice(int device_number)466 Result<std::shared_ptr<CudaDevice>> MakeDevice(int device_number) {
467 DeviceProperties props;
468 RETURN_NOT_OK(props.Init(device_number));
469 return std::shared_ptr<CudaDevice>(new CudaDevice({std::move(props)}));
470 }
471
472 private:
473 int num_devices_;
474 std::vector<std::shared_ptr<CudaDevice>> devices_;
475
476 int64_t host_bytes_allocated_;
477 };
478
CudaDeviceManager()479 CudaDeviceManager::CudaDeviceManager() { impl_.reset(new Impl()); }
480
481 std::unique_ptr<CudaDeviceManager> CudaDeviceManager::instance_ = nullptr;
482
Instance()483 Result<CudaDeviceManager*> CudaDeviceManager::Instance() {
484 static std::mutex mutex;
485 static std::atomic<bool> init_end(false);
486
487 if (!init_end) {
488 std::lock_guard<std::mutex> lock(mutex);
489 if (!init_end) {
490 instance_.reset(new CudaDeviceManager());
491 RETURN_NOT_OK(instance_->impl_->Init());
492 init_end = true;
493 }
494 }
495 return instance_.get();
496 }
497
GetInstance(CudaDeviceManager ** manager)498 Status CudaDeviceManager::GetInstance(CudaDeviceManager** manager) {
499 return Instance().Value(manager);
500 }
501
GetDevice(int device_number)502 Result<std::shared_ptr<CudaDevice>> CudaDeviceManager::GetDevice(int device_number) {
503 return impl_->GetDevice(device_number);
504 }
505
GetContext(int device_number)506 Result<std::shared_ptr<CudaContext>> CudaDeviceManager::GetContext(int device_number) {
507 return impl_->GetContext(device_number);
508 }
509
GetContext(int device_number,std::shared_ptr<CudaContext> * out)510 Status CudaDeviceManager::GetContext(int device_number,
511 std::shared_ptr<CudaContext>* out) {
512 return impl_->GetContext(device_number).Value(out);
513 }
514
GetSharedContext(int device_number,void * ctx)515 Result<std::shared_ptr<CudaContext>> CudaDeviceManager::GetSharedContext(
516 int device_number, void* ctx) {
517 return impl_->GetSharedContext(device_number, ctx);
518 }
519
GetSharedContext(int device_number,void * ctx,std::shared_ptr<CudaContext> * out)520 Status CudaDeviceManager::GetSharedContext(int device_number, void* ctx,
521 std::shared_ptr<CudaContext>* out) {
522 return impl_->GetSharedContext(device_number, ctx).Value(out);
523 }
524
AllocateHost(int device_number,int64_t nbytes)525 Result<std::shared_ptr<CudaHostBuffer>> CudaDeviceManager::AllocateHost(int device_number,
526 int64_t nbytes) {
527 uint8_t* data = nullptr;
528 RETURN_NOT_OK(impl_->AllocateHost(device_number, nbytes, &data));
529 return std::make_shared<CudaHostBuffer>(data, nbytes);
530 }
531
AllocateHost(int device_number,int64_t nbytes,std::shared_ptr<CudaHostBuffer> * out)532 Status CudaDeviceManager::AllocateHost(int device_number, int64_t nbytes,
533 std::shared_ptr<CudaHostBuffer>* out) {
534 return AllocateHost(device_number, nbytes).Value(out);
535 }
536
FreeHost(void * data,int64_t nbytes)537 Status CudaDeviceManager::FreeHost(void* data, int64_t nbytes) {
538 return impl_->FreeHost(data, nbytes);
539 }
540
num_devices() const541 int CudaDeviceManager::num_devices() const { return impl_->num_devices(); }
542
543 // ----------------------------------------------------------------------
544 // CudaContext public API
545
CudaContext()546 CudaContext::CudaContext() { impl_.reset(new Impl()); }
547
~CudaContext()548 CudaContext::~CudaContext() {}
549
Allocate(int64_t nbytes)550 Result<std::shared_ptr<CudaBuffer>> CudaContext::Allocate(int64_t nbytes) {
551 uint8_t* data = nullptr;
552 RETURN_NOT_OK(impl_->Allocate(nbytes, &data));
553 return std::make_shared<CudaBuffer>(data, nbytes, this->shared_from_this(), true);
554 }
555
Allocate(int64_t nbytes,std::shared_ptr<CudaBuffer> * out)556 Status CudaContext::Allocate(int64_t nbytes, std::shared_ptr<CudaBuffer>* out) {
557 return Allocate(nbytes).Value(out);
558 }
559
View(uint8_t * data,int64_t nbytes)560 Result<std::shared_ptr<CudaBuffer>> CudaContext::View(uint8_t* data, int64_t nbytes) {
561 return std::make_shared<CudaBuffer>(data, nbytes, this->shared_from_this(), false);
562 }
563
View(uint8_t * data,int64_t nbytes,std::shared_ptr<CudaBuffer> * out)564 Status CudaContext::View(uint8_t* data, int64_t nbytes,
565 std::shared_ptr<CudaBuffer>* out) {
566 return View(data, nbytes).Value(out);
567 }
568
ExportIpcBuffer(void * data,int64_t size)569 Result<std::shared_ptr<CudaIpcMemHandle>> CudaContext::ExportIpcBuffer(void* data,
570 int64_t size) {
571 return impl_->ExportIpcBuffer(data, size);
572 }
573
CopyHostToDevice(uintptr_t dst,const void * src,int64_t nbytes)574 Status CudaContext::CopyHostToDevice(uintptr_t dst, const void* src, int64_t nbytes) {
575 return impl_->CopyHostToDevice(dst, src, nbytes);
576 }
577
CopyHostToDevice(void * dst,const void * src,int64_t nbytes)578 Status CudaContext::CopyHostToDevice(void* dst, const void* src, int64_t nbytes) {
579 return impl_->CopyHostToDevice(reinterpret_cast<uintptr_t>(dst), src, nbytes);
580 }
581
CopyDeviceToHost(void * dst,uintptr_t src,int64_t nbytes)582 Status CudaContext::CopyDeviceToHost(void* dst, uintptr_t src, int64_t nbytes) {
583 return impl_->CopyDeviceToHost(dst, src, nbytes);
584 }
585
CopyDeviceToHost(void * dst,const void * src,int64_t nbytes)586 Status CudaContext::CopyDeviceToHost(void* dst, const void* src, int64_t nbytes) {
587 return impl_->CopyDeviceToHost(dst, reinterpret_cast<uintptr_t>(src), nbytes);
588 }
589
CopyDeviceToDevice(uintptr_t dst,uintptr_t src,int64_t nbytes)590 Status CudaContext::CopyDeviceToDevice(uintptr_t dst, uintptr_t src, int64_t nbytes) {
591 return impl_->CopyDeviceToDevice(dst, src, nbytes);
592 }
593
CopyDeviceToDevice(void * dst,const void * src,int64_t nbytes)594 Status CudaContext::CopyDeviceToDevice(void* dst, const void* src, int64_t nbytes) {
595 return impl_->CopyDeviceToDevice(reinterpret_cast<uintptr_t>(dst),
596 reinterpret_cast<uintptr_t>(src), nbytes);
597 }
598
CopyDeviceToAnotherDevice(const std::shared_ptr<CudaContext> & dst_ctx,uintptr_t dst,uintptr_t src,int64_t nbytes)599 Status CudaContext::CopyDeviceToAnotherDevice(const std::shared_ptr<CudaContext>& dst_ctx,
600 uintptr_t dst, uintptr_t src,
601 int64_t nbytes) {
602 return impl_->CopyDeviceToAnotherDevice(dst_ctx, dst, src, nbytes);
603 }
604
CopyDeviceToAnotherDevice(const std::shared_ptr<CudaContext> & dst_ctx,void * dst,const void * src,int64_t nbytes)605 Status CudaContext::CopyDeviceToAnotherDevice(const std::shared_ptr<CudaContext>& dst_ctx,
606 void* dst, const void* src,
607 int64_t nbytes) {
608 return impl_->CopyDeviceToAnotherDevice(dst_ctx, reinterpret_cast<uintptr_t>(dst),
609 reinterpret_cast<uintptr_t>(src), nbytes);
610 }
611
Synchronize(void)612 Status CudaContext::Synchronize(void) { return impl_->Synchronize(); }
613
Close()614 Status CudaContext::Close() { return impl_->Close(); }
615
Free(void * device_ptr,int64_t nbytes)616 Status CudaContext::Free(void* device_ptr, int64_t nbytes) {
617 return impl_->Free(device_ptr, nbytes);
618 }
619
OpenIpcBuffer(const CudaIpcMemHandle & ipc_handle)620 Result<std::shared_ptr<CudaBuffer>> CudaContext::OpenIpcBuffer(
621 const CudaIpcMemHandle& ipc_handle) {
622 if (ipc_handle.memory_size() > 0) {
623 ContextSaver set_temporary(*this);
624 uint8_t* data = nullptr;
625 RETURN_NOT_OK(impl_->OpenIpcBuffer(ipc_handle, &data));
626 // Need to ask the device how big the buffer is
627 size_t allocation_size = 0;
628 CU_RETURN_NOT_OK("cuMemGetAddressRange",
629 cuMemGetAddressRange(nullptr, &allocation_size,
630 reinterpret_cast<CUdeviceptr>(data)));
631 return std::make_shared<CudaBuffer>(data, allocation_size, this->shared_from_this(),
632 true, true);
633 } else {
634 // zero-sized buffer does not own data (which is nullptr), hence
635 // CloseIpcBuffer will not be called (see CudaBuffer::Close).
636 return std::make_shared<CudaBuffer>(nullptr, 0, this->shared_from_this(), false,
637 true);
638 }
639 }
640
OpenIpcBuffer(const CudaIpcMemHandle & ipc_handle,std::shared_ptr<CudaBuffer> * out)641 Status CudaContext::OpenIpcBuffer(const CudaIpcMemHandle& ipc_handle,
642 std::shared_ptr<CudaBuffer>* out) {
643 if (ipc_handle.memory_size() > 0) {
644 ContextSaver set_temporary(*this);
645 uint8_t* data = nullptr;
646 RETURN_NOT_OK(impl_->OpenIpcBuffer(ipc_handle, &data));
647 // Need to ask the device how big the buffer is
648 size_t allocation_size = 0;
649 CU_RETURN_NOT_OK("cuMemGetAddressRange",
650 cuMemGetAddressRange(nullptr, &allocation_size,
651 reinterpret_cast<CUdeviceptr>(data)));
652 *out = std::make_shared<CudaBuffer>(data, allocation_size, this->shared_from_this(),
653 true, true);
654 } else {
655 // zero-sized buffer does not own data (which is nullptr), hence
656 // CloseIpcBuffer will not be called (see CudaBuffer::Close).
657 *out =
658 std::make_shared<CudaBuffer>(nullptr, 0, this->shared_from_this(), false, true);
659 }
660 return Status::OK();
661 }
662
CloseIpcBuffer(CudaBuffer * buf)663 Status CudaContext::CloseIpcBuffer(CudaBuffer* buf) {
664 ContextSaver set_temporary(*this);
665 CU_RETURN_NOT_OK("cuIpcCloseMemHandle", cuIpcCloseMemHandle(buf->address()));
666 return Status::OK();
667 }
668
bytes_allocated() const669 int64_t CudaContext::bytes_allocated() const { return impl_->bytes_allocated(); }
670
handle() const671 void* CudaContext::handle() const { return impl_->context_handle(); }
672
device() const673 std::shared_ptr<CudaDevice> CudaContext::device() const { return impl_->device(); }
674
memory_manager() const675 std::shared_ptr<CudaMemoryManager> CudaContext::memory_manager() const {
676 return impl_->memory_manager();
677 }
678
device_number() const679 int CudaContext::device_number() const { return impl_->device()->device_number(); }
680
GetDeviceAddress(uintptr_t addr)681 Result<uintptr_t> CudaContext::GetDeviceAddress(uintptr_t addr) {
682 ContextSaver set_temporary(*this);
683 CUdeviceptr ptr;
684 CU_RETURN_NOT_OK("cuPointerGetAttribute",
685 cuPointerGetAttribute(&ptr, CU_POINTER_ATTRIBUTE_DEVICE_POINTER,
686 static_cast<CUdeviceptr>(addr)));
687 return static_cast<uintptr_t>(ptr);
688 }
689
GetDeviceAddress(uint8_t * addr)690 Result<uintptr_t> CudaContext::GetDeviceAddress(uint8_t* addr) {
691 return GetDeviceAddress(reinterpret_cast<uintptr_t>(addr));
692 }
693
GetDeviceAddress(uint8_t * addr,uint8_t ** devaddr)694 Status CudaContext::GetDeviceAddress(uint8_t* addr, uint8_t** devaddr) {
695 ARROW_ASSIGN_OR_RAISE(auto ptr, GetDeviceAddress(addr));
696 *devaddr = reinterpret_cast<uint8_t*>(ptr);
697 return Status::OK();
698 }
699
700 } // namespace cuda
701 } // namespace arrow
702