1 // Licensed to the Apache Software Foundation (ASF) under one 2 // or more contributor license agreements. See the NOTICE file 3 // distributed with this work for additional information 4 // regarding copyright ownership. The ASF licenses this file 5 // to you under the Apache License, Version 2.0 (the 6 // "License"); you may not use this file except in compliance 7 // with the License. You may obtain a copy of the License at 8 // 9 // http://www.apache.org/licenses/LICENSE-2.0 10 // 11 // Unless required by applicable law or agreed to in writing, 12 // software distributed under the License is distributed on an 13 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 // KIND, either express or implied. See the License for the 15 // specific language governing permissions and limitations 16 // under the License. 17 18 #pragma once 19 20 #include <cstdint> 21 #include <memory> 22 23 #include "arrow/buffer.h" 24 #include "arrow/io/concurrency.h" 25 #include "arrow/type_fwd.h" 26 27 namespace arrow { 28 namespace cuda { 29 30 class CudaContext; 31 class CudaIpcMemHandle; 32 33 /// \class CudaBuffer 34 /// \brief An Arrow buffer located on a GPU device 35 /// 36 /// Be careful using this in any Arrow code which may not be GPU-aware 37 class ARROW_EXPORT CudaBuffer : public Buffer { 38 public: 39 // XXX deprecate? 40 CudaBuffer(uint8_t* data, int64_t size, const std::shared_ptr<CudaContext>& context, 41 bool own_data = false, bool is_ipc = false); 42 43 CudaBuffer(uintptr_t address, int64_t size, const std::shared_ptr<CudaContext>& context, 44 bool own_data = false, bool is_ipc = false); 45 46 CudaBuffer(const std::shared_ptr<CudaBuffer>& parent, const int64_t offset, 47 const int64_t size); 48 49 ~CudaBuffer(); 50 51 /// \brief Convert back generic buffer into CudaBuffer 52 /// \param[in] buffer buffer to convert 53 /// \return CudaBuffer or Status 54 /// 55 /// \note This function returns an error if the buffer isn't backed 56 /// by GPU memory 57 static Result<std::shared_ptr<CudaBuffer>> FromBuffer(std::shared_ptr<Buffer> buffer); 58 59 /// \brief Convert back generic buffer into CudaBuffer 60 /// \param[in] buffer buffer to convert 61 /// \param[out] out conversion result 62 /// \return Status 63 /// 64 /// \note This function returns an error if the buffer isn't backed 65 /// by GPU memory 66 ARROW_DEPRECATED("Use Result-returning version") 67 static Status FromBuffer(std::shared_ptr<Buffer> buffer, 68 std::shared_ptr<CudaBuffer>* out); 69 70 /// \brief Copy memory from GPU device to CPU host 71 /// \param[in] position start position inside buffer to copy bytes from 72 /// \param[in] nbytes number of bytes to copy 73 /// \param[out] out start address of the host memory area to copy to 74 /// \return Status 75 Status CopyToHost(const int64_t position, const int64_t nbytes, void* out) const; 76 77 /// \brief Copy memory to device at position 78 /// \param[in] position start position to copy bytes to 79 /// \param[in] data the host data to copy 80 /// \param[in] nbytes number of bytes to copy 81 /// \return Status 82 Status CopyFromHost(const int64_t position, const void* data, int64_t nbytes); 83 84 /// \brief Copy memory from device to device at position 85 /// \param[in] position start position inside buffer to copy bytes to 86 /// \param[in] data start address of the device memory area to copy from 87 /// \param[in] nbytes number of bytes to copy 88 /// \return Status 89 /// 90 /// \note It is assumed that both source and destination device 91 /// memories have been allocated within the same context. 92 Status CopyFromDevice(const int64_t position, const void* data, int64_t nbytes); 93 94 /// \brief Copy memory from another device to device at position 95 /// \param[in] src_ctx context of the source device memory 96 /// \param[in] position start position inside buffer to copy bytes to 97 /// \param[in] data start address of the another device memory area to copy from 98 /// \param[in] nbytes number of bytes to copy 99 /// \return Status 100 Status CopyFromAnotherDevice(const std::shared_ptr<CudaContext>& src_ctx, 101 const int64_t position, const void* data, int64_t nbytes); 102 103 /// \brief Expose this device buffer as IPC memory which can be used in other processes 104 /// \return Handle or Status 105 /// 106 /// \note After calling this function, this device memory will not be freed 107 /// when the CudaBuffer is destructed 108 virtual Result<std::shared_ptr<CudaIpcMemHandle>> ExportForIpc(); 109 110 /// \brief Expose this device buffer as IPC memory which can be used in other processes 111 /// \param[out] handle the exported IPC handle 112 /// \return Status 113 /// 114 /// \note After calling this function, this device memory will not be freed 115 /// when the CudaBuffer is destructed 116 ARROW_DEPRECATED("Use Result-returning version") 117 virtual Status ExportForIpc(std::shared_ptr<CudaIpcMemHandle>* handle); 118 context()119 const std::shared_ptr<CudaContext>& context() const { return context_; } 120 121 protected: 122 std::shared_ptr<CudaContext> context_; 123 bool own_data_; 124 bool is_ipc_; 125 126 virtual Status Close(); 127 }; 128 129 /// \class CudaHostBuffer 130 /// \brief Device-accessible CPU memory created using cudaHostAlloc 131 class ARROW_EXPORT CudaHostBuffer : public MutableBuffer { 132 public: 133 using MutableBuffer::MutableBuffer; 134 ~CudaHostBuffer(); 135 136 /// \brief Return a device address the GPU can read this memory from. 137 Result<uintptr_t> GetDeviceAddress(const std::shared_ptr<CudaContext>& ctx); 138 }; 139 140 /// \class CudaIpcHandle 141 /// \brief A container for a CUDA IPC handle 142 class ARROW_EXPORT CudaIpcMemHandle { 143 public: 144 ~CudaIpcMemHandle(); 145 146 /// \brief Create CudaIpcMemHandle from opaque buffer (e.g. from another process) 147 /// \param[in] opaque_handle a CUipcMemHandle as a const void* 148 /// \return Handle or Status 149 static Result<std::shared_ptr<CudaIpcMemHandle>> FromBuffer(const void* opaque_handle); 150 151 /// \brief Create CudaIpcMemHandle from opaque buffer (e.g. from another process) 152 /// \param[in] opaque_handle a CUipcMemHandle as a const void* 153 /// \param[out] handle the CudaIpcMemHandle instance 154 /// \return Status 155 ARROW_DEPRECATED("Use Result-returning version") 156 static Status FromBuffer(const void* opaque_handle, 157 std::shared_ptr<CudaIpcMemHandle>* handle); 158 159 /// \brief Write CudaIpcMemHandle to a Buffer 160 /// \param[in] pool a MemoryPool to allocate memory from 161 /// \return Buffer or Status 162 Result<std::shared_ptr<Buffer>> Serialize( 163 MemoryPool* pool = default_memory_pool()) const; 164 165 /// \brief Write CudaIpcMemHandle to a Buffer 166 /// \param[in] pool a MemoryPool to allocate memory from 167 /// \param[out] out the serialized buffer 168 /// \return Status 169 ARROW_DEPRECATED("Use Result-returning version") 170 Status Serialize(MemoryPool* pool, std::shared_ptr<Buffer>* out) const; 171 172 private: 173 explicit CudaIpcMemHandle(const void* handle); 174 CudaIpcMemHandle(int64_t memory_size, const void* cu_handle); 175 176 struct CudaIpcMemHandleImpl; 177 std::unique_ptr<CudaIpcMemHandleImpl> impl_; 178 179 const void* handle() const; 180 int64_t memory_size() const; 181 182 friend CudaBuffer; 183 friend CudaContext; 184 }; 185 186 /// \class CudaBufferReader 187 /// \brief File interface for zero-copy read from CUDA buffers 188 /// 189 /// CAUTION: reading to a Buffer returns a Buffer pointing to device memory. 190 /// It will generally not be compatible with Arrow code expecting a buffer 191 /// pointing to CPU memory. 192 /// Reading to a raw pointer, though, copies device memory into the host 193 /// memory pointed to. 194 class ARROW_EXPORT CudaBufferReader 195 : public ::arrow::io::internal::RandomAccessFileConcurrencyWrapper<CudaBufferReader> { 196 public: 197 explicit CudaBufferReader(const std::shared_ptr<Buffer>& buffer); 198 199 bool closed() const override; 200 201 bool supports_zero_copy() const override; 202 buffer()203 std::shared_ptr<CudaBuffer> buffer() const { return buffer_; } 204 205 protected: 206 friend ::arrow::io::internal::RandomAccessFileConcurrencyWrapper<CudaBufferReader>; 207 208 Status DoClose(); 209 210 Result<int64_t> DoRead(int64_t nbytes, void* buffer); 211 Result<std::shared_ptr<Buffer>> DoRead(int64_t nbytes); 212 Result<int64_t> DoReadAt(int64_t position, int64_t nbytes, void* out); 213 Result<std::shared_ptr<Buffer>> DoReadAt(int64_t position, int64_t nbytes); 214 215 Result<int64_t> DoTell() const; 216 Status DoSeek(int64_t position); 217 Result<int64_t> DoGetSize(); 218 CheckClosed()219 Status CheckClosed() const { 220 if (!is_open_) { 221 return Status::Invalid("Operation forbidden on closed CudaBufferReader"); 222 } 223 return Status::OK(); 224 } 225 226 std::shared_ptr<CudaBuffer> buffer_; 227 std::shared_ptr<CudaContext> context_; 228 const uintptr_t address_; 229 int64_t size_; 230 int64_t position_; 231 bool is_open_; 232 }; 233 234 /// \class CudaBufferWriter 235 /// \brief File interface for writing to CUDA buffers, with optional buffering 236 class ARROW_EXPORT CudaBufferWriter : public io::WritableFile { 237 public: 238 explicit CudaBufferWriter(const std::shared_ptr<CudaBuffer>& buffer); 239 ~CudaBufferWriter() override; 240 241 /// \brief Close writer and flush buffered bytes to GPU 242 Status Close() override; 243 244 bool closed() const override; 245 246 /// \brief Flush buffered bytes to GPU 247 Status Flush() override; 248 249 Status Seek(int64_t position) override; 250 251 Status Write(const void* data, int64_t nbytes) override; 252 253 Status WriteAt(int64_t position, const void* data, int64_t nbytes) override; 254 255 Result<int64_t> Tell() const override; 256 257 /// \brief Set CPU buffer size to limit calls to cudaMemcpy 258 /// \param[in] buffer_size the size of CPU buffer to allocate 259 /// \return Status 260 /// 261 /// By default writes are unbuffered 262 Status SetBufferSize(const int64_t buffer_size); 263 264 /// \brief Returns size of host (CPU) buffer, 0 for unbuffered 265 int64_t buffer_size() const; 266 267 /// \brief Returns number of bytes buffered on host 268 int64_t num_bytes_buffered() const; 269 270 private: 271 class CudaBufferWriterImpl; 272 std::unique_ptr<CudaBufferWriterImpl> impl_; 273 }; 274 275 /// \brief Allocate CUDA-accessible memory on CPU host 276 /// 277 /// The GPU will benefit from fast access to this CPU-located buffer, 278 /// including fast memory copy. 279 /// 280 /// \param[in] device_number device to expose host memory 281 /// \param[in] size number of bytes 282 /// \return Host buffer or Status 283 ARROW_EXPORT 284 Result<std::shared_ptr<CudaHostBuffer>> AllocateCudaHostBuffer(int device_number, 285 const int64_t size); 286 287 /// \brief Allocate CUDA-accessible memory on CPU host 288 /// 289 /// The GPU will benefit from fast access to this CPU-located buffer, 290 /// including fast memory copy. 291 /// 292 /// \param[in] device_number device to expose host memory 293 /// \param[in] size number of bytes 294 /// \param[out] out the allocated buffer 295 /// \return Status 296 ARROW_DEPRECATED("Use Result-returning version") 297 ARROW_EXPORT 298 Status AllocateCudaHostBuffer(int device_number, const int64_t size, 299 std::shared_ptr<CudaHostBuffer>* out); 300 301 /// Low-level: get a device address through which the CPU data be accessed. 302 Result<uintptr_t> GetDeviceAddress(const uint8_t* cpu_data, 303 const std::shared_ptr<CudaContext>& ctx); 304 305 /// Low-level: get a CPU address through which the device data be accessed. 306 Result<uint8_t*> GetHostAddress(uintptr_t device_ptr); 307 308 } // namespace cuda 309 } // namespace arrow 310