1 // Licensed to the Apache Software Foundation (ASF) under one
2 // or more contributor license agreements.  See the NOTICE file
3 // distributed with this work for additional information
4 // regarding copyright ownership.  The ASF licenses this file
5 // to you under the Apache License, Version 2.0 (the
6 // "License"); you may not use this file except in compliance
7 // with the License.  You may obtain a copy of the License at
8 //
9 //   http://www.apache.org/licenses/LICENSE-2.0
10 //
11 // Unless required by applicable law or agreed to in writing,
12 // software distributed under the License is distributed on an
13 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 // KIND, either express or implied.  See the License for the
15 // specific language governing permissions and limitations
16 // under the License.
17 
18 #pragma once
19 
20 #include <cstdint>
21 #include <memory>
22 
23 #include "arrow/buffer.h"
24 #include "arrow/io/concurrency.h"
25 #include "arrow/type_fwd.h"
26 
27 namespace arrow {
28 namespace cuda {
29 
30 class CudaContext;
31 class CudaIpcMemHandle;
32 
33 /// \class CudaBuffer
34 /// \brief An Arrow buffer located on a GPU device
35 ///
36 /// Be careful using this in any Arrow code which may not be GPU-aware
37 class ARROW_EXPORT CudaBuffer : public Buffer {
38  public:
39   // XXX deprecate?
40   CudaBuffer(uint8_t* data, int64_t size, const std::shared_ptr<CudaContext>& context,
41              bool own_data = false, bool is_ipc = false);
42 
43   CudaBuffer(uintptr_t address, int64_t size, const std::shared_ptr<CudaContext>& context,
44              bool own_data = false, bool is_ipc = false);
45 
46   CudaBuffer(const std::shared_ptr<CudaBuffer>& parent, const int64_t offset,
47              const int64_t size);
48 
49   ~CudaBuffer();
50 
51   /// \brief Convert back generic buffer into CudaBuffer
52   /// \param[in] buffer buffer to convert
53   /// \return CudaBuffer or Status
54   ///
55   /// \note This function returns an error if the buffer isn't backed
56   /// by GPU memory
57   static Result<std::shared_ptr<CudaBuffer>> FromBuffer(std::shared_ptr<Buffer> buffer);
58 
59   /// \brief Convert back generic buffer into CudaBuffer
60   /// \param[in] buffer buffer to convert
61   /// \param[out] out conversion result
62   /// \return Status
63   ///
64   /// \note This function returns an error if the buffer isn't backed
65   /// by GPU memory
66   ARROW_DEPRECATED("Use Result-returning version")
67   static Status FromBuffer(std::shared_ptr<Buffer> buffer,
68                            std::shared_ptr<CudaBuffer>* out);
69 
70   /// \brief Copy memory from GPU device to CPU host
71   /// \param[in] position start position inside buffer to copy bytes from
72   /// \param[in] nbytes number of bytes to copy
73   /// \param[out] out start address of the host memory area to copy to
74   /// \return Status
75   Status CopyToHost(const int64_t position, const int64_t nbytes, void* out) const;
76 
77   /// \brief Copy memory to device at position
78   /// \param[in] position start position to copy bytes to
79   /// \param[in] data the host data to copy
80   /// \param[in] nbytes number of bytes to copy
81   /// \return Status
82   Status CopyFromHost(const int64_t position, const void* data, int64_t nbytes);
83 
84   /// \brief Copy memory from device to device at position
85   /// \param[in] position start position inside buffer to copy bytes to
86   /// \param[in] data start address of the device memory area to copy from
87   /// \param[in] nbytes number of bytes to copy
88   /// \return Status
89   ///
90   /// \note It is assumed that both source and destination device
91   /// memories have been allocated within the same context.
92   Status CopyFromDevice(const int64_t position, const void* data, int64_t nbytes);
93 
94   /// \brief Copy memory from another device to device at position
95   /// \param[in] src_ctx context of the source device memory
96   /// \param[in] position start position inside buffer to copy bytes to
97   /// \param[in] data start address of the another device memory area to copy from
98   /// \param[in] nbytes number of bytes to copy
99   /// \return Status
100   Status CopyFromAnotherDevice(const std::shared_ptr<CudaContext>& src_ctx,
101                                const int64_t position, const void* data, int64_t nbytes);
102 
103   /// \brief Expose this device buffer as IPC memory which can be used in other processes
104   /// \return Handle or Status
105   ///
106   /// \note After calling this function, this device memory will not be freed
107   /// when the CudaBuffer is destructed
108   virtual Result<std::shared_ptr<CudaIpcMemHandle>> ExportForIpc();
109 
110   /// \brief Expose this device buffer as IPC memory which can be used in other processes
111   /// \param[out] handle the exported IPC handle
112   /// \return Status
113   ///
114   /// \note After calling this function, this device memory will not be freed
115   /// when the CudaBuffer is destructed
116   ARROW_DEPRECATED("Use Result-returning version")
117   virtual Status ExportForIpc(std::shared_ptr<CudaIpcMemHandle>* handle);
118 
context()119   const std::shared_ptr<CudaContext>& context() const { return context_; }
120 
121  protected:
122   std::shared_ptr<CudaContext> context_;
123   bool own_data_;
124   bool is_ipc_;
125 
126   virtual Status Close();
127 };
128 
129 /// \class CudaHostBuffer
130 /// \brief Device-accessible CPU memory created using cudaHostAlloc
131 class ARROW_EXPORT CudaHostBuffer : public MutableBuffer {
132  public:
133   using MutableBuffer::MutableBuffer;
134   ~CudaHostBuffer();
135 
136   /// \brief Return a device address the GPU can read this memory from.
137   Result<uintptr_t> GetDeviceAddress(const std::shared_ptr<CudaContext>& ctx);
138 };
139 
140 /// \class CudaIpcHandle
141 /// \brief A container for a CUDA IPC handle
142 class ARROW_EXPORT CudaIpcMemHandle {
143  public:
144   ~CudaIpcMemHandle();
145 
146   /// \brief Create CudaIpcMemHandle from opaque buffer (e.g. from another process)
147   /// \param[in] opaque_handle a CUipcMemHandle as a const void*
148   /// \return Handle or Status
149   static Result<std::shared_ptr<CudaIpcMemHandle>> FromBuffer(const void* opaque_handle);
150 
151   /// \brief Create CudaIpcMemHandle from opaque buffer (e.g. from another process)
152   /// \param[in] opaque_handle a CUipcMemHandle as a const void*
153   /// \param[out] handle the CudaIpcMemHandle instance
154   /// \return Status
155   ARROW_DEPRECATED("Use Result-returning version")
156   static Status FromBuffer(const void* opaque_handle,
157                            std::shared_ptr<CudaIpcMemHandle>* handle);
158 
159   /// \brief Write CudaIpcMemHandle to a Buffer
160   /// \param[in] pool a MemoryPool to allocate memory from
161   /// \return Buffer or Status
162   Result<std::shared_ptr<Buffer>> Serialize(
163       MemoryPool* pool = default_memory_pool()) const;
164 
165   /// \brief Write CudaIpcMemHandle to a Buffer
166   /// \param[in] pool a MemoryPool to allocate memory from
167   /// \param[out] out the serialized buffer
168   /// \return Status
169   ARROW_DEPRECATED("Use Result-returning version")
170   Status Serialize(MemoryPool* pool, std::shared_ptr<Buffer>* out) const;
171 
172  private:
173   explicit CudaIpcMemHandle(const void* handle);
174   CudaIpcMemHandle(int64_t memory_size, const void* cu_handle);
175 
176   struct CudaIpcMemHandleImpl;
177   std::unique_ptr<CudaIpcMemHandleImpl> impl_;
178 
179   const void* handle() const;
180   int64_t memory_size() const;
181 
182   friend CudaBuffer;
183   friend CudaContext;
184 };
185 
186 /// \class CudaBufferReader
187 /// \brief File interface for zero-copy read from CUDA buffers
188 ///
189 /// CAUTION: reading to a Buffer returns a Buffer pointing to device memory.
190 /// It will generally not be compatible with Arrow code expecting a buffer
191 /// pointing to CPU memory.
192 /// Reading to a raw pointer, though, copies device memory into the host
193 /// memory pointed to.
194 class ARROW_EXPORT CudaBufferReader
195     : public ::arrow::io::internal::RandomAccessFileConcurrencyWrapper<CudaBufferReader> {
196  public:
197   explicit CudaBufferReader(const std::shared_ptr<Buffer>& buffer);
198 
199   bool closed() const override;
200 
201   bool supports_zero_copy() const override;
202 
buffer()203   std::shared_ptr<CudaBuffer> buffer() const { return buffer_; }
204 
205  protected:
206   friend ::arrow::io::internal::RandomAccessFileConcurrencyWrapper<CudaBufferReader>;
207 
208   Status DoClose();
209 
210   Result<int64_t> DoRead(int64_t nbytes, void* buffer);
211   Result<std::shared_ptr<Buffer>> DoRead(int64_t nbytes);
212   Result<int64_t> DoReadAt(int64_t position, int64_t nbytes, void* out);
213   Result<std::shared_ptr<Buffer>> DoReadAt(int64_t position, int64_t nbytes);
214 
215   Result<int64_t> DoTell() const;
216   Status DoSeek(int64_t position);
217   Result<int64_t> DoGetSize();
218 
CheckClosed()219   Status CheckClosed() const {
220     if (!is_open_) {
221       return Status::Invalid("Operation forbidden on closed CudaBufferReader");
222     }
223     return Status::OK();
224   }
225 
226   std::shared_ptr<CudaBuffer> buffer_;
227   std::shared_ptr<CudaContext> context_;
228   const uintptr_t address_;
229   int64_t size_;
230   int64_t position_;
231   bool is_open_;
232 };
233 
234 /// \class CudaBufferWriter
235 /// \brief File interface for writing to CUDA buffers, with optional buffering
236 class ARROW_EXPORT CudaBufferWriter : public io::WritableFile {
237  public:
238   explicit CudaBufferWriter(const std::shared_ptr<CudaBuffer>& buffer);
239   ~CudaBufferWriter() override;
240 
241   /// \brief Close writer and flush buffered bytes to GPU
242   Status Close() override;
243 
244   bool closed() const override;
245 
246   /// \brief Flush buffered bytes to GPU
247   Status Flush() override;
248 
249   Status Seek(int64_t position) override;
250 
251   Status Write(const void* data, int64_t nbytes) override;
252 
253   Status WriteAt(int64_t position, const void* data, int64_t nbytes) override;
254 
255   Result<int64_t> Tell() const override;
256 
257   /// \brief Set CPU buffer size to limit calls to cudaMemcpy
258   /// \param[in] buffer_size the size of CPU buffer to allocate
259   /// \return Status
260   ///
261   /// By default writes are unbuffered
262   Status SetBufferSize(const int64_t buffer_size);
263 
264   /// \brief Returns size of host (CPU) buffer, 0 for unbuffered
265   int64_t buffer_size() const;
266 
267   /// \brief Returns number of bytes buffered on host
268   int64_t num_bytes_buffered() const;
269 
270  private:
271   class CudaBufferWriterImpl;
272   std::unique_ptr<CudaBufferWriterImpl> impl_;
273 };
274 
275 /// \brief Allocate CUDA-accessible memory on CPU host
276 ///
277 /// The GPU will benefit from fast access to this CPU-located buffer,
278 /// including fast memory copy.
279 ///
280 /// \param[in] device_number device to expose host memory
281 /// \param[in] size number of bytes
282 /// \return Host buffer or Status
283 ARROW_EXPORT
284 Result<std::shared_ptr<CudaHostBuffer>> AllocateCudaHostBuffer(int device_number,
285                                                                const int64_t size);
286 
287 /// \brief Allocate CUDA-accessible memory on CPU host
288 ///
289 /// The GPU will benefit from fast access to this CPU-located buffer,
290 /// including fast memory copy.
291 ///
292 /// \param[in] device_number device to expose host memory
293 /// \param[in] size number of bytes
294 /// \param[out] out the allocated buffer
295 /// \return Status
296 ARROW_DEPRECATED("Use Result-returning version")
297 ARROW_EXPORT
298 Status AllocateCudaHostBuffer(int device_number, const int64_t size,
299                               std::shared_ptr<CudaHostBuffer>* out);
300 
301 /// Low-level: get a device address through which the CPU data be accessed.
302 Result<uintptr_t> GetDeviceAddress(const uint8_t* cpu_data,
303                                    const std::shared_ptr<CudaContext>& ctx);
304 
305 /// Low-level: get a CPU address through which the device data be accessed.
306 Result<uint8_t*> GetHostAddress(uintptr_t device_ptr);
307 
308 }  // namespace cuda
309 }  // namespace arrow
310