arrow/gpu/cuda_memory.h

// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements.  See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership.  The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License.  You may obtain a copy of the License at
//
//   http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied.  See the License for the
// specific language governing permissions and limitations
// under the License.

#pragma once

#include <cstdint>
#include <memory>

#include "arrow/buffer.h"
#include "arrow/io/concurrency.h"
#include "arrow/type_fwd.h"

namespace arrow {
namespace cuda {

class CudaContext;
class CudaIpcMemHandle;

/// \class CudaBuffer
/// \brief An Arrow buffer located on a GPU device
///
/// Be careful using this in any Arrow code which may not be GPU-aware
class ARROW_EXPORT CudaBuffer : public Buffer {
 public:
  // XXX deprecate?
  CudaBuffer(uint8_t* data, int64_t size, const std::shared_ptr<CudaContext>& context,
             bool own_data = false, bool is_ipc = false);

  CudaBuffer(uintptr_t address, int64_t size, const std::shared_ptr<CudaContext>& context,
             bool own_data = false, bool is_ipc = false);

  CudaBuffer(const std::shared_ptr<CudaBuffer>& parent, const int64_t offset,
             const int64_t size);

  ~CudaBuffer();

  /// \brief Convert back generic buffer into CudaBuffer
  /// \param[in] buffer buffer to convert
  /// \return CudaBuffer or Status
  ///
  /// \note This function returns an error if the buffer isn't backed
  /// by GPU memory
  static Result<std::shared_ptr<CudaBuffer>> FromBuffer(std::shared_ptr<Buffer> buffer);

  /// \brief Convert back generic buffer into CudaBuffer
  /// \param[in] buffer buffer to convert
  /// \param[out] out conversion result
  /// \return Status
  ///
  /// \note This function returns an error if the buffer isn't backed
  /// by GPU memory
  ARROW_DEPRECATED("Use Result-returning version")
  static Status FromBuffer(std::shared_ptr<Buffer> buffer,
                           std::shared_ptr<CudaBuffer>* out);

  /// \brief Copy memory from GPU device to CPU host
  /// \param[in] position start position inside buffer to copy bytes from
  /// \param[in] nbytes number of bytes to copy
  /// \param[out] out start address of the host memory area to copy to
  /// \return Status
  Status CopyToHost(const int64_t position, const int64_t nbytes, void* out) const;

  /// \brief Copy memory to device at position
  /// \param[in] position start position to copy bytes to
  /// \param[in] data the host data to copy
  /// \param[in] nbytes number of bytes to copy
  /// \return Status
  Status CopyFromHost(const int64_t position, const void* data, int64_t nbytes);

  /// \brief Copy memory from device to device at position
  /// \param[in] position start position inside buffer to copy bytes to
  /// \param[in] data start address of the device memory area to copy from
  /// \param[in] nbytes number of bytes to copy
  /// \return Status
  ///
  /// \note It is assumed that both source and destination device
  /// memories have been allocated within the same context.
  Status CopyFromDevice(const int64_t position, const void* data, int64_t nbytes);

  /// \brief Copy memory from another device to device at position
  /// \param[in] src_ctx context of the source device memory
  /// \param[in] position start position inside buffer to copy bytes to
  /// \param[in] data start address of the another device memory area to copy from
  /// \param[in] nbytes number of bytes to copy
  /// \return Status
  Status CopyFromAnotherDevice(const std::shared_ptr<CudaContext>& src_ctx,
                               const int64_t position, const void* data, int64_t nbytes);

  /// \brief Expose this device buffer as IPC memory which can be used in other processes
  /// \return Handle or Status
  ///
  /// \note After calling this function, this device memory will not be freed
  /// when the CudaBuffer is destructed
  virtual Result<std::shared_ptr<CudaIpcMemHandle>> ExportForIpc();

  /// \brief Expose this device buffer as IPC memory which can be used in other processes
  /// \param[out] handle the exported IPC handle
  /// \return Status
  ///
  /// \note After calling this function, this device memory will not be freed
  /// when the CudaBuffer is destructed
  ARROW_DEPRECATED("Use Result-returning version")
  virtual Status ExportForIpc(std::shared_ptr<CudaIpcMemHandle>* handle);

  const std::shared_ptr<CudaContext>& context() const { return context_; }

 protected:
  std::shared_ptr<CudaContext> context_;
  bool own_data_;
  bool is_ipc_;

  virtual Status Close();
};

/// \class CudaHostBuffer
/// \brief Device-accessible CPU memory created using cudaHostAlloc
class ARROW_EXPORT CudaHostBuffer : public MutableBuffer {
 public:
  using MutableBuffer::MutableBuffer;
  ~CudaHostBuffer();

  /// \brief Return a device address the GPU can read this memory from.
  Result<uintptr_t> GetDeviceAddress(const std::shared_ptr<CudaContext>& ctx);
};

/// \class CudaIpcHandle
/// \brief A container for a CUDA IPC handle
class ARROW_EXPORT CudaIpcMemHandle {
 public:
  ~CudaIpcMemHandle();

  /// \brief Create CudaIpcMemHandle from opaque buffer (e.g. from another process)
  /// \param[in] opaque_handle a CUipcMemHandle as a const void*
  /// \return Handle or Status
  static Result<std::shared_ptr<CudaIpcMemHandle>> FromBuffer(const void* opaque_handle);

  /// \brief Create CudaIpcMemHandle from opaque buffer (e.g. from another process)
  /// \param[in] opaque_handle a CUipcMemHandle as a const void*
  /// \param[out] handle the CudaIpcMemHandle instance
  /// \return Status
  ARROW_DEPRECATED("Use Result-returning version")
  static Status FromBuffer(const void* opaque_handle,
                           std::shared_ptr<CudaIpcMemHandle>* handle);

  /// \brief Write CudaIpcMemHandle to a Buffer
  /// \param[in] pool a MemoryPool to allocate memory from
  /// \return Buffer or Status
  Result<std::shared_ptr<Buffer>> Serialize(
      MemoryPool* pool = default_memory_pool()) const;

  /// \brief Write CudaIpcMemHandle to a Buffer
  /// \param[in] pool a MemoryPool to allocate memory from
  /// \param[out] out the serialized buffer
  /// \return Status
  ARROW_DEPRECATED("Use Result-returning version")
  Status Serialize(MemoryPool* pool, std::shared_ptr<Buffer>* out) const;

 private:
  explicit CudaIpcMemHandle(const void* handle);
  CudaIpcMemHandle(int64_t memory_size, const void* cu_handle);

  struct CudaIpcMemHandleImpl;
  std::unique_ptr<CudaIpcMemHandleImpl> impl_;

  const void* handle() const;
  int64_t memory_size() const;

  friend CudaBuffer;
  friend CudaContext;
};

/// \class CudaBufferReader
/// \brief File interface for zero-copy read from CUDA buffers
///
/// CAUTION: reading to a Buffer returns a Buffer pointing to device memory.
/// It will generally not be compatible with Arrow code expecting a buffer
/// pointing to CPU memory.
/// Reading to a raw pointer, though, copies device memory into the host
/// memory pointed to.
class ARROW_EXPORT CudaBufferReader
    : public ::arrow::io::internal::RandomAccessFileConcurrencyWrapper<CudaBufferReader> {
 public:
  explicit CudaBufferReader(const std::shared_ptr<Buffer>& buffer);

  bool closed() const override;

  bool supports_zero_copy() const override;

  std::shared_ptr<CudaBuffer> buffer() const { return buffer_; }

 protected:
  friend ::arrow::io::internal::RandomAccessFileConcurrencyWrapper<CudaBufferReader>;

  Status DoClose();

  Result<int64_t> DoRead(int64_t nbytes, void* buffer);
  Result<std::shared_ptr<Buffer>> DoRead(int64_t nbytes);
  Result<int64_t> DoReadAt(int64_t position, int64_t nbytes, void* out);
  Result<std::shared_ptr<Buffer>> DoReadAt(int64_t position, int64_t nbytes);

  Result<int64_t> DoTell() const;
  Status DoSeek(int64_t position);
  Result<int64_t> DoGetSize();

  Status CheckClosed() const {
    if (!is_open_) {
      return Status::Invalid("Operation forbidden on closed CudaBufferReader");
    }
    return Status::OK();
  }

  std::shared_ptr<CudaBuffer> buffer_;
  std::shared_ptr<CudaContext> context_;
  const uintptr_t address_;
  int64_t size_;
  int64_t position_;
  bool is_open_;
};

/// \class CudaBufferWriter
/// \brief File interface for writing to CUDA buffers, with optional buffering
class ARROW_EXPORT CudaBufferWriter : public io::WritableFile {
 public:
  explicit CudaBufferWriter(const std::shared_ptr<CudaBuffer>& buffer);
  ~CudaBufferWriter() override;

  /// \brief Close writer and flush buffered bytes to GPU
  Status Close() override;

  bool closed() const override;

  /// \brief Flush buffered bytes to GPU
  Status Flush() override;

  Status Seek(int64_t position) override;

  Status Write(const void* data, int64_t nbytes) override;

  Status WriteAt(int64_t position, const void* data, int64_t nbytes) override;

  Result<int64_t> Tell() const override;

  /// \brief Set CPU buffer size to limit calls to cudaMemcpy
  /// \param[in] buffer_size the size of CPU buffer to allocate
  /// \return Status
  ///
  /// By default writes are unbuffered
  Status SetBufferSize(const int64_t buffer_size);

  /// \brief Returns size of host (CPU) buffer, 0 for unbuffered
  int64_t buffer_size() const;

  /// \brief Returns number of bytes buffered on host
  int64_t num_bytes_buffered() const;

 private:
  class CudaBufferWriterImpl;
  std::unique_ptr<CudaBufferWriterImpl> impl_;
};

/// \brief Allocate CUDA-accessible memory on CPU host
///
/// The GPU will benefit from fast access to this CPU-located buffer,
/// including fast memory copy.
///
/// \param[in] device_number device to expose host memory
/// \param[in] size number of bytes
/// \return Host buffer or Status
ARROW_EXPORT
Result<std::shared_ptr<CudaHostBuffer>> AllocateCudaHostBuffer(int device_number,
                                                               const int64_t size);

/// \brief Allocate CUDA-accessible memory on CPU host
///
/// The GPU will benefit from fast access to this CPU-located buffer,
/// including fast memory copy.
///
/// \param[in] device_number device to expose host memory
/// \param[in] size number of bytes
/// \param[out] out the allocated buffer
/// \return Status
ARROW_DEPRECATED("Use Result-returning version")
ARROW_EXPORT
Status AllocateCudaHostBuffer(int device_number, const int64_t size,
                              std::shared_ptr<CudaHostBuffer>* out);

/// Low-level: get a device address through which the CPU data be accessed.
Result<uintptr_t> GetDeviceAddress(const uint8_t* cpu_data,
                                   const std::shared_ptr<CudaContext>& ctx);

/// Low-level: get a CPU address through which the device data be accessed.
Result<uint8_t*> GetHostAddress(uintptr_t device_ptr);

}  // namespace cuda
}  // namespace arrow