#ifndef STAN_MATH_OPENCL_COPY_HPP #define STAN_MATH_OPENCL_COPY_HPP #ifdef STAN_OPENCL #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include namespace stan { namespace math { /** \ingroup opencl * Copies the source Eigen matrix, `std::vector` or scalar to the destination * matrix that is stored on the OpenCL device. The function also accepts * `matrix_cl`s in which case it just returns the argument. If a lvalue matrix * is passed to this function the caller must make sure that the matrix does not * go out of scope before copying is complete. * * That means `.wait()` must be called on the event associated on copying or * any other event that requires completion of this event. This can be done by * calling `.wait_for_write_events()` or `.wait_for_read_write_events()` on * returned matrix or any matrix that is calculated from that one. * * @param src source Eigen matrix * @return matrix_cl with a copy of the data in the source matrix */ template * = nullptr> inline matrix_cl> to_matrix_cl(T&& src) { return matrix_cl>(std::forward(src)); } /** \ingroup opencl * Copies the source matrix that is stored on the OpenCL device to the * destination Eigen matrix. * * @tparam T_ret destination type * @tparam T scalar in the source matrix * @param src source matrix on the OpenCL device * @return Eigen matrix with a copy of the data in the source matrix */ template * = nullptr, require_matrix_cl_t* = nullptr, require_st_same* = nullptr> inline auto from_matrix_cl(const T& src) { using T_val = value_type_t; using T_ret_col_major = Eigen::Matrix, T_ret::RowsAtCompileTime, T_ret::ColsAtCompileTime>; T_ret_col_major dst(src.rows(), src.cols()); if (src.size() == 0) { return dst; } if ((src.view() == matrix_cl_view::Lower || src.view() == matrix_cl_view::Upper) && src.rows() == src.cols()) { using T_not_bool = std::conditional_t::value, char, T_val>; std::vector packed = packed_copy(src); size_t pos = 0; if (src.view() == matrix_cl_view::Lower) { for (int j = 0; j < src.cols(); ++j) { for (int k = 0; k < j; ++k) { dst.coeffRef(k, j) = 0; } for (int i = j; i < src.cols(); ++i) { dst.coeffRef(i, j) = packed[pos++]; } } } else { for (int j = 0; j < src.cols(); ++j) { for (int i = 0; i <= j; ++i) { dst.coeffRef(i, j) = packed[pos++]; } for (int k = j + 1; k < src.cols(); ++k) { dst.coeffRef(k, j) = 0; } } } } else { try { cl::Event copy_event; const cl::CommandQueue queue = opencl_context.queue(); queue.enqueueReadBuffer(src.buffer(), opencl_context.in_order(), 0, sizeof(T_val) * dst.size(), dst.data(), &src.write_events(), ©_event); copy_event.wait(); src.clear_write_events(); } catch (const cl::Error& e) { check_opencl_error("copy (OpenCL)->Eigen", e); } if (!contains_nonzero(src.view(), matrix_cl_view::Lower)) { dst.template triangularView() = T_ret_col_major::Zero(dst.rows(), dst.cols()); } if (!contains_nonzero(src.view(), matrix_cl_view::Upper)) { dst.template triangularView() = T_ret_col_major::Zero(dst.rows(), dst.cols()); } } return dst; } /** \ingroup opencl * Copies result of a kernel generator expression to the specified * destination type. * * @tparam T_ret destination type * @tparam T type of expression * @param src source expression * @return Eigen matrix with a copy of the data in the source matrix */ template * = nullptr, require_not_matrix_cl_t* = nullptr> inline auto from_matrix_cl(const T& src) { return from_matrix_cl(src.eval()); } /** \ingroup opencl * Copy A 1 by 1 source matrix from the Device to the host. * @tparam T An arithmetic type to pass the value from the OpenCL matrix to. * @param src A 1x1 matrix on the device. * @return dst Arithmetic to receive the matrix_cl value. */ template * = nullptr, require_same_t* = nullptr> inline T_dst from_matrix_cl(const matrix_cl& src) { T dst; check_size_match("from_matrix_cl", "src.rows()", src.rows(), "dst.rows()", 1); check_size_match("from_matrix_cl", "src.cols()", src.cols(), "dst.cols()", 1); try { cl::Event copy_event; const cl::CommandQueue queue = opencl_context.queue(); queue.enqueueReadBuffer(src.buffer(), opencl_context.in_order(), 0, sizeof(T), &dst, &src.write_events(), ©_event); copy_event.wait(); src.clear_write_events(); } catch (const cl::Error& e) { check_opencl_error("from_matrix_cl", e); } return dst; } /** \ingroup opencl * Copies the source matrix that is stored on the OpenCL device to the * destination `std::vector`. * * @tparam T_dst destination type * @tparam T scalar in the source matrix * @param src source matrix on the OpenCL device * @return `std::vector` with a copy of the data in the source matrix */ template * = nullptr, require_all_st_same* = nullptr> inline T_dst from_matrix_cl(const matrix_cl& src) { check_size_match("from_matrix_cl", "src.cols()", src.cols(), "dst.cols()", 1); T_dst dst(src.rows()); if (src.rows() == 0) { return dst; } try { cl::Event copy_event; const cl::CommandQueue queue = opencl_context.queue(); queue.enqueueReadBuffer(src.buffer(), opencl_context.in_order(), 0, sizeof(T) * src.rows(), dst.data(), &src.write_events(), ©_event); copy_event.wait(); src.clear_write_events(); } catch (const cl::Error& e) { check_opencl_error("from_matrix_cl", e); } return dst; } /** \ingroup opencl * Copies the source matrix that is stored on the OpenCL device to the * destination `std::vector` containing Eigen vectors. * * @tparam T_dst destination type * @tparam T scalar in the source matrix * @param src source matrix on the OpenCL device * @return `std::vector` containing Eigen vectors with a copy of the data in the * source matrix */ template * = nullptr, require_all_st_same* = nullptr> inline T_dst from_matrix_cl(const matrix_cl& src) { Eigen::Matrix tmp = from_matrix_cl(src); T_dst dst; dst.reserve(src.cols()); for (int i = 0; i < src.cols(); i++) { dst.emplace_back(tmp.col(i)); } return dst; } /** \ingroup opencl * Copies the source kernel generator expression or matrix that is stored on the * OpenCL device to the destination Eigen matrix. * * @tparam T source type * @param src expression or source matrix on the OpenCL device * @return Eigen matrix with a copy of the data in the source matrix */ template * = nullptr> auto from_matrix_cl(const T& src) { return from_matrix_cl< Eigen::Matrix, Eigen::Dynamic, Eigen::Dynamic>>(src); } /** \ingroup opencl * Packs the square flat triangular matrix on the OpenCL device and * copies it to the std::vector. * * @param src the flat triangular source matrix on the OpenCL device * @return the packed std::vector * @throw std::invalid_argument if the matrix is not triangular */ template * = nullptr> inline auto packed_copy(const T& src) { check_triangular("packed_copy", "src", src); const int packed_size = src.rows() * (src.rows() + 1) / 2; using T_val = value_type_t; using T_not_bool = std::conditional_t::value, char, T_val>; std::vector dst(packed_size); if (dst.size() == 0) { return dst; } try { const cl::CommandQueue queue = opencl_context.queue(); matrix_cl packed(packed_size, 1); stan::math::opencl_kernels::pack(cl::NDRange(src.rows(), src.rows()), packed, src, src.rows(), src.rows(), src.view()); const std::vector mat_events = vec_concat(packed.read_write_events(), src.write_events()); cl::Event copy_event; queue.enqueueReadBuffer(packed.buffer(), opencl_context.in_order(), 0, sizeof(T_val) * packed_size, dst.data(), &mat_events, ©_event); copy_event.wait(); src.clear_write_events(); } catch (const cl::Error& e) { check_opencl_error("packed_copy (OpenCL->std::vector)", e); } return dst; } /** \ingroup opencl * Copies the packed triangular matrix from the source std::vector to an OpenCL * buffer and unpacks it to a flat matrix on the OpenCL device. If a lvalue is * passed to this constructor the caller must make sure that it does not go out * of scope before copying is complete. * * That means `.wait()` must be called on the event associated on copying or * any other event that requires completion of this event. This can be done by * calling `.wait_for_write_events()` or `.wait_for_read_write_events()` on * returned matrix or any matrix that is calculated from that one. * * @tparam matrix_view the triangularity of the source matrix * @param src the packed source std::vector * @param rows the number of rows in the flat matrix * @return the destination flat matrix on the OpenCL device * @throw std::invalid_argument if the * size of the vector does not match the expected size * for the packed triangular matrix */ template , require_vector_vt* = nullptr> inline matrix_cl packed_copy(Vec&& src, int rows) { const int packed_size = rows * (rows + 1) / 2; check_size_match("copy (packed std::vector -> OpenCL)", "src.size()", src.size(), "rows * (rows + 1) / 2", packed_size); matrix_cl dst(rows, rows, matrix_view); if (dst.size() == 0) { return dst; } try { matrix_cl packed(packed_size, 1); cl::Event packed_event; const cl::CommandQueue queue = opencl_context.queue(); queue.enqueueWriteBuffer( packed.buffer(), opencl_context.in_order() || std::is_rvalue_reference::value, 0, sizeof(Vec_scalar) * packed_size, src.data(), nullptr, &packed_event); packed.add_write_event(packed_event); stan::math::opencl_kernels::unpack(cl::NDRange(dst.rows(), dst.rows()), dst, packed, dst.rows(), dst.rows(), matrix_view); } catch (const cl::Error& e) { check_opencl_error("packed_copy (std::vector->OpenCL)", e); } return dst; } /** \ingroup opencl * Copies the source matrix to the * destination matrix. Both matrices * are stored on the OpenCL device. * * @tparam T An arithmetic type to pass the value from the OpenCL matrix to. * @param src the source matrix * @return matrix_cl with copies of values in the source matrix * @throw std::invalid_argument if the * matrices do not have matching dimensions */ template * = nullptr> inline plain_type_t copy_cl(const T& src) { return plain_type_t(src); } } // namespace math } // namespace stan #endif #endif