1 #ifndef STAN_MATH_OPENCL_COPY_HPP
2 #define STAN_MATH_OPENCL_COPY_HPP
3 #ifdef STAN_OPENCL
4
5 #include <stan/math/opencl/buffer_types.hpp>
6 #include <stan/math/opencl/kernel_cl.hpp>
7 #include <stan/math/opencl/matrix_cl.hpp>
8 #include <stan/math/opencl/matrix_cl_view.hpp>
9 #include <stan/math/opencl/opencl_context.hpp>
10 #include <stan/math/opencl/value_type.hpp>
11 #include <stan/math/opencl/scalar_type.hpp>
12 #include <stan/math/opencl/kernels/pack.hpp>
13 #include <stan/math/opencl/kernels/unpack.hpp>
14 #include <stan/math/opencl/err/check_opencl.hpp>
15 #include <stan/math/opencl/err/check_triangular.hpp>
16 #include <stan/math/prim/meta.hpp>
17 #include <stan/math/prim/fun/Eigen.hpp>
18 #include <stan/math/prim/fun/vec_concat.hpp>
19
20 #include <CL/opencl.hpp>
21 #include <algorithm>
22 #include <iostream>
23 #include <type_traits>
24 #include <vector>
25
26 namespace stan {
27 namespace math {
28
29 /** \ingroup opencl
30 * Copies the source Eigen matrix, `std::vector` or scalar to the destination
31 * matrix that is stored on the OpenCL device. The function also accepts
32 * `matrix_cl`s in which case it just returns the argument. If a lvalue matrix
33 * is passed to this function the caller must make sure that the matrix does not
34 * go out of scope before copying is complete.
35 *
36 * That means `.wait()` must be called on the event associated on copying or
37 * any other event that requires completion of this event. This can be done by
38 * calling `.wait_for_write_events()` or `.wait_for_read_write_events()` on
39 * returned matrix or any matrix that is calculated from that one.
40 *
41 * @param src source Eigen matrix
42 * @return matrix_cl with a copy of the data in the source matrix
43 */
44 template <typename T, require_st_arithmetic<T>* = nullptr>
to_matrix_cl(T && src)45 inline matrix_cl<scalar_type_t<T>> to_matrix_cl(T&& src) {
46 return matrix_cl<scalar_type_t<T>>(std::forward<T>(src));
47 }
48
49 /** \ingroup opencl
50 * Copies the source matrix that is stored on the OpenCL device to the
51 * destination Eigen matrix.
52 *
53 * @tparam T_ret destination type
54 * @tparam T scalar in the source matrix
55 * @param src source matrix on the OpenCL device
56 * @return Eigen matrix with a copy of the data in the source matrix
57 */
58 template <typename T_ret, typename T, require_eigen_t<T_ret>* = nullptr,
59 require_matrix_cl_t<T>* = nullptr,
60 require_st_same<T_ret, T>* = nullptr>
from_matrix_cl(const T & src)61 inline auto from_matrix_cl(const T& src) {
62 using T_val = value_type_t<T>;
63 using T_ret_col_major
64 = Eigen::Matrix<scalar_type_t<T_ret>, T_ret::RowsAtCompileTime,
65 T_ret::ColsAtCompileTime>;
66 T_ret_col_major dst(src.rows(), src.cols());
67 if (src.size() == 0) {
68 return dst;
69 }
70 if ((src.view() == matrix_cl_view::Lower
71 || src.view() == matrix_cl_view::Upper)
72 && src.rows() == src.cols()) {
73 using T_not_bool
74 = std::conditional_t<std::is_same<T_val, bool>::value, char, T_val>;
75 std::vector<T_not_bool> packed = packed_copy(src);
76
77 size_t pos = 0;
78 if (src.view() == matrix_cl_view::Lower) {
79 for (int j = 0; j < src.cols(); ++j) {
80 for (int k = 0; k < j; ++k) {
81 dst.coeffRef(k, j) = 0;
82 }
83 for (int i = j; i < src.cols(); ++i) {
84 dst.coeffRef(i, j) = packed[pos++];
85 }
86 }
87 } else {
88 for (int j = 0; j < src.cols(); ++j) {
89 for (int i = 0; i <= j; ++i) {
90 dst.coeffRef(i, j) = packed[pos++];
91 }
92 for (int k = j + 1; k < src.cols(); ++k) {
93 dst.coeffRef(k, j) = 0;
94 }
95 }
96 }
97 } else {
98 try {
99 cl::Event copy_event;
100 const cl::CommandQueue queue = opencl_context.queue();
101 queue.enqueueReadBuffer(src.buffer(), opencl_context.in_order(), 0,
102 sizeof(T_val) * dst.size(), dst.data(),
103 &src.write_events(), ©_event);
104 copy_event.wait();
105 src.clear_write_events();
106 } catch (const cl::Error& e) {
107 check_opencl_error("copy (OpenCL)->Eigen", e);
108 }
109 if (!contains_nonzero(src.view(), matrix_cl_view::Lower)) {
110 dst.template triangularView<Eigen::StrictlyLower>()
111 = T_ret_col_major::Zero(dst.rows(), dst.cols());
112 }
113 if (!contains_nonzero(src.view(), matrix_cl_view::Upper)) {
114 dst.template triangularView<Eigen::StrictlyUpper>()
115 = T_ret_col_major::Zero(dst.rows(), dst.cols());
116 }
117 }
118 return dst;
119 }
120
121 /** \ingroup opencl
122 * Copies result of a kernel generator expression to the specified
123 * destination type.
124 *
125 * @tparam T_ret destination type
126 * @tparam T type of expression
127 * @param src source expression
128 * @return Eigen matrix with a copy of the data in the source matrix
129 */
130 template <typename T_ret, typename T,
131 require_all_kernel_expressions_t<T>* = nullptr,
132 require_not_matrix_cl_t<T>* = nullptr>
from_matrix_cl(const T & src)133 inline auto from_matrix_cl(const T& src) {
134 return from_matrix_cl<T_ret>(src.eval());
135 }
136
137 /** \ingroup opencl
138 * Copy A 1 by 1 source matrix from the Device to the host.
139 * @tparam T An arithmetic type to pass the value from the OpenCL matrix to.
140 * @param src A 1x1 matrix on the device.
141 * @return dst Arithmetic to receive the matrix_cl value.
142 */
143 template <typename T_dst, typename T, require_arithmetic_t<T>* = nullptr,
144 require_same_t<T_dst, T>* = nullptr>
from_matrix_cl(const matrix_cl<T> & src)145 inline T_dst from_matrix_cl(const matrix_cl<T>& src) {
146 T dst;
147 check_size_match("from_matrix_cl<scalar>", "src.rows()", src.rows(),
148 "dst.rows()", 1);
149 check_size_match("from_matrix_cl<scalar>", "src.cols()", src.cols(),
150 "dst.cols()", 1);
151 try {
152 cl::Event copy_event;
153 const cl::CommandQueue queue = opencl_context.queue();
154 queue.enqueueReadBuffer(src.buffer(), opencl_context.in_order(), 0,
155 sizeof(T), &dst, &src.write_events(), ©_event);
156 copy_event.wait();
157 src.clear_write_events();
158 } catch (const cl::Error& e) {
159 check_opencl_error("from_matrix_cl<scalar>", e);
160 }
161 return dst;
162 }
163
164 /** \ingroup opencl
165 * Copies the source matrix that is stored on the OpenCL device to the
166 * destination `std::vector`.
167 *
168 * @tparam T_dst destination type
169 * @tparam T scalar in the source matrix
170 * @param src source matrix on the OpenCL device
171 * @return `std::vector` with a copy of the data in the source matrix
172 */
173 template <typename T_dst, typename T,
174 require_std_vector_vt<std::is_arithmetic, T_dst>* = nullptr,
175 require_all_st_same<T_dst, T>* = nullptr>
from_matrix_cl(const matrix_cl<T> & src)176 inline T_dst from_matrix_cl(const matrix_cl<T>& src) {
177 check_size_match("from_matrix_cl<std::vector>", "src.cols()", src.cols(),
178 "dst.cols()", 1);
179 T_dst dst(src.rows());
180 if (src.rows() == 0) {
181 return dst;
182 }
183 try {
184 cl::Event copy_event;
185 const cl::CommandQueue queue = opencl_context.queue();
186 queue.enqueueReadBuffer(src.buffer(), opencl_context.in_order(), 0,
187 sizeof(T) * src.rows(), dst.data(),
188 &src.write_events(), ©_event);
189 copy_event.wait();
190 src.clear_write_events();
191 } catch (const cl::Error& e) {
192 check_opencl_error("from_matrix_cl<std::vector>", e);
193 }
194 return dst;
195 }
196
197 /** \ingroup opencl
198 * Copies the source matrix that is stored on the OpenCL device to the
199 * destination `std::vector` containing Eigen vectors.
200 *
201 * @tparam T_dst destination type
202 * @tparam T scalar in the source matrix
203 * @param src source matrix on the OpenCL device
204 * @return `std::vector` containing Eigen vectors with a copy of the data in the
205 * source matrix
206 */
207 template <typename T_dst, typename T,
208 require_std_vector_vt<is_eigen_vector, T_dst>* = nullptr,
209 require_all_st_same<T_dst, T>* = nullptr>
from_matrix_cl(const matrix_cl<T> & src)210 inline T_dst from_matrix_cl(const matrix_cl<T>& src) {
211 Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic> tmp = from_matrix_cl(src);
212 T_dst dst;
213 dst.reserve(src.cols());
214 for (int i = 0; i < src.cols(); i++) {
215 dst.emplace_back(tmp.col(i));
216 }
217 return dst;
218 }
219
220 /** \ingroup opencl
221 * Copies the source kernel generator expression or matrix that is stored on the
222 * OpenCL device to the destination Eigen matrix.
223 *
224 * @tparam T source type
225 * @param src expression or source matrix on the OpenCL device
226 * @return Eigen matrix with a copy of the data in the source matrix
227 */
228 template <typename T, require_all_kernel_expressions_t<T>* = nullptr>
from_matrix_cl(const T & src)229 auto from_matrix_cl(const T& src) {
230 return from_matrix_cl<
231 Eigen::Matrix<scalar_type_t<T>, Eigen::Dynamic, Eigen::Dynamic>>(src);
232 }
233
234 /** \ingroup opencl
235 * Packs the square flat triangular matrix on the OpenCL device and
236 * copies it to the std::vector.
237 *
238 * @param src the flat triangular source matrix on the OpenCL device
239 * @return the packed std::vector
240 * @throw <code>std::invalid_argument</code> if the matrix is not triangular
241 */
242 template <typename T, require_matrix_cl_t<T>* = nullptr>
packed_copy(const T & src)243 inline auto packed_copy(const T& src) {
244 check_triangular("packed_copy", "src", src);
245 const int packed_size = src.rows() * (src.rows() + 1) / 2;
246 using T_val = value_type_t<T>;
247 using T_not_bool
248 = std::conditional_t<std::is_same<T_val, bool>::value, char, T_val>;
249 std::vector<T_not_bool> dst(packed_size);
250 if (dst.size() == 0) {
251 return dst;
252 }
253 try {
254 const cl::CommandQueue queue = opencl_context.queue();
255 matrix_cl<T_val> packed(packed_size, 1);
256 stan::math::opencl_kernels::pack(cl::NDRange(src.rows(), src.rows()),
257 packed, src, src.rows(), src.rows(),
258 src.view());
259 const std::vector<cl::Event> mat_events
260 = vec_concat(packed.read_write_events(), src.write_events());
261 cl::Event copy_event;
262 queue.enqueueReadBuffer(packed.buffer(), opencl_context.in_order(), 0,
263 sizeof(T_val) * packed_size, dst.data(),
264 &mat_events, ©_event);
265 copy_event.wait();
266 src.clear_write_events();
267 } catch (const cl::Error& e) {
268 check_opencl_error("packed_copy (OpenCL->std::vector)", e);
269 }
270 return dst;
271 }
272
273 /** \ingroup opencl
274 * Copies the packed triangular matrix from the source std::vector to an OpenCL
275 * buffer and unpacks it to a flat matrix on the OpenCL device. If a lvalue is
276 * passed to this constructor the caller must make sure that it does not go out
277 * of scope before copying is complete.
278 *
279 * That means `.wait()` must be called on the event associated on copying or
280 * any other event that requires completion of this event. This can be done by
281 * calling `.wait_for_write_events()` or `.wait_for_read_write_events()` on
282 * returned matrix or any matrix that is calculated from that one.
283 *
284 * @tparam matrix_view the triangularity of the source matrix
285 * @param src the packed source std::vector
286 * @param rows the number of rows in the flat matrix
287 * @return the destination flat matrix on the OpenCL device
288 * @throw <code>std::invalid_argument</code> if the
289 * size of the vector does not match the expected size
290 * for the packed triangular matrix
291 */
292 template <matrix_cl_view matrix_view, typename Vec,
293 typename Vec_scalar = scalar_type_t<Vec>,
294 require_vector_vt<std::is_arithmetic, Vec>* = nullptr>
packed_copy(Vec && src,int rows)295 inline matrix_cl<Vec_scalar> packed_copy(Vec&& src, int rows) {
296 const int packed_size = rows * (rows + 1) / 2;
297 check_size_match("copy (packed std::vector -> OpenCL)", "src.size()",
298 src.size(), "rows * (rows + 1) / 2", packed_size);
299 matrix_cl<Vec_scalar> dst(rows, rows, matrix_view);
300 if (dst.size() == 0) {
301 return dst;
302 }
303 try {
304 matrix_cl<Vec_scalar> packed(packed_size, 1);
305 cl::Event packed_event;
306 const cl::CommandQueue queue = opencl_context.queue();
307 queue.enqueueWriteBuffer(
308 packed.buffer(),
309 opencl_context.in_order() || std::is_rvalue_reference<Vec&&>::value, 0,
310 sizeof(Vec_scalar) * packed_size, src.data(), nullptr, &packed_event);
311 packed.add_write_event(packed_event);
312 stan::math::opencl_kernels::unpack(cl::NDRange(dst.rows(), dst.rows()), dst,
313 packed, dst.rows(), dst.rows(),
314 matrix_view);
315 } catch (const cl::Error& e) {
316 check_opencl_error("packed_copy (std::vector->OpenCL)", e);
317 }
318 return dst;
319 }
320
321 /** \ingroup opencl
322 * Copies the source matrix to the
323 * destination matrix. Both matrices
324 * are stored on the OpenCL device.
325 *
326 * @tparam T An arithmetic type to pass the value from the OpenCL matrix to.
327 * @param src the source matrix
328 * @return matrix_cl with copies of values in the source matrix
329 * @throw <code>std::invalid_argument</code> if the
330 * matrices do not have matching dimensions
331 */
332 template <typename T, require_matrix_cl_t<T>* = nullptr>
copy_cl(const T & src)333 inline plain_type_t<T> copy_cl(const T& src) {
334 return plain_type_t<T>(src);
335 }
336
337 } // namespace math
338 } // namespace stan
339 #endif
340 #endif
341