1 #ifndef STAN_MATH_OPENCL_COPY_HPP
2 #define STAN_MATH_OPENCL_COPY_HPP
3 #ifdef STAN_OPENCL
4 
5 #include <stan/math/opencl/buffer_types.hpp>
6 #include <stan/math/opencl/kernel_cl.hpp>
7 #include <stan/math/opencl/matrix_cl.hpp>
8 #include <stan/math/opencl/matrix_cl_view.hpp>
9 #include <stan/math/opencl/opencl_context.hpp>
10 #include <stan/math/opencl/value_type.hpp>
11 #include <stan/math/opencl/scalar_type.hpp>
12 #include <stan/math/opencl/kernels/pack.hpp>
13 #include <stan/math/opencl/kernels/unpack.hpp>
14 #include <stan/math/opencl/err/check_opencl.hpp>
15 #include <stan/math/opencl/err/check_triangular.hpp>
16 #include <stan/math/prim/meta.hpp>
17 #include <stan/math/prim/fun/Eigen.hpp>
18 #include <stan/math/prim/fun/vec_concat.hpp>
19 
20 #include <CL/opencl.hpp>
21 #include <algorithm>
22 #include <iostream>
23 #include <type_traits>
24 #include <vector>
25 
26 namespace stan {
27 namespace math {
28 
29 /** \ingroup opencl
30  * Copies the source Eigen matrix, `std::vector` or scalar to the destination
31  * matrix that is stored on the OpenCL device. The function also accepts
32  * `matrix_cl`s in which case it just returns the argument. If a lvalue matrix
33  * is passed to this function the caller must make sure that the matrix does not
34  * go out of scope before copying is complete.
35  *
36  * That means `.wait()` must be called on the event associated on copying or
37  * any other event that requires completion of this event. This can be done by
38  * calling `.wait_for_write_events()` or `.wait_for_read_write_events()` on
39  * returned matrix or any matrix that is calculated from that one.
40  *
41  * @param src source Eigen matrix
42  * @return matrix_cl with a copy of the data in the source matrix
43  */
44 template <typename T, require_st_arithmetic<T>* = nullptr>
to_matrix_cl(T && src)45 inline matrix_cl<scalar_type_t<T>> to_matrix_cl(T&& src) {
46   return matrix_cl<scalar_type_t<T>>(std::forward<T>(src));
47 }
48 
49 /** \ingroup opencl
50  * Copies the source matrix that is stored on the OpenCL device to the
51  * destination Eigen matrix.
52  *
53  * @tparam T_ret destination type
54  * @tparam T scalar in the source matrix
55  * @param src source matrix on the OpenCL device
56  * @return Eigen matrix with a copy of the data in the source matrix
57  */
58 template <typename T_ret, typename T, require_eigen_t<T_ret>* = nullptr,
59           require_matrix_cl_t<T>* = nullptr,
60           require_st_same<T_ret, T>* = nullptr>
from_matrix_cl(const T & src)61 inline auto from_matrix_cl(const T& src) {
62   using T_val = value_type_t<T>;
63   using T_ret_col_major
64       = Eigen::Matrix<scalar_type_t<T_ret>, T_ret::RowsAtCompileTime,
65                       T_ret::ColsAtCompileTime>;
66   T_ret_col_major dst(src.rows(), src.cols());
67   if (src.size() == 0) {
68     return dst;
69   }
70   if ((src.view() == matrix_cl_view::Lower
71        || src.view() == matrix_cl_view::Upper)
72       && src.rows() == src.cols()) {
73     using T_not_bool
74         = std::conditional_t<std::is_same<T_val, bool>::value, char, T_val>;
75     std::vector<T_not_bool> packed = packed_copy(src);
76 
77     size_t pos = 0;
78     if (src.view() == matrix_cl_view::Lower) {
79       for (int j = 0; j < src.cols(); ++j) {
80         for (int k = 0; k < j; ++k) {
81           dst.coeffRef(k, j) = 0;
82         }
83         for (int i = j; i < src.cols(); ++i) {
84           dst.coeffRef(i, j) = packed[pos++];
85         }
86       }
87     } else {
88       for (int j = 0; j < src.cols(); ++j) {
89         for (int i = 0; i <= j; ++i) {
90           dst.coeffRef(i, j) = packed[pos++];
91         }
92         for (int k = j + 1; k < src.cols(); ++k) {
93           dst.coeffRef(k, j) = 0;
94         }
95       }
96     }
97   } else {
98     try {
99       cl::Event copy_event;
100       const cl::CommandQueue queue = opencl_context.queue();
101       queue.enqueueReadBuffer(src.buffer(), opencl_context.in_order(), 0,
102                               sizeof(T_val) * dst.size(), dst.data(),
103                               &src.write_events(), &copy_event);
104       copy_event.wait();
105       src.clear_write_events();
106     } catch (const cl::Error& e) {
107       check_opencl_error("copy (OpenCL)->Eigen", e);
108     }
109     if (!contains_nonzero(src.view(), matrix_cl_view::Lower)) {
110       dst.template triangularView<Eigen::StrictlyLower>()
111           = T_ret_col_major::Zero(dst.rows(), dst.cols());
112     }
113     if (!contains_nonzero(src.view(), matrix_cl_view::Upper)) {
114       dst.template triangularView<Eigen::StrictlyUpper>()
115           = T_ret_col_major::Zero(dst.rows(), dst.cols());
116     }
117   }
118   return dst;
119 }
120 
121 /** \ingroup opencl
122  * Copies result of a kernel generator expression to the specified
123  * destination type.
124  *
125  * @tparam T_ret destination type
126  * @tparam T type of expression
127  * @param src source expression
128  * @return Eigen matrix with a copy of the data in the source matrix
129  */
130 template <typename T_ret, typename T,
131           require_all_kernel_expressions_t<T>* = nullptr,
132           require_not_matrix_cl_t<T>* = nullptr>
from_matrix_cl(const T & src)133 inline auto from_matrix_cl(const T& src) {
134   return from_matrix_cl<T_ret>(src.eval());
135 }
136 
137 /** \ingroup opencl
138  * Copy A 1 by 1 source matrix from the Device to the host.
139  * @tparam T An arithmetic type to pass the value from the OpenCL matrix to.
140  * @param src A 1x1 matrix on the device.
141  * @return dst Arithmetic to receive the matrix_cl value.
142  */
143 template <typename T_dst, typename T, require_arithmetic_t<T>* = nullptr,
144           require_same_t<T_dst, T>* = nullptr>
from_matrix_cl(const matrix_cl<T> & src)145 inline T_dst from_matrix_cl(const matrix_cl<T>& src) {
146   T dst;
147   check_size_match("from_matrix_cl<scalar>", "src.rows()", src.rows(),
148                    "dst.rows()", 1);
149   check_size_match("from_matrix_cl<scalar>", "src.cols()", src.cols(),
150                    "dst.cols()", 1);
151   try {
152     cl::Event copy_event;
153     const cl::CommandQueue queue = opencl_context.queue();
154     queue.enqueueReadBuffer(src.buffer(), opencl_context.in_order(), 0,
155                             sizeof(T), &dst, &src.write_events(), &copy_event);
156     copy_event.wait();
157     src.clear_write_events();
158   } catch (const cl::Error& e) {
159     check_opencl_error("from_matrix_cl<scalar>", e);
160   }
161   return dst;
162 }
163 
164 /** \ingroup opencl
165  * Copies the source matrix that is stored on the OpenCL device to the
166  * destination `std::vector`.
167  *
168  * @tparam T_dst destination type
169  * @tparam T scalar in the source matrix
170  * @param src source matrix on the OpenCL device
171  * @return `std::vector` with a copy of the data in the source matrix
172  */
173 template <typename T_dst, typename T,
174           require_std_vector_vt<std::is_arithmetic, T_dst>* = nullptr,
175           require_all_st_same<T_dst, T>* = nullptr>
from_matrix_cl(const matrix_cl<T> & src)176 inline T_dst from_matrix_cl(const matrix_cl<T>& src) {
177   check_size_match("from_matrix_cl<std::vector>", "src.cols()", src.cols(),
178                    "dst.cols()", 1);
179   T_dst dst(src.rows());
180   if (src.rows() == 0) {
181     return dst;
182   }
183   try {
184     cl::Event copy_event;
185     const cl::CommandQueue queue = opencl_context.queue();
186     queue.enqueueReadBuffer(src.buffer(), opencl_context.in_order(), 0,
187                             sizeof(T) * src.rows(), dst.data(),
188                             &src.write_events(), &copy_event);
189     copy_event.wait();
190     src.clear_write_events();
191   } catch (const cl::Error& e) {
192     check_opencl_error("from_matrix_cl<std::vector>", e);
193   }
194   return dst;
195 }
196 
197 /** \ingroup opencl
198  * Copies the source matrix that is stored on the OpenCL device to the
199  * destination `std::vector` containing Eigen vectors.
200  *
201  * @tparam T_dst destination type
202  * @tparam T scalar in the source matrix
203  * @param src source matrix on the OpenCL device
204  * @return `std::vector` containing Eigen vectors with a copy of the data in the
205  * source matrix
206  */
207 template <typename T_dst, typename T,
208           require_std_vector_vt<is_eigen_vector, T_dst>* = nullptr,
209           require_all_st_same<T_dst, T>* = nullptr>
from_matrix_cl(const matrix_cl<T> & src)210 inline T_dst from_matrix_cl(const matrix_cl<T>& src) {
211   Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic> tmp = from_matrix_cl(src);
212   T_dst dst;
213   dst.reserve(src.cols());
214   for (int i = 0; i < src.cols(); i++) {
215     dst.emplace_back(tmp.col(i));
216   }
217   return dst;
218 }
219 
220 /** \ingroup opencl
221  * Copies the source kernel generator expression or matrix that is stored on the
222  * OpenCL device to the destination Eigen matrix.
223  *
224  * @tparam T source type
225  * @param src expression or source matrix on the OpenCL device
226  * @return Eigen matrix with a copy of the data in the source matrix
227  */
228 template <typename T, require_all_kernel_expressions_t<T>* = nullptr>
from_matrix_cl(const T & src)229 auto from_matrix_cl(const T& src) {
230   return from_matrix_cl<
231       Eigen::Matrix<scalar_type_t<T>, Eigen::Dynamic, Eigen::Dynamic>>(src);
232 }
233 
234 /** \ingroup opencl
235  * Packs the square flat triangular matrix on the OpenCL device and
236  * copies it to the std::vector.
237  *
238  * @param src the flat triangular source matrix on the OpenCL device
239  * @return the packed std::vector
240  * @throw <code>std::invalid_argument</code> if the matrix is not triangular
241  */
242 template <typename T, require_matrix_cl_t<T>* = nullptr>
packed_copy(const T & src)243 inline auto packed_copy(const T& src) {
244   check_triangular("packed_copy", "src", src);
245   const int packed_size = src.rows() * (src.rows() + 1) / 2;
246   using T_val = value_type_t<T>;
247   using T_not_bool
248       = std::conditional_t<std::is_same<T_val, bool>::value, char, T_val>;
249   std::vector<T_not_bool> dst(packed_size);
250   if (dst.size() == 0) {
251     return dst;
252   }
253   try {
254     const cl::CommandQueue queue = opencl_context.queue();
255     matrix_cl<T_val> packed(packed_size, 1);
256     stan::math::opencl_kernels::pack(cl::NDRange(src.rows(), src.rows()),
257                                      packed, src, src.rows(), src.rows(),
258                                      src.view());
259     const std::vector<cl::Event> mat_events
260         = vec_concat(packed.read_write_events(), src.write_events());
261     cl::Event copy_event;
262     queue.enqueueReadBuffer(packed.buffer(), opencl_context.in_order(), 0,
263                             sizeof(T_val) * packed_size, dst.data(),
264                             &mat_events, &copy_event);
265     copy_event.wait();
266     src.clear_write_events();
267   } catch (const cl::Error& e) {
268     check_opencl_error("packed_copy (OpenCL->std::vector)", e);
269   }
270   return dst;
271 }
272 
273 /** \ingroup opencl
274  * Copies the packed triangular matrix from the source std::vector to an OpenCL
275  * buffer and unpacks it to a flat matrix on the OpenCL device. If a lvalue is
276  * passed to this constructor the caller must make sure that it does not go out
277  * of scope before copying is complete.
278  *
279  * That means `.wait()` must be called on the event associated on copying or
280  * any other event that requires completion of this event. This can be done by
281  * calling `.wait_for_write_events()` or `.wait_for_read_write_events()` on
282  * returned matrix or any matrix that is calculated from that one.
283  *
284  * @tparam matrix_view the triangularity of the source matrix
285  * @param src the packed source std::vector
286  * @param rows the number of rows in the flat matrix
287  * @return the destination flat matrix on the OpenCL device
288  * @throw <code>std::invalid_argument</code> if the
289  * size of the vector does not match the expected size
290  * for the packed triangular matrix
291  */
292 template <matrix_cl_view matrix_view, typename Vec,
293           typename Vec_scalar = scalar_type_t<Vec>,
294           require_vector_vt<std::is_arithmetic, Vec>* = nullptr>
packed_copy(Vec && src,int rows)295 inline matrix_cl<Vec_scalar> packed_copy(Vec&& src, int rows) {
296   const int packed_size = rows * (rows + 1) / 2;
297   check_size_match("copy (packed std::vector -> OpenCL)", "src.size()",
298                    src.size(), "rows * (rows + 1) / 2", packed_size);
299   matrix_cl<Vec_scalar> dst(rows, rows, matrix_view);
300   if (dst.size() == 0) {
301     return dst;
302   }
303   try {
304     matrix_cl<Vec_scalar> packed(packed_size, 1);
305     cl::Event packed_event;
306     const cl::CommandQueue queue = opencl_context.queue();
307     queue.enqueueWriteBuffer(
308         packed.buffer(),
309         opencl_context.in_order() || std::is_rvalue_reference<Vec&&>::value, 0,
310         sizeof(Vec_scalar) * packed_size, src.data(), nullptr, &packed_event);
311     packed.add_write_event(packed_event);
312     stan::math::opencl_kernels::unpack(cl::NDRange(dst.rows(), dst.rows()), dst,
313                                        packed, dst.rows(), dst.rows(),
314                                        matrix_view);
315   } catch (const cl::Error& e) {
316     check_opencl_error("packed_copy (std::vector->OpenCL)", e);
317   }
318   return dst;
319 }
320 
321 /** \ingroup opencl
322  * Copies the source matrix to the
323  * destination matrix. Both matrices
324  * are stored on the OpenCL device.
325  *
326  * @tparam T An arithmetic type to pass the value from the OpenCL matrix to.
327  * @param src the source matrix
328  * @return matrix_cl with copies of values in the source matrix
329  * @throw <code>std::invalid_argument</code> if the
330  * matrices do not have matching dimensions
331  */
332 template <typename T, require_matrix_cl_t<T>* = nullptr>
copy_cl(const T & src)333 inline plain_type_t<T> copy_cl(const T& src) {
334   return plain_type_t<T>(src);
335 }
336 
337 }  // namespace math
338 }  // namespace stan
339 #endif
340 #endif
341