1 #ifndef STAN_MATH_OPENCL_MATRIX_CL_HPP
2 #define STAN_MATH_OPENCL_MATRIX_CL_HPP
3 #ifdef STAN_OPENCL
4 
5 #include <stan/math/opencl/prim/size.hpp>
6 #include <stan/math/opencl/err/check_opencl.hpp>
7 #include <stan/math/prim/err/check_size_match.hpp>
8 #include <stan/math/opencl/opencl_context.hpp>
9 #include <stan/math/opencl/ref_type_for_opencl.hpp>
10 #include <stan/math/opencl/matrix_cl_view.hpp>
11 #include <stan/math/prim/meta.hpp>
12 #include <stan/math/prim/fun/Eigen.hpp>
13 #include <stan/math/prim/fun/vec_concat.hpp>
14 #include <CL/opencl.hpp>
15 #include <algorithm>
16 #include <iostream>
17 #include <string>
18 #include <type_traits>
19 #include <vector>
20 
21 /** \ingroup opencl
22  *  \defgroup matrix_cl_group Matrix
23  * The matrix_cl class - allocates memory space on the OpenCL device. Operations
24  * on `matrix_cl` types are executed lazily via the kernel generator
25  * and async routines.
26  */
27 namespace stan {
28 namespace math {
29 
30 /** \addtogroup matrix_cl_group
31  *  @{
32  */
33 
34 // forward declare
35 template <typename T>
36 class arena_matrix_cl;
37 
38 template <typename>
39 class matrix_cl;
40 
41 /**
42  * Represents an arithmetic matrix on the OpenCL device.
43  * @tparam T an arithmetic type for the type stored in the OpenCL buffer.
44  */
45 template <typename T>
46 class matrix_cl : public matrix_cl_base {
47  private:
48   cl::Buffer buffer_cl_;  // Holds the allocated memory on the device
49   int rows_{0};           // Number of rows.
50   int cols_{0};           // Number of columns.
51   // Holds info on if matrix is a special type
52   matrix_cl_view view_{matrix_cl_view::Entire};
53   mutable std::vector<cl::Event> write_events_;  // Tracks write jobs
54   mutable std::vector<cl::Event> read_events_;   // Tracks reads
55 
56  public:
57   using Scalar = T;  // Underlying type of the matrix
58   using type = T;    // Underlying type of the matrix
59   // Forward declare the methods that work in place on the matrix
60   template <matrix_cl_view matrix_view = matrix_cl_view::Entire>
61   inline void zeros_strict_tri();
62 
rows() const63   int rows() const { return rows_; }
64 
cols() const65   int cols() const { return cols_; }
66 
size() const67   int size() const { return rows_ * cols_; }
68 
view() const69   const matrix_cl_view& view() const { return view_; }
70 
view(const matrix_cl_view & view)71   void view(const matrix_cl_view& view) { view_ = view; }
72 
73   /**
74    * Clear the write events from the event stacks.
75    */
clear_write_events() const76   inline void clear_write_events() const {
77     write_events_.clear();
78     return;
79   }
80 
81   /**
82    * Clear the read events from the event stacks.
83    */
clear_read_events() const84   inline void clear_read_events() const {
85     read_events_.clear();
86     return;
87   }
88 
89   /**
90    * Clear the write events from the event stacks.
91    */
clear_read_write_events() const92   inline void clear_read_write_events() const {
93     read_events_.clear();
94     write_events_.clear();
95     return;
96   }
97 
98   /**
99    * Get the events from the event stacks.
100    * @return The write event stack.
101    */
write_events() const102   inline const std::vector<cl::Event>& write_events() const {
103     return write_events_;
104   }
105 
106   /**
107    * Get the events from the event stacks.
108    * @return The read/write event stack.
109    */
read_events() const110   inline const std::vector<cl::Event>& read_events() const {
111     return read_events_;
112   }
113 
114   /**
115    * Get the events from the event stacks.
116    * @return The read/write event stack.
117    */
read_write_events() const118   inline const std::vector<cl::Event> read_write_events() const {
119     return vec_concat(this->read_events(), this->write_events());
120   }
121 
122   /**
123    * Add an event to the read event stack.
124    * @param new_event The event to be pushed on the event stack.
125    */
add_read_event(cl::Event new_event) const126   inline void add_read_event(cl::Event new_event) const {
127     this->read_events_.push_back(new_event);
128   }
129 
130   /**
131    * Add an event to the write event stack.
132    * @param new_event The event to be pushed on the event stack.
133    */
add_write_event(cl::Event new_event) const134   inline void add_write_event(cl::Event new_event) const {
135     this->write_events_.push_back(new_event);
136   }
137 
138   /**
139    * Add an event to the read/write event stack.
140    * @param new_event The event to be pushed on the event stack.
141    */
add_read_write_event(cl::Event new_event) const142   inline void add_read_write_event(cl::Event new_event) const {
143     this->read_events_.push_back(new_event);
144     this->write_events_.push_back(new_event);
145   }
146 
147   /**
148    * Waits for the write events and clears the read event stack.
149    */
wait_for_write_events() const150   inline void wait_for_write_events() const {
151     for (cl::Event e : write_events_) {
152       e.wait();
153     }
154     write_events_.clear();
155   }
156 
157   /**
158    * Waits for the read events and clears the read event stack.
159    */
wait_for_read_events() const160   inline void wait_for_read_events() const {
161     for (cl::Event e : read_events_) {
162       e.wait();
163     }
164     read_events_.clear();
165   }
166 
167   /**
168    * Waits for read and write events to finish and clears the read, write, and
169    * read/write event stacks.
170    */
wait_for_read_write_events() const171   inline void wait_for_read_write_events() const {
172     wait_for_read_events();
173     wait_for_write_events();
174   }
175 
buffer() const176   const cl::Buffer& buffer() const { return buffer_cl_; }
buffer()177   cl::Buffer& buffer() { return buffer_cl_; }
178 
matrix_cl()179   matrix_cl() {}
180   /**
181    * Construct a matrix_cl<T> from an existing cl::Buffer object. The matrix
182    * directly uses given buffer - no copying is done.
183    *
184    * @param A the cl::Buffer object to construct the matrix from
185    * @param R number of rows
186    * @param C number of columns
187    * @param partial_view view of the matrix
188    */
matrix_cl(const cl::Buffer & A,const int R,const int C,matrix_cl_view partial_view=matrix_cl_view::Entire)189   matrix_cl(const cl::Buffer& A, const int R, const int C,
190             matrix_cl_view partial_view = matrix_cl_view::Entire)
191       : buffer_cl_(A), rows_(R), cols_(C), view_(partial_view) {}
192 
193   /**
194    * Copy constructor.
195    * @param A matrix_cl to copy
196    */
matrix_cl(const matrix_cl<T> & A)197   matrix_cl(const matrix_cl<T>& A)
198       : rows_(A.rows()), cols_(A.cols()), view_(A.view()) {
199     if (A.size() == 0) {
200       return;
201     }
202     buffer_cl_ = cl::Buffer(opencl_context.context(), CL_MEM_READ_WRITE,
203                             sizeof(T) * this->size());
204     initialize_buffer_cl(A);
205   }
206 
207   /**
208    * Move constructor.
209    * @param A matrix_cl to move
210    */
matrix_cl(matrix_cl<T> && A)211   matrix_cl(matrix_cl<T>&& A)
212       : buffer_cl_(std::move(A.buffer_cl_)),
213         rows_(A.rows_),
214         cols_(A.cols_),
215         view_(A.view_),
216         write_events_(std::move(A.write_events_)),
217         read_events_(std::move(A.read_events_)) {}
218 
219   /**
220    * Constructor from `arena_matrix_cl`.
221    * @param A matrix_cl to move
222    */
223   // defined in rev/arena_matrix_cl.hpp
224   matrix_cl(const arena_matrix_cl<T>& A);  // NOLINT(runtime/explicit)
225 
226   /**
227    * Constructor for the matrix_cl that creates a copy of a std::vector of Eigen
228    * matrices on the OpenCL device. Each matrix is flattened into one column
229    * of the resulting matrix_cl. If a lvalue is passed to this constructor the
230    * caller must make sure that the vector does not go out of scope before
231    * copying is complete.
232    *
233    * That means `.wait()` must be called on the event associated on copying or
234    * any other event that requires completion of this event. This can be done by
235    * calling `.wait_for_write_events()` or `.wait_for_read_write_events()` on
236    * this matrix or any matrix that is calculated from this one.
237    *
238    * @param A the vector of Eigen matrices
239    *
240    * @throw <code>std::invalid_argument</code> if the
241    * matrices do not have matching dimensions
242    * @throw <code>std::system_error</code> if the memory on the device could not
243    * be allocated
244    */
245   template <typename Vec, require_std_vector_vt<is_eigen, Vec>* = nullptr,
246             require_st_same<Vec, T>* = nullptr>
matrix_cl(Vec && A)247   explicit matrix_cl(Vec&& A) try : rows_(A.empty() ? 0 : A[0].size()),
248                                     cols_(A.size()) {
249     if (this->size() == 0) {
250       return;
251     }
252     cl::Context& ctx = opencl_context.context();
253     cl::CommandQueue& queue = opencl_context.queue();
254     buffer_cl_ = cl::Buffer(ctx, CL_MEM_READ_WRITE, sizeof(T) * size());
255     for (int i = 0, offset_size = 0; i < cols_; i++, offset_size += rows_) {
256       check_size_match("matrix constructor", "input rows", A[i].size(),
257                        "matrix_cl rows", rows_);
258       cl::Event write_event;
259       queue.enqueueWriteBuffer(
260           buffer_cl_,
261           opencl_context.in_order() || std::is_rvalue_reference<Vec&&>::value,
262           sizeof(T) * offset_size, sizeof(T) * rows_, A[i].data(), nullptr,
263           &write_event);
264       this->add_write_event(write_event);
265     }
266   } catch (const cl::Error& e) {
267     check_opencl_error("matrix constructor", e);
268   }
269 
270   /**
271    * Constructor for the matrix_cl that
272    * only allocates the buffer on the OpenCL device.
273    * Regardless of `partial_view`, whole matrix is stored.
274    *
275    * @param rows number of matrix rows, must be greater or equal to 0
276    * @param cols number of matrix columns, must be greater or equal to 0
277    * @param partial_view which part of the matrix is used
278    *
279    * @throw <code>std::system_error</code> if the memory on the device could not
280    * be allocated
281    *
282    */
matrix_cl(const int rows,const int cols,matrix_cl_view partial_view=matrix_cl_view::Entire)283   matrix_cl(const int rows, const int cols,
284             matrix_cl_view partial_view = matrix_cl_view::Entire)
285       : rows_(rows), cols_(cols), view_(partial_view) {
286     if (size() == 0) {
287       return;
288     }
289     cl::Context& ctx = opencl_context.context();
290     try {
291       int flags = CL_MEM_READ_WRITE;
292       if (opencl_context.device()[0].getInfo<CL_DEVICE_HOST_UNIFIED_MEMORY>()) {
293         flags |= CL_MEM_ALLOC_HOST_PTR;
294       }
295       buffer_cl_ = cl::Buffer(ctx, flags, sizeof(T) * rows_ * cols_);
296     } catch (const cl::Error& e) {
297       check_opencl_error("matrix constructor", e);
298     }
299   }
300 
301   /**
302    * Constructor for the matrix_cl that creates a copy of the Eigen matrix or
303    * Eigen expression on the OpenCL device. Regardless of `partial_view`, whole
304    * matrix is stored.
305    *
306    * If a lvalue matrix or a map is passed to this constructor, it might be
307    * directly used by the device. The caller must make sure that the matrix (map
308    * data) does not go out of scope as long as this `matrix_cl` is in use
309    * (`std::move`-ing it or using raw `buffer()` also counts as in use).
310    *
311    * @tparam Mat type of \c Eigen \c Matrix or expression
312    * @param A the \c Eigen \c Matrix or expression
313    * @param partial_view which part of the matrix is used
314    *
315    * @throw <code>std::system_error</code> if the memory on the device could not
316    * be allocated
317    */
318   template <typename Mat, require_eigen_t<Mat>* = nullptr,
319             require_vt_same<Mat, T>* = nullptr>
matrix_cl(Mat && A,matrix_cl_view partial_view=matrix_cl_view::Entire)320   explicit matrix_cl(Mat&& A,
321                      matrix_cl_view partial_view = matrix_cl_view::Entire)
322       : rows_(A.rows()), cols_(A.cols()), view_(partial_view) {
323     using Mat_type = std::decay_t<ref_type_for_opencl_t<Mat>>;
324     if (size() == 0) {
325       return;
326     }
327     initialize_buffer_no_heap_if<
328         std::is_same<std::decay_t<Mat>, Mat_type>::value
329         && (std::is_lvalue_reference<Mat>::value
330             || is_eigen_contiguous_map<Mat>::value)>(A);
331   }
332 
333   /**
334    * Constructor for the matrix_cl that creates a copy of a scalar on the OpenCL
335    * device. Regardless of `partial_view`, whole matrix is stored.
336    *
337    * If a lvalue is passed to this constructor, it might be directly used by the
338    * device. The caller must make sure that it does not go out of scope as long
339    * as this `matrix_cl` is in use
340    * (`std::move`-ing it or using raw `buffer()` also counts as in use).
341    *
342    * @param A the scalar
343    * @param partial_view which part of the matrix is used
344    *
345    * @throw <code>std::system_error</code> if the memory on the device could not
346    * be allocated
347    */
348   template <typename Scal,
349             typename = require_same_t<T, std::remove_reference_t<Scal>>>
matrix_cl(Scal && A,matrix_cl_view partial_view=matrix_cl_view::Diagonal)350   explicit matrix_cl(Scal&& A,
351                      matrix_cl_view partial_view = matrix_cl_view::Diagonal)
352       : rows_(1), cols_(1), view_(partial_view) {
353     initialize_buffer<std::is_rvalue_reference<Scal&&>::value>(
354         const_cast<const std::decay_t<Scal>*>(&A));
355   }
356 
357   /**
358    * Construct a matrix_cl of size Nx1 from \c std::vector.
359    *
360    * If a lvalue is passed to this constructor, it might be directly used by the
361    * device. The caller must make sure that it does not go out of scope as long
362    * as this `matrix_cl` is in use
363    * (`std::move`-ing it or using raw `buffer()` also counts as in use).
364    *
365    * @param A Standard vector
366    * @param partial_view which part of the matrix is used
367    *
368    * @throw <code>std::system_error</code> if the memory on the device could not
369    * be allocated
370    */
371   template <typename Vec, require_std_vector_t<Vec>* = nullptr,
372             require_vt_same<Vec, T>* = nullptr>
matrix_cl(Vec && A,matrix_cl_view partial_view=matrix_cl_view::Entire)373   explicit matrix_cl(Vec&& A,
374                      matrix_cl_view partial_view = matrix_cl_view::Entire)
375       : matrix_cl(std::forward<Vec>(A), A.size(), 1) {}
376 
377   /**
378    * Construct from \c std::vector with given rows and columns.
379    *
380    * If a lvalue is passed to this constructor, it might be directly used by the
381    * device. The caller must make sure that it does not go out of scope as long
382    * as this `matrix_cl` is in use `std::move`-ing it or using raw `buffer()`
383    * also counts as in use).
384    *
385    * @param A Standard vector
386    * @param R Number of rows the matrix should have.
387    * @param C Number of columns the matrix should have.
388    * @param partial_view which part of the matrix is used
389    *
390    * @throw <code>std::system_error</code> if the memory on the device could not
391    * be allocated
392    */
393   template <typename Vec, require_std_vector_t<Vec>* = nullptr,
394             require_vt_same<Vec, T>* = nullptr>
matrix_cl(Vec && A,const int & R,const int & C,matrix_cl_view partial_view=matrix_cl_view::Entire)395   explicit matrix_cl(Vec&& A, const int& R, const int& C,
396                      matrix_cl_view partial_view = matrix_cl_view::Entire)
397       : rows_(R), cols_(C), view_(partial_view) {
398     initialize_buffer_no_heap_if<std::is_lvalue_reference<Vec>::value>(A);
399   }
400 
401   /**
402    * Construct from \c array with given rows and columns.
403    *
404    * The memory might be directly used by the device. The caller must make sure
405    * that it does not go out of scope as long as this `matrix_cl` is in use
406    * (`std::move`-ing it or using raw `buffer()` also counts as in use).
407    *
408    * @param A array of doubles
409    * @param R Number of rows the matrix should have.
410    * @param C Number of columns the matrix should have.
411    * @param partial_view which part of the matrix is used
412    *
413    * @throw <code>std::system_error</code> if the memory on the device could not
414    * be allocated
415    */
416   template <typename U, require_same_t<T, U>* = nullptr>
matrix_cl(const U * A,const int & R,const int & C,matrix_cl_view partial_view=matrix_cl_view::Entire)417   explicit matrix_cl(const U* A, const int& R, const int& C,
418                      matrix_cl_view partial_view = matrix_cl_view::Entire)
419       : rows_(R), cols_(C), view_(partial_view) {
420     initialize_buffer(A);
421   }
422 
423   /**
424    * Construct from a kernel generator expression. It evaluates the expression
425    * into \c this.
426    * @tparam Expr type of the expression
427    * @param expression expression
428    */
429   // defined in kernel_generator/matrix_cl_conversion.hpp
430   template <typename Expr,
431             require_all_kernel_expressions_and_none_scalar_t<Expr>* = nullptr,
432             require_not_matrix_cl_t<Expr>* = nullptr>
433   matrix_cl(const Expr& expression);  // NOLINT(runtime/explicit)
434 
435   /**
436    * Move assignment operator.
437    */
operator =(matrix_cl<T> && a)438   matrix_cl<T>& operator=(matrix_cl<T>&& a) {
439     view_ = a.view();
440     rows_ = a.rows();
441     cols_ = a.cols();
442     this->wait_for_read_write_events();
443     buffer_cl_ = std::move(a.buffer_cl_);
444     write_events_ = std::move(a.write_events_);
445     read_events_ = std::move(a.read_events_);
446     return *this;
447   }
448 
449   /**
450    * Copy assignment operator.
451    */
operator =(const matrix_cl<T> & a)452   matrix_cl<T>& operator=(const matrix_cl<T>& a) {
453     this->view_ = a.view();
454     if (a.size() == 0) {
455       this->rows_ = a.rows();
456       this->cols_ = a.cols();
457       return *this;
458     }
459     this->wait_for_read_write_events();
460     if (size() != a.size()) {
461       buffer_cl_ = cl::Buffer(opencl_context.context(), CL_MEM_READ_WRITE,
462                               sizeof(T) * a.size());
463     }
464     this->rows_ = a.rows();
465     this->cols_ = a.cols();
466     initialize_buffer_cl(a);
467     return *this;
468   }
469 
470   /**
471    * Assignment of a kernel generator expression evaluates the expression into
472    * \c this.
473    * @tparam Expr type of the expression
474    * @param expression expression
475    */
476   // defined in kernel_generator/matrix_cl_conversion.hpp
477   template <typename Expr,
478             require_all_kernel_expressions_and_none_scalar_t<Expr>* = nullptr,
479             require_not_matrix_cl_t<Expr>* = nullptr>
480   matrix_cl<T>& operator=(const Expr& expression);
481 
482   /**
483    * Assignment of `arena_matrix_cl<T>`.
484    * @tparam Expr type of the expression
485    * @param expression expression
486    */
487   // defined in rev/arena_matrix_cl.hpp
488   matrix_cl<T>& operator=(const arena_matrix_cl<T>& other);
489 
490   /**
491    * Evaluates `this`. This is a no-op.
492    * @return `*this`
493    */
eval() const494   const matrix_cl<T>& eval() const& { return *this; }
eval()495   matrix_cl<T> eval() && { return std::move(*this); }
496 
497   /**
498    * Destructor waits for write events to prevent any kernels from writing
499    * memory that has already been reused.
500    */
~matrix_cl()501   ~matrix_cl() { wait_for_read_write_events(); }
502 
503  private:
504   /**
505    * Initializes the OpenCL buffer of this matrix by copying the data from given
506    * buffer. Assumes that size of \c this is already set and matches the
507    * buffer size.
508    *
509    * The caller must make sure that data is not deleted as long as
510    * this `matrix_cl` is in use (`std::move`-ing it or using raw `buffer()` also
511    * counts as in use).
512    *
513    * @tparam in_order whether copying must be done in order
514    * efficiently use it directly
515    * @param A pointer to buffer
516    * @return event for the copy
517    */
518   template <bool in_order = false>
initialize_buffer(const T * A)519   cl::Event initialize_buffer(const T* A) {
520     cl::Event transfer_event;
521     if (size() == 0) {
522       return transfer_event;
523     }
524     cl::Context& ctx = opencl_context.context();
525     cl::CommandQueue& queue = opencl_context.queue();
526     try {
527       buffer_cl_ = cl::Buffer(ctx, CL_MEM_READ_WRITE, sizeof(T) * size());
528       queue.enqueueWriteBuffer(buffer_cl_,
529                                opencl_context.in_order() || in_order, 0,
530                                sizeof(T) * size(), A, nullptr, &transfer_event);
531       this->add_write_event(transfer_event);
532     } catch (const cl::Error& e) {
533       check_opencl_error("initialize_buffer", e);
534     }
535     return transfer_event;
536   }
537 
538   template <bool in_order = false>
initialize_buffer(T * A)539   cl::Event initialize_buffer(T* A) {
540     cl::Event transfer_event;
541     if (size() == 0) {
542       return transfer_event;
543     }
544     cl::Context& ctx = opencl_context.context();
545     cl::CommandQueue& queue = opencl_context.queue();
546     try {
547       if (opencl_context.device()[0].getInfo<CL_DEVICE_HOST_UNIFIED_MEMORY>()) {
548         buffer_cl_
549             = cl::Buffer(ctx, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR,
550                          sizeof(T) * size(), A);  // this is always synchronous
551       } else {
552         buffer_cl_ = cl::Buffer(ctx, CL_MEM_READ_WRITE, sizeof(T) * size());
553         queue.enqueueWriteBuffer(
554             buffer_cl_, opencl_context.in_order() || in_order, 0,
555             sizeof(T) * size(), A, nullptr, &transfer_event);
556         this->add_write_event(transfer_event);
557       }
558     } catch (const cl::Error& e) {
559       check_opencl_error("initialize_buffer", e);
560     }
561     return transfer_event;
562   }
563 
564   /**
565    * Initializes the OpenCL buffer of this matrix by copying the data from given
566    * object. Assumes that size of \c this is already set and matches the
567    * buffer size. If No_heap is false the object is first moved to heap
568    * and callback is set to delete it after copying to OpenCL device is
569    * complete. Otherwise the caller must make sure that input object is not
570    * deleted as long as this `matrix_cl` is in use (`std::move`-ing it or using
571    * raw `buffer()` also counts).
572    *
573    * @tparam No_heap whether to move the object to heap first
574    * @tparam U type of object
575    * @param obj object
576    * @return event for the copy
577    */
578   template <bool No_heap, typename U, std::enable_if_t<No_heap>* = nullptr>
initialize_buffer_no_heap_if(U && obj)579   void initialize_buffer_no_heap_if(U&& obj) {
580     if (size() == 0) {
581       return;
582     }
583     initialize_buffer(obj.data());
584   }
585   // we need separate overloads as obj.data() might not be available when second
586   // overload is called.
587   template <bool No_heap, typename U, std::enable_if_t<!No_heap>* = nullptr>
initialize_buffer_no_heap_if(U && obj)588   void initialize_buffer_no_heap_if(U&& obj) {
589     using U_val = std::decay_t<ref_type_for_opencl_t<U>>;
590     if (size() == 0) {
591       return;
592     }
593     auto* obj_heap = new U_val(std::move(obj));
594     try {
595       cl::Event e = initialize_buffer(obj_heap->data());
596       if (opencl_context.device()[0].getInfo<CL_DEVICE_HOST_UNIFIED_MEMORY>()) {
597         buffer_cl_.setDestructorCallback(&delete_it_destructor<U_val>,
598                                          obj_heap);
599       } else {
600         e.setCallback(CL_COMPLETE, &delete_it_event<U_val>, obj_heap);
601       }
602     } catch (...) {
603       delete obj_heap;
604       throw;
605     }
606   }
607 
608   /**
609    * Initializes the OpenCL buffer of this matrix by copying the data from given
610    * matrix_cl. Assumes that size of \c this is already set and matches the
611    * size of given matrix.
612    * @param A matrix_cl
613    */
initialize_buffer_cl(const matrix_cl<T> & A)614   void initialize_buffer_cl(const matrix_cl<T>& A) {
615     try {
616       cl::Event cstr_event;
617       opencl_context.queue().enqueueCopyBuffer(A.buffer(), this->buffer(), 0, 0,
618                                                A.size() * sizeof(T),
619                                                &A.write_events(), &cstr_event);
620       this->add_write_event(cstr_event);
621       A.add_read_event(cstr_event);
622     } catch (const cl::Error& e) {
623       check_opencl_error("copy (OpenCL)->(OpenCL)", e);
624     }
625   }
626 
627   /**
628    * Deletes the container. Used as a callback for OpenCL event.
629    * @tparam U type of container
630    * @param e cl_event handle
631    * @param status status of event
632    * @param container container to delete
633    */
634   template <typename U>
delete_it_event(cl_event e,cl_int status,void * container)635   static void delete_it_event(cl_event e, cl_int status, void* container) {
636     delete static_cast<U*>(container);
637   }
638 
639   /**
640    * Deletes the container. Used as a callback for destruction of `cl::Buffer`.
641    * @tparam U type of container
642    * @param buff buffer that is being destructed
643    * @param container container to delete
644    */
645   template <typename U>
delete_it_destructor(cl_mem buff,void * container)646   static void delete_it_destructor(cl_mem buff, void* container) {
647     delete static_cast<U*>(container);
648   }
649 };
650 /** @}*/
651 
652 }  // namespace math
653 }  // namespace stan
654 
655 #endif
656 #endif
657