1 #ifndef STAN_MATH_OPENCL_KERNEL_GENERATOR_COLWISE_REDUCTION_HPP
2 #define STAN_MATH_OPENCL_KERNEL_GENERATOR_COLWISE_REDUCTION_HPP
3 #ifdef STAN_OPENCL
4 
5 #include <stan/math/prim/meta.hpp>
6 #include <stan/math/opencl/opencl_context.hpp>
7 #include <stan/math/opencl/matrix_cl_view.hpp>
8 #include <stan/math/opencl/kernel_generator/type_str.hpp>
9 #include <stan/math/opencl/kernel_generator/name_generator.hpp>
10 #include <stan/math/opencl/kernel_generator/operation_cl.hpp>
11 #include <stan/math/opencl/kernel_generator/as_operation_cl.hpp>
12 #include <stan/math/opencl/kernel_generator/rowwise_reduction.hpp>
13 #include <stan/math/opencl/kernel_generator/calc_if.hpp>
14 #include <map>
15 #include <string>
16 #include <type_traits>
17 #include <utility>
18 
19 namespace stan {
20 namespace math {
21 /** \addtogroup opencl_kernel_generator
22  *  @{
23  */
24 
25 namespace internal {
26 class colwise_reduction_base {};
27 
28 /**
29  * Determine number of work groups in rows direction that will be run fro
30  * colwise reduction of given size.
31  * @param n_rows number of rows of expression to resuce
32  * @param n_cols number of columns of expression to resuce
33  * @return number of work groups in rows direction
34  */
colwise_reduction_wgs_rows(int n_rows,int n_cols)35 inline int colwise_reduction_wgs_rows(int n_rows, int n_cols) {
36   int local = opencl_context.base_opts().at("LOCAL_SIZE_");
37   int preferred_work_groups
38       = opencl_context.device()[0].getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>() * 16;
39   // round up n_rows/local/n_cols
40   return (std::min(preferred_work_groups, (n_rows + local - 1) / local) + n_cols
41           - 1)
42          / n_cols;
43 }
44 }  // namespace internal
45 
46 /**
47  * Represents a column wise reduction in kernel generator expressions. So as to
48  * be efficient column wise reductions are only done partially. That means
49  * instead of 1 row kernel output will have a few rows that need to be reduced
50  * to obtain final result (actually it is 1
51  * result per work group run - roughly 16 times the number of compute units on
52  * the OpenCL device). This can be done in a separate kernel or after
53  * copying to CPU. Also column wise reductions can not be used as arguments to
54  * other operations - they can only be evaluated.
55  * @tparam Derived derived type
56  * @tparam T type of first argument
57  * @tparam Operation type with member function generate that accepts two
58  * variable names and returns OpenCL source code for reduction operation_cl
59  */
60 template <typename Derived, typename T, typename Operation>
61 class colwise_reduction
62     : public internal::colwise_reduction_base,
63       public operation_cl<Derived, typename std::remove_reference_t<T>::Scalar,
64                           T> {
65  public:
66   using Scalar = typename std::remove_reference_t<T>::Scalar;
67   using base = operation_cl<Derived, Scalar, T>;
68   using base::var_name_;
69   static const bool require_specific_local_size = true;
70 
71  protected:
72   std::string init_;
73   using base::derived;
74 
75  public:
76   using base::cols;
77   /**
78    * Constructor
79    * @param a the expression to reduce
80    * @param init OpenCL source code of initialization value for reduction
81    */
colwise_reduction(T && a,const std::string & init)82   explicit colwise_reduction(T&& a, const std::string& init)
83       : base(std::forward<T>(a)), init_(init) {}
84 
85   /**
86    * Generates kernel code for assigning this expression into result expression.
87    * @param[in,out] generated map from (pointer to) already generated local
88    * operations to variable names
89    * @param[in,out] generated_all map from (pointer to) already generated all
90    * operations to variable names
91    * @param ng name generator for this kernel
92    * @param row_index_name row index variable name
93    * @param col_index_name column index variable name
94    * @param result expression into which result is to be assigned
95    * @return part of kernel with code for this and nested expressions
96    */
97   template <typename T_result>
get_whole_kernel_parts(std::map<const void *,const char * > & generated,std::map<const void *,const char * > & generated_all,name_generator & ng,const std::string & row_index_name,const std::string & col_index_name,const T_result & result) const98   kernel_parts get_whole_kernel_parts(
99       std::map<const void*, const char*>& generated,
100       std::map<const void*, const char*>& generated_all, name_generator& ng,
101       const std::string& row_index_name, const std::string& col_index_name,
102       const T_result& result) const {
103     kernel_parts parts = derived().get_kernel_parts(
104         generated, generated_all, ng, row_index_name, col_index_name, false);
105     kernel_parts out_parts = result.get_kernel_parts_lhs(
106         generated, generated_all, ng, row_index_name, col_index_name);
107 
108     parts.args += out_parts.args;
109     parts.reduction_1d += "if (lid_i == 0) {\n"
110                      + result.var_name_
111                      + "_global[j * n_groups_i + wg_id_i] = "
112                      + derived().var_name_ + "_local[0];\n"
113                      "}\n";
114     return parts;
115   }
116 
117   /**
118    * Generates kernel code for this and nested expressions.
119    * @param row_index_name row index variable name
120    * @param col_index_name column index variable name
121    * @param view_handled whether whether caller already handled matrix view
122    * @param var_name_arg name of the variable in kernel that holds argument to
123    * this expression
124    * @return part of kernel with code for this and nested expressions
125    */
generate(const std::string & row_index_name,const std::string & col_index_name,const bool view_handled,const std::string & var_name_arg) const126   inline kernel_parts generate(const std::string& row_index_name,
127                                const std::string& col_index_name,
128                                const bool view_handled,
129                                const std::string& var_name_arg) const {
130     kernel_parts res;
131     res.declarations = "__local " + type_str<Scalar>() + " " + var_name_
132                        + "_local[LOCAL_SIZE_];\n" + type_str<Scalar>() + " "
133                        + var_name_ + ";\n";
134     res.initialization = var_name_ + " = " + init_ + ";\n";
135     res.body = var_name_ + " = " + Operation::generate(var_name_, var_name_arg)
136                + ";\n";
137     res.reduction_1d =
138           var_name_ + "_local[lid_i] = " + var_name_ + ";\n"
139           "barrier(CLK_LOCAL_MEM_FENCE);\n"
140           "for (int step = lsize_i / REDUCTION_STEP_SIZE; "
141                 "step > 0; step /= REDUCTION_STEP_SIZE) {\n"
142           "  if (lid_i < step) {\n"
143           "    for (int i = 1; i < REDUCTION_STEP_SIZE; i++) {\n"
144           "      " + var_name_ + "_local[lid_i] = " +
145         Operation::generate(var_name_ + "_local[lid_i]",
146                             var_name_ + "_local[lid_i + step * i]") + ";\n"
147           "    }\n"
148           "  }\n"
149           "  barrier(CLK_LOCAL_MEM_FENCE);\n"
150           "}\n";
151     return res;
152   }
153 
154   /**
155    * Number of rows of a matrix that would be the result of evaluating this
156    * expression.
157    * @return number of rows
158    */
rows() const159   inline int rows() const {
160     int arg_rows = this->template get_arg<0>().rows();
161     int arg_cols = this->template get_arg<0>().cols();
162     if (arg_cols == 0) {
163       return 1;
164     }
165     if (arg_cols == -1) {
166       return -1;
167     }
168     return internal::colwise_reduction_wgs_rows(arg_rows, arg_cols);
169   }
170 
171   /**
172    * Number of rows threads need to be launched for.
173    * @return number of rows
174    */
thread_rows() const175   inline int thread_rows() const { return this->template get_arg<0>().rows(); }
176 
177   /**
178    * Determine indices of extreme sub- and superdiagonals written.
179    * @return pair of indices - bottom and top diagonal
180    */
extreme_diagonals() const181   inline std::pair<int, int> extreme_diagonals() const {
182     return {-rows() + 1, cols() - 1};
183   }
184 };  // namespace math
185 
186 /**
187  * Represents column wise sum - reduction in kernel generator expressions.
188  * @tparam T type of expression
189  */
190 template <typename T>
191 class colwise_sum_ : public colwise_reduction<colwise_sum_<T>, T, sum_op> {
192   using base = colwise_reduction<colwise_sum_<T>, T, sum_op>;
193   using base::arguments_;
194 
195  public:
colwise_sum_(T && a)196   explicit colwise_sum_(T&& a)
197       : colwise_reduction<colwise_sum_<T>, T, sum_op>(std::forward<T>(a), "0") {
198   }
199   /**
200    * Creates a deep copy of this expression.
201    * @return copy of \c *this
202    */
deep_copy() const203   inline auto deep_copy() const {
204     auto&& arg_copy = this->template get_arg<0>().deep_copy();
205     return colwise_sum_<std::remove_reference_t<decltype(arg_copy)>>(
206         std::move(arg_copy));
207   }
208 };
209 
210 /**
211  * Column wise sum - reduction of a kernel generator expression. So as to
212  * be efficient column wise reductions are only done partially. That means
213  * instead of 1 row kernel output will have a few rows that need to be reduced
214  * to obtain the final result (actually it is 1 result per work group run -
215  * roughly 16 times the number of compute units on the OpenCL device). This can
216  * be done in a separate kernel or after copying to CPU. Also column wise
217  * reductions can not be used as arguments to other operations - they can only
218  * be evaluated.
219  * @tparam T type of input expression
220  * @param a the expression to reduce
221  * @return sum
222  */
223 template <typename T, require_all_kernel_expressions_t<T>* = nullptr>
colwise_sum(T && a)224 inline auto colwise_sum(T&& a) {
225   auto&& arg_copy = as_operation_cl(std::forward<T>(a)).deep_copy();
226   return colwise_sum_<as_operation_cl_t<T>>(
227       as_operation_cl(std::forward<T>(a)));
228 }
229 
230 /**
231  * Represents column wise product - reduction in kernel generator expressions.
232  * @tparam T type of expression
233  */
234 template <typename T>
235 class colwise_prod_ : public colwise_reduction<colwise_prod_<T>, T, prod_op> {
236   using base = colwise_reduction<colwise_prod_<T>, T, prod_op>;
237   using base::arguments_;
238 
239  public:
colwise_prod_(T && a)240   explicit colwise_prod_(T&& a)
241       : colwise_reduction<colwise_prod_<T>, T, prod_op>(std::forward<T>(a),
242                                                         "1") {}
243   /**
244    * Creates a deep copy of this expression.
245    * @return copy of \c *this
246    */
deep_copy() const247   inline auto deep_copy() const {
248     auto&& arg_copy = this->template get_arg<0>().deep_copy();
249     return colwise_prod_<std::remove_reference_t<decltype(arg_copy)>>(
250         std::move(arg_copy));
251   }
252 };
253 
254 /**
255  * Column wise product - reduction of a kernel generator expression. So as to
256  * be efficient column wise reductions are only done partially. That means
257  * instead of 1 row kernel output will have a few rows that need to be reduced
258  * to obtain the final result (actually it is 1 result per work group run -
259  * roughly 16 times the number of compute units on the OpenCL device). This can
260  * be done in a separate kernel or after copying to CPU. Also column wise
261  * reductions can not be used as arguments to other operations - they can only
262  * be evaluated.
263  * @tparam T type of input expression
264  * @param a the expression to reduce
265  * @return prod
266  */
267 template <typename T, require_all_kernel_expressions_t<T>* = nullptr>
colwise_prod(T && a)268 inline auto colwise_prod(T&& a) {
269   auto&& arg_copy = as_operation_cl(std::forward<T>(a)).deep_copy();
270   return colwise_prod_<as_operation_cl_t<T>>(
271       as_operation_cl(std::forward<T>(a)));
272 }
273 
274 /**
275  * Represents column wise max - reduction in kernel generator expressions.
276  * @tparam T type of expression
277  */
278 template <typename T>
279 class colwise_max_ : public colwise_reduction<
280                          colwise_max_<T>, T,
281                          max_op<typename std::remove_reference_t<T>::Scalar>> {
282   using base
283       = colwise_reduction<colwise_max_<T>, T,
284                           max_op<typename std::remove_reference_t<T>::Scalar>>;
285   using base::arguments_;
286 
287  public:
288   using op = max_op<typename std::remove_reference_t<T>::Scalar>;
colwise_max_(T && a)289   explicit colwise_max_(T&& a)
290       : colwise_reduction<colwise_max_<T>, T, op>(std::forward<T>(a),
291                                                   op::init()) {}
292   /**
293    * Creates a deep copy of this expression.
294    * @return copy of \c *this
295    */
deep_copy() const296   inline auto deep_copy() const {
297     auto&& arg_copy = this->template get_arg<0>().deep_copy();
298     return colwise_max_<std::remove_reference_t<decltype(arg_copy)>>(
299         std::move(arg_copy));
300   }
301 };
302 
303 /**
304  * Column wise max - reduction of a kernel generator expression. So as to
305  * be efficient column wise reductions are only done partially. That means
306  * instead of 1 row kernel output will have a few rows that need to be reduced
307  * to obtain the final result (actually it is 1 result per work group run -
308  * roughly 16 times the number of compute units on the OpenCL device). This can
309  * be done in a separate kernel or after copying to CPU. Also column wise
310  * reductions can not be used as arguments to other operations - they can only
311  * be evaluated.
312  * @tparam T type of input expression
313  * @param a the expression to reduce
314  * @return max
315  */
316 template <typename T, require_all_kernel_expressions_t<T>* = nullptr>
colwise_max(T && a)317 inline auto colwise_max(T&& a) {
318   auto&& arg_copy = as_operation_cl(std::forward<T>(a)).deep_copy();
319   return colwise_max_<as_operation_cl_t<T>>(
320       as_operation_cl(std::forward<T>(a)));
321 }
322 
323 /**
324  * Represents column wise min - reduction in kernel generator expressions.
325  * @tparam T type of expression
326  */
327 template <typename T>
328 class colwise_min_ : public colwise_reduction<
329                          colwise_min_<T>, T,
330                          min_op<typename std::remove_reference_t<T>::Scalar>> {
331   using base
332       = colwise_reduction<colwise_min_<T>, T,
333                           min_op<typename std::remove_reference_t<T>::Scalar>>;
334   using base::arguments_;
335 
336  public:
337   using op = min_op<typename std::remove_reference_t<T>::Scalar>;
colwise_min_(T && a)338   explicit colwise_min_(T&& a)
339       : colwise_reduction<colwise_min_<T>, T, op>(std::forward<T>(a),
340                                                   op::init()) {}
341   /**
342    * Creates a deep copy of this expression.
343    * @return copy of \c *this
344    */
deep_copy() const345   inline auto deep_copy() const {
346     auto&& arg_copy = this->template get_arg<0>().deep_copy();
347     return colwise_min_<std::remove_reference_t<decltype(arg_copy)>>(
348         std::move(arg_copy));
349   }
350 };
351 
352 /**
353  * Column wise min - reduction of a kernel generator expression.  So as to
354  * be efficient column wise reductions are only done partially. That means
355  * instead of 1 row kernel output will have a few rows that need to be reduced
356  * to obtain the final result (actually it is 1 result per work group run -
357  * roughly 16 times the number of compute units on the OpenCL device). This can
358  * be done in a separate kernel or after copying to CPU. Also column wise
359  * reductions can not be used as arguments to other operations - they can only
360  * be evaluated.
361  * @tparam T type of input expression
362  * @param a the expression to reduce
363  * @return min
364  */
365 template <typename T, require_all_kernel_expressions_t<T>* = nullptr>
colwise_min(T && a)366 inline auto colwise_min(T&& a) {
367   return colwise_min_<as_operation_cl_t<T>>(
368       as_operation_cl(std::forward<T>(a)));
369 }
370 
371 namespace internal {
372 template <typename T>
373 struct is_colwise_reduction_impl
374     : public std::is_base_of<internal::colwise_reduction_base,
375                              std::decay_t<T>> {};
376 template <typename T>
377 struct is_colwise_reduction_impl<calc_if_<true, T>>
378     : public std::is_base_of<internal::colwise_reduction_base,
379                              std::decay_t<T>> {};
380 }  // namespace internal
381 
382 /**
383  * Check whether a kernel generator expression is a colwise reduction.
384  */
385 template <typename T>
386 using is_colwise_reduction
387     = internal::is_colwise_reduction_impl<std::decay_t<T>>;
388 
389 /** @}*/
390 }  // namespace math
391 }  // namespace stan
392 #endif
393 #endif
394