1 #ifndef STAN_MATH_OPENCL_KERNEL_GENERATOR_COLWISE_REDUCTION_HPP
2 #define STAN_MATH_OPENCL_KERNEL_GENERATOR_COLWISE_REDUCTION_HPP
3 #ifdef STAN_OPENCL
4
5 #include <stan/math/prim/meta.hpp>
6 #include <stan/math/opencl/opencl_context.hpp>
7 #include <stan/math/opencl/matrix_cl_view.hpp>
8 #include <stan/math/opencl/kernel_generator/type_str.hpp>
9 #include <stan/math/opencl/kernel_generator/name_generator.hpp>
10 #include <stan/math/opencl/kernel_generator/operation_cl.hpp>
11 #include <stan/math/opencl/kernel_generator/as_operation_cl.hpp>
12 #include <stan/math/opencl/kernel_generator/rowwise_reduction.hpp>
13 #include <stan/math/opencl/kernel_generator/calc_if.hpp>
14 #include <map>
15 #include <string>
16 #include <type_traits>
17 #include <utility>
18
19 namespace stan {
20 namespace math {
21 /** \addtogroup opencl_kernel_generator
22 * @{
23 */
24
25 namespace internal {
26 class colwise_reduction_base {};
27
28 /**
29 * Determine number of work groups in rows direction that will be run fro
30 * colwise reduction of given size.
31 * @param n_rows number of rows of expression to resuce
32 * @param n_cols number of columns of expression to resuce
33 * @return number of work groups in rows direction
34 */
colwise_reduction_wgs_rows(int n_rows,int n_cols)35 inline int colwise_reduction_wgs_rows(int n_rows, int n_cols) {
36 int local = opencl_context.base_opts().at("LOCAL_SIZE_");
37 int preferred_work_groups
38 = opencl_context.device()[0].getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>() * 16;
39 // round up n_rows/local/n_cols
40 return (std::min(preferred_work_groups, (n_rows + local - 1) / local) + n_cols
41 - 1)
42 / n_cols;
43 }
44 } // namespace internal
45
46 /**
47 * Represents a column wise reduction in kernel generator expressions. So as to
48 * be efficient column wise reductions are only done partially. That means
49 * instead of 1 row kernel output will have a few rows that need to be reduced
50 * to obtain final result (actually it is 1
51 * result per work group run - roughly 16 times the number of compute units on
52 * the OpenCL device). This can be done in a separate kernel or after
53 * copying to CPU. Also column wise reductions can not be used as arguments to
54 * other operations - they can only be evaluated.
55 * @tparam Derived derived type
56 * @tparam T type of first argument
57 * @tparam Operation type with member function generate that accepts two
58 * variable names and returns OpenCL source code for reduction operation_cl
59 */
60 template <typename Derived, typename T, typename Operation>
61 class colwise_reduction
62 : public internal::colwise_reduction_base,
63 public operation_cl<Derived, typename std::remove_reference_t<T>::Scalar,
64 T> {
65 public:
66 using Scalar = typename std::remove_reference_t<T>::Scalar;
67 using base = operation_cl<Derived, Scalar, T>;
68 using base::var_name_;
69 static const bool require_specific_local_size = true;
70
71 protected:
72 std::string init_;
73 using base::derived;
74
75 public:
76 using base::cols;
77 /**
78 * Constructor
79 * @param a the expression to reduce
80 * @param init OpenCL source code of initialization value for reduction
81 */
colwise_reduction(T && a,const std::string & init)82 explicit colwise_reduction(T&& a, const std::string& init)
83 : base(std::forward<T>(a)), init_(init) {}
84
85 /**
86 * Generates kernel code for assigning this expression into result expression.
87 * @param[in,out] generated map from (pointer to) already generated local
88 * operations to variable names
89 * @param[in,out] generated_all map from (pointer to) already generated all
90 * operations to variable names
91 * @param ng name generator for this kernel
92 * @param row_index_name row index variable name
93 * @param col_index_name column index variable name
94 * @param result expression into which result is to be assigned
95 * @return part of kernel with code for this and nested expressions
96 */
97 template <typename T_result>
get_whole_kernel_parts(std::map<const void *,const char * > & generated,std::map<const void *,const char * > & generated_all,name_generator & ng,const std::string & row_index_name,const std::string & col_index_name,const T_result & result) const98 kernel_parts get_whole_kernel_parts(
99 std::map<const void*, const char*>& generated,
100 std::map<const void*, const char*>& generated_all, name_generator& ng,
101 const std::string& row_index_name, const std::string& col_index_name,
102 const T_result& result) const {
103 kernel_parts parts = derived().get_kernel_parts(
104 generated, generated_all, ng, row_index_name, col_index_name, false);
105 kernel_parts out_parts = result.get_kernel_parts_lhs(
106 generated, generated_all, ng, row_index_name, col_index_name);
107
108 parts.args += out_parts.args;
109 parts.reduction_1d += "if (lid_i == 0) {\n"
110 + result.var_name_
111 + "_global[j * n_groups_i + wg_id_i] = "
112 + derived().var_name_ + "_local[0];\n"
113 "}\n";
114 return parts;
115 }
116
117 /**
118 * Generates kernel code for this and nested expressions.
119 * @param row_index_name row index variable name
120 * @param col_index_name column index variable name
121 * @param view_handled whether whether caller already handled matrix view
122 * @param var_name_arg name of the variable in kernel that holds argument to
123 * this expression
124 * @return part of kernel with code for this and nested expressions
125 */
generate(const std::string & row_index_name,const std::string & col_index_name,const bool view_handled,const std::string & var_name_arg) const126 inline kernel_parts generate(const std::string& row_index_name,
127 const std::string& col_index_name,
128 const bool view_handled,
129 const std::string& var_name_arg) const {
130 kernel_parts res;
131 res.declarations = "__local " + type_str<Scalar>() + " " + var_name_
132 + "_local[LOCAL_SIZE_];\n" + type_str<Scalar>() + " "
133 + var_name_ + ";\n";
134 res.initialization = var_name_ + " = " + init_ + ";\n";
135 res.body = var_name_ + " = " + Operation::generate(var_name_, var_name_arg)
136 + ";\n";
137 res.reduction_1d =
138 var_name_ + "_local[lid_i] = " + var_name_ + ";\n"
139 "barrier(CLK_LOCAL_MEM_FENCE);\n"
140 "for (int step = lsize_i / REDUCTION_STEP_SIZE; "
141 "step > 0; step /= REDUCTION_STEP_SIZE) {\n"
142 " if (lid_i < step) {\n"
143 " for (int i = 1; i < REDUCTION_STEP_SIZE; i++) {\n"
144 " " + var_name_ + "_local[lid_i] = " +
145 Operation::generate(var_name_ + "_local[lid_i]",
146 var_name_ + "_local[lid_i + step * i]") + ";\n"
147 " }\n"
148 " }\n"
149 " barrier(CLK_LOCAL_MEM_FENCE);\n"
150 "}\n";
151 return res;
152 }
153
154 /**
155 * Number of rows of a matrix that would be the result of evaluating this
156 * expression.
157 * @return number of rows
158 */
rows() const159 inline int rows() const {
160 int arg_rows = this->template get_arg<0>().rows();
161 int arg_cols = this->template get_arg<0>().cols();
162 if (arg_cols == 0) {
163 return 1;
164 }
165 if (arg_cols == -1) {
166 return -1;
167 }
168 return internal::colwise_reduction_wgs_rows(arg_rows, arg_cols);
169 }
170
171 /**
172 * Number of rows threads need to be launched for.
173 * @return number of rows
174 */
thread_rows() const175 inline int thread_rows() const { return this->template get_arg<0>().rows(); }
176
177 /**
178 * Determine indices of extreme sub- and superdiagonals written.
179 * @return pair of indices - bottom and top diagonal
180 */
extreme_diagonals() const181 inline std::pair<int, int> extreme_diagonals() const {
182 return {-rows() + 1, cols() - 1};
183 }
184 }; // namespace math
185
186 /**
187 * Represents column wise sum - reduction in kernel generator expressions.
188 * @tparam T type of expression
189 */
190 template <typename T>
191 class colwise_sum_ : public colwise_reduction<colwise_sum_<T>, T, sum_op> {
192 using base = colwise_reduction<colwise_sum_<T>, T, sum_op>;
193 using base::arguments_;
194
195 public:
colwise_sum_(T && a)196 explicit colwise_sum_(T&& a)
197 : colwise_reduction<colwise_sum_<T>, T, sum_op>(std::forward<T>(a), "0") {
198 }
199 /**
200 * Creates a deep copy of this expression.
201 * @return copy of \c *this
202 */
deep_copy() const203 inline auto deep_copy() const {
204 auto&& arg_copy = this->template get_arg<0>().deep_copy();
205 return colwise_sum_<std::remove_reference_t<decltype(arg_copy)>>(
206 std::move(arg_copy));
207 }
208 };
209
210 /**
211 * Column wise sum - reduction of a kernel generator expression. So as to
212 * be efficient column wise reductions are only done partially. That means
213 * instead of 1 row kernel output will have a few rows that need to be reduced
214 * to obtain the final result (actually it is 1 result per work group run -
215 * roughly 16 times the number of compute units on the OpenCL device). This can
216 * be done in a separate kernel or after copying to CPU. Also column wise
217 * reductions can not be used as arguments to other operations - they can only
218 * be evaluated.
219 * @tparam T type of input expression
220 * @param a the expression to reduce
221 * @return sum
222 */
223 template <typename T, require_all_kernel_expressions_t<T>* = nullptr>
colwise_sum(T && a)224 inline auto colwise_sum(T&& a) {
225 auto&& arg_copy = as_operation_cl(std::forward<T>(a)).deep_copy();
226 return colwise_sum_<as_operation_cl_t<T>>(
227 as_operation_cl(std::forward<T>(a)));
228 }
229
230 /**
231 * Represents column wise product - reduction in kernel generator expressions.
232 * @tparam T type of expression
233 */
234 template <typename T>
235 class colwise_prod_ : public colwise_reduction<colwise_prod_<T>, T, prod_op> {
236 using base = colwise_reduction<colwise_prod_<T>, T, prod_op>;
237 using base::arguments_;
238
239 public:
colwise_prod_(T && a)240 explicit colwise_prod_(T&& a)
241 : colwise_reduction<colwise_prod_<T>, T, prod_op>(std::forward<T>(a),
242 "1") {}
243 /**
244 * Creates a deep copy of this expression.
245 * @return copy of \c *this
246 */
deep_copy() const247 inline auto deep_copy() const {
248 auto&& arg_copy = this->template get_arg<0>().deep_copy();
249 return colwise_prod_<std::remove_reference_t<decltype(arg_copy)>>(
250 std::move(arg_copy));
251 }
252 };
253
254 /**
255 * Column wise product - reduction of a kernel generator expression. So as to
256 * be efficient column wise reductions are only done partially. That means
257 * instead of 1 row kernel output will have a few rows that need to be reduced
258 * to obtain the final result (actually it is 1 result per work group run -
259 * roughly 16 times the number of compute units on the OpenCL device). This can
260 * be done in a separate kernel or after copying to CPU. Also column wise
261 * reductions can not be used as arguments to other operations - they can only
262 * be evaluated.
263 * @tparam T type of input expression
264 * @param a the expression to reduce
265 * @return prod
266 */
267 template <typename T, require_all_kernel_expressions_t<T>* = nullptr>
colwise_prod(T && a)268 inline auto colwise_prod(T&& a) {
269 auto&& arg_copy = as_operation_cl(std::forward<T>(a)).deep_copy();
270 return colwise_prod_<as_operation_cl_t<T>>(
271 as_operation_cl(std::forward<T>(a)));
272 }
273
274 /**
275 * Represents column wise max - reduction in kernel generator expressions.
276 * @tparam T type of expression
277 */
278 template <typename T>
279 class colwise_max_ : public colwise_reduction<
280 colwise_max_<T>, T,
281 max_op<typename std::remove_reference_t<T>::Scalar>> {
282 using base
283 = colwise_reduction<colwise_max_<T>, T,
284 max_op<typename std::remove_reference_t<T>::Scalar>>;
285 using base::arguments_;
286
287 public:
288 using op = max_op<typename std::remove_reference_t<T>::Scalar>;
colwise_max_(T && a)289 explicit colwise_max_(T&& a)
290 : colwise_reduction<colwise_max_<T>, T, op>(std::forward<T>(a),
291 op::init()) {}
292 /**
293 * Creates a deep copy of this expression.
294 * @return copy of \c *this
295 */
deep_copy() const296 inline auto deep_copy() const {
297 auto&& arg_copy = this->template get_arg<0>().deep_copy();
298 return colwise_max_<std::remove_reference_t<decltype(arg_copy)>>(
299 std::move(arg_copy));
300 }
301 };
302
303 /**
304 * Column wise max - reduction of a kernel generator expression. So as to
305 * be efficient column wise reductions are only done partially. That means
306 * instead of 1 row kernel output will have a few rows that need to be reduced
307 * to obtain the final result (actually it is 1 result per work group run -
308 * roughly 16 times the number of compute units on the OpenCL device). This can
309 * be done in a separate kernel or after copying to CPU. Also column wise
310 * reductions can not be used as arguments to other operations - they can only
311 * be evaluated.
312 * @tparam T type of input expression
313 * @param a the expression to reduce
314 * @return max
315 */
316 template <typename T, require_all_kernel_expressions_t<T>* = nullptr>
colwise_max(T && a)317 inline auto colwise_max(T&& a) {
318 auto&& arg_copy = as_operation_cl(std::forward<T>(a)).deep_copy();
319 return colwise_max_<as_operation_cl_t<T>>(
320 as_operation_cl(std::forward<T>(a)));
321 }
322
323 /**
324 * Represents column wise min - reduction in kernel generator expressions.
325 * @tparam T type of expression
326 */
327 template <typename T>
328 class colwise_min_ : public colwise_reduction<
329 colwise_min_<T>, T,
330 min_op<typename std::remove_reference_t<T>::Scalar>> {
331 using base
332 = colwise_reduction<colwise_min_<T>, T,
333 min_op<typename std::remove_reference_t<T>::Scalar>>;
334 using base::arguments_;
335
336 public:
337 using op = min_op<typename std::remove_reference_t<T>::Scalar>;
colwise_min_(T && a)338 explicit colwise_min_(T&& a)
339 : colwise_reduction<colwise_min_<T>, T, op>(std::forward<T>(a),
340 op::init()) {}
341 /**
342 * Creates a deep copy of this expression.
343 * @return copy of \c *this
344 */
deep_copy() const345 inline auto deep_copy() const {
346 auto&& arg_copy = this->template get_arg<0>().deep_copy();
347 return colwise_min_<std::remove_reference_t<decltype(arg_copy)>>(
348 std::move(arg_copy));
349 }
350 };
351
352 /**
353 * Column wise min - reduction of a kernel generator expression. So as to
354 * be efficient column wise reductions are only done partially. That means
355 * instead of 1 row kernel output will have a few rows that need to be reduced
356 * to obtain the final result (actually it is 1 result per work group run -
357 * roughly 16 times the number of compute units on the OpenCL device). This can
358 * be done in a separate kernel or after copying to CPU. Also column wise
359 * reductions can not be used as arguments to other operations - they can only
360 * be evaluated.
361 * @tparam T type of input expression
362 * @param a the expression to reduce
363 * @return min
364 */
365 template <typename T, require_all_kernel_expressions_t<T>* = nullptr>
colwise_min(T && a)366 inline auto colwise_min(T&& a) {
367 return colwise_min_<as_operation_cl_t<T>>(
368 as_operation_cl(std::forward<T>(a)));
369 }
370
371 namespace internal {
372 template <typename T>
373 struct is_colwise_reduction_impl
374 : public std::is_base_of<internal::colwise_reduction_base,
375 std::decay_t<T>> {};
376 template <typename T>
377 struct is_colwise_reduction_impl<calc_if_<true, T>>
378 : public std::is_base_of<internal::colwise_reduction_base,
379 std::decay_t<T>> {};
380 } // namespace internal
381
382 /**
383 * Check whether a kernel generator expression is a colwise reduction.
384 */
385 template <typename T>
386 using is_colwise_reduction
387 = internal::is_colwise_reduction_impl<std::decay_t<T>>;
388
389 /** @}*/
390 } // namespace math
391 } // namespace stan
392 #endif
393 #endif
394