1 //---------------------------------------------------------------------------//
2 // Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
3 //
4 // Distributed under the Boost Software License, Version 1.0
5 // See accompanying file LICENSE_1_0.txt or copy at
6 // http://www.boost.org/LICENSE_1_0.txt
7 //
8 // See http://boostorg.github.com/compute for more information.
9 //---------------------------------------------------------------------------//
10 
11 #ifndef BOOST_COMPUTE_ALGORITHM_REDUCE_HPP
12 #define BOOST_COMPUTE_ALGORITHM_REDUCE_HPP
13 
14 #include <iterator>
15 
16 #include <boost/static_assert.hpp>
17 
18 #include <boost/compute/system.hpp>
19 #include <boost/compute/functional.hpp>
20 #include <boost/compute/detail/meta_kernel.hpp>
21 #include <boost/compute/command_queue.hpp>
22 #include <boost/compute/container/array.hpp>
23 #include <boost/compute/container/vector.hpp>
24 #include <boost/compute/algorithm/copy_n.hpp>
25 #include <boost/compute/algorithm/detail/inplace_reduce.hpp>
26 #include <boost/compute/algorithm/detail/reduce_on_gpu.hpp>
27 #include <boost/compute/algorithm/detail/reduce_on_cpu.hpp>
28 #include <boost/compute/detail/iterator_range_size.hpp>
29 #include <boost/compute/memory/local_buffer.hpp>
30 #include <boost/compute/type_traits/result_of.hpp>
31 #include <boost/compute/type_traits/is_device_iterator.hpp>
32 
33 namespace boost {
34 namespace compute {
35 namespace detail {
36 
37 template<class InputIterator, class OutputIterator, class BinaryFunction>
reduce(InputIterator first,size_t count,OutputIterator result,size_t block_size,BinaryFunction function,command_queue & queue)38 size_t reduce(InputIterator first,
39               size_t count,
40               OutputIterator result,
41               size_t block_size,
42               BinaryFunction function,
43               command_queue &queue)
44 {
45     typedef typename
46         std::iterator_traits<InputIterator>::value_type
47         input_type;
48     typedef typename
49         boost::compute::result_of<BinaryFunction(input_type, input_type)>::type
50         result_type;
51 
52     const context &context = queue.get_context();
53     size_t block_count = count / 2 / block_size;
54     size_t total_block_count =
55         static_cast<size_t>(std::ceil(float(count) / 2.f / float(block_size)));
56 
57     if(block_count != 0){
58         meta_kernel k("block_reduce");
59         size_t output_arg = k.add_arg<result_type *>(memory_object::global_memory, "output");
60         size_t block_arg = k.add_arg<input_type *>(memory_object::local_memory, "block");
61 
62         k <<
63             "const uint gid = get_global_id(0);\n" <<
64             "const uint lid = get_local_id(0);\n" <<
65 
66             // copy values to local memory
67             "block[lid] = " <<
68                 function(first[k.make_var<uint_>("gid*2+0")],
69                          first[k.make_var<uint_>("gid*2+1")]) << ";\n" <<
70 
71             // perform reduction
72             "for(uint i = 1; i < " << uint_(block_size) << "; i <<= 1){\n" <<
73             "    barrier(CLK_LOCAL_MEM_FENCE);\n" <<
74             "    uint mask = (i << 1) - 1;\n" <<
75             "    if((lid & mask) == 0){\n" <<
76             "        block[lid] = " <<
77                          function(k.expr<input_type>("block[lid]"),
78                                   k.expr<input_type>("block[lid+i]")) << ";\n" <<
79             "    }\n" <<
80             "}\n" <<
81 
82             // write block result to global output
83             "if(lid == 0)\n" <<
84             "    output[get_group_id(0)] = block[0];\n";
85 
86         kernel kernel = k.compile(context);
87         kernel.set_arg(output_arg, result.get_buffer());
88         kernel.set_arg(block_arg, local_buffer<input_type>(block_size));
89 
90         queue.enqueue_1d_range_kernel(kernel,
91                                       0,
92                                       block_count * block_size,
93                                       block_size);
94     }
95 
96     // serially reduce any leftovers
97     if(block_count * block_size * 2 < count){
98         size_t last_block_start = block_count * block_size * 2;
99 
100         meta_kernel k("extra_serial_reduce");
101         size_t count_arg = k.add_arg<uint_>("count");
102         size_t offset_arg = k.add_arg<uint_>("offset");
103         size_t output_arg = k.add_arg<result_type *>(memory_object::global_memory, "output");
104         size_t output_offset_arg = k.add_arg<uint_>("output_offset");
105 
106         k <<
107             k.decl<result_type>("result") << " = \n" <<
108                 first[k.expr<uint_>("offset")] << ";\n" <<
109             "for(uint i = offset + 1; i < count; i++)\n" <<
110             "    result = " <<
111                      function(k.var<result_type>("result"),
112                               first[k.var<uint_>("i")]) << ";\n" <<
113             "output[output_offset] = result;\n";
114 
115         kernel kernel = k.compile(context);
116         kernel.set_arg(count_arg, static_cast<uint_>(count));
117         kernel.set_arg(offset_arg, static_cast<uint_>(last_block_start));
118         kernel.set_arg(output_arg, result.get_buffer());
119         kernel.set_arg(output_offset_arg, static_cast<uint_>(block_count));
120 
121         queue.enqueue_task(kernel);
122     }
123 
124     return total_block_count;
125 }
126 
127 template<class InputIterator, class BinaryFunction>
128 inline vector<
129     typename boost::compute::result_of<
130         BinaryFunction(
131             typename std::iterator_traits<InputIterator>::value_type,
132             typename std::iterator_traits<InputIterator>::value_type
133         )
134     >::type
135 >
block_reduce(InputIterator first,size_t count,size_t block_size,BinaryFunction function,command_queue & queue)136 block_reduce(InputIterator first,
137              size_t count,
138              size_t block_size,
139              BinaryFunction function,
140              command_queue &queue)
141 {
142     typedef typename
143         std::iterator_traits<InputIterator>::value_type
144         input_type;
145     typedef typename
146         boost::compute::result_of<BinaryFunction(input_type, input_type)>::type
147         result_type;
148 
149     const context &context = queue.get_context();
150     size_t total_block_count =
151         static_cast<size_t>(std::ceil(float(count) / 2.f / float(block_size)));
152     vector<result_type> result_vector(total_block_count, context);
153 
154     reduce(first, count, result_vector.begin(), block_size, function, queue);
155 
156     return result_vector;
157 }
158 
159 // Space complexity: O( ceil(n / 2 / 256) )
160 template<class InputIterator, class OutputIterator, class BinaryFunction>
generic_reduce(InputIterator first,InputIterator last,OutputIterator result,BinaryFunction function,command_queue & queue)161 inline void generic_reduce(InputIterator first,
162                            InputIterator last,
163                            OutputIterator result,
164                            BinaryFunction function,
165                            command_queue &queue)
166 {
167     typedef typename
168         std::iterator_traits<InputIterator>::value_type
169         input_type;
170     typedef typename
171         boost::compute::result_of<BinaryFunction(input_type, input_type)>::type
172         result_type;
173 
174     const device &device = queue.get_device();
175     const context &context = queue.get_context();
176 
177     size_t count = detail::iterator_range_size(first, last);
178 
179     if(device.type() & device::cpu){
180         array<result_type, 1> value(context);
181         detail::reduce_on_cpu(first, last, value.begin(), function, queue);
182         boost::compute::copy_n(value.begin(), 1, result, queue);
183     }
184     else {
185         size_t block_size = 256;
186 
187         // first pass
188         vector<result_type> results = detail::block_reduce(first,
189                                                            count,
190                                                            block_size,
191                                                            function,
192                                                            queue);
193 
194         if(results.size() > 1){
195             detail::inplace_reduce(results.begin(),
196                                    results.end(),
197                                    function,
198                                    queue);
199         }
200 
201         boost::compute::copy_n(results.begin(), 1, result, queue);
202     }
203 }
204 
205 template<class InputIterator, class OutputIterator, class T>
dispatch_reduce(InputIterator first,InputIterator last,OutputIterator result,const plus<T> & function,command_queue & queue)206 inline void dispatch_reduce(InputIterator first,
207                             InputIterator last,
208                             OutputIterator result,
209                             const plus<T> &function,
210                             command_queue &queue)
211 {
212     const context &context = queue.get_context();
213     const device &device = queue.get_device();
214 
215     // reduce to temporary buffer on device
216     array<T, 1> value(context);
217     if(device.type() & device::cpu){
218         detail::reduce_on_cpu(first, last, value.begin(), function, queue);
219     }
220     else {
221         reduce_on_gpu(first, last, value.begin(), function, queue);
222     }
223 
224     // copy to result iterator
225     copy_n(value.begin(), 1, result, queue);
226 }
227 
228 template<class InputIterator, class OutputIterator, class BinaryFunction>
dispatch_reduce(InputIterator first,InputIterator last,OutputIterator result,BinaryFunction function,command_queue & queue)229 inline void dispatch_reduce(InputIterator first,
230                             InputIterator last,
231                             OutputIterator result,
232                             BinaryFunction function,
233                             command_queue &queue)
234 {
235     generic_reduce(first, last, result, function, queue);
236 }
237 
238 } // end detail namespace
239 
240 /// Returns the result of applying \p function to the elements in the
241 /// range [\p first, \p last).
242 ///
243 /// If no function is specified, \c plus will be used.
244 ///
245 /// \param first first element in the input range
246 /// \param last last element in the input range
247 /// \param result iterator pointing to the output
248 /// \param function binary reduction function
249 /// \param queue command queue to perform the operation
250 ///
251 /// The \c reduce() algorithm assumes that the binary reduction function is
252 /// associative. When used with non-associative functions the result may
253 /// be non-deterministic and vary in precision. Notably this affects the
254 /// \c plus<float>() function as floating-point addition is not associative
255 /// and may produce slightly different results than a serial algorithm.
256 ///
257 /// This algorithm supports both host and device iterators for the
258 /// result argument. This allows for values to be reduced and copied
259 /// to the host all with a single function call.
260 ///
261 /// For example, to calculate the sum of the values in a device vector and
262 /// copy the result to a value on the host:
263 ///
264 /// \snippet test/test_reduce.cpp sum_int
265 ///
266 /// Note that while the the \c reduce() algorithm is conceptually identical to
267 /// the \c accumulate() algorithm, its implementation is substantially more
268 /// efficient on parallel hardware. For more information, see the documentation
269 /// on the \c accumulate() algorithm.
270 ///
271 /// Space complexity on GPUs: \Omega(n)<br>
272 /// Space complexity on CPUs: \Omega(1)
273 ///
274 /// \see accumulate()
275 template<class InputIterator, class OutputIterator, class BinaryFunction>
reduce(InputIterator first,InputIterator last,OutputIterator result,BinaryFunction function,command_queue & queue=system::default_queue ())276 inline void reduce(InputIterator first,
277                    InputIterator last,
278                    OutputIterator result,
279                    BinaryFunction function,
280                    command_queue &queue = system::default_queue())
281 {
282     BOOST_STATIC_ASSERT(is_device_iterator<InputIterator>::value);
283     if(first == last){
284         return;
285     }
286 
287     detail::dispatch_reduce(first, last, result, function, queue);
288 }
289 
290 /// \overload
291 template<class InputIterator, class OutputIterator>
reduce(InputIterator first,InputIterator last,OutputIterator result,command_queue & queue=system::default_queue ())292 inline void reduce(InputIterator first,
293                    InputIterator last,
294                    OutputIterator result,
295                    command_queue &queue = system::default_queue())
296 {
297     BOOST_STATIC_ASSERT(is_device_iterator<InputIterator>::value);
298     typedef typename std::iterator_traits<InputIterator>::value_type T;
299 
300     if(first == last){
301         return;
302     }
303 
304     detail::dispatch_reduce(first, last, result, plus<T>(), queue);
305 }
306 
307 } // end compute namespace
308 } // end boost namespace
309 
310 #endif // BOOST_COMPUTE_ALGORITHM_REDUCE_HPP
311