1 /*
2 //@HEADER
3 // ************************************************************************
4 //
5 //                        Kokkos v. 3.0
6 //       Copyright (2020) National Technology & Engineering
7 //               Solutions of Sandia, LLC (NTESS).
8 //
9 // Under the terms of Contract DE-NA0003525 with NTESS,
10 // the U.S. Government retains certain rights in this software.
11 //
12 // Redistribution and use in source and binary forms, with or without
13 // modification, are permitted provided that the following conditions are
14 // met:
15 //
16 // 1. Redistributions of source code must retain the above copyright
17 // notice, this list of conditions and the following disclaimer.
18 //
19 // 2. Redistributions in binary form must reproduce the above copyright
20 // notice, this list of conditions and the following disclaimer in the
21 // documentation and/or other materials provided with the distribution.
22 //
23 // 3. Neither the name of the Corporation nor the names of the
24 // contributors may be used to endorse or promote products derived from
25 // this software without specific prior written permission.
26 //
27 // THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
28 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
30 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
31 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
32 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
33 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
34 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
35 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
36 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
37 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
38 //
39 // Questions? Contact Christian R. Trott (crtrott@sandia.gov)
40 //
41 // ************************************************************************
42 //@HEADER
43 */
44 
45 #ifndef KOKKOS_HIP_SHUFFLE_REDUCE_HPP
46 #define KOKKOS_HIP_SHUFFLE_REDUCE_HPP
47 
48 #include <Kokkos_Macros.hpp>
49 
50 #if defined(__HIPCC__)
51 
52 #include <HIP/Kokkos_HIP_Vectorization.hpp>
53 
54 #include <climits>
55 
56 namespace Kokkos {
57 namespace Impl {
58 
59 /* Algorithmic constraints:
60  *   (a) threads with the same threadIdx.x have same value
61  *   (b) blockDim.x == power of two
62  *   (x) blockDim.z == 1
63  */
64 template <typename ValueType, typename JoinOp,
65           typename std::enable_if<!Kokkos::is_reducer<ValueType>::value,
66                                   int>::type = 0>
hip_intra_warp_shuffle_reduction(ValueType & result,JoinOp const & join,uint32_t const max_active_thread=blockDim.y)67 __device__ inline void hip_intra_warp_shuffle_reduction(
68     ValueType& result, JoinOp const& join,
69     uint32_t const max_active_thread = blockDim.y) {
70   unsigned int shift = 1;
71 
72   // Reduce over values from threads with different threadIdx.y
73   unsigned int constexpr warp_size =
74       Kokkos::Experimental::Impl::HIPTraits::WarpSize;
75   while (blockDim.x * shift < warp_size) {
76     ValueType const tmp =
77         Kokkos::Experimental::shfl_down(result, blockDim.x * shift, warp_size);
78     // Only join if upper thread is active (this allows non power of two for
79     // blockDim.y)
80     if (threadIdx.y + shift < max_active_thread) {
81       join(result, tmp);
82     }
83     shift *= 2;
84   }
85 
86   // Broadcast the result to all the threads in the warp
87   result = Kokkos::Experimental::shfl(result, 0, warp_size);
88 }
89 
90 template <typename ValueType, typename JoinOp,
91           typename std::enable_if<!Kokkos::is_reducer<ValueType>::value,
92                                   int>::type = 0>
hip_inter_warp_shuffle_reduction(ValueType & value,const JoinOp & join,const int max_active_thread=blockDim.y)93 __device__ inline void hip_inter_warp_shuffle_reduction(
94     ValueType& value, const JoinOp& join,
95     const int max_active_thread = blockDim.y) {
96   unsigned int constexpr warp_size =
97       Kokkos::Experimental::Impl::HIPTraits::WarpSize;
98   int constexpr step_width = 8;
99   // Depending on the ValueType __shared__ memory must be aligned up to 8 byte
100   // boundaries. The reason not to use ValueType directly is that for types with
101   // constructors it could lead to race conditions.
102   __shared__ double sh_result[(sizeof(ValueType) + 7) / 8 * step_width];
103   ValueType* result = reinterpret_cast<ValueType*>(&sh_result);
104   int const step    = warp_size / blockDim.x;
105   int shift         = step_width;
106   // Skip the code below if  threadIdx.y % step != 0
107   int const id = threadIdx.y % step == 0 ? threadIdx.y / step : INT_MAX;
108   if (id < step_width) {
109     result[id] = value;
110   }
111   __syncthreads();
112   while (shift <= max_active_thread / step) {
113     if (shift <= id && shift + step_width > id && threadIdx.x == 0) {
114       join(result[id % step_width], value);
115     }
116     __syncthreads();
117     shift += step_width;
118   }
119 
120   value = result[0];
121   for (int i = 1; (i * step < max_active_thread) && (i < step_width); ++i)
122     join(value, result[i]);
123 }
124 
125 template <typename ValueType, typename JoinOp,
126           typename std::enable_if<!Kokkos::is_reducer<ValueType>::value,
127                                   int>::type = 0>
hip_intra_block_shuffle_reduction(ValueType & value,JoinOp const & join,int const max_active_thread=blockDim.y)128 __device__ inline void hip_intra_block_shuffle_reduction(
129     ValueType& value, JoinOp const& join,
130     int const max_active_thread = blockDim.y) {
131   hip_intra_warp_shuffle_reduction(value, join, max_active_thread);
132   hip_inter_warp_shuffle_reduction(value, join, max_active_thread);
133 }
134 
135 template <class FunctorType, class JoinOp, class ArgTag = void>
hip_inter_block_shuffle_reduction(typename FunctorValueTraits<FunctorType,ArgTag>::reference_type value,typename FunctorValueTraits<FunctorType,ArgTag>::reference_type neutral,JoinOp const & join,Kokkos::Experimental::HIP::size_type * const m_scratch_space,typename FunctorValueTraits<FunctorType,ArgTag>::pointer_type const,Kokkos::Experimental::HIP::size_type * const m_scratch_flags,int const max_active_thread=blockDim.y)136 __device__ inline bool hip_inter_block_shuffle_reduction(
137     typename FunctorValueTraits<FunctorType, ArgTag>::reference_type value,
138     typename FunctorValueTraits<FunctorType, ArgTag>::reference_type neutral,
139     JoinOp const& join,
140     Kokkos::Experimental::HIP::size_type* const m_scratch_space,
141     typename FunctorValueTraits<FunctorType,
142                                 ArgTag>::pointer_type const /*result*/,
143     Kokkos::Experimental::HIP::size_type* const m_scratch_flags,
144     int const max_active_thread = blockDim.y) {
145   using pointer_type =
146       typename FunctorValueTraits<FunctorType, ArgTag>::pointer_type;
147   using value_type =
148       typename FunctorValueTraits<FunctorType, ArgTag>::value_type;
149 
150   // Do the intra-block reduction with shfl operations for the intra warp
151   // reduction and static shared memory for the inter warp reduction
152   hip_intra_block_shuffle_reduction(value, join, max_active_thread);
153 
154   int const id = threadIdx.y * blockDim.x + threadIdx.x;
155 
156   // One thread in the block writes block result to global scratch_memory
157   if (id == 0) {
158     pointer_type global =
159         reinterpret_cast<pointer_type>(m_scratch_space) + blockIdx.x;
160     *global = value;
161   }
162 
163   // One warp of last block performs inter block reduction through loading the
164   // block values from global scratch_memory
165   bool last_block = false;
166   __threadfence();
167   __syncthreads();
168   int constexpr warp_size = Kokkos::Experimental::Impl::HIPTraits::WarpSize;
169   if (id < warp_size) {
170     Kokkos::Experimental::HIP::size_type count;
171 
172     // Figure out whether this is the last block
173     if (id == 0) count = Kokkos::atomic_fetch_add(m_scratch_flags, 1);
174     count = Kokkos::Experimental::shfl(count, 0, warp_size);
175 
176     // Last block does the inter block reduction
177     if (count == gridDim.x - 1) {
178       // set flag back to zero
179       if (id == 0) *m_scratch_flags = 0;
180       last_block = true;
181       value      = neutral;
182 
183       pointer_type const volatile global =
184           reinterpret_cast<pointer_type>(m_scratch_space);
185 
186       // Reduce all global values with splitting work over threads in one warp
187       const int step_size = blockDim.x * blockDim.y < warp_size
188                                 ? blockDim.x * blockDim.y
189                                 : warp_size;
190       for (int i = id; i < static_cast<int>(gridDim.x); i += step_size) {
191         value_type tmp = global[i];
192         join(value, tmp);
193       }
194 
195       // Perform shfl reductions within the warp only join if contribution is
196       // valid (allows gridDim.x non power of two and <warp_size)
197       for (unsigned int i = 1; i < warp_size; i *= 2) {
198         if ((blockDim.x * blockDim.y) > i) {
199           value_type tmp = Kokkos::Experimental::shfl_down(value, i, warp_size);
200           if (id + i < gridDim.x) join(value, tmp);
201         }
202       }
203     }
204   }
205   // The last block has in its thread=0 the global reduction value through
206   // "value"
207   return last_block;
208 }
209 
210 // We implemente the same functions as above but the user provide a Reducer
211 // instead of JoinOP
212 template <typename ReducerType,
213           typename std::enable_if<Kokkos::is_reducer<ReducerType>::value,
214                                   int>::type = 0>
hip_intra_warp_shuffle_reduction(const ReducerType & reducer,typename ReducerType::value_type & result,const uint32_t max_active_thread=blockDim.y)215 __device__ inline void hip_intra_warp_shuffle_reduction(
216     const ReducerType& reducer, typename ReducerType::value_type& result,
217     const uint32_t max_active_thread = blockDim.y) {
218   using ValueType = typename ReducerType::value_type;
219   auto join_op    = [&](ValueType& result, ValueType const& tmp) {
220     reducer.join(result, tmp);
221   };
222   hip_intra_warp_shuffle_reduction(result, join_op, max_active_thread);
223 
224   reducer.reference() = result;
225 }
226 
227 template <typename ReducerType,
228           typename std::enable_if<Kokkos::is_reducer<ReducerType>::value,
229                                   int>::type = 0>
hip_inter_warp_shuffle_reduction(ReducerType const & reducer,typename ReducerType::value_type value,int const max_active_thread=blockDim.y)230 __device__ inline void hip_inter_warp_shuffle_reduction(
231     ReducerType const& reducer, typename ReducerType::value_type value,
232     int const max_active_thread = blockDim.y) {
233   using ValueType = typename ReducerType::value_type;
234   auto join_op    = [&](ValueType& a, ValueType& b) { reducer.join(a, b); };
235   hip_inter_warp_shuffle_reduction(value, join_op, max_active_thread);
236 
237   reducer.reference() = value;
238 }
239 
240 template <typename ReducerType,
241           typename std::enable_if<Kokkos::is_reducer<ReducerType>::value,
242                                   int>::type = 0>
hip_intra_block_shuffle_reduction(ReducerType const & reducer,typename ReducerType::value_type value,int const max_active_thread=blockDim.y)243 __device__ inline void hip_intra_block_shuffle_reduction(
244     ReducerType const& reducer, typename ReducerType::value_type value,
245     int const max_active_thread = blockDim.y) {
246   hip_intra_warp_shuffle_reduction(reducer, value, max_active_thread);
247   hip_inter_warp_shuffle_reduction(reducer, value, max_active_thread);
248 }
249 
250 template <typename ReducerType,
251           typename std::enable_if<Kokkos::is_reducer<ReducerType>::value,
252                                   int>::type = 0>
hip_intra_block_shuffle_reduction(ReducerType const & reducer,int const max_active_thread=blockDim.y)253 __device__ inline void hip_intra_block_shuffle_reduction(
254     ReducerType const& reducer, int const max_active_thread = blockDim.y) {
255   hip_intra_block_shuffle_reduction(reducer, reducer.reference(),
256                                     max_active_thread);
257 }
258 
259 template <typename ReducerType,
260           typename std::enable_if<Kokkos::is_reducer<ReducerType>::value,
261                                   int>::type = 0>
hip_inter_block_shuffle_reduction(ReducerType const & reducer,Kokkos::Experimental::HIP::size_type * const m_scratch_space,Kokkos::Experimental::HIP::size_type * const m_scratch_flags,int const max_active_thread=blockDim.y)262 __device__ inline bool hip_inter_block_shuffle_reduction(
263     ReducerType const& reducer,
264     Kokkos::Experimental::HIP::size_type* const m_scratch_space,
265     Kokkos::Experimental::HIP::size_type* const m_scratch_flags,
266     int const max_active_thread = blockDim.y) {
267   using pointer_type = typename ReducerType::value_type*;
268   using value_type   = typename ReducerType::value_type;
269 
270   // Do the intra-block reduction with shfl operations for the intra warp
271   // reduction and static shared memory for the inter warp reduction
272   hip_intra_block_shuffle_reduction(reducer, max_active_thread);
273 
274   value_type value = reducer.reference();
275 
276   int const id = threadIdx.y * blockDim.x + threadIdx.x;
277 
278   // One thread in the block writes block result to global scratch_memory
279   if (id == 0) {
280     pointer_type global =
281         reinterpret_cast<pointer_type>(m_scratch_space) + blockIdx.x;
282     *global = value;
283   }
284 
285   // One warp of last block performs inter block reduction through loading the
286   // block values from global scratch_memory
287   bool last_block = false;
288 
289   __threadfence();
290   __syncthreads();
291   int constexpr warp_size = Kokkos::Experimental::Impl::HIPTraits::WarpSize;
292   if (id < warp_size) {
293     Kokkos::Experimental::HIP::size_type count;
294 
295     // Figure out whether this is the last block
296     if (id == 0) count = Kokkos::atomic_fetch_add(m_scratch_flags, 1);
297     count = Kokkos::Experimental::shfl(count, 0, warp_size);
298 
299     // Last block does the inter block reduction
300     if (count == gridDim.x - 1) {
301       // Set flag back to zero
302       if (id == 0) *m_scratch_flags = 0;
303       last_block = true;
304       reducer.init(value);
305 
306       pointer_type const volatile global =
307           reinterpret_cast<pointer_type>(m_scratch_space);
308 
309       // Reduce all global values with splitting work over threads in one warp
310       int const step_size = blockDim.x * blockDim.y < warp_size
311                                 ? blockDim.x * blockDim.y
312                                 : warp_size;
313       for (int i = id; i < static_cast<int>(gridDim.x); i += step_size) {
314         value_type tmp = global[i];
315         reducer.join(value, tmp);
316       }
317 
318       // Perform shfl reductions within the warp only join if contribution is
319       // valid (allows gridDim.x non power of two and <warp_size)
320       for (unsigned int i = 1; i < warp_size; i *= 2) {
321         if ((blockDim.x * blockDim.y) > i) {
322           value_type tmp = Kokkos::Experimental::shfl_down(value, i, warp_size);
323           if (id + i < gridDim.x) reducer.join(value, tmp);
324         }
325         __syncthreads();
326       }
327     }
328   }
329 
330   // The last block has in its thread = 0 the global reduction value through
331   // "value"
332   return last_block;
333 }
334 }  // namespace Impl
335 }  // namespace Kokkos
336 
337 #endif
338 
339 #endif
340