1 /*
2 //@HEADER
3 // ************************************************************************
4 //
5 // Kokkos v. 3.0
6 // Copyright (2020) National Technology & Engineering
7 // Solutions of Sandia, LLC (NTESS).
8 //
9 // Under the terms of Contract DE-NA0003525 with NTESS,
10 // the U.S. Government retains certain rights in this software.
11 //
12 // Redistribution and use in source and binary forms, with or without
13 // modification, are permitted provided that the following conditions are
14 // met:
15 //
16 // 1. Redistributions of source code must retain the above copyright
17 // notice, this list of conditions and the following disclaimer.
18 //
19 // 2. Redistributions in binary form must reproduce the above copyright
20 // notice, this list of conditions and the following disclaimer in the
21 // documentation and/or other materials provided with the distribution.
22 //
23 // 3. Neither the name of the Corporation nor the names of the
24 // contributors may be used to endorse or promote products derived from
25 // this software without specific prior written permission.
26 //
27 // THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
28 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
30 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
31 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
32 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
33 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
34 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
35 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
36 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
37 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
38 //
39 // Questions? Contact Christian R. Trott (crtrott@sandia.gov)
40 //
41 // ************************************************************************
42 //@HEADER
43 */
44
45 #ifndef KOKKOS_HIP_SHUFFLE_REDUCE_HPP
46 #define KOKKOS_HIP_SHUFFLE_REDUCE_HPP
47
48 #include <Kokkos_Macros.hpp>
49
50 #if defined(__HIPCC__)
51
52 #include <HIP/Kokkos_HIP_Vectorization.hpp>
53
54 #include <climits>
55
56 namespace Kokkos {
57 namespace Impl {
58
59 /* Algorithmic constraints:
60 * (a) threads with the same threadIdx.x have same value
61 * (b) blockDim.x == power of two
62 * (x) blockDim.z == 1
63 */
64 template <typename ValueType, typename JoinOp,
65 typename std::enable_if<!Kokkos::is_reducer<ValueType>::value,
66 int>::type = 0>
hip_intra_warp_shuffle_reduction(ValueType & result,JoinOp const & join,uint32_t const max_active_thread=blockDim.y)67 __device__ inline void hip_intra_warp_shuffle_reduction(
68 ValueType& result, JoinOp const& join,
69 uint32_t const max_active_thread = blockDim.y) {
70 unsigned int shift = 1;
71
72 // Reduce over values from threads with different threadIdx.y
73 unsigned int constexpr warp_size =
74 Kokkos::Experimental::Impl::HIPTraits::WarpSize;
75 while (blockDim.x * shift < warp_size) {
76 ValueType const tmp =
77 Kokkos::Experimental::shfl_down(result, blockDim.x * shift, warp_size);
78 // Only join if upper thread is active (this allows non power of two for
79 // blockDim.y)
80 if (threadIdx.y + shift < max_active_thread) {
81 join(result, tmp);
82 }
83 shift *= 2;
84 }
85
86 // Broadcast the result to all the threads in the warp
87 result = Kokkos::Experimental::shfl(result, 0, warp_size);
88 }
89
90 template <typename ValueType, typename JoinOp,
91 typename std::enable_if<!Kokkos::is_reducer<ValueType>::value,
92 int>::type = 0>
hip_inter_warp_shuffle_reduction(ValueType & value,const JoinOp & join,const int max_active_thread=blockDim.y)93 __device__ inline void hip_inter_warp_shuffle_reduction(
94 ValueType& value, const JoinOp& join,
95 const int max_active_thread = blockDim.y) {
96 unsigned int constexpr warp_size =
97 Kokkos::Experimental::Impl::HIPTraits::WarpSize;
98 int constexpr step_width = 8;
99 // Depending on the ValueType __shared__ memory must be aligned up to 8 byte
100 // boundaries. The reason not to use ValueType directly is that for types with
101 // constructors it could lead to race conditions.
102 __shared__ double sh_result[(sizeof(ValueType) + 7) / 8 * step_width];
103 ValueType* result = reinterpret_cast<ValueType*>(&sh_result);
104 int const step = warp_size / blockDim.x;
105 int shift = step_width;
106 // Skip the code below if threadIdx.y % step != 0
107 int const id = threadIdx.y % step == 0 ? threadIdx.y / step : INT_MAX;
108 if (id < step_width) {
109 result[id] = value;
110 }
111 __syncthreads();
112 while (shift <= max_active_thread / step) {
113 if (shift <= id && shift + step_width > id && threadIdx.x == 0) {
114 join(result[id % step_width], value);
115 }
116 __syncthreads();
117 shift += step_width;
118 }
119
120 value = result[0];
121 for (int i = 1; (i * step < max_active_thread) && (i < step_width); ++i)
122 join(value, result[i]);
123 }
124
125 template <typename ValueType, typename JoinOp,
126 typename std::enable_if<!Kokkos::is_reducer<ValueType>::value,
127 int>::type = 0>
hip_intra_block_shuffle_reduction(ValueType & value,JoinOp const & join,int const max_active_thread=blockDim.y)128 __device__ inline void hip_intra_block_shuffle_reduction(
129 ValueType& value, JoinOp const& join,
130 int const max_active_thread = blockDim.y) {
131 hip_intra_warp_shuffle_reduction(value, join, max_active_thread);
132 hip_inter_warp_shuffle_reduction(value, join, max_active_thread);
133 }
134
135 template <class FunctorType, class JoinOp, class ArgTag = void>
hip_inter_block_shuffle_reduction(typename FunctorValueTraits<FunctorType,ArgTag>::reference_type value,typename FunctorValueTraits<FunctorType,ArgTag>::reference_type neutral,JoinOp const & join,Kokkos::Experimental::HIP::size_type * const m_scratch_space,typename FunctorValueTraits<FunctorType,ArgTag>::pointer_type const,Kokkos::Experimental::HIP::size_type * const m_scratch_flags,int const max_active_thread=blockDim.y)136 __device__ inline bool hip_inter_block_shuffle_reduction(
137 typename FunctorValueTraits<FunctorType, ArgTag>::reference_type value,
138 typename FunctorValueTraits<FunctorType, ArgTag>::reference_type neutral,
139 JoinOp const& join,
140 Kokkos::Experimental::HIP::size_type* const m_scratch_space,
141 typename FunctorValueTraits<FunctorType,
142 ArgTag>::pointer_type const /*result*/,
143 Kokkos::Experimental::HIP::size_type* const m_scratch_flags,
144 int const max_active_thread = blockDim.y) {
145 using pointer_type =
146 typename FunctorValueTraits<FunctorType, ArgTag>::pointer_type;
147 using value_type =
148 typename FunctorValueTraits<FunctorType, ArgTag>::value_type;
149
150 // Do the intra-block reduction with shfl operations for the intra warp
151 // reduction and static shared memory for the inter warp reduction
152 hip_intra_block_shuffle_reduction(value, join, max_active_thread);
153
154 int const id = threadIdx.y * blockDim.x + threadIdx.x;
155
156 // One thread in the block writes block result to global scratch_memory
157 if (id == 0) {
158 pointer_type global =
159 reinterpret_cast<pointer_type>(m_scratch_space) + blockIdx.x;
160 *global = value;
161 }
162
163 // One warp of last block performs inter block reduction through loading the
164 // block values from global scratch_memory
165 bool last_block = false;
166 __threadfence();
167 __syncthreads();
168 int constexpr warp_size = Kokkos::Experimental::Impl::HIPTraits::WarpSize;
169 if (id < warp_size) {
170 Kokkos::Experimental::HIP::size_type count;
171
172 // Figure out whether this is the last block
173 if (id == 0) count = Kokkos::atomic_fetch_add(m_scratch_flags, 1);
174 count = Kokkos::Experimental::shfl(count, 0, warp_size);
175
176 // Last block does the inter block reduction
177 if (count == gridDim.x - 1) {
178 // set flag back to zero
179 if (id == 0) *m_scratch_flags = 0;
180 last_block = true;
181 value = neutral;
182
183 pointer_type const volatile global =
184 reinterpret_cast<pointer_type>(m_scratch_space);
185
186 // Reduce all global values with splitting work over threads in one warp
187 const int step_size = blockDim.x * blockDim.y < warp_size
188 ? blockDim.x * blockDim.y
189 : warp_size;
190 for (int i = id; i < static_cast<int>(gridDim.x); i += step_size) {
191 value_type tmp = global[i];
192 join(value, tmp);
193 }
194
195 // Perform shfl reductions within the warp only join if contribution is
196 // valid (allows gridDim.x non power of two and <warp_size)
197 for (unsigned int i = 1; i < warp_size; i *= 2) {
198 if ((blockDim.x * blockDim.y) > i) {
199 value_type tmp = Kokkos::Experimental::shfl_down(value, i, warp_size);
200 if (id + i < gridDim.x) join(value, tmp);
201 }
202 }
203 }
204 }
205 // The last block has in its thread=0 the global reduction value through
206 // "value"
207 return last_block;
208 }
209
210 // We implemente the same functions as above but the user provide a Reducer
211 // instead of JoinOP
212 template <typename ReducerType,
213 typename std::enable_if<Kokkos::is_reducer<ReducerType>::value,
214 int>::type = 0>
hip_intra_warp_shuffle_reduction(const ReducerType & reducer,typename ReducerType::value_type & result,const uint32_t max_active_thread=blockDim.y)215 __device__ inline void hip_intra_warp_shuffle_reduction(
216 const ReducerType& reducer, typename ReducerType::value_type& result,
217 const uint32_t max_active_thread = blockDim.y) {
218 using ValueType = typename ReducerType::value_type;
219 auto join_op = [&](ValueType& result, ValueType const& tmp) {
220 reducer.join(result, tmp);
221 };
222 hip_intra_warp_shuffle_reduction(result, join_op, max_active_thread);
223
224 reducer.reference() = result;
225 }
226
227 template <typename ReducerType,
228 typename std::enable_if<Kokkos::is_reducer<ReducerType>::value,
229 int>::type = 0>
hip_inter_warp_shuffle_reduction(ReducerType const & reducer,typename ReducerType::value_type value,int const max_active_thread=blockDim.y)230 __device__ inline void hip_inter_warp_shuffle_reduction(
231 ReducerType const& reducer, typename ReducerType::value_type value,
232 int const max_active_thread = blockDim.y) {
233 using ValueType = typename ReducerType::value_type;
234 auto join_op = [&](ValueType& a, ValueType& b) { reducer.join(a, b); };
235 hip_inter_warp_shuffle_reduction(value, join_op, max_active_thread);
236
237 reducer.reference() = value;
238 }
239
240 template <typename ReducerType,
241 typename std::enable_if<Kokkos::is_reducer<ReducerType>::value,
242 int>::type = 0>
hip_intra_block_shuffle_reduction(ReducerType const & reducer,typename ReducerType::value_type value,int const max_active_thread=blockDim.y)243 __device__ inline void hip_intra_block_shuffle_reduction(
244 ReducerType const& reducer, typename ReducerType::value_type value,
245 int const max_active_thread = blockDim.y) {
246 hip_intra_warp_shuffle_reduction(reducer, value, max_active_thread);
247 hip_inter_warp_shuffle_reduction(reducer, value, max_active_thread);
248 }
249
250 template <typename ReducerType,
251 typename std::enable_if<Kokkos::is_reducer<ReducerType>::value,
252 int>::type = 0>
hip_intra_block_shuffle_reduction(ReducerType const & reducer,int const max_active_thread=blockDim.y)253 __device__ inline void hip_intra_block_shuffle_reduction(
254 ReducerType const& reducer, int const max_active_thread = blockDim.y) {
255 hip_intra_block_shuffle_reduction(reducer, reducer.reference(),
256 max_active_thread);
257 }
258
259 template <typename ReducerType,
260 typename std::enable_if<Kokkos::is_reducer<ReducerType>::value,
261 int>::type = 0>
hip_inter_block_shuffle_reduction(ReducerType const & reducer,Kokkos::Experimental::HIP::size_type * const m_scratch_space,Kokkos::Experimental::HIP::size_type * const m_scratch_flags,int const max_active_thread=blockDim.y)262 __device__ inline bool hip_inter_block_shuffle_reduction(
263 ReducerType const& reducer,
264 Kokkos::Experimental::HIP::size_type* const m_scratch_space,
265 Kokkos::Experimental::HIP::size_type* const m_scratch_flags,
266 int const max_active_thread = blockDim.y) {
267 using pointer_type = typename ReducerType::value_type*;
268 using value_type = typename ReducerType::value_type;
269
270 // Do the intra-block reduction with shfl operations for the intra warp
271 // reduction and static shared memory for the inter warp reduction
272 hip_intra_block_shuffle_reduction(reducer, max_active_thread);
273
274 value_type value = reducer.reference();
275
276 int const id = threadIdx.y * blockDim.x + threadIdx.x;
277
278 // One thread in the block writes block result to global scratch_memory
279 if (id == 0) {
280 pointer_type global =
281 reinterpret_cast<pointer_type>(m_scratch_space) + blockIdx.x;
282 *global = value;
283 }
284
285 // One warp of last block performs inter block reduction through loading the
286 // block values from global scratch_memory
287 bool last_block = false;
288
289 __threadfence();
290 __syncthreads();
291 int constexpr warp_size = Kokkos::Experimental::Impl::HIPTraits::WarpSize;
292 if (id < warp_size) {
293 Kokkos::Experimental::HIP::size_type count;
294
295 // Figure out whether this is the last block
296 if (id == 0) count = Kokkos::atomic_fetch_add(m_scratch_flags, 1);
297 count = Kokkos::Experimental::shfl(count, 0, warp_size);
298
299 // Last block does the inter block reduction
300 if (count == gridDim.x - 1) {
301 // Set flag back to zero
302 if (id == 0) *m_scratch_flags = 0;
303 last_block = true;
304 reducer.init(value);
305
306 pointer_type const volatile global =
307 reinterpret_cast<pointer_type>(m_scratch_space);
308
309 // Reduce all global values with splitting work over threads in one warp
310 int const step_size = blockDim.x * blockDim.y < warp_size
311 ? blockDim.x * blockDim.y
312 : warp_size;
313 for (int i = id; i < static_cast<int>(gridDim.x); i += step_size) {
314 value_type tmp = global[i];
315 reducer.join(value, tmp);
316 }
317
318 // Perform shfl reductions within the warp only join if contribution is
319 // valid (allows gridDim.x non power of two and <warp_size)
320 for (unsigned int i = 1; i < warp_size; i *= 2) {
321 if ((blockDim.x * blockDim.y) > i) {
322 value_type tmp = Kokkos::Experimental::shfl_down(value, i, warp_size);
323 if (id + i < gridDim.x) reducer.join(value, tmp);
324 }
325 __syncthreads();
326 }
327 }
328 }
329
330 // The last block has in its thread = 0 the global reduction value through
331 // "value"
332 return last_block;
333 }
334 } // namespace Impl
335 } // namespace Kokkos
336
337 #endif
338
339 #endif
340