1 /*
2 //@HEADER
3 // ************************************************************************
4 //
5 // Kokkos v. 3.0
6 // Copyright (2020) National Technology & Engineering
7 // Solutions of Sandia, LLC (NTESS).
8 //
9 // Under the terms of Contract DE-NA0003525 with NTESS,
10 // the U.S. Government retains certain rights in this software.
11 //
12 // Redistribution and use in source and binary forms, with or without
13 // modification, are permitted provided that the following conditions are
14 // met:
15 //
16 // 1. Redistributions of source code must retain the above copyright
17 // notice, this list of conditions and the following disclaimer.
18 //
19 // 2. Redistributions in binary form must reproduce the above copyright
20 // notice, this list of conditions and the following disclaimer in the
21 // documentation and/or other materials provided with the distribution.
22 //
23 // 3. Neither the name of the Corporation nor the names of the
24 // contributors may be used to endorse or promote products derived from
25 // this software without specific prior written permission.
26 //
27 // THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
28 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
30 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
31 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
32 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
33 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
34 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
35 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
36 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
37 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
38 //
39 // Questions? Contact Christian R. Trott (crtrott@sandia.gov)
40 //
41 // ************************************************************************
42 //@HEADER
43 */
44
45 #ifndef KOKKOS_IMPL_HOSTTHREADTEAM_HPP
46 #define KOKKOS_IMPL_HOSTTHREADTEAM_HPP
47
48 #include <Kokkos_Core_fwd.hpp>
49 #include <Kokkos_Pair.hpp>
50 #include <Kokkos_Atomic.hpp>
51 #include <Kokkos_ExecPolicy.hpp>
52 #include <impl/Kokkos_FunctorAdapter.hpp>
53 #include <impl/Kokkos_FunctorAnalysis.hpp>
54 #include <impl/Kokkos_HostBarrier.hpp>
55
56 #include <limits> // std::numeric_limits
57 #include <algorithm> // std::max
58
59 //----------------------------------------------------------------------------
60 //----------------------------------------------------------------------------
61
62 namespace Kokkos {
63 namespace Impl {
64
65 template <class HostExecSpace>
66 class HostThreadTeamMember;
67
68 class HostThreadTeamData {
69 public:
70 template <class>
71 friend class HostThreadTeamMember;
72
73 // Assume upper bounds on number of threads:
74 // pool size <= 1024 threads
75 // team size <= 64 threads
76
77 enum : int { max_pool_members = 1024 };
78 enum : int { max_team_members = 64 };
79 enum : int { max_pool_rendezvous = HostBarrier::required_buffer_size };
80 enum : int { max_team_rendezvous = HostBarrier::required_buffer_size };
81
82 private:
83 // per-thread scratch memory buffer chunks:
84 //
85 // [ pool_members ] = [ m_pool_members .. m_pool_rendezvous )
86 // [ pool_rendezvous ] = [ m_pool_rendezvous .. m_team_rendezvous )
87 // [ team_rendezvous ] = [ m_team_rendezvous .. m_pool_reduce )
88 // [ pool_reduce ] = [ m_pool_reduce .. m_team_reduce )
89 // [ team_reduce ] = [ m_team_reduce .. m_team_shared )
90 // [ team_shared ] = [ m_team_shared .. m_thread_local )
91 // [ thread_local ] = [ m_thread_local .. m_scratch_size )
92
93 enum : int { m_pool_members = 0 };
94 enum : int { m_pool_rendezvous = m_pool_members + max_pool_members };
95 enum : int { m_team_rendezvous = m_pool_rendezvous + max_pool_rendezvous };
96 enum : int { m_pool_reduce = m_team_rendezvous + max_team_rendezvous };
97
98 using pair_int_t = Kokkos::pair<int64_t, int64_t>;
99
100 pair_int_t m_work_range;
101 int64_t m_work_end;
102 int64_t* m_scratch; // per-thread buffer
103 int64_t* m_pool_scratch; // == pool[0]->m_scratch
104 int64_t* m_team_scratch; // == pool[ 0 + m_team_base ]->m_scratch
105 int m_pool_rank;
106 int m_pool_size;
107 int m_team_reduce;
108 int m_team_shared;
109 int m_thread_local;
110 int m_scratch_size;
111 int m_team_base;
112 int m_team_rank;
113 int m_team_size;
114 int m_team_alloc;
115 int m_league_rank;
116 int m_league_size;
117 int m_work_chunk;
118 int m_steal_rank; // work stealing rank
119 int mutable m_pool_rendezvous_step;
120 int mutable m_team_rendezvous_step;
121
team_member(int r) const122 HostThreadTeamData* team_member(int r) const noexcept {
123 return ((HostThreadTeamData**)(m_pool_scratch +
124 m_pool_members))[m_team_base + r];
125 }
126
127 public:
team_rendezvous() const128 inline bool team_rendezvous() const noexcept {
129 int* ptr = (int*)(m_team_scratch + m_team_rendezvous);
130 HostBarrier::split_arrive(ptr, m_team_size, m_team_rendezvous_step);
131 if (m_team_rank != 0) {
132 HostBarrier::wait(ptr, m_team_size, m_team_rendezvous_step);
133 } else {
134 HostBarrier::split_master_wait(ptr, m_team_size, m_team_rendezvous_step);
135 }
136
137 return m_team_rank == 0;
138 }
139
team_rendezvous(const int source_team_rank) const140 inline bool team_rendezvous(const int source_team_rank) const noexcept {
141 int* ptr = (int*)(m_team_scratch + m_team_rendezvous);
142 HostBarrier::split_arrive(ptr, m_team_size, m_team_rendezvous_step);
143 if (m_team_rank != source_team_rank) {
144 HostBarrier::wait(ptr, m_team_size, m_team_rendezvous_step);
145 } else {
146 HostBarrier::split_master_wait(ptr, m_team_size, m_team_rendezvous_step);
147 }
148
149 return (m_team_rank == source_team_rank);
150 }
151
team_rendezvous_release() const152 inline void team_rendezvous_release() const noexcept {
153 HostBarrier::split_release((int*)(m_team_scratch + m_team_rendezvous),
154 m_team_size, m_team_rendezvous_step);
155 }
156
pool_rendezvous() const157 inline int pool_rendezvous() const noexcept {
158 int* ptr = (int*)(m_pool_scratch + m_pool_rendezvous);
159 HostBarrier::split_arrive(ptr, m_pool_size, m_pool_rendezvous_step);
160 if (m_pool_rank != 0) {
161 HostBarrier::wait(ptr, m_pool_size, m_pool_rendezvous_step);
162 } else {
163 HostBarrier::split_master_wait(ptr, m_pool_size, m_pool_rendezvous_step);
164 }
165
166 return m_pool_rank == 0;
167 }
168
pool_rendezvous_release() const169 inline void pool_rendezvous_release() const noexcept {
170 HostBarrier::split_release((int*)(m_pool_scratch + m_pool_rendezvous),
171 m_pool_size, m_pool_rendezvous_step);
172 }
173
174 //----------------------------------------
175
HostThreadTeamData()176 constexpr HostThreadTeamData() noexcept
177 : m_work_range(-1, -1),
178 m_work_end(0),
179 m_scratch(nullptr),
180 m_pool_scratch(nullptr),
181 m_team_scratch(nullptr),
182 m_pool_rank(0),
183 m_pool_size(1),
184 m_team_reduce(0),
185 m_team_shared(0),
186 m_thread_local(0),
187 m_scratch_size(0),
188 m_team_base(0),
189 m_team_rank(0),
190 m_team_size(1),
191 m_team_alloc(1),
192 m_league_rank(0),
193 m_league_size(1),
194 m_work_chunk(0),
195 m_steal_rank(0),
196 m_pool_rendezvous_step(0),
197 m_team_rendezvous_step(0) {}
198
199 //----------------------------------------
200 // Organize array of members into a pool.
201 // The 0th member is the root of the pool.
202 // Requires: members are not already in a pool.
203 // Requires: called by one thread.
204 // Pool members are ordered as "close" - sorted by NUMA and then CORE
205 // Each thread is its own team with team_size == 1.
206 static void organize_pool(HostThreadTeamData* members[], const int size);
207
208 // Called by each thread within the pool
209 void disband_pool();
210
211 //----------------------------------------
212 // Each thread within a pool organizes itself into a team.
213 // Must be called by all threads of the pool.
214 // Organizing threads into a team performs a barrier across the
215 // entire pool to insure proper initialization of the team
216 // rendezvous mechanism before a team rendezvous can be performed.
217 //
218 // Return true if a valid member of a team.
219 // Return false if not a member and thread should be idled.
220 int organize_team(const int team_size);
221
222 // Each thread within a pool disbands itself from current team.
223 // Each thread becomes its own team with team_size == 1.
224 // Must be called by all threads of the pool.
225 void disband_team();
226
227 //----------------------------------------
228
pool_rank() const229 constexpr int pool_rank() const { return m_pool_rank; }
pool_size() const230 constexpr int pool_size() const { return m_pool_size; }
231
pool_member(int r) const232 HostThreadTeamData* pool_member(int r) const noexcept {
233 return ((HostThreadTeamData**)(m_pool_scratch + m_pool_members))[r];
234 }
235
236 //----------------------------------------
237
238 private:
239 enum : int { mask_to_16 = 0x0f }; // align to 16 bytes
240 enum : int { shift_to_8 = 3 }; // size to 8 bytes
241
242 public:
align_to_int64(int n)243 static constexpr int align_to_int64(int n) {
244 return ((n + mask_to_16) & ~mask_to_16) >> shift_to_8;
245 }
246
pool_reduce_bytes() const247 constexpr int pool_reduce_bytes() const {
248 return m_scratch_size ? sizeof(int64_t) * (m_team_reduce - m_pool_reduce)
249 : 0;
250 }
251
team_reduce_bytes() const252 constexpr int team_reduce_bytes() const {
253 return sizeof(int64_t) * (m_team_shared - m_team_reduce);
254 }
255
team_shared_bytes() const256 constexpr int team_shared_bytes() const {
257 return sizeof(int64_t) * (m_thread_local - m_team_shared);
258 }
259
thread_local_bytes() const260 constexpr int thread_local_bytes() const {
261 return sizeof(int64_t) * (m_scratch_size - m_thread_local);
262 }
263
scratch_bytes() const264 constexpr int scratch_bytes() const {
265 return sizeof(int64_t) * m_scratch_size;
266 }
267
268 // Memory chunks:
269
scratch_buffer() const270 int64_t* scratch_buffer() const noexcept { return m_scratch; }
271
pool_reduce() const272 int64_t* pool_reduce() const noexcept {
273 return m_pool_scratch + m_pool_reduce;
274 }
275
pool_reduce_local() const276 int64_t* pool_reduce_local() const noexcept {
277 return m_scratch + m_pool_reduce;
278 }
279
team_reduce() const280 int64_t* team_reduce() const noexcept {
281 return m_team_scratch + m_team_reduce;
282 }
283
team_reduce_local() const284 int64_t* team_reduce_local() const noexcept {
285 return m_scratch + m_team_reduce;
286 }
287
team_shared() const288 int64_t* team_shared() const noexcept {
289 return m_team_scratch + m_team_shared;
290 }
291
local_scratch() const292 int64_t* local_scratch() const noexcept { return m_scratch + m_thread_local; }
293
294 // Given:
295 // pool_reduce_size = number bytes for pool reduce
296 // team_reduce_size = number bytes for team reduce
297 // team_shared_size = number bytes for team shared memory
298 // thread_local_size = number bytes for thread local memory
299 // Return:
300 // total number of bytes that must be allocated
scratch_size(int pool_reduce_size,int team_reduce_size,int team_shared_size,int thread_local_size)301 static size_t scratch_size(int pool_reduce_size, int team_reduce_size,
302 int team_shared_size, int thread_local_size) {
303 pool_reduce_size = align_to_int64(pool_reduce_size);
304 team_reduce_size = align_to_int64(team_reduce_size);
305 team_shared_size = align_to_int64(team_shared_size);
306 thread_local_size = align_to_int64(thread_local_size);
307
308 const size_t total_bytes =
309 (m_pool_reduce + pool_reduce_size + team_reduce_size +
310 team_shared_size + thread_local_size) *
311 sizeof(int64_t);
312
313 return total_bytes;
314 }
315
316 // Given:
317 // alloc_ptr = pointer to allocated memory
318 // alloc_size = number bytes of allocated memory
319 // pool_reduce_size = number bytes for pool reduce/scan operations
320 // team_reduce_size = number bytes for team reduce/scan operations
321 // team_shared_size = number bytes for team-shared memory
322 // thread_local_size = number bytes for thread-local memory
323 // Return:
324 // total number of bytes that must be allocated
scratch_assign(void * const alloc_ptr,size_t const alloc_size,int pool_reduce_size,int team_reduce_size,int team_shared_size,int)325 void scratch_assign(void* const alloc_ptr, size_t const alloc_size,
326 int pool_reduce_size, int team_reduce_size,
327 int team_shared_size, int /* thread_local_size */) {
328 pool_reduce_size = align_to_int64(pool_reduce_size);
329 team_reduce_size = align_to_int64(team_reduce_size);
330 team_shared_size = align_to_int64(team_shared_size);
331 // thread_local_size = align_to_int64( thread_local_size );
332
333 m_scratch = (int64_t*)alloc_ptr;
334 m_team_reduce = m_pool_reduce + pool_reduce_size;
335 m_team_shared = m_team_reduce + team_reduce_size;
336 m_thread_local = m_team_shared + team_shared_size;
337 m_scratch_size = align_to_int64(alloc_size);
338
339 #if 0
340 fprintf(stdout,"HostThreadTeamData::scratch_assign { %d %d %d %d %d %d %d }\n"
341 , int(m_pool_members)
342 , int(m_pool_rendezvous)
343 , int(m_pool_reduce)
344 , int(m_team_reduce)
345 , int(m_team_shared)
346 , int(m_thread_local)
347 , int(m_scratch_size)
348 );
349 fflush(stdout);
350 #endif
351 }
352
353 //----------------------------------------
354 // Get a work index within the range.
355 // First try to steal from beginning of own teams's partition.
356 // If that fails then try to steal from end of another teams' partition.
357 int get_work_stealing() noexcept;
358
359 //----------------------------------------
360 // Set the initial work partitioning of [ 0 .. length ) among the teams
361 // with granularity of chunk
362
set_work_partition(int64_t const length,int const chunk)363 void set_work_partition(int64_t const length, int const chunk) noexcept {
364 // Minimum chunk size to insure that
365 // m_work_end < std::numeric_limits<int>::max() * m_work_chunk
366
367 int const chunk_min = (length + std::numeric_limits<int>::max()) /
368 std::numeric_limits<int>::max();
369
370 m_work_end = length;
371 m_work_chunk = std::max(chunk, chunk_min);
372
373 // Number of work chunks and partitioning of that number:
374 int const num = (m_work_end + m_work_chunk - 1) / m_work_chunk;
375 int const part = (num + m_league_size - 1) / m_league_size;
376
377 m_work_range.first = part * m_league_rank;
378 m_work_range.second = m_work_range.first + part;
379
380 // Steal from next team, round robin
381 // The next team is offset by m_team_alloc if it fits in the pool.
382
383 m_steal_rank = m_team_base + m_team_alloc + m_team_size <= m_pool_size
384 ? m_team_base + m_team_alloc
385 : 0;
386 }
387
get_work_partition()388 std::pair<int64_t, int64_t> get_work_partition() noexcept {
389 int64_t first = m_work_range.first;
390 int64_t second = m_work_range.second;
391 first *= m_work_chunk;
392 second *= m_work_chunk;
393 return std::pair<int64_t, int64_t>(
394 first, second < m_work_end ? second : m_work_end);
395 }
396
get_work_stealing_chunk()397 std::pair<int64_t, int64_t> get_work_stealing_chunk() noexcept {
398 std::pair<int64_t, int64_t> x(-1, -1);
399
400 const int i = get_work_stealing();
401
402 if (0 <= i) {
403 x.first = m_work_chunk * i;
404 x.second = x.first + m_work_chunk < m_work_end ? x.first + m_work_chunk
405 : m_work_end;
406 }
407
408 return x;
409 }
410 };
411
412 //----------------------------------------------------------------------------
413
414 template <class HostExecSpace>
415 class HostThreadTeamMember {
416 public:
417 using scratch_memory_space = typename HostExecSpace::scratch_memory_space;
418 using execution_space = HostExecSpace;
419 using thread_team_member = HostThreadTeamMember;
420 using host_thread_team_member = HostThreadTeamMember;
421
422 private:
423 scratch_memory_space m_scratch;
424 HostThreadTeamData& m_data;
425 int const m_league_rank;
426 int const m_league_size;
427
428 public:
HostThreadTeamMember(HostThreadTeamData & arg_data)429 constexpr HostThreadTeamMember(HostThreadTeamData& arg_data) noexcept
430 : m_scratch(arg_data.team_shared(), arg_data.team_shared_bytes()),
431 m_data(arg_data),
432 m_league_rank(arg_data.m_league_rank),
433 m_league_size(arg_data.m_league_size) {}
434
HostThreadTeamMember(HostThreadTeamData & arg_data,int const arg_league_rank,int const arg_league_size)435 constexpr HostThreadTeamMember(HostThreadTeamData& arg_data,
436 int const arg_league_rank,
437 int const arg_league_size) noexcept
438 : m_scratch(arg_data.team_shared(), arg_data.team_shared_bytes(),
439 arg_data.team_shared(), arg_data.team_shared_bytes()),
440 m_data(arg_data),
441 m_league_rank(arg_league_rank),
442 m_league_size(arg_league_size) {}
443
444 ~HostThreadTeamMember() = default;
445 HostThreadTeamMember() = delete;
446 HostThreadTeamMember(HostThreadTeamMember&&) = default;
447 HostThreadTeamMember(HostThreadTeamMember const&) = default;
448 HostThreadTeamMember& operator=(HostThreadTeamMember&&) = default;
449 HostThreadTeamMember& operator=(HostThreadTeamMember const&) = default;
450
451 //----------------------------------------
452
453 KOKKOS_INLINE_FUNCTION
team_rank() const454 int team_rank() const noexcept { return m_data.m_team_rank; }
455
456 KOKKOS_INLINE_FUNCTION
team_size() const457 int team_size() const noexcept { return m_data.m_team_size; }
458
459 KOKKOS_INLINE_FUNCTION
league_rank() const460 int league_rank() const noexcept { return m_league_rank; }
461
462 KOKKOS_INLINE_FUNCTION
league_size() const463 int league_size() const noexcept { return m_league_size; }
464
465 //----------------------------------------
466
467 KOKKOS_INLINE_FUNCTION
team_shmem() const468 const scratch_memory_space& team_shmem() const {
469 return m_scratch.set_team_thread_mode(0, 1, 0);
470 }
471
472 KOKKOS_INLINE_FUNCTION
team_scratch(int) const473 const scratch_memory_space& team_scratch(int) const {
474 return m_scratch.set_team_thread_mode(0, 1, 0);
475 }
476
477 KOKKOS_INLINE_FUNCTION
thread_scratch(int) const478 const scratch_memory_space& thread_scratch(int) const {
479 return m_scratch.set_team_thread_mode(0, m_data.m_team_size,
480 m_data.m_team_rank);
481 }
482
483 //--------------------------------------------------------------------------
484 // Team collectives
485 //--------------------------------------------------------------------------
486
team_barrier() const487 KOKKOS_INLINE_FUNCTION void team_barrier() const noexcept
488 #if defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST)
489 {
490 if (m_data.team_rendezvous()) {
491 m_data.team_rendezvous_release();
492 };
493 }
494 #else
495 {
496 }
497 #endif
498
499 //--------------------------------------------------------------------------
500
501 template <typename T>
team_broadcast(T & value,const int source_team_rank) const502 KOKKOS_INLINE_FUNCTION void team_broadcast(T& value,
503 const int source_team_rank) const
504 noexcept
505 #if defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST)
506 {
507 if (1 < m_data.m_team_size) {
508 T volatile* const shared_value = (T*)m_data.team_reduce();
509
510 // Don't overwrite shared memory until all threads arrive
511
512 if (m_data.team_rendezvous(source_team_rank)) {
513 // All threads have entered 'team_rendezvous'
514 // only this thread returned from 'team_rendezvous'
515 // with a return value of 'true'
516
517 *shared_value = value;
518
519 m_data.team_rendezvous_release();
520 // This thread released all other threads from 'team_rendezvous'
521 // with a return value of 'false'
522 } else {
523 value = *shared_value;
524 }
525 }
526 }
527 #else
528 {
529 (void)value;
530 (void)source_team_rank;
531 Kokkos::abort("HostThreadTeamMember team_broadcast\n");
532 }
533 #endif
534
535 //--------------------------------------------------------------------------
536
537 template <class Closure, typename T>
team_broadcast(Closure const & f,T & value,const int source_team_rank) const538 KOKKOS_INLINE_FUNCTION void team_broadcast(Closure const& f, T& value,
539 const int source_team_rank) const
540 noexcept
541 #if defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST)
542 {
543 T volatile* const shared_value = (T*)m_data.team_reduce();
544
545 // Don't overwrite shared memory until all threads arrive
546
547 if (m_data.team_rendezvous(source_team_rank)) {
548 // All threads have entered 'team_rendezvous'
549 // only this thread returned from 'team_rendezvous'
550 // with a return value of 'true'
551
552 f(value);
553
554 if (1 < m_data.m_team_size) {
555 *shared_value = value;
556 }
557
558 m_data.team_rendezvous_release();
559 // This thread released all other threads from 'team_rendezvous'
560 // with a return value of 'false'
561 } else {
562 value = *shared_value;
563 }
564 }
565 #else
566 {
567 (void)f;
568 (void)value;
569 (void)source_team_rank;
570 Kokkos::abort("HostThreadTeamMember team_broadcast\n");
571 }
572 #endif
573
574 //--------------------------------------------------------------------------
575 // team_reduce( Sum(result) );
576 // team_reduce( Min(result) );
577 // team_reduce( Max(result) );
578
579 template <typename ReducerType>
580 KOKKOS_INLINE_FUNCTION
581 typename std::enable_if<is_reducer<ReducerType>::value>::type
team_reduce(ReducerType const & reducer) const582 team_reduce(ReducerType const& reducer) const noexcept {
583 team_reduce(reducer, reducer.reference());
584 }
585
586 template <typename ReducerType>
587 KOKKOS_INLINE_FUNCTION
588 typename std::enable_if<is_reducer<ReducerType>::value>::type
team_reduce(ReducerType const & reducer,typename ReducerType::value_type contribution) const589 team_reduce(ReducerType const& reducer,
590 typename ReducerType::value_type contribution) const noexcept
591 #if defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST)
592 {
593 if (1 < m_data.m_team_size) {
594 using value_type = typename ReducerType::value_type;
595
596 if (0 != m_data.m_team_rank) {
597 // Non-root copies to their local buffer:
598 /*reducer.copy( (value_type*) m_data.team_reduce_local()
599 , reducer.data() );*/
600 *((value_type*)m_data.team_reduce_local()) = contribution;
601 }
602
603 // Root does not overwrite shared memory until all threads arrive
604 // and copy to their local buffer.
605
606 if (m_data.team_rendezvous()) {
607 // All threads have entered 'team_rendezvous'
608 // only this thread returned from 'team_rendezvous'
609 // with a return value of 'true'
610 //
611 // This thread sums contributed values
612 for (int i = 1; i < m_data.m_team_size; ++i) {
613 value_type* const src =
614 (value_type*)m_data.team_member(i)->team_reduce_local();
615
616 reducer.join(contribution, *src);
617 }
618
619 // Copy result to root member's buffer:
620 // reducer.copy( (value_type*) m_data.team_reduce() , reducer.data() );
621 *((value_type*)m_data.team_reduce()) = contribution;
622 reducer.reference() = contribution;
623 m_data.team_rendezvous_release();
624 // This thread released all other threads from 'team_rendezvous'
625 // with a return value of 'false'
626 } else {
627 // Copy from root member's buffer:
628 reducer.reference() = *((value_type*)m_data.team_reduce());
629 }
630 } else {
631 reducer.reference() = contribution;
632 }
633 }
634 #else
635 {
636 (void)reducer;
637 (void)contribution;
638 Kokkos::abort("HostThreadTeamMember team_reduce\n");
639 }
640 #endif
641
642 //--------------------------------------------------------------------------
643
644 /*template< typename ValueType , class JoinOp >
645 KOKKOS_INLINE_FUNCTION
646 ValueType
647 team_reduce( ValueType const & value
648 , JoinOp const & join ) const noexcept
649 #if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
650 {
651 if ( 0 != m_data.m_team_rank ) {
652 // Non-root copies to their local buffer:
653 *((ValueType*) m_data.team_reduce_local()) = value ;
654 }
655
656 // Root does not overwrite shared memory until all threads arrive
657 // and copy to their local buffer.
658
659 if ( m_data.team_rendezvous() ) {
660 const Impl::Reducer< ValueType , JoinOp > reducer( join );
661
662 // All threads have entered 'team_rendezvous'
663 // only this thread returned from 'team_rendezvous'
664 // with a return value of 'true'
665 //
666 // This thread sums contributed values
667
668 ValueType * const dst = (ValueType*) m_data.team_reduce_local();
669
670 *dst = value ;
671
672 for ( int i = 1 ; i < m_data.m_team_size ; ++i ) {
673 ValueType * const src =
674 (ValueType*) m_data.team_member(i)->team_reduce_local();
675
676 reducer.join( dst , src );
677 }
678
679 m_data.team_rendezvous_release();
680 // This thread released all other threads from 'team_rendezvous'
681 // with a return value of 'false'
682 }
683
684 return *((ValueType*) m_data.team_reduce());
685 }
686 #else
687 { Kokkos::abort("HostThreadTeamMember team_reduce\n"); return ValueType(); }
688 #endif*/
689
690 template <typename T>
team_scan(T const & value,T * const global=nullptr) const691 KOKKOS_INLINE_FUNCTION T team_scan(T const& value,
692 T* const global = nullptr) const noexcept
693 #if defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST)
694 {
695 if (0 != m_data.m_team_rank) {
696 // Non-root copies to their local buffer:
697 ((T*)m_data.team_reduce_local())[1] = value;
698 }
699
700 // Root does not overwrite shared memory until all threads arrive
701 // and copy to their local buffer.
702
703 if (m_data.team_rendezvous()) {
704 // All threads have entered 'team_rendezvous'
705 // only this thread returned from 'team_rendezvous'
706 // with a return value of 'true'
707 //
708 // This thread scans contributed values
709
710 {
711 T* prev = (T*)m_data.team_reduce_local();
712
713 prev[0] = 0;
714 prev[1] = value;
715
716 for (int i = 1; i < m_data.m_team_size; ++i) {
717 T* const ptr = (T*)m_data.team_member(i)->team_reduce_local();
718
719 ptr[0] = prev[0] + prev[1];
720
721 prev = ptr;
722 }
723 }
724
725 // If adding to global value then atomic_fetch_add to that value
726 // and sum previous value to every entry of the scan.
727 if (global) {
728 T* prev = (T*)m_data.team_reduce_local();
729
730 {
731 T* ptr = (T*)m_data.team_member(m_data.m_team_size - 1)
732 ->team_reduce_local();
733 prev[0] = Kokkos::atomic_fetch_add(global, ptr[0] + ptr[1]);
734 }
735
736 for (int i = 1; i < m_data.m_team_size; ++i) {
737 T* ptr = (T*)m_data.team_member(i)->team_reduce_local();
738 ptr[0] += prev[0];
739 }
740 }
741
742 m_data.team_rendezvous_release();
743 }
744
745 return ((T*)m_data.team_reduce_local())[0];
746 }
747 #else
748 {
749 (void)value;
750 (void)global;
751 Kokkos::abort("HostThreadTeamMember team_scan\n");
752 return T();
753 }
754 #endif
755 };
756
757 } // namespace Impl
758 } // namespace Kokkos
759
760 //----------------------------------------------------------------------------
761 //----------------------------------------------------------------------------
762
763 namespace Kokkos {
764
765 template <typename iType, typename Member>
766 KOKKOS_INLINE_FUNCTION Impl::TeamThreadRangeBoundariesStruct<iType, Member>
TeamThreadRange(Member const & member,iType count,typename std::enable_if<Impl::is_thread_team_member<Member>::value>::type const ** =nullptr)767 TeamThreadRange(
768 Member const& member, iType count,
769 typename std::enable_if<
770 Impl::is_thread_team_member<Member>::value>::type const** = nullptr) {
771 return Impl::TeamThreadRangeBoundariesStruct<iType, Member>(member, 0, count);
772 }
773
774 template <typename iType1, typename iType2, typename Member>
775 KOKKOS_INLINE_FUNCTION Impl::TeamThreadRangeBoundariesStruct<
776 typename std::common_type<iType1, iType2>::type, Member>
TeamThreadRange(Member const & member,iType1 begin,iType2 end,typename std::enable_if<Impl::is_thread_team_member<Member>::value>::type const ** =nullptr)777 TeamThreadRange(
778 Member const& member, iType1 begin, iType2 end,
779 typename std::enable_if<
780 Impl::is_thread_team_member<Member>::value>::type const** = nullptr) {
781 return Impl::TeamThreadRangeBoundariesStruct<
782 typename std::common_type<iType1, iType2>::type, Member>(member, begin,
783 end);
784 }
785
786 template <typename iType, typename Member>
787 KOKKOS_INLINE_FUNCTION Impl::TeamThreadRangeBoundariesStruct<iType, Member>
TeamVectorRange(Member const & member,iType count,typename std::enable_if<Impl::is_thread_team_member<Member>::value>::type const ** =nullptr)788 TeamVectorRange(
789 Member const& member, iType count,
790 typename std::enable_if<
791 Impl::is_thread_team_member<Member>::value>::type const** = nullptr) {
792 return Impl::TeamThreadRangeBoundariesStruct<iType, Member>(member, 0, count);
793 }
794
795 template <typename iType1, typename iType2, typename Member>
796 KOKKOS_INLINE_FUNCTION Impl::TeamThreadRangeBoundariesStruct<
797 typename std::common_type<iType1, iType2>::type, Member>
TeamVectorRange(Member const & member,iType1 begin,iType2 end,typename std::enable_if<Impl::is_thread_team_member<Member>::value>::type const ** =nullptr)798 TeamVectorRange(
799 Member const& member, iType1 begin, iType2 end,
800 typename std::enable_if<
801 Impl::is_thread_team_member<Member>::value>::type const** = nullptr) {
802 return Impl::TeamThreadRangeBoundariesStruct<
803 typename std::common_type<iType1, iType2>::type, Member>(member, begin,
804 end);
805 }
806
807 template <typename iType, typename Member>
808 KOKKOS_INLINE_FUNCTION Impl::ThreadVectorRangeBoundariesStruct<iType, Member>
ThreadVectorRange(Member const & member,iType count,typename std::enable_if<Impl::is_thread_team_member<Member>::value>::type const ** =nullptr)809 ThreadVectorRange(
810 Member const& member, iType count,
811 typename std::enable_if<
812 Impl::is_thread_team_member<Member>::value>::type const** = nullptr) {
813 return Impl::ThreadVectorRangeBoundariesStruct<iType, Member>(member, count);
814 }
815
816 template <typename iType1, typename iType2, typename Member>
817 KOKKOS_INLINE_FUNCTION Impl::ThreadVectorRangeBoundariesStruct<
818 typename std::common_type<iType1, iType2>::type, Member>
ThreadVectorRange(Member const & member,iType1 arg_begin,iType2 arg_end,typename std::enable_if<Impl::is_thread_team_member<Member>::value>::type const ** =nullptr)819 ThreadVectorRange(
820 Member const& member, iType1 arg_begin, iType2 arg_end,
821 typename std::enable_if<
822 Impl::is_thread_team_member<Member>::value>::type const** = nullptr) {
823 using iType = typename std::common_type<iType1, iType2>::type;
824 return Impl::ThreadVectorRangeBoundariesStruct<iType, Member>(
825 member, iType(arg_begin), iType(arg_end));
826 }
827
828 //----------------------------------------------------------------------------
829 /** \brief Inter-thread parallel_for.
830 *
831 * Executes lambda(iType i) for each i=[0..N)
832 *
833 * The range [0..N) is mapped to all threads of the the calling thread team.
834 */
835 template <typename iType, class Closure, class Member>
parallel_for(Impl::TeamThreadRangeBoundariesStruct<iType,Member> const & loop_boundaries,Closure const & closure,typename std::enable_if<Impl::is_host_thread_team_member<Member>::value>::type const ** =nullptr)836 KOKKOS_INLINE_FUNCTION void parallel_for(
837 Impl::TeamThreadRangeBoundariesStruct<iType, Member> const& loop_boundaries,
838 Closure const& closure,
839 typename std::enable_if<Impl::is_host_thread_team_member<Member>::value>::
840 type const** = nullptr) {
841 for (iType i = loop_boundaries.start; i < loop_boundaries.end;
842 i += loop_boundaries.increment) {
843 closure(i);
844 }
845 }
846
847 template <typename iType, class Closure, class Member>
parallel_for(Impl::ThreadVectorRangeBoundariesStruct<iType,Member> const & loop_boundaries,Closure const & closure,typename std::enable_if<Impl::is_host_thread_team_member<Member>::value>::type const ** =nullptr)848 KOKKOS_INLINE_FUNCTION void parallel_for(
849 Impl::ThreadVectorRangeBoundariesStruct<iType, Member> const&
850 loop_boundaries,
851 Closure const& closure,
852 typename std::enable_if<Impl::is_host_thread_team_member<Member>::value>::
853 type const** = nullptr) {
854 #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
855 #pragma ivdep
856 #endif
857 for (iType i = loop_boundaries.start; i < loop_boundaries.end;
858 i += loop_boundaries.increment) {
859 closure(i);
860 }
861 }
862
863 //----------------------------------------------------------------------------
864
865 template <typename iType, class Closure, class Reducer, class Member>
866 KOKKOS_INLINE_FUNCTION typename std::enable_if<
867 Kokkos::is_reducer<Reducer>::value &&
868 Impl::is_host_thread_team_member<Member>::value>::type
parallel_reduce(Impl::TeamThreadRangeBoundariesStruct<iType,Member> const & loop_boundaries,Closure const & closure,Reducer const & reducer)869 parallel_reduce(
870 Impl::TeamThreadRangeBoundariesStruct<iType, Member> const& loop_boundaries,
871 Closure const& closure, Reducer const& reducer) {
872 typename Reducer::value_type value;
873 reducer.init(value);
874
875 for (iType i = loop_boundaries.start; i < loop_boundaries.end;
876 i += loop_boundaries.increment) {
877 closure(i, value);
878 }
879
880 loop_boundaries.thread.team_reduce(reducer, value);
881 }
882
883 template <typename iType, typename Closure, typename ValueType, typename Member>
884 KOKKOS_INLINE_FUNCTION typename std::enable_if<
885 !Kokkos::is_reducer<ValueType>::value &&
886 Impl::is_host_thread_team_member<Member>::value>::type
parallel_reduce(Impl::TeamThreadRangeBoundariesStruct<iType,Member> const & loop_boundaries,Closure const & closure,ValueType & result)887 parallel_reduce(
888 Impl::TeamThreadRangeBoundariesStruct<iType, Member> const& loop_boundaries,
889 Closure const& closure, ValueType& result) {
890 ValueType val;
891 Sum<ValueType> reducer(val);
892 reducer.init(val);
893
894 for (iType i = loop_boundaries.start; i < loop_boundaries.end;
895 i += loop_boundaries.increment) {
896 closure(i, reducer.reference());
897 }
898
899 loop_boundaries.thread.team_reduce(reducer);
900 result = reducer.reference();
901 }
902
903 /*template< typename iType, class Space
904 , class Closure, class Joiner , typename ValueType >
905 KOKKOS_INLINE_FUNCTION
906 void parallel_reduce
907 (
908 Impl::TeamThreadRangeBoundariesStruct<iType,Impl::HostThreadTeamMember<Space> >
909 const & loop_boundaries
910 , Closure const & closure
911 , Joiner const & joiner
912 , ValueType & result
913 )
914 {
915 Impl::Reducer< ValueType , Joiner > reducer( joiner , & result );
916
917 reducer.init( reducer.data() );
918
919 for( iType i = loop_boundaries.start
920 ; i < loop_boundaries.end
921 ; i += loop_boundaries.increment ) {
922 closure( i , reducer.reference() );
923 }
924
925 loop_boundaries.thread.team_reduce( reducer );
926 }*/
927
928 //----------------------------------------------------------------------------
929 /** \brief Inter-thread vector parallel_reduce.
930 *
931 * Executes lambda(iType i, ValueType & val) for each i=[0..N)
932 *
933 * The range [0..N) is mapped to all threads of the
934 * calling thread team and a summation of val is
935 * performed and put into result.
936 */
937 template <typename iType, class Lambda, typename ValueType, typename Member>
938 KOKKOS_INLINE_FUNCTION typename std::enable_if<
939 !Kokkos::is_reducer<ValueType>::value &&
940 Impl::is_host_thread_team_member<Member>::value>::type
parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType,Member> & loop_boundaries,const Lambda & lambda,ValueType & result)941 parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType, Member>&
942 loop_boundaries,
943 const Lambda& lambda, ValueType& result) {
944 result = ValueType();
945 for (iType i = loop_boundaries.start; i < loop_boundaries.end;
946 i += loop_boundaries.increment) {
947 lambda(i, result);
948 }
949 }
950
951 template <typename iType, class Lambda, typename ReducerType, typename Member>
952 KOKKOS_INLINE_FUNCTION typename std::enable_if<
953 Kokkos::is_reducer<ReducerType>::value &&
954 Impl::is_host_thread_team_member<Member>::value>::type
parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType,Member> & loop_boundaries,const Lambda & lambda,const ReducerType & reducer)955 parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType, Member>&
956 loop_boundaries,
957 const Lambda& lambda, const ReducerType& reducer) {
958 reducer.init(reducer.reference());
959 for (iType i = loop_boundaries.start; i < loop_boundaries.end;
960 i += loop_boundaries.increment) {
961 lambda(i, reducer.reference());
962 }
963 }
964
965 //----------------------------------------------------------------------------
966
967 template <typename iType, class Closure, class Member>
968 KOKKOS_INLINE_FUNCTION typename std::enable_if<
969 Impl::is_host_thread_team_member<Member>::value>::type
parallel_scan(Impl::TeamThreadRangeBoundariesStruct<iType,Member> const & loop_boundaries,Closure const & closure)970 parallel_scan(
971 Impl::TeamThreadRangeBoundariesStruct<iType, Member> const& loop_boundaries,
972 Closure const& closure) {
973 // Extract ValueType from the closure
974
975 using value_type = typename Kokkos::Impl::FunctorAnalysis<
976 Kokkos::Impl::FunctorPatternInterface::SCAN, void, Closure>::value_type;
977
978 value_type accum = 0;
979
980 // Intra-member scan
981 for (iType i = loop_boundaries.start; i < loop_boundaries.end;
982 i += loop_boundaries.increment) {
983 closure(i, accum, false);
984 }
985
986 // 'accum' output is the exclusive prefix sum
987 accum = loop_boundaries.thread.team_scan(accum);
988
989 for (iType i = loop_boundaries.start; i < loop_boundaries.end;
990 i += loop_boundaries.increment) {
991 closure(i, accum, true);
992 }
993 }
994
995 template <typename iType, class ClosureType, class Member>
996 KOKKOS_INLINE_FUNCTION typename std::enable_if<
997 Impl::is_host_thread_team_member<Member>::value>::type
parallel_scan(Impl::ThreadVectorRangeBoundariesStruct<iType,Member> const & loop_boundaries,ClosureType const & closure)998 parallel_scan(Impl::ThreadVectorRangeBoundariesStruct<iType, Member> const&
999 loop_boundaries,
1000 ClosureType const& closure) {
1001 using value_type = typename Kokkos::Impl::FunctorAnalysis<
1002 Impl::FunctorPatternInterface::SCAN, void, ClosureType>::value_type;
1003
1004 value_type scan_val = value_type();
1005
1006 #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
1007 #pragma ivdep
1008 #endif
1009 for (iType i = loop_boundaries.start; i < loop_boundaries.end;
1010 i += loop_boundaries.increment) {
1011 closure(i, scan_val, true);
1012 }
1013 }
1014
1015 template <typename iType, class Lambda, typename ReducerType, typename Member>
1016 KOKKOS_INLINE_FUNCTION typename std::enable_if<
1017 Kokkos::is_reducer<ReducerType>::value &&
1018 Impl::is_host_thread_team_member<Member>::value>::type
parallel_scan(const Impl::ThreadVectorRangeBoundariesStruct<iType,Member> & loop_boundaries,const Lambda & lambda,const ReducerType & reducer)1019 parallel_scan(const Impl::ThreadVectorRangeBoundariesStruct<iType, Member>&
1020 loop_boundaries,
1021 const Lambda& lambda, const ReducerType& reducer) {
1022 typename ReducerType::value_type scan_val;
1023 reducer.init(scan_val);
1024
1025 #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
1026 #pragma ivdep
1027 #endif
1028 for (iType i = loop_boundaries.start; i < loop_boundaries.end;
1029 i += loop_boundaries.increment) {
1030 lambda(i, scan_val, true);
1031 }
1032 }
1033
1034 //----------------------------------------------------------------------------
1035
1036 template <class Member>
PerTeam(Member const & member,typename std::enable_if<Impl::is_thread_team_member<Member>::value>::type const ** =nullptr)1037 KOKKOS_INLINE_FUNCTION Impl::ThreadSingleStruct<Member> PerTeam(
1038 Member const& member,
1039 typename std::enable_if<
1040 Impl::is_thread_team_member<Member>::value>::type const** = nullptr) {
1041 return Impl::ThreadSingleStruct<Member>(member);
1042 }
1043
1044 template <class Member>
PerThread(Member const & member,typename std::enable_if<Impl::is_thread_team_member<Member>::value>::type const ** =nullptr)1045 KOKKOS_INLINE_FUNCTION Impl::VectorSingleStruct<Member> PerThread(
1046 Member const& member,
1047 typename std::enable_if<
1048 Impl::is_thread_team_member<Member>::value>::type const** = nullptr) {
1049 return Impl::VectorSingleStruct<Member>(member);
1050 }
1051
1052 template <class Member, class FunctorType>
1053 KOKKOS_INLINE_FUNCTION typename std::enable_if<
1054 Impl::is_host_thread_team_member<Member>::value>::type
single(const Impl::ThreadSingleStruct<Member> & single,const FunctorType & functor)1055 single(const Impl::ThreadSingleStruct<Member>& single,
1056 const FunctorType& functor) {
1057 // 'single' does not perform a barrier.
1058 if (single.team_member.team_rank() == 0) functor();
1059 }
1060
1061 template <class Member, class FunctorType, typename ValueType>
1062 KOKKOS_INLINE_FUNCTION typename std::enable_if<
1063 Impl::is_host_thread_team_member<Member>::value>::type
single(const Impl::ThreadSingleStruct<Member> & single,const FunctorType & functor,ValueType & val)1064 single(const Impl::ThreadSingleStruct<Member>& single,
1065 const FunctorType& functor, ValueType& val) {
1066 single.team_member.team_broadcast(functor, val, 0);
1067 }
1068
1069 template <class Member, class FunctorType>
1070 KOKKOS_INLINE_FUNCTION typename std::enable_if<
1071 Impl::is_host_thread_team_member<Member>::value>::type
single(const Impl::VectorSingleStruct<Member> &,const FunctorType & functor)1072 single(const Impl::VectorSingleStruct<Member>&, const FunctorType& functor) {
1073 functor();
1074 }
1075
1076 template <class Member, class FunctorType, typename ValueType>
1077 KOKKOS_INLINE_FUNCTION typename std::enable_if<
1078 Impl::is_host_thread_team_member<Member>::value>::type
single(const Impl::VectorSingleStruct<Member> &,const FunctorType & functor,ValueType & val)1079 single(const Impl::VectorSingleStruct<Member>&, const FunctorType& functor,
1080 ValueType& val) {
1081 functor(val);
1082 }
1083
1084 } /* namespace Kokkos */
1085
1086 //----------------------------------------------------------------------------
1087 //----------------------------------------------------------------------------
1088
1089 #endif /* #ifndef KOKKOS_IMPL_HOSTTHREADTEAM_HPP */
1090