1 /*
2 //@HEADER
3 // ************************************************************************
4 //
5 //                        Kokkos v. 3.0
6 //       Copyright (2020) National Technology & Engineering
7 //               Solutions of Sandia, LLC (NTESS).
8 //
9 // Under the terms of Contract DE-NA0003525 with NTESS,
10 // the U.S. Government retains certain rights in this software.
11 //
12 // Redistribution and use in source and binary forms, with or without
13 // modification, are permitted provided that the following conditions are
14 // met:
15 //
16 // 1. Redistributions of source code must retain the above copyright
17 // notice, this list of conditions and the following disclaimer.
18 //
19 // 2. Redistributions in binary form must reproduce the above copyright
20 // notice, this list of conditions and the following disclaimer in the
21 // documentation and/or other materials provided with the distribution.
22 //
23 // 3. Neither the name of the Corporation nor the names of the
24 // contributors may be used to endorse or promote products derived from
25 // this software without specific prior written permission.
26 //
27 // THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
28 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
30 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
31 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
32 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
33 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
34 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
35 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
36 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
37 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
38 //
39 // Questions? Contact Christian R. Trott (crtrott@sandia.gov)
40 //
41 // ************************************************************************
42 //@HEADER
43 */
44 
45 #ifndef KOKKOS_IMPL_HOSTTHREADTEAM_HPP
46 #define KOKKOS_IMPL_HOSTTHREADTEAM_HPP
47 
48 #include <Kokkos_Core_fwd.hpp>
49 #include <Kokkos_Pair.hpp>
50 #include <Kokkos_Atomic.hpp>
51 #include <Kokkos_ExecPolicy.hpp>
52 #include <impl/Kokkos_FunctorAdapter.hpp>
53 #include <impl/Kokkos_FunctorAnalysis.hpp>
54 #include <impl/Kokkos_HostBarrier.hpp>
55 
56 #include <limits>     // std::numeric_limits
57 #include <algorithm>  // std::max
58 
59 //----------------------------------------------------------------------------
60 //----------------------------------------------------------------------------
61 
62 namespace Kokkos {
63 namespace Impl {
64 
65 template <class HostExecSpace>
66 class HostThreadTeamMember;
67 
68 class HostThreadTeamData {
69  public:
70   template <class>
71   friend class HostThreadTeamMember;
72 
73   // Assume upper bounds on number of threads:
74   //   pool size       <= 1024 threads
75   //   team size       <= 64 threads
76 
77   enum : int { max_pool_members = 1024 };
78   enum : int { max_team_members = 64 };
79   enum : int { max_pool_rendezvous = HostBarrier::required_buffer_size };
80   enum : int { max_team_rendezvous = HostBarrier::required_buffer_size };
81 
82  private:
83   // per-thread scratch memory buffer chunks:
84   //
85   //   [ pool_members ]     = [ m_pool_members    .. m_pool_rendezvous )
86   //   [ pool_rendezvous ]  = [ m_pool_rendezvous .. m_team_rendezvous )
87   //   [ team_rendezvous ]  = [ m_team_rendezvous .. m_pool_reduce )
88   //   [ pool_reduce ]      = [ m_pool_reduce     .. m_team_reduce )
89   //   [ team_reduce ]      = [ m_team_reduce     .. m_team_shared )
90   //   [ team_shared ]      = [ m_team_shared     .. m_thread_local )
91   //   [ thread_local ]     = [ m_thread_local    .. m_scratch_size )
92 
93   enum : int { m_pool_members = 0 };
94   enum : int { m_pool_rendezvous = m_pool_members + max_pool_members };
95   enum : int { m_team_rendezvous = m_pool_rendezvous + max_pool_rendezvous };
96   enum : int { m_pool_reduce = m_team_rendezvous + max_team_rendezvous };
97 
98   using pair_int_t = Kokkos::pair<int64_t, int64_t>;
99 
100   pair_int_t m_work_range;
101   int64_t m_work_end;
102   int64_t* m_scratch;       // per-thread buffer
103   int64_t* m_pool_scratch;  // == pool[0]->m_scratch
104   int64_t* m_team_scratch;  // == pool[ 0 + m_team_base ]->m_scratch
105   int m_pool_rank;
106   int m_pool_size;
107   int m_team_reduce;
108   int m_team_shared;
109   int m_thread_local;
110   int m_scratch_size;
111   int m_team_base;
112   int m_team_rank;
113   int m_team_size;
114   int m_team_alloc;
115   int m_league_rank;
116   int m_league_size;
117   int m_work_chunk;
118   int m_steal_rank;  // work stealing rank
119   int mutable m_pool_rendezvous_step;
120   int mutable m_team_rendezvous_step;
121 
team_member(int r) const122   HostThreadTeamData* team_member(int r) const noexcept {
123     return ((HostThreadTeamData**)(m_pool_scratch +
124                                    m_pool_members))[m_team_base + r];
125   }
126 
127  public:
team_rendezvous() const128   inline bool team_rendezvous() const noexcept {
129     int* ptr = (int*)(m_team_scratch + m_team_rendezvous);
130     HostBarrier::split_arrive(ptr, m_team_size, m_team_rendezvous_step);
131     if (m_team_rank != 0) {
132       HostBarrier::wait(ptr, m_team_size, m_team_rendezvous_step);
133     } else {
134       HostBarrier::split_master_wait(ptr, m_team_size, m_team_rendezvous_step);
135     }
136 
137     return m_team_rank == 0;
138   }
139 
team_rendezvous(const int source_team_rank) const140   inline bool team_rendezvous(const int source_team_rank) const noexcept {
141     int* ptr = (int*)(m_team_scratch + m_team_rendezvous);
142     HostBarrier::split_arrive(ptr, m_team_size, m_team_rendezvous_step);
143     if (m_team_rank != source_team_rank) {
144       HostBarrier::wait(ptr, m_team_size, m_team_rendezvous_step);
145     } else {
146       HostBarrier::split_master_wait(ptr, m_team_size, m_team_rendezvous_step);
147     }
148 
149     return (m_team_rank == source_team_rank);
150   }
151 
team_rendezvous_release() const152   inline void team_rendezvous_release() const noexcept {
153     HostBarrier::split_release((int*)(m_team_scratch + m_team_rendezvous),
154                                m_team_size, m_team_rendezvous_step);
155   }
156 
pool_rendezvous() const157   inline int pool_rendezvous() const noexcept {
158     int* ptr = (int*)(m_pool_scratch + m_pool_rendezvous);
159     HostBarrier::split_arrive(ptr, m_pool_size, m_pool_rendezvous_step);
160     if (m_pool_rank != 0) {
161       HostBarrier::wait(ptr, m_pool_size, m_pool_rendezvous_step);
162     } else {
163       HostBarrier::split_master_wait(ptr, m_pool_size, m_pool_rendezvous_step);
164     }
165 
166     return m_pool_rank == 0;
167   }
168 
pool_rendezvous_release() const169   inline void pool_rendezvous_release() const noexcept {
170     HostBarrier::split_release((int*)(m_pool_scratch + m_pool_rendezvous),
171                                m_pool_size, m_pool_rendezvous_step);
172   }
173 
174   //----------------------------------------
175 
HostThreadTeamData()176   constexpr HostThreadTeamData() noexcept
177       : m_work_range(-1, -1),
178         m_work_end(0),
179         m_scratch(nullptr),
180         m_pool_scratch(nullptr),
181         m_team_scratch(nullptr),
182         m_pool_rank(0),
183         m_pool_size(1),
184         m_team_reduce(0),
185         m_team_shared(0),
186         m_thread_local(0),
187         m_scratch_size(0),
188         m_team_base(0),
189         m_team_rank(0),
190         m_team_size(1),
191         m_team_alloc(1),
192         m_league_rank(0),
193         m_league_size(1),
194         m_work_chunk(0),
195         m_steal_rank(0),
196         m_pool_rendezvous_step(0),
197         m_team_rendezvous_step(0) {}
198 
199   //----------------------------------------
200   // Organize array of members into a pool.
201   // The 0th member is the root of the pool.
202   // Requires: members are not already in a pool.
203   // Requires: called by one thread.
204   // Pool members are ordered as "close" - sorted by NUMA and then CORE
205   // Each thread is its own team with team_size == 1.
206   static void organize_pool(HostThreadTeamData* members[], const int size);
207 
208   // Called by each thread within the pool
209   void disband_pool();
210 
211   //----------------------------------------
212   // Each thread within a pool organizes itself into a team.
213   // Must be called by all threads of the pool.
214   // Organizing threads into a team performs a barrier across the
215   // entire pool to insure proper initialization of the team
216   // rendezvous mechanism before a team rendezvous can be performed.
217   //
218   // Return true  if a valid member of a team.
219   // Return false if not a member and thread should be idled.
220   int organize_team(const int team_size);
221 
222   // Each thread within a pool disbands itself from current team.
223   // Each thread becomes its own team with team_size == 1.
224   // Must be called by all threads of the pool.
225   void disband_team();
226 
227   //----------------------------------------
228 
pool_rank() const229   constexpr int pool_rank() const { return m_pool_rank; }
pool_size() const230   constexpr int pool_size() const { return m_pool_size; }
231 
pool_member(int r) const232   HostThreadTeamData* pool_member(int r) const noexcept {
233     return ((HostThreadTeamData**)(m_pool_scratch + m_pool_members))[r];
234   }
235 
236   //----------------------------------------
237 
238  private:
239   enum : int { mask_to_16 = 0x0f };  // align to 16 bytes
240   enum : int { shift_to_8 = 3 };     // size to 8 bytes
241 
242  public:
align_to_int64(int n)243   static constexpr int align_to_int64(int n) {
244     return ((n + mask_to_16) & ~mask_to_16) >> shift_to_8;
245   }
246 
pool_reduce_bytes() const247   constexpr int pool_reduce_bytes() const {
248     return m_scratch_size ? sizeof(int64_t) * (m_team_reduce - m_pool_reduce)
249                           : 0;
250   }
251 
team_reduce_bytes() const252   constexpr int team_reduce_bytes() const {
253     return sizeof(int64_t) * (m_team_shared - m_team_reduce);
254   }
255 
team_shared_bytes() const256   constexpr int team_shared_bytes() const {
257     return sizeof(int64_t) * (m_thread_local - m_team_shared);
258   }
259 
thread_local_bytes() const260   constexpr int thread_local_bytes() const {
261     return sizeof(int64_t) * (m_scratch_size - m_thread_local);
262   }
263 
scratch_bytes() const264   constexpr int scratch_bytes() const {
265     return sizeof(int64_t) * m_scratch_size;
266   }
267 
268   // Memory chunks:
269 
scratch_buffer() const270   int64_t* scratch_buffer() const noexcept { return m_scratch; }
271 
pool_reduce() const272   int64_t* pool_reduce() const noexcept {
273     return m_pool_scratch + m_pool_reduce;
274   }
275 
pool_reduce_local() const276   int64_t* pool_reduce_local() const noexcept {
277     return m_scratch + m_pool_reduce;
278   }
279 
team_reduce() const280   int64_t* team_reduce() const noexcept {
281     return m_team_scratch + m_team_reduce;
282   }
283 
team_reduce_local() const284   int64_t* team_reduce_local() const noexcept {
285     return m_scratch + m_team_reduce;
286   }
287 
team_shared() const288   int64_t* team_shared() const noexcept {
289     return m_team_scratch + m_team_shared;
290   }
291 
local_scratch() const292   int64_t* local_scratch() const noexcept { return m_scratch + m_thread_local; }
293 
294   // Given:
295   //   pool_reduce_size  = number bytes for pool reduce
296   //   team_reduce_size  = number bytes for team reduce
297   //   team_shared_size  = number bytes for team shared memory
298   //   thread_local_size = number bytes for thread local memory
299   // Return:
300   //   total number of bytes that must be allocated
scratch_size(int pool_reduce_size,int team_reduce_size,int team_shared_size,int thread_local_size)301   static size_t scratch_size(int pool_reduce_size, int team_reduce_size,
302                              int team_shared_size, int thread_local_size) {
303     pool_reduce_size  = align_to_int64(pool_reduce_size);
304     team_reduce_size  = align_to_int64(team_reduce_size);
305     team_shared_size  = align_to_int64(team_shared_size);
306     thread_local_size = align_to_int64(thread_local_size);
307 
308     const size_t total_bytes =
309         (m_pool_reduce + pool_reduce_size + team_reduce_size +
310          team_shared_size + thread_local_size) *
311         sizeof(int64_t);
312 
313     return total_bytes;
314   }
315 
316   // Given:
317   //   alloc_ptr         = pointer to allocated memory
318   //   alloc_size        = number bytes of allocated memory
319   //   pool_reduce_size  = number bytes for pool reduce/scan operations
320   //   team_reduce_size  = number bytes for team reduce/scan operations
321   //   team_shared_size  = number bytes for team-shared memory
322   //   thread_local_size = number bytes for thread-local memory
323   // Return:
324   //   total number of bytes that must be allocated
scratch_assign(void * const alloc_ptr,size_t const alloc_size,int pool_reduce_size,int team_reduce_size,int team_shared_size,int)325   void scratch_assign(void* const alloc_ptr, size_t const alloc_size,
326                       int pool_reduce_size, int team_reduce_size,
327                       int team_shared_size, int /* thread_local_size */) {
328     pool_reduce_size = align_to_int64(pool_reduce_size);
329     team_reduce_size = align_to_int64(team_reduce_size);
330     team_shared_size = align_to_int64(team_shared_size);
331     // thread_local_size = align_to_int64( thread_local_size );
332 
333     m_scratch      = (int64_t*)alloc_ptr;
334     m_team_reduce  = m_pool_reduce + pool_reduce_size;
335     m_team_shared  = m_team_reduce + team_reduce_size;
336     m_thread_local = m_team_shared + team_shared_size;
337     m_scratch_size = align_to_int64(alloc_size);
338 
339 #if 0
340 fprintf(stdout,"HostThreadTeamData::scratch_assign { %d %d %d %d %d %d %d }\n"
341        , int(m_pool_members)
342        , int(m_pool_rendezvous)
343        , int(m_pool_reduce)
344        , int(m_team_reduce)
345        , int(m_team_shared)
346        , int(m_thread_local)
347        , int(m_scratch_size)
348        );
349 fflush(stdout);
350 #endif
351   }
352 
353   //----------------------------------------
354   // Get a work index within the range.
355   // First try to steal from beginning of own teams's partition.
356   // If that fails then try to steal from end of another teams' partition.
357   int get_work_stealing() noexcept;
358 
359   //----------------------------------------
360   // Set the initial work partitioning of [ 0 .. length ) among the teams
361   // with granularity of chunk
362 
set_work_partition(int64_t const length,int const chunk)363   void set_work_partition(int64_t const length, int const chunk) noexcept {
364     // Minimum chunk size to insure that
365     //   m_work_end < std::numeric_limits<int>::max() * m_work_chunk
366 
367     int const chunk_min = (length + std::numeric_limits<int>::max()) /
368                           std::numeric_limits<int>::max();
369 
370     m_work_end   = length;
371     m_work_chunk = std::max(chunk, chunk_min);
372 
373     // Number of work chunks and partitioning of that number:
374     int const num  = (m_work_end + m_work_chunk - 1) / m_work_chunk;
375     int const part = (num + m_league_size - 1) / m_league_size;
376 
377     m_work_range.first  = part * m_league_rank;
378     m_work_range.second = m_work_range.first + part;
379 
380     // Steal from next team, round robin
381     // The next team is offset by m_team_alloc if it fits in the pool.
382 
383     m_steal_rank = m_team_base + m_team_alloc + m_team_size <= m_pool_size
384                        ? m_team_base + m_team_alloc
385                        : 0;
386   }
387 
get_work_partition()388   std::pair<int64_t, int64_t> get_work_partition() noexcept {
389     int64_t first  = m_work_range.first;
390     int64_t second = m_work_range.second;
391     first *= m_work_chunk;
392     second *= m_work_chunk;
393     return std::pair<int64_t, int64_t>(
394         first, second < m_work_end ? second : m_work_end);
395   }
396 
get_work_stealing_chunk()397   std::pair<int64_t, int64_t> get_work_stealing_chunk() noexcept {
398     std::pair<int64_t, int64_t> x(-1, -1);
399 
400     const int i = get_work_stealing();
401 
402     if (0 <= i) {
403       x.first  = m_work_chunk * i;
404       x.second = x.first + m_work_chunk < m_work_end ? x.first + m_work_chunk
405                                                      : m_work_end;
406     }
407 
408     return x;
409   }
410 };
411 
412 //----------------------------------------------------------------------------
413 
414 template <class HostExecSpace>
415 class HostThreadTeamMember {
416  public:
417   using scratch_memory_space    = typename HostExecSpace::scratch_memory_space;
418   using execution_space         = HostExecSpace;
419   using thread_team_member      = HostThreadTeamMember;
420   using host_thread_team_member = HostThreadTeamMember;
421 
422  private:
423   scratch_memory_space m_scratch;
424   HostThreadTeamData& m_data;
425   int const m_league_rank;
426   int const m_league_size;
427 
428  public:
HostThreadTeamMember(HostThreadTeamData & arg_data)429   constexpr HostThreadTeamMember(HostThreadTeamData& arg_data) noexcept
430       : m_scratch(arg_data.team_shared(), arg_data.team_shared_bytes()),
431         m_data(arg_data),
432         m_league_rank(arg_data.m_league_rank),
433         m_league_size(arg_data.m_league_size) {}
434 
HostThreadTeamMember(HostThreadTeamData & arg_data,int const arg_league_rank,int const arg_league_size)435   constexpr HostThreadTeamMember(HostThreadTeamData& arg_data,
436                                  int const arg_league_rank,
437                                  int const arg_league_size) noexcept
438       : m_scratch(arg_data.team_shared(), arg_data.team_shared_bytes(),
439                   arg_data.team_shared(), arg_data.team_shared_bytes()),
440         m_data(arg_data),
441         m_league_rank(arg_league_rank),
442         m_league_size(arg_league_size) {}
443 
444   ~HostThreadTeamMember()                           = default;
445   HostThreadTeamMember()                            = delete;
446   HostThreadTeamMember(HostThreadTeamMember&&)      = default;
447   HostThreadTeamMember(HostThreadTeamMember const&) = default;
448   HostThreadTeamMember& operator=(HostThreadTeamMember&&) = default;
449   HostThreadTeamMember& operator=(HostThreadTeamMember const&) = default;
450 
451   //----------------------------------------
452 
453   KOKKOS_INLINE_FUNCTION
team_rank() const454   int team_rank() const noexcept { return m_data.m_team_rank; }
455 
456   KOKKOS_INLINE_FUNCTION
team_size() const457   int team_size() const noexcept { return m_data.m_team_size; }
458 
459   KOKKOS_INLINE_FUNCTION
league_rank() const460   int league_rank() const noexcept { return m_league_rank; }
461 
462   KOKKOS_INLINE_FUNCTION
league_size() const463   int league_size() const noexcept { return m_league_size; }
464 
465   //----------------------------------------
466 
467   KOKKOS_INLINE_FUNCTION
team_shmem() const468   const scratch_memory_space& team_shmem() const {
469     return m_scratch.set_team_thread_mode(0, 1, 0);
470   }
471 
472   KOKKOS_INLINE_FUNCTION
team_scratch(int) const473   const scratch_memory_space& team_scratch(int) const {
474     return m_scratch.set_team_thread_mode(0, 1, 0);
475   }
476 
477   KOKKOS_INLINE_FUNCTION
thread_scratch(int) const478   const scratch_memory_space& thread_scratch(int) const {
479     return m_scratch.set_team_thread_mode(0, m_data.m_team_size,
480                                           m_data.m_team_rank);
481   }
482 
483   //--------------------------------------------------------------------------
484   // Team collectives
485   //--------------------------------------------------------------------------
486 
team_barrier() const487   KOKKOS_INLINE_FUNCTION void team_barrier() const noexcept
488 #if defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST)
489   {
490     if (m_data.team_rendezvous()) {
491       m_data.team_rendezvous_release();
492     };
493   }
494 #else
495   {
496   }
497 #endif
498 
499   //--------------------------------------------------------------------------
500 
501   template <typename T>
team_broadcast(T & value,const int source_team_rank) const502   KOKKOS_INLINE_FUNCTION void team_broadcast(T& value,
503                                              const int source_team_rank) const
504       noexcept
505 #if defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST)
506   {
507     if (1 < m_data.m_team_size) {
508       T volatile* const shared_value = (T*)m_data.team_reduce();
509 
510       // Don't overwrite shared memory until all threads arrive
511 
512       if (m_data.team_rendezvous(source_team_rank)) {
513         // All threads have entered 'team_rendezvous'
514         // only this thread returned from 'team_rendezvous'
515         // with a return value of 'true'
516 
517         *shared_value = value;
518 
519         m_data.team_rendezvous_release();
520         // This thread released all other threads from 'team_rendezvous'
521         // with a return value of 'false'
522       } else {
523         value = *shared_value;
524       }
525     }
526   }
527 #else
528   {
529     (void)value;
530     (void)source_team_rank;
531     Kokkos::abort("HostThreadTeamMember team_broadcast\n");
532   }
533 #endif
534 
535   //--------------------------------------------------------------------------
536 
537   template <class Closure, typename T>
team_broadcast(Closure const & f,T & value,const int source_team_rank) const538   KOKKOS_INLINE_FUNCTION void team_broadcast(Closure const& f, T& value,
539                                              const int source_team_rank) const
540       noexcept
541 #if defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST)
542   {
543     T volatile* const shared_value = (T*)m_data.team_reduce();
544 
545     // Don't overwrite shared memory until all threads arrive
546 
547     if (m_data.team_rendezvous(source_team_rank)) {
548       // All threads have entered 'team_rendezvous'
549       // only this thread returned from 'team_rendezvous'
550       // with a return value of 'true'
551 
552       f(value);
553 
554       if (1 < m_data.m_team_size) {
555         *shared_value = value;
556       }
557 
558       m_data.team_rendezvous_release();
559       // This thread released all other threads from 'team_rendezvous'
560       // with a return value of 'false'
561     } else {
562       value = *shared_value;
563     }
564   }
565 #else
566   {
567     (void)f;
568     (void)value;
569     (void)source_team_rank;
570     Kokkos::abort("HostThreadTeamMember team_broadcast\n");
571   }
572 #endif
573 
574   //--------------------------------------------------------------------------
575   // team_reduce( Sum(result) );
576   // team_reduce( Min(result) );
577   // team_reduce( Max(result) );
578 
579   template <typename ReducerType>
580   KOKKOS_INLINE_FUNCTION
581       typename std::enable_if<is_reducer<ReducerType>::value>::type
team_reduce(ReducerType const & reducer) const582       team_reduce(ReducerType const& reducer) const noexcept {
583     team_reduce(reducer, reducer.reference());
584   }
585 
586   template <typename ReducerType>
587   KOKKOS_INLINE_FUNCTION
588       typename std::enable_if<is_reducer<ReducerType>::value>::type
team_reduce(ReducerType const & reducer,typename ReducerType::value_type contribution) const589       team_reduce(ReducerType const& reducer,
590                   typename ReducerType::value_type contribution) const noexcept
591 #if defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST)
592   {
593     if (1 < m_data.m_team_size) {
594       using value_type = typename ReducerType::value_type;
595 
596       if (0 != m_data.m_team_rank) {
597         // Non-root copies to their local buffer:
598         /*reducer.copy( (value_type*) m_data.team_reduce_local()
599                     , reducer.data() );*/
600         *((value_type*)m_data.team_reduce_local()) = contribution;
601       }
602 
603       // Root does not overwrite shared memory until all threads arrive
604       // and copy to their local buffer.
605 
606       if (m_data.team_rendezvous()) {
607         // All threads have entered 'team_rendezvous'
608         // only this thread returned from 'team_rendezvous'
609         // with a return value of 'true'
610         //
611         // This thread sums contributed values
612         for (int i = 1; i < m_data.m_team_size; ++i) {
613           value_type* const src =
614               (value_type*)m_data.team_member(i)->team_reduce_local();
615 
616           reducer.join(contribution, *src);
617         }
618 
619         // Copy result to root member's buffer:
620         // reducer.copy( (value_type*) m_data.team_reduce() , reducer.data() );
621         *((value_type*)m_data.team_reduce()) = contribution;
622         reducer.reference()                  = contribution;
623         m_data.team_rendezvous_release();
624         // This thread released all other threads from 'team_rendezvous'
625         // with a return value of 'false'
626       } else {
627         // Copy from root member's buffer:
628         reducer.reference() = *((value_type*)m_data.team_reduce());
629       }
630     } else {
631       reducer.reference() = contribution;
632     }
633   }
634 #else
635   {
636     (void)reducer;
637     (void)contribution;
638     Kokkos::abort("HostThreadTeamMember team_reduce\n");
639   }
640 #endif
641 
642   //--------------------------------------------------------------------------
643 
644   /*template< typename ValueType , class JoinOp >
645   KOKKOS_INLINE_FUNCTION
646   ValueType
647   team_reduce( ValueType const & value
648              , JoinOp    const & join ) const noexcept
649 #if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
650     {
651       if ( 0 != m_data.m_team_rank ) {
652         // Non-root copies to their local buffer:
653         *((ValueType*) m_data.team_reduce_local()) = value ;
654       }
655 
656       // Root does not overwrite shared memory until all threads arrive
657       // and copy to their local buffer.
658 
659       if ( m_data.team_rendezvous() ) {
660         const Impl::Reducer< ValueType , JoinOp > reducer( join );
661 
662         // All threads have entered 'team_rendezvous'
663         // only this thread returned from 'team_rendezvous'
664         // with a return value of 'true'
665         //
666         // This thread sums contributed values
667 
668         ValueType * const dst = (ValueType*) m_data.team_reduce_local();
669 
670         *dst = value ;
671 
672         for ( int i = 1 ; i < m_data.m_team_size ; ++i ) {
673           ValueType * const src =
674             (ValueType*) m_data.team_member(i)->team_reduce_local();
675 
676           reducer.join( dst , src );
677         }
678 
679         m_data.team_rendezvous_release();
680         // This thread released all other threads from 'team_rendezvous'
681         // with a return value of 'false'
682       }
683 
684       return *((ValueType*) m_data.team_reduce());
685     }
686 #else
687     { Kokkos::abort("HostThreadTeamMember team_reduce\n"); return ValueType(); }
688 #endif*/
689 
690   template <typename T>
team_scan(T const & value,T * const global=nullptr) const691   KOKKOS_INLINE_FUNCTION T team_scan(T const& value,
692                                      T* const global = nullptr) const noexcept
693 #if defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST)
694   {
695     if (0 != m_data.m_team_rank) {
696       // Non-root copies to their local buffer:
697       ((T*)m_data.team_reduce_local())[1] = value;
698     }
699 
700     // Root does not overwrite shared memory until all threads arrive
701     // and copy to their local buffer.
702 
703     if (m_data.team_rendezvous()) {
704       // All threads have entered 'team_rendezvous'
705       // only this thread returned from 'team_rendezvous'
706       // with a return value of 'true'
707       //
708       // This thread scans contributed values
709 
710       {
711         T* prev = (T*)m_data.team_reduce_local();
712 
713         prev[0] = 0;
714         prev[1] = value;
715 
716         for (int i = 1; i < m_data.m_team_size; ++i) {
717           T* const ptr = (T*)m_data.team_member(i)->team_reduce_local();
718 
719           ptr[0] = prev[0] + prev[1];
720 
721           prev = ptr;
722         }
723       }
724 
725       // If adding to global value then atomic_fetch_add to that value
726       // and sum previous value to every entry of the scan.
727       if (global) {
728         T* prev = (T*)m_data.team_reduce_local();
729 
730         {
731           T* ptr = (T*)m_data.team_member(m_data.m_team_size - 1)
732                        ->team_reduce_local();
733           prev[0] = Kokkos::atomic_fetch_add(global, ptr[0] + ptr[1]);
734         }
735 
736         for (int i = 1; i < m_data.m_team_size; ++i) {
737           T* ptr = (T*)m_data.team_member(i)->team_reduce_local();
738           ptr[0] += prev[0];
739         }
740       }
741 
742       m_data.team_rendezvous_release();
743     }
744 
745     return ((T*)m_data.team_reduce_local())[0];
746   }
747 #else
748   {
749     (void)value;
750     (void)global;
751     Kokkos::abort("HostThreadTeamMember team_scan\n");
752     return T();
753   }
754 #endif
755 };
756 
757 }  // namespace Impl
758 }  // namespace Kokkos
759 
760 //----------------------------------------------------------------------------
761 //----------------------------------------------------------------------------
762 
763 namespace Kokkos {
764 
765 template <typename iType, typename Member>
766 KOKKOS_INLINE_FUNCTION Impl::TeamThreadRangeBoundariesStruct<iType, Member>
TeamThreadRange(Member const & member,iType count,typename std::enable_if<Impl::is_thread_team_member<Member>::value>::type const ** =nullptr)767 TeamThreadRange(
768     Member const& member, iType count,
769     typename std::enable_if<
770         Impl::is_thread_team_member<Member>::value>::type const** = nullptr) {
771   return Impl::TeamThreadRangeBoundariesStruct<iType, Member>(member, 0, count);
772 }
773 
774 template <typename iType1, typename iType2, typename Member>
775 KOKKOS_INLINE_FUNCTION Impl::TeamThreadRangeBoundariesStruct<
776     typename std::common_type<iType1, iType2>::type, Member>
TeamThreadRange(Member const & member,iType1 begin,iType2 end,typename std::enable_if<Impl::is_thread_team_member<Member>::value>::type const ** =nullptr)777 TeamThreadRange(
778     Member const& member, iType1 begin, iType2 end,
779     typename std::enable_if<
780         Impl::is_thread_team_member<Member>::value>::type const** = nullptr) {
781   return Impl::TeamThreadRangeBoundariesStruct<
782       typename std::common_type<iType1, iType2>::type, Member>(member, begin,
783                                                                end);
784 }
785 
786 template <typename iType, typename Member>
787 KOKKOS_INLINE_FUNCTION Impl::TeamThreadRangeBoundariesStruct<iType, Member>
TeamVectorRange(Member const & member,iType count,typename std::enable_if<Impl::is_thread_team_member<Member>::value>::type const ** =nullptr)788 TeamVectorRange(
789     Member const& member, iType count,
790     typename std::enable_if<
791         Impl::is_thread_team_member<Member>::value>::type const** = nullptr) {
792   return Impl::TeamThreadRangeBoundariesStruct<iType, Member>(member, 0, count);
793 }
794 
795 template <typename iType1, typename iType2, typename Member>
796 KOKKOS_INLINE_FUNCTION Impl::TeamThreadRangeBoundariesStruct<
797     typename std::common_type<iType1, iType2>::type, Member>
TeamVectorRange(Member const & member,iType1 begin,iType2 end,typename std::enable_if<Impl::is_thread_team_member<Member>::value>::type const ** =nullptr)798 TeamVectorRange(
799     Member const& member, iType1 begin, iType2 end,
800     typename std::enable_if<
801         Impl::is_thread_team_member<Member>::value>::type const** = nullptr) {
802   return Impl::TeamThreadRangeBoundariesStruct<
803       typename std::common_type<iType1, iType2>::type, Member>(member, begin,
804                                                                end);
805 }
806 
807 template <typename iType, typename Member>
808 KOKKOS_INLINE_FUNCTION Impl::ThreadVectorRangeBoundariesStruct<iType, Member>
ThreadVectorRange(Member const & member,iType count,typename std::enable_if<Impl::is_thread_team_member<Member>::value>::type const ** =nullptr)809 ThreadVectorRange(
810     Member const& member, iType count,
811     typename std::enable_if<
812         Impl::is_thread_team_member<Member>::value>::type const** = nullptr) {
813   return Impl::ThreadVectorRangeBoundariesStruct<iType, Member>(member, count);
814 }
815 
816 template <typename iType1, typename iType2, typename Member>
817 KOKKOS_INLINE_FUNCTION Impl::ThreadVectorRangeBoundariesStruct<
818     typename std::common_type<iType1, iType2>::type, Member>
ThreadVectorRange(Member const & member,iType1 arg_begin,iType2 arg_end,typename std::enable_if<Impl::is_thread_team_member<Member>::value>::type const ** =nullptr)819 ThreadVectorRange(
820     Member const& member, iType1 arg_begin, iType2 arg_end,
821     typename std::enable_if<
822         Impl::is_thread_team_member<Member>::value>::type const** = nullptr) {
823   using iType = typename std::common_type<iType1, iType2>::type;
824   return Impl::ThreadVectorRangeBoundariesStruct<iType, Member>(
825       member, iType(arg_begin), iType(arg_end));
826 }
827 
828 //----------------------------------------------------------------------------
829 /** \brief  Inter-thread parallel_for.
830  *
831  * Executes lambda(iType i) for each i=[0..N)
832  *
833  * The range [0..N) is mapped to all threads of the the calling thread team.
834  */
835 template <typename iType, class Closure, class Member>
parallel_for(Impl::TeamThreadRangeBoundariesStruct<iType,Member> const & loop_boundaries,Closure const & closure,typename std::enable_if<Impl::is_host_thread_team_member<Member>::value>::type const ** =nullptr)836 KOKKOS_INLINE_FUNCTION void parallel_for(
837     Impl::TeamThreadRangeBoundariesStruct<iType, Member> const& loop_boundaries,
838     Closure const& closure,
839     typename std::enable_if<Impl::is_host_thread_team_member<Member>::value>::
840         type const** = nullptr) {
841   for (iType i = loop_boundaries.start; i < loop_boundaries.end;
842        i += loop_boundaries.increment) {
843     closure(i);
844   }
845 }
846 
847 template <typename iType, class Closure, class Member>
parallel_for(Impl::ThreadVectorRangeBoundariesStruct<iType,Member> const & loop_boundaries,Closure const & closure,typename std::enable_if<Impl::is_host_thread_team_member<Member>::value>::type const ** =nullptr)848 KOKKOS_INLINE_FUNCTION void parallel_for(
849     Impl::ThreadVectorRangeBoundariesStruct<iType, Member> const&
850         loop_boundaries,
851     Closure const& closure,
852     typename std::enable_if<Impl::is_host_thread_team_member<Member>::value>::
853         type const** = nullptr) {
854 #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
855 #pragma ivdep
856 #endif
857   for (iType i = loop_boundaries.start; i < loop_boundaries.end;
858        i += loop_boundaries.increment) {
859     closure(i);
860   }
861 }
862 
863 //----------------------------------------------------------------------------
864 
865 template <typename iType, class Closure, class Reducer, class Member>
866 KOKKOS_INLINE_FUNCTION typename std::enable_if<
867     Kokkos::is_reducer<Reducer>::value &&
868     Impl::is_host_thread_team_member<Member>::value>::type
parallel_reduce(Impl::TeamThreadRangeBoundariesStruct<iType,Member> const & loop_boundaries,Closure const & closure,Reducer const & reducer)869 parallel_reduce(
870     Impl::TeamThreadRangeBoundariesStruct<iType, Member> const& loop_boundaries,
871     Closure const& closure, Reducer const& reducer) {
872   typename Reducer::value_type value;
873   reducer.init(value);
874 
875   for (iType i = loop_boundaries.start; i < loop_boundaries.end;
876        i += loop_boundaries.increment) {
877     closure(i, value);
878   }
879 
880   loop_boundaries.thread.team_reduce(reducer, value);
881 }
882 
883 template <typename iType, typename Closure, typename ValueType, typename Member>
884 KOKKOS_INLINE_FUNCTION typename std::enable_if<
885     !Kokkos::is_reducer<ValueType>::value &&
886     Impl::is_host_thread_team_member<Member>::value>::type
parallel_reduce(Impl::TeamThreadRangeBoundariesStruct<iType,Member> const & loop_boundaries,Closure const & closure,ValueType & result)887 parallel_reduce(
888     Impl::TeamThreadRangeBoundariesStruct<iType, Member> const& loop_boundaries,
889     Closure const& closure, ValueType& result) {
890   ValueType val;
891   Sum<ValueType> reducer(val);
892   reducer.init(val);
893 
894   for (iType i = loop_boundaries.start; i < loop_boundaries.end;
895        i += loop_boundaries.increment) {
896     closure(i, reducer.reference());
897   }
898 
899   loop_boundaries.thread.team_reduce(reducer);
900   result = reducer.reference();
901 }
902 
903 /*template< typename iType, class Space
904          , class Closure, class Joiner , typename ValueType >
905 KOKKOS_INLINE_FUNCTION
906 void parallel_reduce
907   (
908 Impl::TeamThreadRangeBoundariesStruct<iType,Impl::HostThreadTeamMember<Space> >
909              const & loop_boundaries
910   , Closure  const & closure
911   , Joiner   const & joiner
912   , ValueType      & result
913   )
914 {
915   Impl::Reducer< ValueType , Joiner > reducer( joiner , & result );
916 
917   reducer.init( reducer.data() );
918 
919   for( iType i = loop_boundaries.start
920      ; i <  loop_boundaries.end
921      ; i += loop_boundaries.increment ) {
922     closure( i , reducer.reference() );
923   }
924 
925   loop_boundaries.thread.team_reduce( reducer );
926 }*/
927 
928 //----------------------------------------------------------------------------
929 /** \brief  Inter-thread vector parallel_reduce.
930  *
931  *  Executes lambda(iType i, ValueType & val) for each i=[0..N)
932  *
933  *  The range [0..N) is mapped to all threads of the
934  *  calling thread team and a summation of  val is
935  *  performed and put into result.
936  */
937 template <typename iType, class Lambda, typename ValueType, typename Member>
938 KOKKOS_INLINE_FUNCTION typename std::enable_if<
939     !Kokkos::is_reducer<ValueType>::value &&
940     Impl::is_host_thread_team_member<Member>::value>::type
parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType,Member> & loop_boundaries,const Lambda & lambda,ValueType & result)941 parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType, Member>&
942                     loop_boundaries,
943                 const Lambda& lambda, ValueType& result) {
944   result = ValueType();
945   for (iType i = loop_boundaries.start; i < loop_boundaries.end;
946        i += loop_boundaries.increment) {
947     lambda(i, result);
948   }
949 }
950 
951 template <typename iType, class Lambda, typename ReducerType, typename Member>
952 KOKKOS_INLINE_FUNCTION typename std::enable_if<
953     Kokkos::is_reducer<ReducerType>::value &&
954     Impl::is_host_thread_team_member<Member>::value>::type
parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType,Member> & loop_boundaries,const Lambda & lambda,const ReducerType & reducer)955 parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType, Member>&
956                     loop_boundaries,
957                 const Lambda& lambda, const ReducerType& reducer) {
958   reducer.init(reducer.reference());
959   for (iType i = loop_boundaries.start; i < loop_boundaries.end;
960        i += loop_boundaries.increment) {
961     lambda(i, reducer.reference());
962   }
963 }
964 
965 //----------------------------------------------------------------------------
966 
967 template <typename iType, class Closure, class Member>
968 KOKKOS_INLINE_FUNCTION typename std::enable_if<
969     Impl::is_host_thread_team_member<Member>::value>::type
parallel_scan(Impl::TeamThreadRangeBoundariesStruct<iType,Member> const & loop_boundaries,Closure const & closure)970 parallel_scan(
971     Impl::TeamThreadRangeBoundariesStruct<iType, Member> const& loop_boundaries,
972     Closure const& closure) {
973   // Extract ValueType from the closure
974 
975   using value_type = typename Kokkos::Impl::FunctorAnalysis<
976       Kokkos::Impl::FunctorPatternInterface::SCAN, void, Closure>::value_type;
977 
978   value_type accum = 0;
979 
980   // Intra-member scan
981   for (iType i = loop_boundaries.start; i < loop_boundaries.end;
982        i += loop_boundaries.increment) {
983     closure(i, accum, false);
984   }
985 
986   // 'accum' output is the exclusive prefix sum
987   accum = loop_boundaries.thread.team_scan(accum);
988 
989   for (iType i = loop_boundaries.start; i < loop_boundaries.end;
990        i += loop_boundaries.increment) {
991     closure(i, accum, true);
992   }
993 }
994 
995 template <typename iType, class ClosureType, class Member>
996 KOKKOS_INLINE_FUNCTION typename std::enable_if<
997     Impl::is_host_thread_team_member<Member>::value>::type
parallel_scan(Impl::ThreadVectorRangeBoundariesStruct<iType,Member> const & loop_boundaries,ClosureType const & closure)998 parallel_scan(Impl::ThreadVectorRangeBoundariesStruct<iType, Member> const&
999                   loop_boundaries,
1000               ClosureType const& closure) {
1001   using value_type = typename Kokkos::Impl::FunctorAnalysis<
1002       Impl::FunctorPatternInterface::SCAN, void, ClosureType>::value_type;
1003 
1004   value_type scan_val = value_type();
1005 
1006 #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
1007 #pragma ivdep
1008 #endif
1009   for (iType i = loop_boundaries.start; i < loop_boundaries.end;
1010        i += loop_boundaries.increment) {
1011     closure(i, scan_val, true);
1012   }
1013 }
1014 
1015 template <typename iType, class Lambda, typename ReducerType, typename Member>
1016 KOKKOS_INLINE_FUNCTION typename std::enable_if<
1017     Kokkos::is_reducer<ReducerType>::value &&
1018     Impl::is_host_thread_team_member<Member>::value>::type
parallel_scan(const Impl::ThreadVectorRangeBoundariesStruct<iType,Member> & loop_boundaries,const Lambda & lambda,const ReducerType & reducer)1019 parallel_scan(const Impl::ThreadVectorRangeBoundariesStruct<iType, Member>&
1020                   loop_boundaries,
1021               const Lambda& lambda, const ReducerType& reducer) {
1022   typename ReducerType::value_type scan_val;
1023   reducer.init(scan_val);
1024 
1025 #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
1026 #pragma ivdep
1027 #endif
1028   for (iType i = loop_boundaries.start; i < loop_boundaries.end;
1029        i += loop_boundaries.increment) {
1030     lambda(i, scan_val, true);
1031   }
1032 }
1033 
1034 //----------------------------------------------------------------------------
1035 
1036 template <class Member>
PerTeam(Member const & member,typename std::enable_if<Impl::is_thread_team_member<Member>::value>::type const ** =nullptr)1037 KOKKOS_INLINE_FUNCTION Impl::ThreadSingleStruct<Member> PerTeam(
1038     Member const& member,
1039     typename std::enable_if<
1040         Impl::is_thread_team_member<Member>::value>::type const** = nullptr) {
1041   return Impl::ThreadSingleStruct<Member>(member);
1042 }
1043 
1044 template <class Member>
PerThread(Member const & member,typename std::enable_if<Impl::is_thread_team_member<Member>::value>::type const ** =nullptr)1045 KOKKOS_INLINE_FUNCTION Impl::VectorSingleStruct<Member> PerThread(
1046     Member const& member,
1047     typename std::enable_if<
1048         Impl::is_thread_team_member<Member>::value>::type const** = nullptr) {
1049   return Impl::VectorSingleStruct<Member>(member);
1050 }
1051 
1052 template <class Member, class FunctorType>
1053 KOKKOS_INLINE_FUNCTION typename std::enable_if<
1054     Impl::is_host_thread_team_member<Member>::value>::type
single(const Impl::ThreadSingleStruct<Member> & single,const FunctorType & functor)1055 single(const Impl::ThreadSingleStruct<Member>& single,
1056        const FunctorType& functor) {
1057   // 'single' does not perform a barrier.
1058   if (single.team_member.team_rank() == 0) functor();
1059 }
1060 
1061 template <class Member, class FunctorType, typename ValueType>
1062 KOKKOS_INLINE_FUNCTION typename std::enable_if<
1063     Impl::is_host_thread_team_member<Member>::value>::type
single(const Impl::ThreadSingleStruct<Member> & single,const FunctorType & functor,ValueType & val)1064 single(const Impl::ThreadSingleStruct<Member>& single,
1065        const FunctorType& functor, ValueType& val) {
1066   single.team_member.team_broadcast(functor, val, 0);
1067 }
1068 
1069 template <class Member, class FunctorType>
1070 KOKKOS_INLINE_FUNCTION typename std::enable_if<
1071     Impl::is_host_thread_team_member<Member>::value>::type
single(const Impl::VectorSingleStruct<Member> &,const FunctorType & functor)1072 single(const Impl::VectorSingleStruct<Member>&, const FunctorType& functor) {
1073   functor();
1074 }
1075 
1076 template <class Member, class FunctorType, typename ValueType>
1077 KOKKOS_INLINE_FUNCTION typename std::enable_if<
1078     Impl::is_host_thread_team_member<Member>::value>::type
single(const Impl::VectorSingleStruct<Member> &,const FunctorType & functor,ValueType & val)1079 single(const Impl::VectorSingleStruct<Member>&, const FunctorType& functor,
1080        ValueType& val) {
1081   functor(val);
1082 }
1083 
1084 } /* namespace Kokkos */
1085 
1086 //----------------------------------------------------------------------------
1087 //----------------------------------------------------------------------------
1088 
1089 #endif /* #ifndef KOKKOS_IMPL_HOSTTHREADTEAM_HPP */
1090