1 /*
2 //@HEADER
3 // ************************************************************************
4 //
5 //                        Kokkos v. 3.0
6 //       Copyright (2020) National Technology & Engineering
7 //               Solutions of Sandia, LLC (NTESS).
8 //
9 // Under the terms of Contract DE-NA0003525 with NTESS,
10 // the U.S. Government retains certain rights in this software.
11 //
12 // Redistribution and use in source and binary forms, with or without
13 // modification, are permitted provided that the following conditions are
14 // met:
15 //
16 // 1. Redistributions of source code must retain the above copyright
17 // notice, this list of conditions and the following disclaimer.
18 //
19 // 2. Redistributions in binary form must reproduce the above copyright
20 // notice, this list of conditions and the following disclaimer in the
21 // documentation and/or other materials provided with the distribution.
22 //
23 // 3. Neither the name of the Corporation nor the names of the
24 // contributors may be used to endorse or promote products derived from
25 // this software without specific prior written permission.
26 //
27 // THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
28 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
30 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
31 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
32 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
33 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
34 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
35 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
36 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
37 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
38 //
39 // Questions? Contact Christian R. Trott (crtrott@sandia.gov)
40 //
41 // ************************************************************************
42 //@HEADER
43 */
44 
45 #ifndef KOKKOS_IMPL_OPENMP_TASK_HPP
46 #define KOKKOS_IMPL_OPENMP_TASK_HPP
47 
48 #if defined(KOKKOS_ENABLE_TASKPOLICY)
49 
50 //----------------------------------------------------------------------------
51 //----------------------------------------------------------------------------
52 
53 namespace Kokkos {
54 namespace Impl {
55 
56 template <>
57 class TaskQueueSpecialization<Kokkos::Experimental::OpenMPTarget> {
58  public:
59   using execution_space = Kokkos::Experimental::OpenMPTarget;
60   using queue_type      = Kokkos::Impl::TaskQueue<execution_space>;
61   using task_base_type  = Kokkos::Impl::TaskBase<execution_space, void, void>;
62 
63   // Must specify memory space
64   using memory_space = Kokkos::HostSpace;
65 
66   static void iff_single_thread_recursive_execute(queue_type* const);
67 
68   // Must provide task queue execution function
69   static void execute(queue_type* const);
70 
71   // Must provide mechanism to set function pointer in
72   // execution space from the host process.
73   template <typename FunctorType>
proc_set_apply(task_base_type::function_type * ptr)74   static void proc_set_apply(task_base_type::function_type* ptr) {
75     using TaskType = TaskBase<Kokkos::Experimental::OpenMPTarget,
76                               typename FunctorType::value_type, FunctorType>;
77     *ptr           = TaskType::apply;
78   }
79 };
80 
81 extern template class TaskQueue<Kokkos::Experimental::OpenMPTarget>;
82 
83 //----------------------------------------------------------------------------
84 
85 template <>
86 class TaskExec<Kokkos::Experimental::OpenMPTarget> {
87  private:
88   TaskExec(TaskExec&&)      = delete;
89   TaskExec(TaskExec const&) = delete;
90   TaskExec& operator=(TaskExec&&) = delete;
91   TaskExec& operator=(TaskExec const&) = delete;
92 
93   using PoolExec = Kokkos::Impl::OpenMPTargetExec;
94 
95   friend class Kokkos::Impl::TaskQueue<Kokkos::Experimental::OpenMPTarget>;
96   friend class Kokkos::Impl::TaskQueueSpecialization<
97       Kokkos::Experimental::OpenMPTarget>;
98 
99   PoolExec* const m_self_exec;  ///< This thread's thread pool data structure
100   PoolExec* const m_team_exec;  ///< Team thread's thread pool data structure
101   int64_t m_sync_mask;
102   int64_t mutable m_sync_value;
103   int mutable m_sync_step;
104   int m_group_rank;  ///< Which "team" subset of thread pool
105   int m_team_rank;   ///< Which thread within a team
106   int m_team_size;
107 
108   TaskExec();
109   TaskExec(PoolExec& arg_exec, int arg_team_size);
110 
111   void team_barrier_impl() const;
112 
113  public:
114 #if defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST)
team_shared() const115   void* team_shared() const {
116     return m_team_exec ? m_team_exec->scratch_thread() : nullptr;
117   }
118 
team_shared_size() const119   int team_shared_size() const {
120     return m_team_exec ? m_team_exec->scratch_thread_size() : 0;
121   }
122 
123   /**\brief  Whole team enters this function call
124    *         before any teeam member returns from
125    *         this function call.
126    */
team_barrier() const127   void team_barrier() const {
128     if (1 < m_team_size) team_barrier_impl();
129   }
130 #else
131   KOKKOS_INLINE_FUNCTION void team_barrier() const {}
132   KOKKOS_INLINE_FUNCTION void* team_shared() const { return 0; }
133   KOKKOS_INLINE_FUNCTION int team_shared_size() const { return 0; }
134 #endif
135 
136   KOKKOS_INLINE_FUNCTION
team_rank() const137   int team_rank() const { return m_team_rank; }
138 
139   KOKKOS_INLINE_FUNCTION
team_size() const140   int team_size() const { return m_team_size; }
141 };
142 
143 }  // namespace Impl
144 }  // namespace Kokkos
145 
146 //----------------------------------------------------------------------------
147 //----------------------------------------------------------------------------
148 
149 namespace Kokkos {
150 
151 template <typename iType>
152 KOKKOS_INLINE_FUNCTION Impl::TeamThreadRangeBoundariesStruct<
153     iType, Impl::TaskExec<Kokkos::Experimental::OpenMPTarget> >
TeamThreadRange(Impl::TaskExec<Kokkos::Experimental::OpenMPTarget> & thread,const iType & count)154 TeamThreadRange(Impl::TaskExec<Kokkos::Experimental::OpenMPTarget>& thread,
155                 const iType& count) {
156   return Impl::TeamThreadRangeBoundariesStruct<
157       iType, Impl::TaskExec<Kokkos::Experimental::OpenMPTarget> >(thread,
158                                                                   count);
159 }
160 
161 template <typename iType>
162 KOKKOS_INLINE_FUNCTION Impl::TeamThreadRangeBoundariesStruct<
163     iType, Impl::TaskExec<Kokkos::Experimental::OpenMPTarget> >
TeamThreadRange(Impl::TaskExec<Kokkos::Experimental::OpenMPTarget> & thread,const iType & start,const iType & end)164 TeamThreadRange(Impl::TaskExec<Kokkos::Experimental::OpenMPTarget>& thread,
165                 const iType& start, const iType& end) {
166   return Impl::TeamThreadRangeBoundariesStruct<
167       iType, Impl::TaskExec<Kokkos::Experimental::OpenMPTarget> >(thread, start,
168                                                                   end);
169 }
170 
171 /** \brief  Inter-thread parallel_for. Executes lambda(iType i) for each
172  * i=0..N-1.
173  *
174  * The range i=0..N-1 is mapped to all threads of the the calling thread team.
175  */
176 template <typename iType, class Lambda>
parallel_for(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec<Kokkos::Experimental::OpenMPTarget>> & loop_boundaries,const Lambda & lambda)177 KOKKOS_INLINE_FUNCTION void parallel_for(
178     const Impl::TeamThreadRangeBoundariesStruct<
179         iType, Impl::TaskExec<Kokkos::Experimental::OpenMPTarget> >&
180         loop_boundaries,
181     const Lambda& lambda) {
182   for (iType i = loop_boundaries.start; i < loop_boundaries.end;
183        i += loop_boundaries.increment) {
184     lambda(i);
185   }
186 }
187 
188 template <typename iType, class Lambda, typename ValueType>
parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec<Kokkos::Experimental::OpenMPTarget>> & loop_boundaries,const Lambda & lambda,ValueType & initialized_result)189 KOKKOS_INLINE_FUNCTION void parallel_reduce(
190     const Impl::TeamThreadRangeBoundariesStruct<
191         iType, Impl::TaskExec<Kokkos::Experimental::OpenMPTarget> >&
192         loop_boundaries,
193     const Lambda& lambda, ValueType& initialized_result) {
194   int team_rank =
195       loop_boundaries.thread.team_rank();  // member num within the team
196   ValueType result = initialized_result;
197 
198   for (iType i = loop_boundaries.start; i < loop_boundaries.end;
199        i += loop_boundaries.increment) {
200     lambda(i, result);
201   }
202 
203   if (1 < loop_boundaries.thread.team_size()) {
204     ValueType* shared = (ValueType*)loop_boundaries.thread.team_shared();
205 
206     loop_boundaries.thread.team_barrier();
207     shared[team_rank] = result;
208 
209     loop_boundaries.thread.team_barrier();
210 
211     // reduce across threads to thread 0
212     if (team_rank == 0) {
213       for (int i = 1; i < loop_boundaries.thread.team_size(); i++) {
214         shared[0] += shared[i];
215       }
216     }
217 
218     loop_boundaries.thread.team_barrier();
219 
220     // broadcast result
221     initialized_result = shared[0];
222   } else {
223     initialized_result = result;
224   }
225 }
226 
227 template <typename iType, class Lambda, typename ValueType, class JoinType>
parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec<Kokkos::Experimental::OpenMPTarget>> & loop_boundaries,const Lambda & lambda,const JoinType & join,ValueType & initialized_result)228 KOKKOS_INLINE_FUNCTION void parallel_reduce(
229     const Impl::TeamThreadRangeBoundariesStruct<
230         iType, Impl::TaskExec<Kokkos::Experimental::OpenMPTarget> >&
231         loop_boundaries,
232     const Lambda& lambda, const JoinType& join, ValueType& initialized_result) {
233   int team_rank =
234       loop_boundaries.thread.team_rank();  // member num within the team
235   ValueType result = initialized_result;
236 
237   for (iType i = loop_boundaries.start; i < loop_boundaries.end;
238        i += loop_boundaries.increment) {
239     lambda(i, result);
240   }
241 
242   if (1 < loop_boundaries.thread.team_size()) {
243     ValueType* shared = (ValueType*)loop_boundaries.thread.team_shared();
244 
245     loop_boundaries.thread.team_barrier();
246     shared[team_rank] = result;
247 
248     loop_boundaries.thread.team_barrier();
249 
250     // reduce across threads to thread 0
251     if (team_rank == 0) {
252       for (int i = 1; i < loop_boundaries.thread.team_size(); i++) {
253         join(shared[0], shared[i]);
254       }
255     }
256 
257     loop_boundaries.thread.team_barrier();
258 
259     // broadcast result
260     initialized_result = shared[0];
261   } else {
262     initialized_result = result;
263   }
264 }
265 
266 // placeholder for future function
267 template <typename iType, class Lambda, typename ValueType>
parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec<Kokkos::Experimental::OpenMPTarget>> & loop_boundaries,const Lambda & lambda,ValueType & initialized_result)268 KOKKOS_INLINE_FUNCTION void parallel_reduce(
269     const Impl::ThreadVectorRangeBoundariesStruct<
270         iType, Impl::TaskExec<Kokkos::Experimental::OpenMPTarget> >&
271         loop_boundaries,
272     const Lambda& lambda, ValueType& initialized_result) {}
273 
274 // placeholder for future function
275 template <typename iType, class Lambda, typename ValueType, class JoinType>
parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec<Kokkos::Experimental::OpenMPTarget>> & loop_boundaries,const Lambda & lambda,const JoinType & join,ValueType & initialized_result)276 KOKKOS_INLINE_FUNCTION void parallel_reduce(
277     const Impl::ThreadVectorRangeBoundariesStruct<
278         iType, Impl::TaskExec<Kokkos::Experimental::OpenMPTarget> >&
279         loop_boundaries,
280     const Lambda& lambda, const JoinType& join, ValueType& initialized_result) {
281 }
282 
283 template <typename ValueType, typename iType, class Lambda>
parallel_scan(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec<Kokkos::Experimental::OpenMPTarget>> & loop_boundaries,const Lambda & lambda)284 KOKKOS_INLINE_FUNCTION void parallel_scan(
285     const Impl::TeamThreadRangeBoundariesStruct<
286         iType, Impl::TaskExec<Kokkos::Experimental::OpenMPTarget> >&
287         loop_boundaries,
288     const Lambda& lambda) {
289   ValueType accum = 0;
290   ValueType val, local_total;
291   ValueType* shared = (ValueType*)loop_boundaries.thread.team_shared();
292   int team_size     = loop_boundaries.thread.team_size();
293   int team_rank =
294       loop_boundaries.thread.team_rank();  // member num within the team
295 
296   // Intra-member scan
297   for (iType i = loop_boundaries.start; i < loop_boundaries.end;
298        i += loop_boundaries.increment) {
299     local_total = 0;
300     lambda(i, local_total, false);
301     val = accum;
302     lambda(i, val, true);
303     accum += local_total;
304   }
305 
306   shared[team_rank] = accum;
307   loop_boundaries.thread.team_barrier();
308 
309   // Member 0 do scan on accumulated totals
310   if (team_rank == 0) {
311     for (iType i = 1; i < team_size; i += 1) {
312       shared[i] += shared[i - 1];
313     }
314     accum = 0;  // Member 0 set accum to 0 in preparation for inter-member scan
315   }
316 
317   loop_boundaries.thread.team_barrier();
318 
319   // Inter-member scan adding in accumulated totals
320   if (team_rank != 0) {
321     accum = shared[team_rank - 1];
322   }
323   for (iType i = loop_boundaries.start; i < loop_boundaries.end;
324        i += loop_boundaries.increment) {
325     local_total = 0;
326     lambda(i, local_total, false);
327     val = accum;
328     lambda(i, val, true);
329     accum += local_total;
330   }
331 }
332 
333 // placeholder for future function
334 template <typename iType, class Lambda, typename ValueType>
parallel_scan(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec<Kokkos::Experimental::OpenMPTarget>> & loop_boundaries,const Lambda & lambda)335 KOKKOS_INLINE_FUNCTION void parallel_scan(
336     const Impl::ThreadVectorRangeBoundariesStruct<
337         iType, Impl::TaskExec<Kokkos::Experimental::OpenMPTarget> >&
338         loop_boundaries,
339     const Lambda& lambda) {}
340 
341 } /* namespace Kokkos */
342 
343 //----------------------------------------------------------------------------
344 //----------------------------------------------------------------------------
345 
346 #endif /* #if defined( KOKKOS_ENABLE_TASKPOLICY ) */
347 #endif /* #ifndef KOKKOS_IMPL_OPENMP_TASK_HPP */
348