1 /*
2 //@HEADER
3 // ************************************************************************
4 //
5 // Kokkos v. 3.0
6 // Copyright (2020) National Technology & Engineering
7 // Solutions of Sandia, LLC (NTESS).
8 //
9 // Under the terms of Contract DE-NA0003525 with NTESS,
10 // the U.S. Government retains certain rights in this software.
11 //
12 // Redistribution and use in source and binary forms, with or without
13 // modification, are permitted provided that the following conditions are
14 // met:
15 //
16 // 1. Redistributions of source code must retain the above copyright
17 // notice, this list of conditions and the following disclaimer.
18 //
19 // 2. Redistributions in binary form must reproduce the above copyright
20 // notice, this list of conditions and the following disclaimer in the
21 // documentation and/or other materials provided with the distribution.
22 //
23 // 3. Neither the name of the Corporation nor the names of the
24 // contributors may be used to endorse or promote products derived from
25 // this software without specific prior written permission.
26 //
27 // THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
28 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
30 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
31 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
32 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
33 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
34 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
35 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
36 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
37 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
38 //
39 // Questions? Contact Christian R. Trott (crtrott@sandia.gov)
40 //
41 // ************************************************************************
42 //@HEADER
43 */
44
45 #ifndef KOKKOS_IMPL_OPENMP_TASK_HPP
46 #define KOKKOS_IMPL_OPENMP_TASK_HPP
47
48 #if defined(KOKKOS_ENABLE_TASKPOLICY)
49
50 //----------------------------------------------------------------------------
51 //----------------------------------------------------------------------------
52
53 namespace Kokkos {
54 namespace Impl {
55
56 template <>
57 class TaskQueueSpecialization<Kokkos::Experimental::OpenMPTarget> {
58 public:
59 using execution_space = Kokkos::Experimental::OpenMPTarget;
60 using queue_type = Kokkos::Impl::TaskQueue<execution_space>;
61 using task_base_type = Kokkos::Impl::TaskBase<execution_space, void, void>;
62
63 // Must specify memory space
64 using memory_space = Kokkos::HostSpace;
65
66 static void iff_single_thread_recursive_execute(queue_type* const);
67
68 // Must provide task queue execution function
69 static void execute(queue_type* const);
70
71 // Must provide mechanism to set function pointer in
72 // execution space from the host process.
73 template <typename FunctorType>
proc_set_apply(task_base_type::function_type * ptr)74 static void proc_set_apply(task_base_type::function_type* ptr) {
75 using TaskType = TaskBase<Kokkos::Experimental::OpenMPTarget,
76 typename FunctorType::value_type, FunctorType>;
77 *ptr = TaskType::apply;
78 }
79 };
80
81 extern template class TaskQueue<Kokkos::Experimental::OpenMPTarget>;
82
83 //----------------------------------------------------------------------------
84
85 template <>
86 class TaskExec<Kokkos::Experimental::OpenMPTarget> {
87 private:
88 TaskExec(TaskExec&&) = delete;
89 TaskExec(TaskExec const&) = delete;
90 TaskExec& operator=(TaskExec&&) = delete;
91 TaskExec& operator=(TaskExec const&) = delete;
92
93 using PoolExec = Kokkos::Impl::OpenMPTargetExec;
94
95 friend class Kokkos::Impl::TaskQueue<Kokkos::Experimental::OpenMPTarget>;
96 friend class Kokkos::Impl::TaskQueueSpecialization<
97 Kokkos::Experimental::OpenMPTarget>;
98
99 PoolExec* const m_self_exec; ///< This thread's thread pool data structure
100 PoolExec* const m_team_exec; ///< Team thread's thread pool data structure
101 int64_t m_sync_mask;
102 int64_t mutable m_sync_value;
103 int mutable m_sync_step;
104 int m_group_rank; ///< Which "team" subset of thread pool
105 int m_team_rank; ///< Which thread within a team
106 int m_team_size;
107
108 TaskExec();
109 TaskExec(PoolExec& arg_exec, int arg_team_size);
110
111 void team_barrier_impl() const;
112
113 public:
114 #if defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST)
team_shared() const115 void* team_shared() const {
116 return m_team_exec ? m_team_exec->scratch_thread() : nullptr;
117 }
118
team_shared_size() const119 int team_shared_size() const {
120 return m_team_exec ? m_team_exec->scratch_thread_size() : 0;
121 }
122
123 /**\brief Whole team enters this function call
124 * before any teeam member returns from
125 * this function call.
126 */
team_barrier() const127 void team_barrier() const {
128 if (1 < m_team_size) team_barrier_impl();
129 }
130 #else
131 KOKKOS_INLINE_FUNCTION void team_barrier() const {}
132 KOKKOS_INLINE_FUNCTION void* team_shared() const { return 0; }
133 KOKKOS_INLINE_FUNCTION int team_shared_size() const { return 0; }
134 #endif
135
136 KOKKOS_INLINE_FUNCTION
team_rank() const137 int team_rank() const { return m_team_rank; }
138
139 KOKKOS_INLINE_FUNCTION
team_size() const140 int team_size() const { return m_team_size; }
141 };
142
143 } // namespace Impl
144 } // namespace Kokkos
145
146 //----------------------------------------------------------------------------
147 //----------------------------------------------------------------------------
148
149 namespace Kokkos {
150
151 template <typename iType>
152 KOKKOS_INLINE_FUNCTION Impl::TeamThreadRangeBoundariesStruct<
153 iType, Impl::TaskExec<Kokkos::Experimental::OpenMPTarget> >
TeamThreadRange(Impl::TaskExec<Kokkos::Experimental::OpenMPTarget> & thread,const iType & count)154 TeamThreadRange(Impl::TaskExec<Kokkos::Experimental::OpenMPTarget>& thread,
155 const iType& count) {
156 return Impl::TeamThreadRangeBoundariesStruct<
157 iType, Impl::TaskExec<Kokkos::Experimental::OpenMPTarget> >(thread,
158 count);
159 }
160
161 template <typename iType>
162 KOKKOS_INLINE_FUNCTION Impl::TeamThreadRangeBoundariesStruct<
163 iType, Impl::TaskExec<Kokkos::Experimental::OpenMPTarget> >
TeamThreadRange(Impl::TaskExec<Kokkos::Experimental::OpenMPTarget> & thread,const iType & start,const iType & end)164 TeamThreadRange(Impl::TaskExec<Kokkos::Experimental::OpenMPTarget>& thread,
165 const iType& start, const iType& end) {
166 return Impl::TeamThreadRangeBoundariesStruct<
167 iType, Impl::TaskExec<Kokkos::Experimental::OpenMPTarget> >(thread, start,
168 end);
169 }
170
171 /** \brief Inter-thread parallel_for. Executes lambda(iType i) for each
172 * i=0..N-1.
173 *
174 * The range i=0..N-1 is mapped to all threads of the the calling thread team.
175 */
176 template <typename iType, class Lambda>
parallel_for(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec<Kokkos::Experimental::OpenMPTarget>> & loop_boundaries,const Lambda & lambda)177 KOKKOS_INLINE_FUNCTION void parallel_for(
178 const Impl::TeamThreadRangeBoundariesStruct<
179 iType, Impl::TaskExec<Kokkos::Experimental::OpenMPTarget> >&
180 loop_boundaries,
181 const Lambda& lambda) {
182 for (iType i = loop_boundaries.start; i < loop_boundaries.end;
183 i += loop_boundaries.increment) {
184 lambda(i);
185 }
186 }
187
188 template <typename iType, class Lambda, typename ValueType>
parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec<Kokkos::Experimental::OpenMPTarget>> & loop_boundaries,const Lambda & lambda,ValueType & initialized_result)189 KOKKOS_INLINE_FUNCTION void parallel_reduce(
190 const Impl::TeamThreadRangeBoundariesStruct<
191 iType, Impl::TaskExec<Kokkos::Experimental::OpenMPTarget> >&
192 loop_boundaries,
193 const Lambda& lambda, ValueType& initialized_result) {
194 int team_rank =
195 loop_boundaries.thread.team_rank(); // member num within the team
196 ValueType result = initialized_result;
197
198 for (iType i = loop_boundaries.start; i < loop_boundaries.end;
199 i += loop_boundaries.increment) {
200 lambda(i, result);
201 }
202
203 if (1 < loop_boundaries.thread.team_size()) {
204 ValueType* shared = (ValueType*)loop_boundaries.thread.team_shared();
205
206 loop_boundaries.thread.team_barrier();
207 shared[team_rank] = result;
208
209 loop_boundaries.thread.team_barrier();
210
211 // reduce across threads to thread 0
212 if (team_rank == 0) {
213 for (int i = 1; i < loop_boundaries.thread.team_size(); i++) {
214 shared[0] += shared[i];
215 }
216 }
217
218 loop_boundaries.thread.team_barrier();
219
220 // broadcast result
221 initialized_result = shared[0];
222 } else {
223 initialized_result = result;
224 }
225 }
226
227 template <typename iType, class Lambda, typename ValueType, class JoinType>
parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec<Kokkos::Experimental::OpenMPTarget>> & loop_boundaries,const Lambda & lambda,const JoinType & join,ValueType & initialized_result)228 KOKKOS_INLINE_FUNCTION void parallel_reduce(
229 const Impl::TeamThreadRangeBoundariesStruct<
230 iType, Impl::TaskExec<Kokkos::Experimental::OpenMPTarget> >&
231 loop_boundaries,
232 const Lambda& lambda, const JoinType& join, ValueType& initialized_result) {
233 int team_rank =
234 loop_boundaries.thread.team_rank(); // member num within the team
235 ValueType result = initialized_result;
236
237 for (iType i = loop_boundaries.start; i < loop_boundaries.end;
238 i += loop_boundaries.increment) {
239 lambda(i, result);
240 }
241
242 if (1 < loop_boundaries.thread.team_size()) {
243 ValueType* shared = (ValueType*)loop_boundaries.thread.team_shared();
244
245 loop_boundaries.thread.team_barrier();
246 shared[team_rank] = result;
247
248 loop_boundaries.thread.team_barrier();
249
250 // reduce across threads to thread 0
251 if (team_rank == 0) {
252 for (int i = 1; i < loop_boundaries.thread.team_size(); i++) {
253 join(shared[0], shared[i]);
254 }
255 }
256
257 loop_boundaries.thread.team_barrier();
258
259 // broadcast result
260 initialized_result = shared[0];
261 } else {
262 initialized_result = result;
263 }
264 }
265
266 // placeholder for future function
267 template <typename iType, class Lambda, typename ValueType>
parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec<Kokkos::Experimental::OpenMPTarget>> & loop_boundaries,const Lambda & lambda,ValueType & initialized_result)268 KOKKOS_INLINE_FUNCTION void parallel_reduce(
269 const Impl::ThreadVectorRangeBoundariesStruct<
270 iType, Impl::TaskExec<Kokkos::Experimental::OpenMPTarget> >&
271 loop_boundaries,
272 const Lambda& lambda, ValueType& initialized_result) {}
273
274 // placeholder for future function
275 template <typename iType, class Lambda, typename ValueType, class JoinType>
parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec<Kokkos::Experimental::OpenMPTarget>> & loop_boundaries,const Lambda & lambda,const JoinType & join,ValueType & initialized_result)276 KOKKOS_INLINE_FUNCTION void parallel_reduce(
277 const Impl::ThreadVectorRangeBoundariesStruct<
278 iType, Impl::TaskExec<Kokkos::Experimental::OpenMPTarget> >&
279 loop_boundaries,
280 const Lambda& lambda, const JoinType& join, ValueType& initialized_result) {
281 }
282
283 template <typename ValueType, typename iType, class Lambda>
parallel_scan(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec<Kokkos::Experimental::OpenMPTarget>> & loop_boundaries,const Lambda & lambda)284 KOKKOS_INLINE_FUNCTION void parallel_scan(
285 const Impl::TeamThreadRangeBoundariesStruct<
286 iType, Impl::TaskExec<Kokkos::Experimental::OpenMPTarget> >&
287 loop_boundaries,
288 const Lambda& lambda) {
289 ValueType accum = 0;
290 ValueType val, local_total;
291 ValueType* shared = (ValueType*)loop_boundaries.thread.team_shared();
292 int team_size = loop_boundaries.thread.team_size();
293 int team_rank =
294 loop_boundaries.thread.team_rank(); // member num within the team
295
296 // Intra-member scan
297 for (iType i = loop_boundaries.start; i < loop_boundaries.end;
298 i += loop_boundaries.increment) {
299 local_total = 0;
300 lambda(i, local_total, false);
301 val = accum;
302 lambda(i, val, true);
303 accum += local_total;
304 }
305
306 shared[team_rank] = accum;
307 loop_boundaries.thread.team_barrier();
308
309 // Member 0 do scan on accumulated totals
310 if (team_rank == 0) {
311 for (iType i = 1; i < team_size; i += 1) {
312 shared[i] += shared[i - 1];
313 }
314 accum = 0; // Member 0 set accum to 0 in preparation for inter-member scan
315 }
316
317 loop_boundaries.thread.team_barrier();
318
319 // Inter-member scan adding in accumulated totals
320 if (team_rank != 0) {
321 accum = shared[team_rank - 1];
322 }
323 for (iType i = loop_boundaries.start; i < loop_boundaries.end;
324 i += loop_boundaries.increment) {
325 local_total = 0;
326 lambda(i, local_total, false);
327 val = accum;
328 lambda(i, val, true);
329 accum += local_total;
330 }
331 }
332
333 // placeholder for future function
334 template <typename iType, class Lambda, typename ValueType>
parallel_scan(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec<Kokkos::Experimental::OpenMPTarget>> & loop_boundaries,const Lambda & lambda)335 KOKKOS_INLINE_FUNCTION void parallel_scan(
336 const Impl::ThreadVectorRangeBoundariesStruct<
337 iType, Impl::TaskExec<Kokkos::Experimental::OpenMPTarget> >&
338 loop_boundaries,
339 const Lambda& lambda) {}
340
341 } /* namespace Kokkos */
342
343 //----------------------------------------------------------------------------
344 //----------------------------------------------------------------------------
345
346 #endif /* #if defined( KOKKOS_ENABLE_TASKPOLICY ) */
347 #endif /* #ifndef KOKKOS_IMPL_OPENMP_TASK_HPP */
348