1 /* 2 //@HEADER 3 // ************************************************************************ 4 // 5 // Kokkos v. 3.0 6 // Copyright (2020) National Technology & Engineering 7 // Solutions of Sandia, LLC (NTESS). 8 // 9 // Under the terms of Contract DE-NA0003525 with NTESS, 10 // the U.S. Government retains certain rights in this software. 11 // 12 // Redistribution and use in source and binary forms, with or without 13 // modification, are permitted provided that the following conditions are 14 // met: 15 // 16 // 1. Redistributions of source code must retain the above copyright 17 // notice, this list of conditions and the following disclaimer. 18 // 19 // 2. Redistributions in binary form must reproduce the above copyright 20 // notice, this list of conditions and the following disclaimer in the 21 // documentation and/or other materials provided with the distribution. 22 // 23 // 3. Neither the name of the Corporation nor the names of the 24 // contributors may be used to endorse or promote products derived from 25 // this software without specific prior written permission. 26 // 27 // THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY 28 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 29 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 30 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE 31 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 32 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 33 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 34 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 35 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 36 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 37 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 38 // 39 // Questions? Contact Christian R. Trott (crtrott@sandia.gov) 40 // 41 // ************************************************************************ 42 //@HEADER 43 */ 44 45 #ifndef KOKKOS_OPENMPTARGET_PARALLEL_HPP 46 #define KOKKOS_OPENMPTARGET_PARALLEL_HPP 47 48 #include <omp.h> 49 #include <sstream> 50 #include <Kokkos_Parallel.hpp> 51 #include <OpenMPTarget/Kokkos_OpenMPTarget_Exec.hpp> 52 #include <impl/Kokkos_FunctorAdapter.hpp> 53 54 #define KOKKOS_IMPL_LOCK_FREE_HIERARCHICAL 55 56 namespace Kokkos { 57 namespace Impl { 58 59 template <class FunctorType, class... Traits> 60 class ParallelFor<FunctorType, Kokkos::RangePolicy<Traits...>, 61 Kokkos::Experimental::OpenMPTarget> { 62 private: 63 using Policy = Kokkos::RangePolicy<Traits...>; 64 using WorkTag = typename Policy::work_tag; 65 using WorkRange = typename Policy::WorkRange; 66 using Member = typename Policy::member_type; 67 68 const FunctorType m_functor; 69 const Policy m_policy; 70 71 public: execute() const72 inline void execute() const { execute_impl<WorkTag>(); } 73 /* 74 template <class TagType> 75 inline typename std::enable_if<std::is_same<TagType, void>::value>::type 76 execute_impl() const { 77 OpenMPTargetExec::verify_is_process( 78 "Kokkos::Experimental::OpenMPTarget parallel_for"); 79 OpenMPTargetExec::verify_initialized( 80 "Kokkos::Experimental::OpenMPTarget parallel_for"); 81 const typename Policy::member_type begin = m_policy.begin(); 82 const typename Policy::member_type end = m_policy.end(); 83 84 #pragma omp target teams distribute parallel for map(to: this->m_functor) 85 for (int i = begin; i < end; i++) m_functor(i); 86 } 87 */ 88 template <class TagType> execute_impl() const89 inline void execute_impl() const { 90 OpenMPTargetExec::verify_is_process( 91 "Kokkos::Experimental::OpenMPTarget parallel_for"); 92 OpenMPTargetExec::verify_initialized( 93 "Kokkos::Experimental::OpenMPTarget parallel_for"); 94 const auto begin = m_policy.begin(); 95 const auto end = m_policy.end(); 96 97 if (end <= begin) return; 98 99 FunctorType a_functor(m_functor); 100 101 if constexpr (std::is_same<TagType, void>::value) { 102 #pragma omp target teams distribute parallel for map(to : a_functor) 103 for (auto i = begin; i < end; i++) a_functor(i); 104 } else { 105 #pragma omp target teams distribute parallel for map(to : a_functor) 106 for (auto i = begin; i < end; i++) a_functor(TagType(), i); 107 } 108 } 109 ParallelFor(const FunctorType & arg_functor,Policy arg_policy)110 inline ParallelFor(const FunctorType& arg_functor, Policy arg_policy) 111 : m_functor(arg_functor), m_policy(arg_policy) {} 112 }; 113 114 } // namespace Impl 115 } // namespace Kokkos 116 117 //---------------------------------------------------------------------------- 118 //---------------------------------------------------------------------------- 119 120 namespace Kokkos { 121 namespace Impl { 122 123 template <class FunctorType, class PolicyType, class ReducerType, 124 class PointerType, class ValueType, bool FunctorHasJoin, 125 bool UseReducerType> 126 struct ParallelReduceSpecialize { executeKokkos::Impl::ParallelReduceSpecialize127 static inline void execute(const FunctorType& /*f*/, const PolicyType& /*p*/, 128 PointerType /*result_ptr*/) { 129 std::stringstream error_message; 130 error_message << "Error: Invalid Specialization " << FunctorHasJoin << ' ' 131 << UseReducerType << '\n'; 132 // FIXME_OPENMPTARGET 133 OpenMPTarget_abort(error_message.str().c_str()); 134 } 135 }; 136 137 template <class FunctorType, class ReducerType, class PointerType, 138 class ValueType, class... PolicyArgs> 139 struct ParallelReduceSpecialize<FunctorType, Kokkos::RangePolicy<PolicyArgs...>, 140 ReducerType, PointerType, ValueType, false, 141 false> { 142 using PolicyType = Kokkos::RangePolicy<PolicyArgs...>; 143 template <class TagType> execute_implKokkos::Impl::ParallelReduceSpecialize144 inline static void execute_impl(const FunctorType& f, const PolicyType& p, 145 PointerType result_ptr) { 146 OpenMPTargetExec::verify_is_process( 147 "Kokkos::Experimental::OpenMPTarget parallel_for"); 148 OpenMPTargetExec::verify_initialized( 149 "Kokkos::Experimental::OpenMPTarget parallel_for"); 150 const auto begin = p.begin(); 151 const auto end = p.end(); 152 153 if (end <= begin) return; 154 155 ValueType result = ValueType(); 156 if constexpr (std::is_same<TagType, void>::value) { 157 #pragma omp target teams distribute parallel for num_teams(512) \ 158 map(to:f) map(tofrom:result) reduction(+: result) 159 for (auto i = begin; i < end; i++) f(i, result); 160 } else { 161 #pragma omp target teams distribute parallel for num_teams(512) \ 162 map(to:f) map(tofrom:result) reduction(+: result) 163 for (auto i = begin; i < end; i++) f(TagType(), i, result); 164 } 165 166 *result_ptr = result; 167 } 168 executeKokkos::Impl::ParallelReduceSpecialize169 inline static void execute(const FunctorType& f, const PolicyType& p, 170 PointerType ptr) { 171 execute_impl<typename PolicyType::work_tag>(f, p, ptr); 172 } 173 }; 174 175 template <class FunctorType, class PolicyType, class ReducerType, 176 class PointerType, class ValueType> 177 struct ParallelReduceSpecialize<FunctorType, PolicyType, ReducerType, 178 PointerType, ValueType, false, true> { 179 #pragma omp declare reduction( \ 180 custom:ValueType \ 181 : OpenMPTargetReducerWrapper <ReducerType>::join(omp_out, omp_in)) \ 182 initializer(OpenMPTargetReducerWrapper <ReducerType>::init(omp_priv)) 183 184 template <class TagType> execute_implKokkos::Impl::ParallelReduceSpecialize185 inline static void execute_impl(const FunctorType& f, const PolicyType& p, 186 PointerType result_ptr) { 187 OpenMPTargetExec::verify_is_process( 188 "Kokkos::Experimental::OpenMPTarget parallel_for"); 189 OpenMPTargetExec::verify_initialized( 190 "Kokkos::Experimental::OpenMPTarget parallel_for"); 191 const typename PolicyType::member_type begin = p.begin(); 192 const typename PolicyType::member_type end = p.end(); 193 194 if (end <= begin) return; 195 196 ValueType result = ValueType(); 197 OpenMPTargetReducerWrapper<ReducerType>::init(result); 198 199 if constexpr (std::is_same<TagType, void>::value) { 200 #pragma omp target teams distribute parallel for num_teams(512) map(to \ 201 : f) \ 202 reduction(custom \ 203 : result) 204 for (auto i = begin; i < end; i++) f(i, result); 205 *result_ptr = result; 206 } else { 207 #pragma omp target teams distribute parallel for num_teams(512) map(to \ 208 : f) \ 209 reduction(custom \ 210 : result) 211 for (auto i = begin; i < end; i++) f(TagType(), i, result); 212 *result_ptr = result; 213 } 214 } 215 executeKokkos::Impl::ParallelReduceSpecialize216 inline static void execute(const FunctorType& f, const PolicyType& p, 217 PointerType ptr) { 218 execute_impl<typename PolicyType::work_tag>(f, p, ptr); 219 } 220 }; 221 222 template <class FunctorType, class ReducerType, class... Traits> 223 class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType, 224 Kokkos::Experimental::OpenMPTarget> { 225 private: 226 using Policy = Kokkos::RangePolicy<Traits...>; 227 228 using WorkTag = typename Policy::work_tag; 229 using WorkRange = typename Policy::WorkRange; 230 using Member = typename Policy::member_type; 231 232 using ReducerConditional = 233 Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value, 234 FunctorType, ReducerType>; 235 using ReducerTypeFwd = typename ReducerConditional::type; 236 using WorkTagFwd = 237 std::conditional_t<std::is_same<InvalidType, ReducerType>::value, WorkTag, 238 void>; 239 240 // Static Assert WorkTag void if ReducerType not InvalidType 241 242 using ValueTraits = 243 Kokkos::Impl::FunctorValueTraits<ReducerTypeFwd, WorkTagFwd>; 244 using ValueInit = Kokkos::Impl::FunctorValueInit<ReducerTypeFwd, WorkTagFwd>; 245 using ValueJoin = Kokkos::Impl::FunctorValueJoin<ReducerTypeFwd, WorkTagFwd>; 246 247 enum { HasJoin = ReduceFunctorHasJoin<FunctorType>::value }; 248 enum { UseReducer = is_reducer_type<ReducerType>::value }; 249 250 using pointer_type = typename ValueTraits::pointer_type; 251 using reference_type = typename ValueTraits::reference_type; 252 253 using ParReduceSpecialize = 254 ParallelReduceSpecialize<FunctorType, Policy, ReducerType, pointer_type, 255 typename ValueTraits::value_type, HasJoin, 256 UseReducer>; 257 258 const FunctorType m_functor; 259 const Policy m_policy; 260 const ReducerType m_reducer; 261 const pointer_type m_result_ptr; 262 263 public: execute() const264 inline void execute() const { 265 ParReduceSpecialize::execute(m_functor, m_policy, m_result_ptr); 266 } 267 268 template <class ViewType> ParallelReduce(const FunctorType & arg_functor,Policy arg_policy,const ViewType & arg_result_view,typename std::enable_if<Kokkos::is_view<ViewType>::value &&!Kokkos::is_reducer_type<ReducerType>::value,void * >::type=nullptr)269 inline ParallelReduce( 270 const FunctorType& arg_functor, Policy arg_policy, 271 const ViewType& arg_result_view, 272 typename std::enable_if<Kokkos::is_view<ViewType>::value && 273 !Kokkos::is_reducer_type<ReducerType>::value, 274 void*>::type = nullptr) 275 : m_functor(arg_functor), 276 m_policy(arg_policy), 277 m_reducer(InvalidType()), 278 m_result_ptr(arg_result_view.data()) {} 279 ParallelReduce(const FunctorType & arg_functor,Policy arg_policy,const ReducerType & reducer)280 inline ParallelReduce(const FunctorType& arg_functor, Policy arg_policy, 281 const ReducerType& reducer) 282 : m_functor(arg_functor), 283 m_policy(arg_policy), 284 m_reducer(reducer), 285 m_result_ptr(reducer.view().data()) {} 286 }; 287 288 } // namespace Impl 289 } // namespace Kokkos 290 291 //---------------------------------------------------------------------------- 292 //---------------------------------------------------------------------------- 293 294 namespace Kokkos { 295 namespace Impl { 296 297 template <class FunctorType, class... Traits> 298 class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>, 299 Kokkos::Experimental::OpenMPTarget> { 300 protected: 301 using Policy = Kokkos::RangePolicy<Traits...>; 302 303 using WorkTag = typename Policy::work_tag; 304 using WorkRange = typename Policy::WorkRange; 305 using Member = typename Policy::member_type; 306 using idx_type = typename Policy::index_type; 307 308 using ValueTraits = Kokkos::Impl::FunctorValueTraits<FunctorType, WorkTag>; 309 using ValueInit = Kokkos::Impl::FunctorValueInit<FunctorType, WorkTag>; 310 using ValueJoin = Kokkos::Impl::FunctorValueJoin<FunctorType, WorkTag>; 311 using ValueOps = Kokkos::Impl::FunctorValueOps<FunctorType, WorkTag>; 312 313 using value_type = typename ValueTraits::value_type; 314 using pointer_type = typename ValueTraits::pointer_type; 315 using reference_type = typename ValueTraits::reference_type; 316 317 const FunctorType m_functor; 318 const Policy m_policy; 319 320 template <class TagType> 321 inline typename std::enable_if<std::is_same<TagType, void>::value>::type call_with_tag(const FunctorType & f,const idx_type & idx,value_type & val,const bool & is_final) const322 call_with_tag(const FunctorType& f, const idx_type& idx, value_type& val, 323 const bool& is_final) const { 324 f(idx, val, is_final); 325 } 326 template <class TagType> 327 inline typename std::enable_if<!std::is_same<TagType, void>::value>::type call_with_tag(const FunctorType & f,const idx_type & idx,value_type & val,const bool & is_final) const328 call_with_tag(const FunctorType& f, const idx_type& idx, value_type& val, 329 const bool& is_final) const { 330 f(WorkTag(), idx, val, is_final); 331 } 332 333 public: impl_execute(Kokkos::View<value_type **,Kokkos::LayoutRight,Kokkos::Experimental::OpenMPTargetSpace> element_values,Kokkos::View<value_type *,Kokkos::Experimental::OpenMPTargetSpace> chunk_values,Kokkos::View<int64_t,Kokkos::Experimental::OpenMPTargetSpace> count) const334 inline void impl_execute( 335 Kokkos::View<value_type**, Kokkos::LayoutRight, 336 Kokkos::Experimental::OpenMPTargetSpace> 337 element_values, 338 Kokkos::View<value_type*, Kokkos::Experimental::OpenMPTargetSpace> 339 chunk_values, 340 Kokkos::View<int64_t, Kokkos::Experimental::OpenMPTargetSpace> count) 341 const { 342 const idx_type N = m_policy.end() - m_policy.begin(); 343 const idx_type chunk_size = 128; 344 const idx_type n_chunks = (N + chunk_size - 1) / chunk_size; 345 idx_type nteams = n_chunks > 512 ? 512 : n_chunks; 346 idx_type team_size = 128; 347 348 FunctorType a_functor(m_functor); 349 #pragma omp target teams distribute map(to \ 350 : a_functor) num_teams(nteams) \ 351 thread_limit(team_size) 352 for (idx_type team_id = 0; team_id < n_chunks; team_id++) { 353 #pragma omp parallel num_threads(team_size) 354 { 355 const idx_type local_offset = team_id * chunk_size; 356 357 #pragma omp for 358 for (idx_type i = 0; i < chunk_size; i++) { 359 const idx_type idx = local_offset + i; 360 value_type val; 361 ValueInit::init(a_functor, &val); 362 if (idx < N) call_with_tag<WorkTag>(a_functor, idx, val, false); 363 element_values(team_id, i) = val; 364 } 365 #pragma omp barrier 366 if (omp_get_thread_num() == 0) { 367 value_type sum; 368 ValueInit::init(a_functor, &sum); 369 for (idx_type i = 0; i < chunk_size; i++) { 370 ValueJoin::join(a_functor, &sum, &element_values(team_id, i)); 371 element_values(team_id, i) = sum; 372 } 373 chunk_values(team_id) = sum; 374 } 375 #pragma omp barrier 376 if (omp_get_thread_num() == 0) { 377 if (Kokkos::atomic_fetch_add(&count(), 1) == n_chunks - 1) { 378 value_type sum; 379 ValueInit::init(a_functor, &sum); 380 for (idx_type i = 0; i < n_chunks; i++) { 381 ValueJoin::join(a_functor, &sum, &chunk_values(i)); 382 chunk_values(i) = sum; 383 } 384 } 385 } 386 } 387 } 388 389 #pragma omp target teams distribute map(to \ 390 : a_functor) num_teams(nteams) \ 391 thread_limit(team_size) 392 for (idx_type team_id = 0; team_id < n_chunks; team_id++) { 393 #pragma omp parallel num_threads(team_size) 394 { 395 const idx_type local_offset = team_id * chunk_size; 396 value_type offset_value; 397 if (team_id > 0) 398 offset_value = chunk_values(team_id - 1); 399 else 400 ValueInit::init(a_functor, &offset_value); 401 402 #pragma omp for 403 for (idx_type i = 0; i < chunk_size; i++) { 404 const idx_type idx = local_offset + i; 405 value_type local_offset_value; 406 if (i > 0) { 407 local_offset_value = element_values(team_id, i - 1); 408 ValueJoin::join(a_functor, &local_offset_value, &offset_value); 409 } else 410 local_offset_value = offset_value; 411 if (idx < N) 412 call_with_tag<WorkTag>(a_functor, idx, local_offset_value, true); 413 } 414 } 415 } 416 } 417 execute() const418 inline void execute() const { 419 OpenMPTargetExec::verify_is_process( 420 "Kokkos::Experimental::OpenMPTarget parallel_for"); 421 OpenMPTargetExec::verify_initialized( 422 "Kokkos::Experimental::OpenMPTarget parallel_for"); 423 const idx_type N = m_policy.end() - m_policy.begin(); 424 const idx_type chunk_size = 128; 425 const idx_type n_chunks = (N + chunk_size - 1) / chunk_size; 426 427 // This could be scratch memory per team 428 Kokkos::View<value_type**, Kokkos::LayoutRight, 429 Kokkos::Experimental::OpenMPTargetSpace> 430 element_values("element_values", n_chunks, chunk_size); 431 Kokkos::View<value_type*, Kokkos::Experimental::OpenMPTargetSpace> 432 chunk_values("chunk_values", n_chunks); 433 Kokkos::View<int64_t, Kokkos::Experimental::OpenMPTargetSpace> count( 434 "Count"); 435 436 impl_execute(element_values, chunk_values, count); 437 } 438 439 //---------------------------------------- 440 ParallelScan(const FunctorType & arg_functor,const Policy & arg_policy)441 inline ParallelScan(const FunctorType& arg_functor, const Policy& arg_policy) 442 : m_functor(arg_functor), m_policy(arg_policy) {} 443 444 //---------------------------------------- 445 }; 446 447 template <class FunctorType, class ReturnType, class... Traits> 448 class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>, 449 ReturnType, Kokkos::Experimental::OpenMPTarget> 450 : public ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>, 451 Kokkos::Experimental::OpenMPTarget> { 452 using base_t = ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>, 453 Kokkos::Experimental::OpenMPTarget>; 454 using value_type = typename base_t::value_type; 455 value_type& m_returnvalue; 456 457 public: execute() const458 inline void execute() const { 459 OpenMPTargetExec::verify_is_process( 460 "Kokkos::Experimental::OpenMPTarget parallel_for"); 461 OpenMPTargetExec::verify_initialized( 462 "Kokkos::Experimental::OpenMPTarget parallel_for"); 463 const int64_t N = base_t::m_policy.end() - base_t::m_policy.begin(); 464 const int chunk_size = 128; 465 const int64_t n_chunks = (N + chunk_size - 1) / chunk_size; 466 467 if (N > 0) { 468 // This could be scratch memory per team 469 Kokkos::View<value_type**, Kokkos::LayoutRight, 470 Kokkos::Experimental::OpenMPTargetSpace> 471 element_values("element_values", n_chunks, chunk_size); 472 Kokkos::View<value_type*, Kokkos::Experimental::OpenMPTargetSpace> 473 chunk_values("chunk_values", n_chunks); 474 Kokkos::View<int64_t, Kokkos::Experimental::OpenMPTargetSpace> count( 475 "Count"); 476 477 base_t::impl_execute(element_values, chunk_values, count); 478 479 const int size = base_t::ValueTraits::value_size(base_t::m_functor); 480 DeepCopy<HostSpace, Kokkos::Experimental::OpenMPTargetSpace>( 481 &m_returnvalue, chunk_values.data() + (n_chunks - 1), size); 482 } else { 483 m_returnvalue = 0; 484 } 485 } 486 ParallelScanWithTotal(const FunctorType & arg_functor,const typename base_t::Policy & arg_policy,ReturnType & arg_returnvalue)487 ParallelScanWithTotal(const FunctorType& arg_functor, 488 const typename base_t::Policy& arg_policy, 489 ReturnType& arg_returnvalue) 490 : base_t(arg_functor, arg_policy), m_returnvalue(arg_returnvalue) {} 491 }; 492 } // namespace Impl 493 } // namespace Kokkos 494 495 //---------------------------------------------------------------------------- 496 //---------------------------------------------------------------------------- 497 498 namespace Kokkos { 499 namespace Impl { 500 501 template <class FunctorType, class... Properties> 502 class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>, 503 Kokkos::Experimental::OpenMPTarget> { 504 private: 505 using Policy = 506 Kokkos::Impl::TeamPolicyInternal<Kokkos::Experimental::OpenMPTarget, 507 Properties...>; 508 using WorkTag = typename Policy::work_tag; 509 using Member = typename Policy::member_type; 510 511 const FunctorType m_functor; 512 const Policy m_policy; 513 const int m_shmem_size; 514 515 public: execute() const516 inline void execute() const { 517 OpenMPTargetExec::verify_is_process( 518 "Kokkos::Experimental::OpenMPTarget parallel_for"); 519 OpenMPTargetExec::verify_initialized( 520 "Kokkos::Experimental::OpenMPTarget parallel_for"); 521 execute_impl<WorkTag>(); 522 } 523 524 private: 525 template <class TagType> execute_impl() const526 inline void execute_impl() const { 527 OpenMPTargetExec::verify_is_process( 528 "Kokkos::Experimental::OpenMPTarget parallel_for"); 529 OpenMPTargetExec::verify_initialized( 530 "Kokkos::Experimental::OpenMPTarget parallel_for"); 531 const auto league_size = m_policy.league_size(); 532 const auto team_size = m_policy.team_size(); 533 const auto vector_length = m_policy.impl_vector_length(); 534 535 const size_t shmem_size_L0 = m_policy.scratch_size(0, team_size); 536 const size_t shmem_size_L1 = m_policy.scratch_size(1, team_size); 537 OpenMPTargetExec::resize_scratch(team_size, shmem_size_L0, shmem_size_L1); 538 539 void* scratch_ptr = OpenMPTargetExec::get_scratch_ptr(); 540 FunctorType a_functor(m_functor); 541 542 // FIXME_OPENMPTARGET - If the team_size is not a multiple of 32, the 543 // scratch implementation does not work in the Release or RelWithDebugInfo 544 // mode but works in the Debug mode. 545 546 // Maximum active teams possible. 547 int max_active_teams = OpenMPTargetExec::MAX_ACTIVE_THREADS / team_size; 548 // nteams should not exceed the maximum in-flight teams possible. 549 const auto nteams = 550 league_size < max_active_teams ? league_size : max_active_teams; 551 552 #ifdef KOKKOS_IMPL_LOCK_FREE_HIERARCHICAL 553 // Performing our own scheduling of teams to avoid separation of code between 554 // teams-distribute and parallel. Gave a 2x performance boost in test cases with 555 // the clang compiler. atomic_compare_exchange can be avoided since the standard 556 // guarantees that the number of teams specified in the `num_teams` clause is 557 // always less than or equal to the maximum concurrently running teams. 558 #pragma omp target teams num_teams(nteams) thread_limit(team_size) \ 559 map(to \ 560 : a_functor) is_device_ptr(scratch_ptr) 561 #pragma omp parallel 562 { 563 const int blockIdx = omp_get_team_num(); 564 const int gridDim = omp_get_num_teams(); 565 566 // Iterate through the number of teams until league_size and assign the 567 // league_id accordingly 568 // Guarantee that the compilers respect the `num_teams` clause 569 if (gridDim <= nteams) { 570 for (int league_id = blockIdx; league_id < league_size; 571 league_id += gridDim) { 572 typename Policy::member_type team( 573 league_id, league_size, team_size, vector_length, scratch_ptr, 574 blockIdx, shmem_size_L0, shmem_size_L1); 575 if constexpr (std::is_same<TagType, void>::value) 576 m_functor(team); 577 else 578 m_functor(TagType(), team); 579 } 580 } else 581 Kokkos::abort("`num_teams` clause was not respected.\n"); 582 } 583 584 #else 585 // Saving the older implementation that uses `atomic_compare_exchange` to 586 // calculate the shared memory block index and `distribute` clause to distribute 587 // teams. 588 #pragma omp target teams distribute map(to \ 589 : a_functor) \ 590 is_device_ptr(scratch_ptr, lock_array) num_teams(nteams) \ 591 thread_limit(team_size) 592 for (int i = 0; i < league_size; i++) { 593 int shmem_block_index = -1, lock_team = 99999, iter = -1; 594 iter = (omp_get_team_num() % max_active_teams); 595 596 // Loop as long as a shmem_block_index is not found. 597 while (shmem_block_index == -1) { 598 // Try and acquire a lock on the index. 599 lock_team = atomic_compare_exchange(&lock_array[iter], 0, 1); 600 601 // If lock is acquired assign it to the block index. 602 // lock_team = 0, implies atomic_compare_exchange is successfull. 603 if (lock_team == 0) 604 shmem_block_index = iter; 605 else 606 iter = ++iter % max_active_teams; 607 } 608 609 #pragma omp parallel num_threads(team_size) 610 { 611 typename Policy::member_type team( 612 i, league_size, team_size, vector_length, scratch_ptr, 613 shmem_block_index, shmem_size_L0, shmem_size_L1); 614 m_functor(team); 615 } 616 617 // Free the locked block and increment the number of available free 618 // blocks. 619 lock_team = atomic_compare_exchange(&lock_array[shmem_block_index], 1, 0); 620 } 621 #endif 622 } 623 624 public: ParallelFor(const FunctorType & arg_functor,const Policy & arg_policy)625 inline ParallelFor(const FunctorType& arg_functor, const Policy& arg_policy) 626 : m_functor(arg_functor), 627 m_policy(arg_policy), 628 m_shmem_size(arg_policy.scratch_size(0) + arg_policy.scratch_size(1) + 629 FunctorTeamShmemSize<FunctorType>::value( 630 arg_functor, arg_policy.team_size())) {} 631 }; 632 633 template <class FunctorType, class ReducerType, class PointerType, 634 class ValueType, class... PolicyArgs> 635 struct ParallelReduceSpecialize<FunctorType, TeamPolicyInternal<PolicyArgs...>, 636 ReducerType, PointerType, ValueType, false, 637 false> { 638 using PolicyType = TeamPolicyInternal<PolicyArgs...>; 639 640 template <class TagType> execute_implKokkos::Impl::ParallelReduceSpecialize641 inline static void execute_impl(const FunctorType& f, const PolicyType& p, 642 PointerType result_ptr) { 643 OpenMPTargetExec::verify_is_process( 644 "Kokkos::Experimental::OpenMPTarget parallel_for"); 645 OpenMPTargetExec::verify_initialized( 646 "Kokkos::Experimental::OpenMPTarget parallel_for"); 647 648 const int league_size = p.league_size(); 649 const int team_size = p.team_size(); 650 const int vector_length = p.impl_vector_length(); 651 652 const size_t shmem_size_L0 = p.scratch_size(0, team_size); 653 const size_t shmem_size_L1 = p.scratch_size(1, team_size); 654 OpenMPTargetExec::resize_scratch(PolicyType::member_type::TEAM_REDUCE_SIZE, 655 shmem_size_L0, shmem_size_L1); 656 void* scratch_ptr = OpenMPTargetExec::get_scratch_ptr(); 657 658 ValueType result = ValueType(); 659 660 // Maximum active teams possible. 661 int max_active_teams = OpenMPTargetExec::MAX_ACTIVE_THREADS / team_size; 662 const auto nteams = 663 league_size < max_active_teams ? league_size : max_active_teams; 664 665 #ifdef KOKKOS_IMPL_LOCK_FREE_HIERARCHICAL 666 #pragma omp target teams num_teams(nteams) thread_limit(team_size) map(to \ 667 : f) \ 668 is_device_ptr(scratch_ptr) reduction(+: result) 669 #pragma omp parallel reduction(+ : result) 670 { 671 const int blockIdx = omp_get_team_num(); 672 const int gridDim = omp_get_num_teams(); 673 674 // Guarantee that the compilers respect the `num_teams` clause 675 if (gridDim <= nteams) { 676 for (int league_id = blockIdx; league_id < league_size; 677 league_id += gridDim) { 678 typename PolicyType::member_type team( 679 league_id, league_size, team_size, vector_length, scratch_ptr, 680 blockIdx, shmem_size_L0, shmem_size_L1); 681 if constexpr (std::is_same<TagType, void>::value) 682 f(team, result); 683 else 684 f(TagType(), team, result); 685 } 686 } else 687 Kokkos::abort("`num_teams` clause was not respected.\n"); 688 } 689 690 *result_ptr = result; 691 #else 692 // Saving the older implementation that uses `atomic_compare_exchange` to 693 // calculate the shared memory block index and `distribute` clause to distribute 694 // teams. 695 #pragma omp target teams distribute num_teams(nteams) thread_limit(team_size) \ 696 map(to:f) map(tofrom:result) reduction(+: result) \ 697 is_device_ptr(scratch_ptr, lock_array) 698 for (int i = 0; i < league_size; i++) { 699 ValueType inner_result = ValueType(); 700 int shmem_block_index = -1, lock_team = 99999, iter = -1; 701 iter = (omp_get_team_num() % max_active_teams); 702 703 // Loop as long as a shmem_block_index is not found. 704 while (shmem_block_index == -1) { 705 // Try and acquire a lock on the index. 706 lock_team = atomic_compare_exchange(&lock_array[iter], 0, 1); 707 708 // If lock is acquired assign it to the block index. 709 // lock_team = 0, implies atomic_compare_exchange is successfull. 710 if (lock_team == 0) 711 shmem_block_index = iter; 712 else 713 iter = ++iter % max_active_teams; 714 } 715 #pragma omp parallel num_threads(team_size) reduction(+ : inner_result) 716 { 717 typename PolicyType::member_type team( 718 i, league_size, team_size, vector_length, scratch_ptr, 719 shmem_block_index, shmem_size_L0, shmem_size_L1); 720 f(team, inner_result); 721 } 722 result = inner_result; 723 724 // Free the locked block and increment the number of available free 725 // blocks. 726 lock_team = atomic_compare_exchange(&lock_array[shmem_block_index], 1, 0); 727 } 728 729 *result_ptr = result; 730 #endif 731 } 732 executeKokkos::Impl::ParallelReduceSpecialize733 inline static void execute(const FunctorType& f, const PolicyType& p, 734 PointerType ptr) { 735 execute_impl<typename PolicyType::work_tag>(f, p, ptr); 736 } 737 }; 738 739 template <class FunctorType, class ReducerType, class PointerType, 740 class ValueType, class... PolicyArgs> 741 struct ParallelReduceSpecialize<FunctorType, TeamPolicyInternal<PolicyArgs...>, 742 ReducerType, PointerType, ValueType, false, 743 true> { 744 using PolicyType = TeamPolicyInternal<PolicyArgs...>; 745 template <class TagType> execute_implKokkos::Impl::ParallelReduceSpecialize746 inline static void execute_impl(const FunctorType& f, const PolicyType& p, 747 PointerType result_ptr) { 748 OpenMPTargetExec::verify_is_process( 749 "Kokkos::Experimental::OpenMPTarget parallel_for"); 750 OpenMPTargetExec::verify_initialized( 751 "Kokkos::Experimental::OpenMPTarget parallel_for"); 752 753 #pragma omp declare reduction( \ 754 custom:ValueType \ 755 : OpenMPTargetReducerWrapper <ReducerType>::join(omp_out, omp_in)) \ 756 initializer(OpenMPTargetReducerWrapper <ReducerType>::init(omp_priv)) 757 const int league_size = p.league_size(); 758 const int team_size = p.team_size(); 759 const int vector_length = p.impl_vector_length(); 760 const size_t shmem_size_L0 = p.scratch_size(0, team_size); 761 const size_t shmem_size_L1 = p.scratch_size(1, team_size); 762 OpenMPTargetExec::resize_scratch(team_size, shmem_size_L0, shmem_size_L1); 763 void* scratch_ptr = OpenMPTargetExec::get_scratch_ptr(); 764 765 ValueType result = ValueType(); 766 767 // Maximum active teams possible. 768 int max_active_teams = OpenMPTargetExec::MAX_ACTIVE_THREADS / team_size; 769 const auto nteams = 770 league_size < max_active_teams ? league_size : max_active_teams; 771 772 #pragma omp target teams num_teams(nteams) thread_limit(team_size) map(to \ 773 : f) \ 774 is_device_ptr(scratch_ptr) reduction(custom \ 775 : result) 776 #pragma omp parallel reduction(custom : result) 777 { 778 const int blockIdx = omp_get_team_num(); 779 const int gridDim = omp_get_num_teams(); 780 781 // Guarantee that the compilers respect the `num_teams` clause 782 if (gridDim <= nteams) { 783 for (int league_id = blockIdx; league_id < league_size; 784 league_id += gridDim) { 785 typename PolicyType::member_type team( 786 league_id, league_size, team_size, vector_length, scratch_ptr, 787 blockIdx, shmem_size_L0, shmem_size_L1); 788 if constexpr (std::is_same<TagType, void>::value) 789 f(team, result); 790 else 791 f(TagType(), team, result); 792 } 793 } else 794 Kokkos::abort("`num_teams` clause was not respected.\n"); 795 } 796 797 *result_ptr = result; 798 } 799 executeKokkos::Impl::ParallelReduceSpecialize800 inline static void execute(const FunctorType& f, const PolicyType& p, 801 PointerType ptr) { 802 execute_impl<typename PolicyType::work_tag>(f, p, ptr); 803 } 804 }; 805 806 template <class FunctorType, class ReducerType, class... Properties> 807 class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>, 808 ReducerType, Kokkos::Experimental::OpenMPTarget> { 809 private: 810 using Policy = 811 Kokkos::Impl::TeamPolicyInternal<Kokkos::Experimental::OpenMPTarget, 812 Properties...>; 813 814 using WorkTag = typename Policy::work_tag; 815 using Member = typename Policy::member_type; 816 817 using ReducerConditional = 818 Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value, 819 FunctorType, ReducerType>; 820 using ReducerTypeFwd = typename ReducerConditional::type; 821 using WorkTagFwd = 822 std::conditional_t<std::is_same<InvalidType, ReducerType>::value, WorkTag, 823 void>; 824 825 using ValueTraits = 826 Kokkos::Impl::FunctorValueTraits<ReducerTypeFwd, WorkTagFwd>; 827 using ValueInit = Kokkos::Impl::FunctorValueInit<ReducerTypeFwd, WorkTagFwd>; 828 using ValueJoin = Kokkos::Impl::FunctorValueJoin<ReducerTypeFwd, WorkTagFwd>; 829 830 using pointer_type = typename ValueTraits::pointer_type; 831 using reference_type = typename ValueTraits::reference_type; 832 using value_type = typename ValueTraits::value_type; 833 834 enum { HasJoin = ReduceFunctorHasJoin<FunctorType>::value }; 835 enum { UseReducer = is_reducer_type<ReducerType>::value }; 836 837 using ParForSpecialize = 838 ParallelReduceSpecialize<FunctorType, Policy, ReducerType, pointer_type, 839 typename ValueTraits::value_type, HasJoin, 840 UseReducer>; 841 842 const FunctorType m_functor; 843 const Policy m_policy; 844 const ReducerType m_reducer; 845 const pointer_type m_result_ptr; 846 const int m_shmem_size; 847 848 public: execute() const849 inline void execute() const { 850 ParForSpecialize::execute(m_functor, m_policy, m_result_ptr); 851 } 852 853 template <class ViewType> ParallelReduce(const FunctorType & arg_functor,const Policy & arg_policy,const ViewType & arg_result,typename std::enable_if<Kokkos::is_view<ViewType>::value &&!Kokkos::is_reducer_type<ReducerType>::value,void * >::type=nullptr)854 inline ParallelReduce( 855 const FunctorType& arg_functor, const Policy& arg_policy, 856 const ViewType& arg_result, 857 typename std::enable_if<Kokkos::is_view<ViewType>::value && 858 !Kokkos::is_reducer_type<ReducerType>::value, 859 void*>::type = nullptr) 860 : m_functor(arg_functor), 861 m_policy(arg_policy), 862 m_reducer(InvalidType()), 863 m_result_ptr(arg_result.data()), 864 m_shmem_size(arg_policy.scratch_size(0) + arg_policy.scratch_size(1) + 865 FunctorTeamShmemSize<FunctorType>::value( 866 arg_functor, arg_policy.team_size())) {} 867 ParallelReduce(const FunctorType & arg_functor,Policy arg_policy,const ReducerType & reducer)868 inline ParallelReduce(const FunctorType& arg_functor, Policy arg_policy, 869 const ReducerType& reducer) 870 : m_functor(arg_functor), 871 m_policy(arg_policy), 872 m_reducer(reducer), 873 m_result_ptr(reducer.view().data()), 874 m_shmem_size(arg_policy.scratch_size(0) + arg_policy.scratch_size(1) + 875 FunctorTeamShmemSize<FunctorType>::value( 876 arg_functor, arg_policy.team_size())) {} 877 }; 878 879 } // namespace Impl 880 } // namespace Kokkos 881 882 namespace Kokkos { 883 namespace Impl { 884 885 template <typename iType> 886 struct TeamThreadRangeBoundariesStruct<iType, OpenMPTargetExecTeamMember> { 887 using index_type = iType; 888 const iType start; 889 const iType end; 890 const OpenMPTargetExecTeamMember& team; 891 TeamThreadRangeBoundariesStructKokkos::Impl::TeamThreadRangeBoundariesStruct892 inline TeamThreadRangeBoundariesStruct( 893 const OpenMPTargetExecTeamMember& thread_, iType count) 894 : start(0), end(count), team(thread_) {} TeamThreadRangeBoundariesStructKokkos::Impl::TeamThreadRangeBoundariesStruct895 inline TeamThreadRangeBoundariesStruct( 896 const OpenMPTargetExecTeamMember& thread_, iType begin_, iType end_) 897 : start(begin_), end(end_), team(thread_) {} 898 }; 899 900 template <typename iType> 901 struct ThreadVectorRangeBoundariesStruct<iType, OpenMPTargetExecTeamMember> { 902 using index_type = iType; 903 const index_type start; 904 const index_type end; 905 const OpenMPTargetExecTeamMember& team; 906 ThreadVectorRangeBoundariesStructKokkos::Impl::ThreadVectorRangeBoundariesStruct907 inline ThreadVectorRangeBoundariesStruct( 908 const OpenMPTargetExecTeamMember& thread_, index_type count) 909 : start(0), end(count), team(thread_) {} ThreadVectorRangeBoundariesStructKokkos::Impl::ThreadVectorRangeBoundariesStruct910 inline ThreadVectorRangeBoundariesStruct( 911 const OpenMPTargetExecTeamMember& thread_, index_type begin_, 912 index_type end_) 913 : start(begin_), end(end_), team(thread_) {} 914 }; 915 916 template <typename iType> 917 struct TeamVectorRangeBoundariesStruct<iType, OpenMPTargetExecTeamMember> { 918 using index_type = iType; 919 const index_type start; 920 const index_type end; 921 const OpenMPTargetExecTeamMember& team; 922 TeamVectorRangeBoundariesStructKokkos::Impl::TeamVectorRangeBoundariesStruct923 inline TeamVectorRangeBoundariesStruct( 924 const OpenMPTargetExecTeamMember& thread_, index_type count) 925 : start(0), end(count), team(thread_) {} TeamVectorRangeBoundariesStructKokkos::Impl::TeamVectorRangeBoundariesStruct926 inline TeamVectorRangeBoundariesStruct( 927 const OpenMPTargetExecTeamMember& thread_, index_type begin_, 928 index_type end_) 929 : start(begin_), end(end_), team(thread_) {} 930 }; 931 932 } // namespace Impl 933 934 } // namespace Kokkos 935 //---------------------------------------------------------------------------- 936 //---------------------------------------------------------------------------- 937 938 #undef KOKKOS_IMPL_LOCK_FREE_HIERARCHICAL 939 #endif /* KOKKOS_OPENMPTARGET_PARALLEL_HPP */ 940