1 /* 2 //@HEADER 3 // ************************************************************************ 4 // 5 // Kokkos v. 3.0 6 // Copyright (2020) National Technology & Engineering 7 // Solutions of Sandia, LLC (NTESS). 8 // 9 // Under the terms of Contract DE-NA0003525 with NTESS, 10 // the U.S. Government retains certain rights in this software. 11 // 12 // Redistribution and use in source and binary forms, with or without 13 // modification, are permitted provided that the following conditions are 14 // met: 15 // 16 // 1. Redistributions of source code must retain the above copyright 17 // notice, this list of conditions and the following disclaimer. 18 // 19 // 2. Redistributions in binary form must reproduce the above copyright 20 // notice, this list of conditions and the following disclaimer in the 21 // documentation and/or other materials provided with the distribution. 22 // 23 // 3. Neither the name of the Corporation nor the names of the 24 // contributors may be used to endorse or promote products derived from 25 // this software without specific prior written permission. 26 // 27 // THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY 28 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 29 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 30 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE 31 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 32 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 33 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 34 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 35 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 36 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 37 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 38 // 39 // Questions? Contact Christian R. Trott (crtrott@sandia.gov) 40 // 41 // ************************************************************************ 42 //@HEADER 43 */ 44 45 #ifndef KOKKOS_SYCL_PARALLEL_TEAM_HPP 46 #define KOKKOS_SYCL_PARALLEL_TEAM_HPP 47 48 #include <Kokkos_Parallel.hpp> 49 50 #include <SYCL/Kokkos_SYCL_Team.hpp> 51 52 namespace Kokkos { 53 namespace Impl { 54 template <typename... Properties> 55 class TeamPolicyInternal<Kokkos::Experimental::SYCL, Properties...> 56 : public PolicyTraits<Properties...> { 57 public: 58 using execution_policy = TeamPolicyInternal; 59 60 using traits = PolicyTraits<Properties...>; 61 62 template <typename ExecSpace, typename... OtherProperties> 63 friend class TeamPolicyInternal; 64 65 private: 66 static int constexpr MAX_WARP = 8; 67 68 typename traits::execution_space m_space; 69 int m_league_size; 70 int m_team_size; 71 int m_vector_length; 72 int m_team_scratch_size[2]; 73 int m_thread_scratch_size[2]; 74 int m_chunk_size; 75 bool m_tune_team_size; 76 bool m_tune_vector_length; 77 78 public: 79 using execution_space = Kokkos::Experimental::SYCL; 80 81 template <class... OtherProperties> TeamPolicyInternal(TeamPolicyInternal<OtherProperties...> const & p)82 TeamPolicyInternal(TeamPolicyInternal<OtherProperties...> const& p) { 83 m_league_size = p.m_league_size; 84 m_team_size = p.m_team_size; 85 m_vector_length = p.m_vector_length; 86 m_team_scratch_size[0] = p.m_team_scratch_size[0]; 87 m_team_scratch_size[1] = p.m_team_scratch_size[1]; 88 m_thread_scratch_size[0] = p.m_thread_scratch_size[0]; 89 m_thread_scratch_size[1] = p.m_thread_scratch_size[1]; 90 m_chunk_size = p.m_chunk_size; 91 m_space = p.m_space; 92 m_tune_team_size = p.m_tune_team_size; 93 m_tune_vector_length = p.m_tune_vector_length; 94 } 95 96 template <typename FunctorType> team_size_max(FunctorType const & f,ParallelForTag const &) const97 int team_size_max(FunctorType const& f, ParallelForTag const&) const { 98 return internal_team_size_max_for(f); 99 } 100 101 template <class FunctorType> team_size_max(const FunctorType & f,const ParallelReduceTag &) const102 inline int team_size_max(const FunctorType& f, 103 const ParallelReduceTag&) const { 104 return internal_team_size_max_reduce(f); 105 } 106 107 template <class FunctorType, class ReducerType> team_size_max(const FunctorType & f,const ReducerType &,const ParallelReduceTag &) const108 inline int team_size_max(const FunctorType& f, const ReducerType& /*r*/, 109 const ParallelReduceTag&) const { 110 return internal_team_size_max_reduce(f); 111 } 112 113 template <typename FunctorType> team_size_recommended(FunctorType const & f,ParallelForTag const &) const114 int team_size_recommended(FunctorType const& f, ParallelForTag const&) const { 115 return internal_team_size_max_for(f); 116 } 117 118 template <typename FunctorType> team_size_recommended(FunctorType const & f,ParallelReduceTag const &) const119 inline int team_size_recommended(FunctorType const& f, 120 ParallelReduceTag const&) const { 121 return internal_team_size_recommended_reduce(f); 122 } 123 124 template <class FunctorType, class ReducerType> team_size_recommended(FunctorType const & f,ReducerType const &,ParallelReduceTag const &) const125 int team_size_recommended(FunctorType const& f, ReducerType const&, 126 ParallelReduceTag const&) const { 127 return internal_team_size_recommended_reduce(f); 128 } impl_auto_vector_length() const129 inline bool impl_auto_vector_length() const { return m_tune_vector_length; } impl_auto_team_size() const130 inline bool impl_auto_team_size() const { return m_tune_team_size; } vector_length_max()131 static int vector_length_max() { 132 // FIXME_SYCL provide a reasonable value 133 return 1; 134 } 135 verify_requested_vector_length(int requested_vector_length)136 static int verify_requested_vector_length(int requested_vector_length) { 137 int test_vector_length = 138 std::min(requested_vector_length, vector_length_max()); 139 140 // Allow only power-of-two vector_length 141 if (!(is_integral_power_of_two(test_vector_length))) { 142 int test_pow2 = 1; 143 for (int i = 0; i < 5; i++) { 144 test_pow2 = test_pow2 << 1; 145 if (test_pow2 > test_vector_length) { 146 break; 147 } 148 } 149 test_vector_length = test_pow2 >> 1; 150 } 151 152 return test_vector_length; 153 } 154 scratch_size_max(int level)155 static int scratch_size_max(int level) { 156 return level == 0 ? 1024 * 32 157 : // FIXME_SYCL arbitrarily setting this to 32kB 158 20 * 1024 * 1024; // FIXME_SYCL arbitrarily setting this to 20MB 159 } impl_set_vector_length(size_t size)160 inline void impl_set_vector_length(size_t size) { m_vector_length = size; } impl_set_team_size(size_t size)161 inline void impl_set_team_size(size_t size) { m_team_size = size; } impl_vector_length() const162 int impl_vector_length() const { return m_vector_length; } vector_length() const163 KOKKOS_DEPRECATED int vector_length() const { return impl_vector_length(); } 164 team_size() const165 int team_size() const { return m_team_size; } 166 league_size() const167 int league_size() const { return m_league_size; } 168 scratch_size(int level,int team_size_=-1) const169 int scratch_size(int level, int team_size_ = -1) const { 170 if (team_size_ < 0) team_size_ = m_team_size; 171 return m_team_scratch_size[level] + 172 team_size_ * m_thread_scratch_size[level]; 173 } 174 team_scratch_size(int level) const175 int team_scratch_size(int level) const { return m_team_scratch_size[level]; } 176 thread_scratch_size(int level) const177 int thread_scratch_size(int level) const { 178 return m_thread_scratch_size[level]; 179 } 180 space() const181 typename traits::execution_space space() const { return m_space; } 182 TeamPolicyInternal()183 TeamPolicyInternal() 184 : m_space(typename traits::execution_space()), 185 m_league_size(0), 186 m_team_size(-1), 187 m_vector_length(0), 188 m_team_scratch_size{0, 0}, 189 m_thread_scratch_size{0, 0}, 190 m_chunk_size(0), 191 m_tune_team_size(false), 192 m_tune_vector_length(false) {} 193 194 /** \brief Specify league size, request team size */ TeamPolicyInternal(const execution_space space_,int league_size_,int team_size_request,int vector_length_request=1)195 TeamPolicyInternal(const execution_space space_, int league_size_, 196 int team_size_request, int vector_length_request = 1) 197 : m_space(space_), 198 m_league_size(league_size_), 199 m_team_size(team_size_request), 200 m_vector_length( 201 (vector_length_request > 0) 202 ? verify_requested_vector_length(vector_length_request) 203 : (verify_requested_vector_length(1))), 204 m_team_scratch_size{0, 0}, 205 m_thread_scratch_size{0, 0}, 206 m_chunk_size(0), 207 m_tune_team_size(bool(team_size_request <= 0)), 208 m_tune_vector_length(bool(vector_length_request <= 0)) { 209 // FIXME_SYCL check paramters 210 } 211 212 /** \brief Specify league size, request team size */ TeamPolicyInternal(const execution_space space_,int league_size_,const Kokkos::AUTO_t &,int vector_length_request=1)213 TeamPolicyInternal(const execution_space space_, int league_size_, 214 const Kokkos::AUTO_t& /* team_size_request */, 215 int vector_length_request = 1) 216 : TeamPolicyInternal(space_, league_size_, -1, vector_length_request) {} 217 // FLAG 218 /** \brief Specify league size and team size, request vector length*/ TeamPolicyInternal(const execution_space space_,int league_size_,int team_size_request,const Kokkos::AUTO_t &)219 TeamPolicyInternal(const execution_space space_, int league_size_, 220 int team_size_request, 221 const Kokkos::AUTO_t& /* vector_length_request */ 222 ) 223 : TeamPolicyInternal(space_, league_size_, team_size_request, -1) 224 225 {} 226 227 /** \brief Specify league size, request team size and vector length*/ TeamPolicyInternal(const execution_space space_,int league_size_,const Kokkos::AUTO_t &,const Kokkos::AUTO_t &)228 TeamPolicyInternal(const execution_space space_, int league_size_, 229 const Kokkos::AUTO_t& /* team_size_request */, 230 const Kokkos::AUTO_t& /* vector_length_request */ 231 232 ) 233 : TeamPolicyInternal(space_, league_size_, -1, -1) 234 235 {} 236 TeamPolicyInternal(int league_size_,int team_size_request,int vector_length_request=1)237 TeamPolicyInternal(int league_size_, int team_size_request, 238 int vector_length_request = 1) 239 : TeamPolicyInternal(typename traits::execution_space(), league_size_, 240 team_size_request, vector_length_request) {} 241 TeamPolicyInternal(int league_size_,const Kokkos::AUTO_t &,int vector_length_request=1)242 TeamPolicyInternal(int league_size_, 243 const Kokkos::AUTO_t& /* team_size_request */, 244 int vector_length_request = 1) 245 : TeamPolicyInternal(typename traits::execution_space(), league_size_, -1, 246 vector_length_request) {} 247 248 /** \brief Specify league size and team size, request vector length*/ TeamPolicyInternal(int league_size_,int team_size_request,const Kokkos::AUTO_t &)249 TeamPolicyInternal(int league_size_, int team_size_request, 250 const Kokkos::AUTO_t& /* vector_length_request */ 251 252 ) 253 : TeamPolicyInternal(typename traits::execution_space(), league_size_, 254 team_size_request, -1) 255 256 {} 257 258 /** \brief Specify league size, request team size and vector length*/ TeamPolicyInternal(int league_size_,const Kokkos::AUTO_t &,const Kokkos::AUTO_t &)259 TeamPolicyInternal(int league_size_, 260 const Kokkos::AUTO_t& /* team_size_request */, 261 const Kokkos::AUTO_t& /* vector_length_request */ 262 263 ) 264 : TeamPolicyInternal(typename traits::execution_space(), league_size_, -1, 265 -1) {} 266 chunk_size() const267 int chunk_size() const { return m_chunk_size; } 268 set_chunk_size(typename traits::index_type chunk_size_)269 TeamPolicyInternal& set_chunk_size(typename traits::index_type chunk_size_) { 270 m_chunk_size = chunk_size_; 271 return *this; 272 } 273 274 /** \brief set per team scratch size for a specific level of the scratch 275 * hierarchy */ set_scratch_size(int level,PerTeamValue const & per_team)276 TeamPolicyInternal& set_scratch_size(int level, 277 PerTeamValue const& per_team) { 278 m_team_scratch_size[level] = per_team.value; 279 return *this; 280 } 281 282 /** \brief set per thread scratch size for a specific level of the scratch 283 * hierarchy */ set_scratch_size(int level,PerThreadValue const & per_thread)284 TeamPolicyInternal& set_scratch_size(int level, 285 PerThreadValue const& per_thread) { 286 m_thread_scratch_size[level] = per_thread.value; 287 return *this; 288 } 289 290 /** \brief set per thread and per team scratch size for a specific level of 291 * the scratch hierarchy */ set_scratch_size(int level,PerTeamValue const & per_team,PerThreadValue const & per_thread)292 TeamPolicyInternal& set_scratch_size(int level, PerTeamValue const& per_team, 293 PerThreadValue const& per_thread) { 294 m_team_scratch_size[level] = per_team.value; 295 m_thread_scratch_size[level] = per_thread.value; 296 return *this; 297 } 298 299 using member_type = Kokkos::Impl::SYCLTeamMember; 300 301 protected: 302 template <class FunctorType> internal_team_size_max_for(const FunctorType &) const303 int internal_team_size_max_for(const FunctorType& /*f*/) const { 304 // nested_reducer_memsize = (sizeof(double) * (m_team_size + 2) 305 // custom: m_team_scratch_size[0] + m_thread_scratch_size[0] * m_team_size 306 // total: 307 // 2*sizeof(double)+m_team_scratch_size[0] 308 // + m_team_size(sizeof(double)+m_thread_scratch_size[0]) 309 const int max_threads_for_memory = 310 (space().impl_internal_space_instance()->m_maxShmemPerBlock - 311 2 * sizeof(double) - m_team_scratch_size[0]) / 312 (sizeof(double) + m_thread_scratch_size[0]); 313 return std::min<int>( 314 m_space.impl_internal_space_instance()->m_maxWorkgroupSize, 315 max_threads_for_memory); 316 } 317 318 template <class FunctorType> internal_team_size_max_reduce(const FunctorType & f) const319 int internal_team_size_max_reduce(const FunctorType& f) const { 320 using Analysis = FunctorAnalysis<FunctorPatternInterface::REDUCE, 321 TeamPolicyInternal, FunctorType>; 322 using value_type = typename Analysis::value_type; 323 const int value_count = Analysis::value_count(f); 324 325 // nested_reducer_memsize = (sizeof(double) * (m_team_size + 2) 326 // reducer_memsize = sizeof(value_type) * m_team_size * value_count 327 // custom: m_team_scratch_size[0] + m_thread_scratch_size[0] * m_team_size 328 // total: 329 // 2*sizeof(double)+m_team_scratch_size[0] 330 // + m_team_size(sizeof(double)+sizeof(value_type)*value_count 331 // +m_thread_scratch_size[0]) 332 const int max_threads_for_memory = 333 (space().impl_internal_space_instance()->m_maxShmemPerBlock - 334 2 * sizeof(double) - m_team_scratch_size[0]) / 335 (sizeof(double) + sizeof(value_type) * value_count + 336 m_thread_scratch_size[0]); 337 return std::min<int>( 338 m_space.impl_internal_space_instance()->m_maxWorkgroupSize, 339 max_threads_for_memory); 340 } 341 342 template <class FunctorType> internal_team_size_recommended_for(const FunctorType & f) const343 int internal_team_size_recommended_for(const FunctorType& f) const { 344 // FIXME_SYCL improve 345 return internal_team_size_max_for(f); 346 } 347 348 template <class FunctorType> internal_team_size_recommended_reduce(const FunctorType & f) const349 int internal_team_size_recommended_reduce(const FunctorType& f) const { 350 // FIXME_SYCL improve 351 return internal_team_size_max_reduce(f); 352 } 353 }; 354 355 template <typename FunctorType, typename... Properties> 356 class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>, 357 Kokkos::Experimental::SYCL> { 358 public: 359 using Policy = TeamPolicyInternal<Kokkos::Experimental::SYCL, Properties...>; 360 using functor_type = FunctorType; 361 using size_type = ::Kokkos::Experimental::SYCL::size_type; 362 363 private: 364 using member_type = typename Policy::member_type; 365 using work_tag = typename Policy::work_tag; 366 using launch_bounds = typename Policy::launch_bounds; 367 368 FunctorType const m_functor; 369 Policy const m_policy; 370 size_type const m_league_size; 371 int m_team_size; 372 size_type const m_vector_size; 373 int m_shmem_begin; 374 int m_shmem_size; 375 void* m_scratch_ptr[2]; 376 int m_scratch_size[2]; 377 378 template <typename Functor> sycl_direct_launch(const Policy & policy,const Functor & functor) const379 void sycl_direct_launch(const Policy& policy, const Functor& functor) const { 380 // Convenience references 381 const Kokkos::Experimental::SYCL& space = policy.space(); 382 Kokkos::Experimental::Impl::SYCLInternal& instance = 383 *space.impl_internal_space_instance(); 384 sycl::queue& q = *instance.m_queue; 385 386 q.submit([&](sycl::handler& cgh) { 387 // FIXME_SYCL accessors seem to need a size greater than zero at least for 388 // host queues 389 sycl::accessor<char, 1, sycl::access::mode::read_write, 390 sycl::access::target::local> 391 team_scratch_memory_L0( 392 sycl::range<1>(std::max(m_scratch_size[0] + m_shmem_begin, 1)), 393 cgh); 394 395 // Avoid capturing *this since it might not be trivially copyable 396 const auto shmem_begin = m_shmem_begin; 397 const int scratch_size[2] = {m_scratch_size[0], m_scratch_size[1]}; 398 void* const scratch_ptr[2] = {m_scratch_ptr[0], m_scratch_ptr[1]}; 399 400 cgh.parallel_for( 401 sycl::nd_range<2>( 402 sycl::range<2>(m_league_size * m_team_size, m_vector_size), 403 sycl::range<2>(m_team_size, m_vector_size)), 404 [=](sycl::nd_item<2> item) { 405 const member_type team_member( 406 team_scratch_memory_L0.get_pointer(), shmem_begin, 407 scratch_size[0], 408 static_cast<char*>(scratch_ptr[1]) + 409 item.get_group(0) * scratch_size[1], 410 scratch_size[1], item); 411 if constexpr (std::is_same<work_tag, void>::value) 412 functor(team_member); 413 else 414 functor(work_tag(), team_member); 415 }); 416 }); 417 space.fence(); 418 } 419 420 public: execute() const421 inline void execute() const { 422 if (m_league_size == 0) return; 423 424 Kokkos::Experimental::Impl::SYCLInternal::IndirectKernelMem& 425 indirectKernelMem = m_policy.space() 426 .impl_internal_space_instance() 427 ->m_indirectKernelMem; 428 429 const auto functor_wrapper = Experimental::Impl::make_sycl_function_wrapper( 430 m_functor, indirectKernelMem); 431 432 sycl_direct_launch(m_policy, functor_wrapper.get_functor()); 433 } 434 ParallelFor(FunctorType const & arg_functor,Policy const & arg_policy)435 ParallelFor(FunctorType const& arg_functor, Policy const& arg_policy) 436 : m_functor(arg_functor), 437 m_policy(arg_policy), 438 m_league_size(arg_policy.league_size()), 439 m_team_size(arg_policy.team_size()), 440 m_vector_size(arg_policy.impl_vector_length()) { 441 // FIXME_SYCL optimize 442 if (m_team_size < 0) m_team_size = 32; 443 444 m_shmem_begin = (sizeof(double) * (m_team_size + 2)); 445 m_shmem_size = 446 (m_policy.scratch_size(0, m_team_size) + 447 FunctorTeamShmemSize<FunctorType>::value(m_functor, m_team_size)); 448 m_scratch_size[0] = m_shmem_size; 449 m_scratch_size[1] = m_policy.scratch_size(1, m_team_size); 450 451 // FIXME_SYCL so far accessors used instead of these pointers 452 // Functor's reduce memory, team scan memory, and team shared memory depend 453 // upon team size. 454 const auto& space = *m_policy.space().impl_internal_space_instance(); 455 const sycl::queue& q = *space.m_queue; 456 m_scratch_ptr[0] = nullptr; 457 m_scratch_ptr[1] = sycl::malloc_device( 458 sizeof(char) * m_scratch_size[1] * m_league_size, q); 459 460 if (static_cast<int>(space.m_maxShmemPerBlock) < 461 m_shmem_size - m_shmem_begin) { 462 std::stringstream out; 463 out << "Kokkos::Impl::ParallelFor<SYCL> insufficient shared memory! " 464 "Requested " 465 << m_shmem_size - m_shmem_begin << " bytes but maximum is " 466 << m_policy.space().impl_internal_space_instance()->m_maxShmemPerBlock 467 << '\n'; 468 Kokkos::Impl::throw_runtime_exception(out.str()); 469 } 470 471 if (m_team_size > m_policy.team_size_max(arg_functor, ParallelForTag{})) 472 Kokkos::Impl::throw_runtime_exception( 473 "Kokkos::Impl::ParallelFor<SYCL> requested too large team size."); 474 } 475 476 // FIXME_SYCL remove when managing m_scratch_ptr[1] in the execution space 477 // instance 478 ParallelFor(const ParallelFor&) = delete; 479 ParallelFor& operator=(const ParallelFor&) = delete; 480 ~ParallelFor()481 ~ParallelFor() { 482 const Kokkos::Experimental::SYCL& space = m_policy.space(); 483 Kokkos::Experimental::Impl::SYCLInternal& instance = 484 *space.impl_internal_space_instance(); 485 sycl::queue& q = *instance.m_queue; 486 sycl::free(m_scratch_ptr[1], q); 487 } 488 }; 489 490 //---------------------------------------------------------------------------- 491 //---------------------------------------------------------------------------- 492 493 template <class FunctorType, class ReducerType, class... Properties> 494 class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>, 495 ReducerType, Kokkos::Experimental::SYCL> { 496 public: 497 using Policy = TeamPolicyInternal<Kokkos::Experimental::SYCL, Properties...>; 498 499 private: 500 using Analysis = 501 FunctorAnalysis<FunctorPatternInterface::REDUCE, Policy, FunctorType>; 502 using member_type = typename Policy::member_type; 503 using WorkTag = typename Policy::work_tag; 504 using launch_bounds = typename Policy::launch_bounds; 505 506 using pointer_type = typename Analysis::pointer_type; 507 using reference_type = typename Analysis::reference_type; 508 using value_type = typename Analysis::value_type; 509 510 public: 511 using functor_type = FunctorType; 512 using size_type = Kokkos::Experimental::SYCL::size_type; 513 514 private: 515 const FunctorType m_functor; 516 const Policy m_policy; 517 const ReducerType m_reducer; 518 const pointer_type m_result_ptr; 519 // FIXME_SYCL avoid reallocating memory for reductions 520 /* size_type* m_scratch_space; 521 size_type* m_scratch_flags; 522 size_type m_team_begin;*/ 523 size_type m_shmem_begin; 524 size_type m_shmem_size; 525 void* m_scratch_ptr[2]; 526 int m_scratch_size[2]; 527 const size_type m_league_size; 528 int m_team_size; 529 const size_type m_vector_size; 530 531 template <typename PolicyType, typename Functor, typename Reducer> sycl_direct_launch(const PolicyType & policy,const Functor & functor,const Reducer & reducer) const532 void sycl_direct_launch(const PolicyType& policy, const Functor& functor, 533 const Reducer& reducer) const { 534 using ReducerConditional = 535 Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value, 536 FunctorType, ReducerType>; 537 using ReducerTypeFwd = typename ReducerConditional::type; 538 using WorkTagFwd = 539 std::conditional_t<std::is_same<InvalidType, ReducerType>::value, 540 WorkTag, void>; 541 using ValueInit = 542 Kokkos::Impl::FunctorValueInit<ReducerTypeFwd, WorkTagFwd>; 543 using ValueJoin = 544 Kokkos::Impl::FunctorValueJoin<ReducerTypeFwd, WorkTagFwd>; 545 using ValueOps = Kokkos::Impl::FunctorValueOps<FunctorType, WorkTag>; 546 547 auto selected_reducer = ReducerConditional::select(functor, reducer); 548 549 // Convenience references 550 const Kokkos::Experimental::SYCL& space = policy.space(); 551 Kokkos::Experimental::Impl::SYCLInternal& instance = 552 *space.impl_internal_space_instance(); 553 sycl::queue& q = *instance.m_queue; 554 555 // FIXME_SYCL optimize 556 const size_t wgroup_size = m_team_size; 557 std::size_t size = m_league_size * m_team_size; 558 const auto init_size = 559 std::max<std::size_t>((size + wgroup_size - 1) / wgroup_size, 1); 560 const unsigned int value_count = 561 FunctorValueTraits<ReducerTypeFwd, WorkTagFwd>::value_count( 562 selected_reducer); 563 // FIXME_SYCL only use the first half 564 const auto results_ptr = static_cast<pointer_type>(instance.scratch_space( 565 sizeof(value_type) * std::max(value_count, 1u) * init_size * 2)); 566 // FIXME_SYCL without this we are running into a race condition 567 const auto results_ptr2 = 568 results_ptr + std::max(value_count, 1u) * init_size; 569 570 // If size<=1 we only call init(), the functor and possibly final once 571 // working with the global scratch memory but don't copy back to 572 // m_result_ptr yet. 573 if (size <= 1) { 574 q.submit([&](sycl::handler& cgh) { 575 // FIXME_SYCL accessors seem to need a size greater than zero at least 576 // for host queues 577 sycl::accessor<char, 1, sycl::access::mode::read_write, 578 sycl::access::target::local> 579 team_scratch_memory_L0( 580 sycl::range<1>(std::max(m_scratch_size[0] + m_shmem_begin, 1)), 581 cgh); 582 583 // Avoid capturing *this since it might not be trivially copyable 584 const auto shmem_begin = m_shmem_begin; 585 const int scratch_size[2] = {m_scratch_size[0], m_scratch_size[1]}; 586 void* const scratch_ptr[2] = {m_scratch_ptr[0], m_scratch_ptr[1]}; 587 588 cgh.parallel_for( 589 sycl::nd_range<2>(sycl::range<2>(1, 1), sycl::range<2>(1, 1)), 590 [=](sycl::nd_item<2> item) { 591 const auto& selected_reducer = ReducerConditional::select( 592 static_cast<const FunctorType&>(functor), 593 static_cast<const ReducerType&>(reducer)); 594 reference_type update = 595 ValueInit::init(selected_reducer, results_ptr); 596 if (size == 1) { 597 const member_type team_member( 598 team_scratch_memory_L0.get_pointer(), shmem_begin, 599 scratch_size[0], static_cast<char*>(scratch_ptr[1]), 600 scratch_size[1], item); 601 if constexpr (std::is_same<WorkTag, void>::value) 602 functor(team_member, update); 603 else 604 functor(WorkTag(), team_member, update); 605 } 606 if constexpr (ReduceFunctorHasFinal<FunctorType>::value) 607 FunctorFinal<FunctorType, WorkTag>::final( 608 static_cast<const FunctorType&>(functor), results_ptr); 609 }); 610 }); 611 space.fence(); 612 } 613 614 // Otherwise, we perform a reduction on the values in all workgroups 615 // separately, write the workgroup results back to global memory and recurse 616 // until only one workgroup does the reduction and thus gets the final 617 // value. 618 bool first_run = true; 619 while (size > 1) { 620 auto n_wgroups = (size + wgroup_size - 1) / wgroup_size; 621 q.submit([&](sycl::handler& cgh) { 622 sycl::accessor<value_type, 1, sycl::access::mode::read_write, 623 sycl::access::target::local> 624 local_mem(sycl::range<1>(wgroup_size) * std::max(value_count, 1u), 625 cgh); 626 // FIXME_SYCL accessors seem to need a size greater than zero at least 627 // for host queues 628 sycl::accessor<char, 1, sycl::access::mode::read_write, 629 sycl::access::target::local> 630 team_scratch_memory_L0( 631 sycl::range<1>(std::max(m_scratch_size[0] + m_shmem_begin, 1)), 632 cgh); 633 634 // Avoid capturing *this since it might not be trivially copyable 635 const auto shmem_begin = m_shmem_begin; 636 const int scratch_size[2] = {m_scratch_size[0], m_scratch_size[1]}; 637 void* const scratch_ptr[2] = {m_scratch_ptr[0], m_scratch_ptr[1]}; 638 639 cgh.parallel_for( 640 sycl::nd_range<2>( 641 sycl::range<2>(m_league_size * m_team_size, m_vector_size), 642 sycl::range<2>(m_team_size, m_vector_size)), 643 [=](sycl::nd_item<2> item) { 644 const auto local_id = item.get_local_linear_id(); 645 const auto global_id = 646 wgroup_size * item.get_group_linear_id() + local_id; 647 const auto& selected_reducer = ReducerConditional::select( 648 static_cast<const FunctorType&>(functor), 649 static_cast<const ReducerType&>(reducer)); 650 651 // In the first iteration, we call functor to initialize the local 652 // memory. Otherwise, the local memory is initialized with the 653 // results from the previous iteration that are stored in global 654 // memory. Note that we load values_per_thread values per thread 655 // and immediately combine them to avoid too many threads being 656 // idle in the actual workgroup reduction. 657 if (first_run) { 658 reference_type update = ValueInit::init( 659 selected_reducer, &local_mem[local_id * value_count]); 660 const member_type team_member( 661 team_scratch_memory_L0.get_pointer(), shmem_begin, 662 scratch_size[0], 663 static_cast<char*>(scratch_ptr[1]) + 664 item.get_group(0) * scratch_size[1], 665 scratch_size[1], item); 666 if constexpr (std::is_same<WorkTag, void>::value) 667 functor(team_member, update); 668 else 669 functor(WorkTag(), team_member, update); 670 } else { 671 if (global_id >= size) 672 ValueInit::init(selected_reducer, 673 &local_mem[local_id * value_count]); 674 else { 675 ValueOps::copy(functor, &local_mem[local_id * value_count], 676 &results_ptr[global_id * value_count]); 677 } 678 } 679 item.barrier(sycl::access::fence_space::local_space); 680 681 // Perform the actual workgroup reduction. To achieve a better 682 // memory access pattern, we use sequential addressing and a 683 // reversed loop. If the workgroup size is 8, the first element 684 // contains all the values with index%4==0, after the second one 685 // the values with index%2==0 and after the third one index%1==0, 686 // i.e., all values. 687 for (unsigned int stride = wgroup_size / 2; stride > 0; 688 stride >>= 1) { 689 const auto idx = local_id; 690 if (idx < stride) { 691 ValueJoin::join(selected_reducer, 692 &local_mem[idx * value_count], 693 &local_mem[(idx + stride) * value_count]); 694 } 695 item.barrier(sycl::access::fence_space::local_space); 696 } 697 698 // Finally, we copy the workgroup results back to global memory to 699 // be used in the next iteration. If this is the last iteration, 700 // i.e., there is only one workgroup also call final() if 701 // necessary. 702 if (local_id == 0) { 703 ValueOps::copy( 704 functor, 705 &results_ptr2[(item.get_group_linear_id()) * value_count], 706 &local_mem[0]); 707 if constexpr (ReduceFunctorHasFinal<FunctorType>::value) 708 if (n_wgroups <= 1 && item.get_group_linear_id() == 0) { 709 FunctorFinal<FunctorType, WorkTag>::final( 710 static_cast<const FunctorType&>(functor), 711 &results_ptr2[(item.get_group_linear_id()) * 712 value_count]); 713 } 714 } 715 }); 716 }); 717 space.fence(); 718 719 // FIXME_SYCL this is likely not necessary, see above 720 Kokkos::Impl::DeepCopy<Kokkos::Experimental::SYCLDeviceUSMSpace, 721 Kokkos::Experimental::SYCLDeviceUSMSpace>( 722 space, results_ptr, results_ptr2, 723 sizeof(*m_result_ptr) * value_count * n_wgroups); 724 space.fence(); 725 726 first_run = false; 727 size = n_wgroups; 728 } 729 730 // At this point, the reduced value is written to the entry in results_ptr 731 // and all that is left is to copy it back to the given result pointer if 732 // necessary. 733 if (m_result_ptr) { 734 Kokkos::Impl::DeepCopy<Kokkos::Experimental::SYCLDeviceUSMSpace, 735 Kokkos::Experimental::SYCLDeviceUSMSpace>( 736 space, m_result_ptr, results_ptr, 737 sizeof(*m_result_ptr) * value_count); 738 space.fence(); 739 } 740 } 741 742 public: execute()743 inline void execute() { 744 Kokkos::Experimental::Impl::SYCLInternal& instance = 745 *m_policy.space().impl_internal_space_instance(); 746 using IndirectKernelMem = 747 Kokkos::Experimental::Impl::SYCLInternal::IndirectKernelMem; 748 IndirectKernelMem& indirectKernelMem = instance.m_indirectKernelMem; 749 IndirectKernelMem& indirectReducerMem = instance.m_indirectReducerMem; 750 751 const auto functor_wrapper = Experimental::Impl::make_sycl_function_wrapper( 752 m_functor, indirectKernelMem); 753 const auto reducer_wrapper = Experimental::Impl::make_sycl_function_wrapper( 754 m_reducer, indirectReducerMem); 755 756 sycl_direct_launch(m_policy, functor_wrapper.get_functor(), 757 reducer_wrapper.get_functor()); 758 } 759 760 private: initialize()761 void initialize() { 762 // FIXME_SYCL optimize 763 if (m_team_size < 0) m_team_size = 32; 764 // Must be a power of two greater than two, get the one not bigger than the 765 // requested one. 766 if ((m_team_size & m_team_size - 1) || m_team_size < 2) { 767 int temp_team_size = 2; 768 while ((temp_team_size << 1) < m_team_size) temp_team_size <<= 1; 769 m_team_size = temp_team_size; 770 } 771 772 m_shmem_begin = (sizeof(double) * (m_team_size + 2)); 773 m_shmem_size = 774 (m_policy.scratch_size(0, m_team_size) + 775 FunctorTeamShmemSize<FunctorType>::value(m_functor, m_team_size)); 776 m_scratch_size[0] = m_shmem_size; 777 m_scratch_size[1] = m_policy.scratch_size(1, m_team_size); 778 779 // FIXME_SYCL so far accessors used instead of these pointers 780 // Functor's reduce memory, team scan memory, and team shared memory depend 781 // upon team size. 782 const auto& space = *m_policy.space().impl_internal_space_instance(); 783 const sycl::queue& q = *space.m_queue; 784 m_scratch_ptr[0] = nullptr; 785 m_scratch_ptr[1] = sycl::malloc_device( 786 sizeof(char) * m_scratch_size[1] * m_league_size, q); 787 788 if (static_cast<int>(space.m_maxShmemPerBlock) < 789 m_shmem_size - m_shmem_begin) { 790 std::stringstream out; 791 out << "Kokkos::Impl::ParallelFor<SYCL> insufficient shared memory! " 792 "Requested " 793 << m_shmem_size - m_shmem_begin << " bytes but maximum is " 794 << m_policy.space().impl_internal_space_instance()->m_maxShmemPerBlock 795 << '\n'; 796 Kokkos::Impl::throw_runtime_exception(out.str()); 797 } 798 799 if (m_team_size > m_policy.team_size_max(m_functor, ParallelForTag{})) 800 Kokkos::Impl::throw_runtime_exception( 801 "Kokkos::Impl::ParallelFor<SYCL> requested too large team size."); 802 } 803 804 public: 805 template <class ViewType> ParallelReduce(FunctorType const & arg_functor,Policy const & arg_policy,ViewType const & arg_result,typename std::enable_if<Kokkos::is_view<ViewType>::value,void * >::type=nullptr)806 ParallelReduce(FunctorType const& arg_functor, Policy const& arg_policy, 807 ViewType const& arg_result, 808 typename std::enable_if<Kokkos::is_view<ViewType>::value, 809 void*>::type = nullptr) 810 : m_functor(arg_functor), 811 m_policy(arg_policy), 812 m_reducer(InvalidType()), 813 m_result_ptr(arg_result.data()), 814 m_league_size(arg_policy.league_size()), 815 m_team_size(arg_policy.team_size()), 816 m_vector_size(arg_policy.impl_vector_length()) { 817 initialize(); 818 } 819 ParallelReduce(FunctorType const & arg_functor,Policy const & arg_policy,ReducerType const & reducer)820 ParallelReduce(FunctorType const& arg_functor, Policy const& arg_policy, 821 ReducerType const& reducer) 822 : m_functor(arg_functor), 823 m_policy(arg_policy), 824 m_reducer(reducer), 825 m_result_ptr(reducer.view().data()), 826 m_league_size(arg_policy.league_size()), 827 m_team_size(arg_policy.team_size()), 828 m_vector_size(arg_policy.impl_vector_length()) { 829 initialize(); 830 } 831 }; 832 } // namespace Impl 833 } // namespace Kokkos 834 835 #endif 836