1 /*
2 //@HEADER
3 // ************************************************************************
4 //
5 //                        Kokkos v. 3.0
6 //       Copyright (2020) National Technology & Engineering
7 //               Solutions of Sandia, LLC (NTESS).
8 //
9 // Under the terms of Contract DE-NA0003525 with NTESS,
10 // the U.S. Government retains certain rights in this software.
11 //
12 // Redistribution and use in source and binary forms, with or without
13 // modification, are permitted provided that the following conditions are
14 // met:
15 //
16 // 1. Redistributions of source code must retain the above copyright
17 // notice, this list of conditions and the following disclaimer.
18 //
19 // 2. Redistributions in binary form must reproduce the above copyright
20 // notice, this list of conditions and the following disclaimer in the
21 // documentation and/or other materials provided with the distribution.
22 //
23 // 3. Neither the name of the Corporation nor the names of the
24 // contributors may be used to endorse or promote products derived from
25 // this software without specific prior written permission.
26 //
27 // THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
28 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
30 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
31 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
32 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
33 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
34 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
35 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
36 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
37 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
38 //
39 // Questions? Contact Christian R. Trott (crtrott@sandia.gov)
40 //
41 // ************************************************************************
42 //@HEADER
43 */
44 
45 #ifndef KOKKOS_SYCL_INSTANCE_HPP_
46 #define KOKKOS_SYCL_INSTANCE_HPP_
47 
48 #include <optional>
49 #include <CL/sycl.hpp>
50 
51 #include <impl/Kokkos_Error.hpp>
52 
53 namespace Kokkos {
54 namespace Experimental {
55 namespace Impl {
56 
57 class SYCLInternal {
58  public:
59   using size_type = int;
60 
61   SYCLInternal() = default;
62   ~SYCLInternal();
63 
64   SYCLInternal(const SYCLInternal&) = delete;
65   SYCLInternal& operator=(const SYCLInternal&) = delete;
66   SYCLInternal& operator=(SYCLInternal&&) = delete;
67   SYCLInternal(SYCLInternal&&)            = delete;
68 
69   void* scratch_space(const size_type size);
70   void* scratch_flags(const size_type size);
71 
72   int m_syclDev = -1;
73 
74   size_t m_maxWorkgroupSize   = 0;
75   uint32_t m_maxConcurrency   = 0;
76   uint64_t m_maxShmemPerBlock = 0;
77 
78   uint32_t* m_scratchConcurrentBitset = nullptr;
79   size_type m_scratchSpaceCount       = 0;
80   size_type* m_scratchSpace           = nullptr;
81   size_type m_scratchFlagsCount       = 0;
82   size_type* m_scratchFlags           = nullptr;
83 
84   std::optional<sycl::queue> m_queue;
85 
86   // Using std::vector<std::optional<sycl::queue>> reveals a compiler bug when
87   // compiling for the CUDA backend. Storing pointers instead works around this.
88   static std::vector<std::optional<sycl::queue>*> all_queues;
89   // We need a mutex for thread safety when modifying all_queues.
90   static std::mutex mutex;
91 
92   // USMObjectMem is a reusable buffer for a single object
93   // in USM memory
94   template <sycl::usm::alloc Kind>
95   class USMObjectMem {
96    public:
97     class Deleter {
98      public:
99       Deleter() = default;
Deleter(USMObjectMem * mem)100       explicit Deleter(USMObjectMem* mem) : m_mem(mem) {}
101 
102       template <typename T>
operator ()(T * p) const103       void operator()(T* p) const noexcept {
104         assert(m_mem);
105         assert(sizeof(T) == m_mem->size());
106 
107         if constexpr (sycl::usm::alloc::device == kind)
108           // Only skipping the dtor on trivially copyable types
109           static_assert(std::is_trivially_copyable_v<T>);
110         else
111           p->~T();
112 
113         m_mem->m_size = 0;
114       }
115 
116      private:
117       USMObjectMem* m_mem = nullptr;
118     };
119 
120     static constexpr sycl::usm::alloc kind = Kind;
121 
122     void reset();
123 
reset(sycl::queue q)124     void reset(sycl::queue q) {
125       reset();
126       m_q.emplace(std::move(q));
127     }
128 
129     USMObjectMem() = default;
USMObjectMem(sycl::queue q)130     explicit USMObjectMem(sycl::queue q) noexcept : m_q(std::move(q)) {}
131 
132     USMObjectMem(USMObjectMem const&) = delete;
133     USMObjectMem(USMObjectMem&&)      = delete;
134     USMObjectMem& operator=(USMObjectMem&&) = delete;
135     USMObjectMem& operator=(USMObjectMem const&) = delete;
136 
~USMObjectMem()137     ~USMObjectMem() { reset(); };
138 
data()139     void* data() noexcept { return m_data; }
data() const140     const void* data() const noexcept { return m_data; }
141 
size() const142     size_t size() const noexcept { return m_size; }
capacity() const143     size_t capacity() const noexcept { return m_capacity; }
144 
145     // reserve() allocates space for at least n bytes
146     // returns the new capacity
147     size_t reserve(size_t n);
148 
149    private:
150     using AllocationSpace =
151         std::conditional_t<Kind == sycl::usm::alloc::device,
152                            Kokkos::Experimental::SYCLDeviceUSMSpace,
153                            Kokkos::Experimental::SYCLSharedUSMSpace>;
154 
155     // This will memcpy an object T into memory held by this object
156     // returns: a T* to that object
157     //
158     // Note:  it is UB to dereference this pointer with an object that is
159     // not an implicit-lifetime nor trivially-copyable type, but presumably much
160     // faster because we can use USM device memory
161     template <typename T>
memcpy_from(const T & t)162     std::unique_ptr<T, Deleter> memcpy_from(const T& t) {
163       reserve(sizeof(T));
164       sycl::event memcopied = m_q->memcpy(m_data, std::addressof(t), sizeof(T));
165       fence(memcopied);
166 
167       std::unique_ptr<T, Deleter> ptr(reinterpret_cast<T*>(m_data),
168                                       Deleter(this));
169       m_size = sizeof(T);
170       return ptr;
171     }
172 
173     // This will copy-constuct an object T into memory held by this object
174     // returns: a unique_ptr<T, destruct_delete> that will call the
175     // destructor on the type when it goes out of scope.
176     //
177     // Note:  This will not work with USM device memory
178     template <typename T>
copy_construct_from(const T & t)179     std::unique_ptr<T, Deleter> copy_construct_from(const T& t) {
180       static_assert(kind != sycl::usm::alloc::device,
181                     "Cannot copy construct into USM device memory");
182 
183       reserve(sizeof(T));
184 
185       std::unique_ptr<T, Deleter> ptr(new (m_data) T(t), Deleter(this));
186       m_size = sizeof(T);
187       return ptr;
188     }
189 
190    public:
191     // Performs either memcpy (for USM device memory) and returns a T*
192     // (but is technically UB when dereferenced on an object that is not
193     // an implicit-lifetime nor trivially-copyable type
194     //
195     // or
196     //
197     // performs copy construction (for other USM memory types) and returns a
198     // unique_ptr<T, ...>
199     template <typename T>
copy_from(const T & t)200     std::unique_ptr<T, Deleter> copy_from(const T& t) {
201       if constexpr (sycl::usm::alloc::device == kind)
202         return memcpy_from(t);
203       else
204         return copy_construct_from(t);
205     }
206 
207    private:
208     // Returns a reference to t (helpful when debugging)
209     template <typename T>
memcpy_to(T & t)210     T& memcpy_to(T& t) {
211       assert(sizeof(T) == m_size);
212 
213       sycl::event memcopied = m_q->memcpy(std::addressof(t), m_data, sizeof(T));
214       fence(memcopied);
215 
216       return t;
217     }
218 
219     // Returns a reference to t (helpful when debugging)
220     template <typename T>
move_assign_to(T & t)221     T& move_assign_to(T& t) {
222       static_assert(kind != sycl::usm::alloc::device,
223                     "Cannot move_assign_to from USM device memory");
224 
225       assert(sizeof(T) == m_size);
226 
227       t = std::move(*static_cast<T*>(m_data));
228 
229       return t;
230     }
231 
232    public:
233     // Returns a reference to t (helpful when debugging)
234     template <typename T>
transfer_to(T & t)235     T& transfer_to(T& t) {
236       if constexpr (sycl::usm::alloc::device == kind)
237         return memcpy_to(t);
238       else
239         return move_assign_to(t);
240     }
241 
242    private:
243     // USMObjectMem class invariants
244     // All four expressions below must evaluate to true:
245     //
246     //  !m_data == !m_capacity
247     //  m_q || !m_data
248     //  m_data || !m_size
249     //  m_size <= m_capacity
250     //
251     //  The above invariants mean that:
252     //  if m_size != 0 then m_data != 0
253     //  if m_data != 0 then m_capacity != 0 && m_q != nullopt
254     //  if m_data == 0 then m_capacity == 0
255 
256     std::optional<sycl::queue> m_q;
257     void* m_data      = nullptr;
258     size_t m_size     = 0;  // sizeof(T) iff m_data points to live T
259     size_t m_capacity = 0;
260   };
261 
262   // An indirect kernel is one where the functor to be executed is explicitly
263   // copied to USM device memory before being executed, to get around the
264   // trivially copyable limitation of SYCL.
265   using IndirectKernelMem = USMObjectMem<sycl::usm::alloc::shared>;
266   IndirectKernelMem m_indirectKernelMem;
267 
268   using IndirectReducerMem = USMObjectMem<sycl::usm::alloc::shared>;
269   IndirectReducerMem m_indirectReducerMem;
270 
271   bool was_finalized = false;
272 
273   static SYCLInternal& singleton();
274 
275   int verify_is_initialized(const char* const label) const;
276 
277   void initialize(const sycl::device& d);
278 
279   void initialize(const sycl::queue& q);
280 
is_initialized() const281   int is_initialized() const { return m_queue.has_value(); }
282 
283   void finalize();
284 
285  private:
286   // fence(...) takes any type with a .wait_and_throw() method
287   // (sycl::event and sycl::queue)
288   template <typename WAT>
fence_helper(WAT & wat)289   static void fence_helper(WAT& wat) {
290     try {
291       wat.wait_and_throw();
292     } catch (sycl::exception const& e) {
293       Kokkos::Impl::throw_runtime_exception(
294           std::string("There was a synchronous SYCL error:\n") += e.what());
295     }
296   }
297 
298  public:
fence(sycl::queue & q)299   static void fence(sycl::queue& q) { fence_helper(q); }
fence(sycl::event & e)300   static void fence(sycl::event& e) { fence_helper(e); }
301 };
302 
303 template <typename Functor, typename Storage,
304           bool is_memcpyable = std::is_trivially_copyable_v<Functor>>
305 class SYCLFunctionWrapper;
306 
307 template <typename Functor, typename Storage>
308 class SYCLFunctionWrapper<Functor, Storage, true> {
309   const Functor& m_functor;
310 
311  public:
SYCLFunctionWrapper(const Functor & functor,Storage &)312   SYCLFunctionWrapper(const Functor& functor, Storage&) : m_functor(functor) {}
313 
get_functor() const314   const Functor& get_functor() const { return m_functor; }
315 };
316 
317 template <typename Functor, typename Storage>
318 class SYCLFunctionWrapper<Functor, Storage, false> {
319   std::unique_ptr<Functor,
320                   Experimental::Impl::SYCLInternal::IndirectKernelMem::Deleter>
321       m_kernelFunctorPtr;
322 
323  public:
SYCLFunctionWrapper(const Functor & functor,Storage & storage)324   SYCLFunctionWrapper(const Functor& functor, Storage& storage)
325       : m_kernelFunctorPtr(storage.copy_from(functor)) {}
326 
get_functor() const327   std::reference_wrapper<const Functor> get_functor() const {
328     return {*m_kernelFunctorPtr};
329   }
330 };
331 
332 template <typename Functor, typename Storage>
make_sycl_function_wrapper(const Functor & functor,Storage & storage)333 auto make_sycl_function_wrapper(const Functor& functor, Storage& storage) {
334   return SYCLFunctionWrapper<Functor, Storage>(functor, storage);
335 }
336 }  // namespace Impl
337 }  // namespace Experimental
338 }  // namespace Kokkos
339 #endif
340