1 /*
2 //@HEADER
3 // ************************************************************************
4 //
5 // Kokkos v. 3.0
6 // Copyright (2020) National Technology & Engineering
7 // Solutions of Sandia, LLC (NTESS).
8 //
9 // Under the terms of Contract DE-NA0003525 with NTESS,
10 // the U.S. Government retains certain rights in this software.
11 //
12 // Redistribution and use in source and binary forms, with or without
13 // modification, are permitted provided that the following conditions are
14 // met:
15 //
16 // 1. Redistributions of source code must retain the above copyright
17 // notice, this list of conditions and the following disclaimer.
18 //
19 // 2. Redistributions in binary form must reproduce the above copyright
20 // notice, this list of conditions and the following disclaimer in the
21 // documentation and/or other materials provided with the distribution.
22 //
23 // 3. Neither the name of the Corporation nor the names of the
24 // contributors may be used to endorse or promote products derived from
25 // this software without specific prior written permission.
26 //
27 // THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
28 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
30 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
31 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
32 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
33 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
34 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
35 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
36 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
37 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
38 //
39 // Questions? Contact Christian R. Trott (crtrott@sandia.gov)
40 //
41 // ************************************************************************
42 //@HEADER
43 */
44
45 #ifndef KOKKOS_SYCL_INSTANCE_HPP_
46 #define KOKKOS_SYCL_INSTANCE_HPP_
47
48 #include <optional>
49 #include <CL/sycl.hpp>
50
51 #include <impl/Kokkos_Error.hpp>
52
53 namespace Kokkos {
54 namespace Experimental {
55 namespace Impl {
56
57 class SYCLInternal {
58 public:
59 using size_type = int;
60
61 SYCLInternal() = default;
62 ~SYCLInternal();
63
64 SYCLInternal(const SYCLInternal&) = delete;
65 SYCLInternal& operator=(const SYCLInternal&) = delete;
66 SYCLInternal& operator=(SYCLInternal&&) = delete;
67 SYCLInternal(SYCLInternal&&) = delete;
68
69 void* scratch_space(const size_type size);
70 void* scratch_flags(const size_type size);
71
72 int m_syclDev = -1;
73
74 size_t m_maxWorkgroupSize = 0;
75 uint32_t m_maxConcurrency = 0;
76 uint64_t m_maxShmemPerBlock = 0;
77
78 uint32_t* m_scratchConcurrentBitset = nullptr;
79 size_type m_scratchSpaceCount = 0;
80 size_type* m_scratchSpace = nullptr;
81 size_type m_scratchFlagsCount = 0;
82 size_type* m_scratchFlags = nullptr;
83
84 std::optional<sycl::queue> m_queue;
85
86 // Using std::vector<std::optional<sycl::queue>> reveals a compiler bug when
87 // compiling for the CUDA backend. Storing pointers instead works around this.
88 static std::vector<std::optional<sycl::queue>*> all_queues;
89 // We need a mutex for thread safety when modifying all_queues.
90 static std::mutex mutex;
91
92 // USMObjectMem is a reusable buffer for a single object
93 // in USM memory
94 template <sycl::usm::alloc Kind>
95 class USMObjectMem {
96 public:
97 class Deleter {
98 public:
99 Deleter() = default;
Deleter(USMObjectMem * mem)100 explicit Deleter(USMObjectMem* mem) : m_mem(mem) {}
101
102 template <typename T>
operator ()(T * p) const103 void operator()(T* p) const noexcept {
104 assert(m_mem);
105 assert(sizeof(T) == m_mem->size());
106
107 if constexpr (sycl::usm::alloc::device == kind)
108 // Only skipping the dtor on trivially copyable types
109 static_assert(std::is_trivially_copyable_v<T>);
110 else
111 p->~T();
112
113 m_mem->m_size = 0;
114 }
115
116 private:
117 USMObjectMem* m_mem = nullptr;
118 };
119
120 static constexpr sycl::usm::alloc kind = Kind;
121
122 void reset();
123
reset(sycl::queue q)124 void reset(sycl::queue q) {
125 reset();
126 m_q.emplace(std::move(q));
127 }
128
129 USMObjectMem() = default;
USMObjectMem(sycl::queue q)130 explicit USMObjectMem(sycl::queue q) noexcept : m_q(std::move(q)) {}
131
132 USMObjectMem(USMObjectMem const&) = delete;
133 USMObjectMem(USMObjectMem&&) = delete;
134 USMObjectMem& operator=(USMObjectMem&&) = delete;
135 USMObjectMem& operator=(USMObjectMem const&) = delete;
136
~USMObjectMem()137 ~USMObjectMem() { reset(); };
138
data()139 void* data() noexcept { return m_data; }
data() const140 const void* data() const noexcept { return m_data; }
141
size() const142 size_t size() const noexcept { return m_size; }
capacity() const143 size_t capacity() const noexcept { return m_capacity; }
144
145 // reserve() allocates space for at least n bytes
146 // returns the new capacity
147 size_t reserve(size_t n);
148
149 private:
150 using AllocationSpace =
151 std::conditional_t<Kind == sycl::usm::alloc::device,
152 Kokkos::Experimental::SYCLDeviceUSMSpace,
153 Kokkos::Experimental::SYCLSharedUSMSpace>;
154
155 // This will memcpy an object T into memory held by this object
156 // returns: a T* to that object
157 //
158 // Note: it is UB to dereference this pointer with an object that is
159 // not an implicit-lifetime nor trivially-copyable type, but presumably much
160 // faster because we can use USM device memory
161 template <typename T>
memcpy_from(const T & t)162 std::unique_ptr<T, Deleter> memcpy_from(const T& t) {
163 reserve(sizeof(T));
164 sycl::event memcopied = m_q->memcpy(m_data, std::addressof(t), sizeof(T));
165 fence(memcopied);
166
167 std::unique_ptr<T, Deleter> ptr(reinterpret_cast<T*>(m_data),
168 Deleter(this));
169 m_size = sizeof(T);
170 return ptr;
171 }
172
173 // This will copy-constuct an object T into memory held by this object
174 // returns: a unique_ptr<T, destruct_delete> that will call the
175 // destructor on the type when it goes out of scope.
176 //
177 // Note: This will not work with USM device memory
178 template <typename T>
copy_construct_from(const T & t)179 std::unique_ptr<T, Deleter> copy_construct_from(const T& t) {
180 static_assert(kind != sycl::usm::alloc::device,
181 "Cannot copy construct into USM device memory");
182
183 reserve(sizeof(T));
184
185 std::unique_ptr<T, Deleter> ptr(new (m_data) T(t), Deleter(this));
186 m_size = sizeof(T);
187 return ptr;
188 }
189
190 public:
191 // Performs either memcpy (for USM device memory) and returns a T*
192 // (but is technically UB when dereferenced on an object that is not
193 // an implicit-lifetime nor trivially-copyable type
194 //
195 // or
196 //
197 // performs copy construction (for other USM memory types) and returns a
198 // unique_ptr<T, ...>
199 template <typename T>
copy_from(const T & t)200 std::unique_ptr<T, Deleter> copy_from(const T& t) {
201 if constexpr (sycl::usm::alloc::device == kind)
202 return memcpy_from(t);
203 else
204 return copy_construct_from(t);
205 }
206
207 private:
208 // Returns a reference to t (helpful when debugging)
209 template <typename T>
memcpy_to(T & t)210 T& memcpy_to(T& t) {
211 assert(sizeof(T) == m_size);
212
213 sycl::event memcopied = m_q->memcpy(std::addressof(t), m_data, sizeof(T));
214 fence(memcopied);
215
216 return t;
217 }
218
219 // Returns a reference to t (helpful when debugging)
220 template <typename T>
move_assign_to(T & t)221 T& move_assign_to(T& t) {
222 static_assert(kind != sycl::usm::alloc::device,
223 "Cannot move_assign_to from USM device memory");
224
225 assert(sizeof(T) == m_size);
226
227 t = std::move(*static_cast<T*>(m_data));
228
229 return t;
230 }
231
232 public:
233 // Returns a reference to t (helpful when debugging)
234 template <typename T>
transfer_to(T & t)235 T& transfer_to(T& t) {
236 if constexpr (sycl::usm::alloc::device == kind)
237 return memcpy_to(t);
238 else
239 return move_assign_to(t);
240 }
241
242 private:
243 // USMObjectMem class invariants
244 // All four expressions below must evaluate to true:
245 //
246 // !m_data == !m_capacity
247 // m_q || !m_data
248 // m_data || !m_size
249 // m_size <= m_capacity
250 //
251 // The above invariants mean that:
252 // if m_size != 0 then m_data != 0
253 // if m_data != 0 then m_capacity != 0 && m_q != nullopt
254 // if m_data == 0 then m_capacity == 0
255
256 std::optional<sycl::queue> m_q;
257 void* m_data = nullptr;
258 size_t m_size = 0; // sizeof(T) iff m_data points to live T
259 size_t m_capacity = 0;
260 };
261
262 // An indirect kernel is one where the functor to be executed is explicitly
263 // copied to USM device memory before being executed, to get around the
264 // trivially copyable limitation of SYCL.
265 using IndirectKernelMem = USMObjectMem<sycl::usm::alloc::shared>;
266 IndirectKernelMem m_indirectKernelMem;
267
268 using IndirectReducerMem = USMObjectMem<sycl::usm::alloc::shared>;
269 IndirectReducerMem m_indirectReducerMem;
270
271 bool was_finalized = false;
272
273 static SYCLInternal& singleton();
274
275 int verify_is_initialized(const char* const label) const;
276
277 void initialize(const sycl::device& d);
278
279 void initialize(const sycl::queue& q);
280
is_initialized() const281 int is_initialized() const { return m_queue.has_value(); }
282
283 void finalize();
284
285 private:
286 // fence(...) takes any type with a .wait_and_throw() method
287 // (sycl::event and sycl::queue)
288 template <typename WAT>
fence_helper(WAT & wat)289 static void fence_helper(WAT& wat) {
290 try {
291 wat.wait_and_throw();
292 } catch (sycl::exception const& e) {
293 Kokkos::Impl::throw_runtime_exception(
294 std::string("There was a synchronous SYCL error:\n") += e.what());
295 }
296 }
297
298 public:
fence(sycl::queue & q)299 static void fence(sycl::queue& q) { fence_helper(q); }
fence(sycl::event & e)300 static void fence(sycl::event& e) { fence_helper(e); }
301 };
302
303 template <typename Functor, typename Storage,
304 bool is_memcpyable = std::is_trivially_copyable_v<Functor>>
305 class SYCLFunctionWrapper;
306
307 template <typename Functor, typename Storage>
308 class SYCLFunctionWrapper<Functor, Storage, true> {
309 const Functor& m_functor;
310
311 public:
SYCLFunctionWrapper(const Functor & functor,Storage &)312 SYCLFunctionWrapper(const Functor& functor, Storage&) : m_functor(functor) {}
313
get_functor() const314 const Functor& get_functor() const { return m_functor; }
315 };
316
317 template <typename Functor, typename Storage>
318 class SYCLFunctionWrapper<Functor, Storage, false> {
319 std::unique_ptr<Functor,
320 Experimental::Impl::SYCLInternal::IndirectKernelMem::Deleter>
321 m_kernelFunctorPtr;
322
323 public:
SYCLFunctionWrapper(const Functor & functor,Storage & storage)324 SYCLFunctionWrapper(const Functor& functor, Storage& storage)
325 : m_kernelFunctorPtr(storage.copy_from(functor)) {}
326
get_functor() const327 std::reference_wrapper<const Functor> get_functor() const {
328 return {*m_kernelFunctorPtr};
329 }
330 };
331
332 template <typename Functor, typename Storage>
make_sycl_function_wrapper(const Functor & functor,Storage & storage)333 auto make_sycl_function_wrapper(const Functor& functor, Storage& storage) {
334 return SYCLFunctionWrapper<Functor, Storage>(functor, storage);
335 }
336 } // namespace Impl
337 } // namespace Experimental
338 } // namespace Kokkos
339 #endif
340