1 /* 2 //@HEADER 3 // ************************************************************************ 4 // 5 // Kokkos v. 3.0 6 // Copyright (2020) National Technology & Engineering 7 // Solutions of Sandia, LLC (NTESS). 8 // 9 // Under the terms of Contract DE-NA0003525 with NTESS, 10 // the U.S. Government retains certain rights in this software. 11 // 12 // Redistribution and use in source and binary forms, with or without 13 // modification, are permitted provided that the following conditions are 14 // met: 15 // 16 // 1. Redistributions of source code must retain the above copyright 17 // notice, this list of conditions and the following disclaimer. 18 // 19 // 2. Redistributions in binary form must reproduce the above copyright 20 // notice, this list of conditions and the following disclaimer in the 21 // documentation and/or other materials provided with the distribution. 22 // 23 // 3. Neither the name of the Corporation nor the names of the 24 // contributors may be used to endorse or promote products derived from 25 // this software without specific prior written permission. 26 // 27 // THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY 28 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 29 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 30 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE 31 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 32 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 33 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 34 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 35 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 36 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 37 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 38 // 39 // Questions? Contact Christian R. Trott (crtrott@sandia.gov) 40 // 41 // ************************************************************************ 42 //@HEADER 43 */ 44 45 // Experimental unified task-data parallel manycore LDRD 46 47 #ifndef KOKKOS_IMPL_LOCKFREEDEQUE_HPP 48 #define KOKKOS_IMPL_LOCKFREEDEQUE_HPP 49 50 #include <Kokkos_Macros.hpp> 51 #ifdef KOKKOS_ENABLE_TASKDAG 52 53 #include <Kokkos_Core_fwd.hpp> 54 55 #include <Kokkos_PointerOwnership.hpp> 56 #include <impl/Kokkos_OptionalRef.hpp> 57 #include <impl/Kokkos_Error.hpp> // KOKKOS_EXPECTS 58 #include <impl/Kokkos_LinkedListNode.hpp> // KOKKOS_EXPECTS 59 60 #include <Kokkos_Atomic.hpp> // atomic_compare_exchange, atomic_fence 61 #include "Kokkos_LIFO.hpp" 62 63 //---------------------------------------------------------------------------- 64 //---------------------------------------------------------------------------- 65 66 namespace Kokkos { 67 namespace Impl { 68 69 //---------------------------------------------------------------------------- 70 //---------------------------------------------------------------------------- 71 72 template <class NodeType, size_t CircularBufferSize, class SizeType = size_t> 73 struct fixed_size_circular_buffer { 74 public: 75 using node_type = NodeType; 76 using size_type = SizeType; 77 78 private: 79 node_type* m_buffer[CircularBufferSize] = {nullptr}; 80 81 public: 82 fixed_size_circular_buffer() = default; 83 fixed_size_circular_buffer(fixed_size_circular_buffer const&) = delete; 84 fixed_size_circular_buffer(fixed_size_circular_buffer&&) = default; 85 fixed_size_circular_buffer& operator=(fixed_size_circular_buffer const&) = 86 delete; 87 fixed_size_circular_buffer& operator=(fixed_size_circular_buffer&&) = default; 88 ~fixed_size_circular_buffer() = default; 89 90 KOKKOS_FORCEINLINE_FUNCTION sizeKokkos::Impl::fixed_size_circular_buffer91 static constexpr size_type size() noexcept { 92 return size_type(CircularBufferSize); 93 } 94 95 KOKKOS_FORCEINLINE_FUNCTION operator []Kokkos::Impl::fixed_size_circular_buffer96 node_type* operator[](size_type idx) const noexcept { 97 return m_buffer[idx % size()]; 98 } 99 100 KOKKOS_FORCEINLINE_FUNCTION operator []Kokkos::Impl::fixed_size_circular_buffer101 node_type*& operator[](size_type idx) noexcept { 102 return m_buffer[idx % size()]; 103 } 104 }; 105 106 template <class NodeType, class SizeType = size_t> 107 struct non_owning_variable_size_circular_buffer { 108 public: 109 using node_type = NodeType; 110 using size_type = SizeType; 111 112 private: 113 ObservingRawPtr<node_type*> m_buffer = nullptr; 114 size_type m_size = 0; 115 116 public: 117 KOKKOS_INLINE_FUNCTION non_owning_variable_size_circular_bufferKokkos::Impl::non_owning_variable_size_circular_buffer118 non_owning_variable_size_circular_buffer(ObservingRawPtr<node_type*> buffer, 119 size_type arg_size) noexcept 120 : m_buffer(buffer), m_size(arg_size) {} 121 122 non_owning_variable_size_circular_buffer() = default; 123 non_owning_variable_size_circular_buffer( 124 non_owning_variable_size_circular_buffer const&) = delete; 125 non_owning_variable_size_circular_buffer( 126 non_owning_variable_size_circular_buffer&&) = default; 127 non_owning_variable_size_circular_buffer& operator =( 128 non_owning_variable_size_circular_buffer const&) = delete; 129 non_owning_variable_size_circular_buffer& operator =( 130 non_owning_variable_size_circular_buffer&&) = default; 131 ~non_owning_variable_size_circular_buffer() = default; 132 133 KOKKOS_FORCEINLINE_FUNCTION sizeKokkos::Impl::non_owning_variable_size_circular_buffer134 constexpr size_type size() const noexcept { return m_size; } 135 136 KOKKOS_FORCEINLINE_FUNCTION operator []Kokkos::Impl::non_owning_variable_size_circular_buffer137 node_type* operator[](size_type idx) const noexcept { 138 return m_buffer[idx % size()]; 139 } 140 141 KOKKOS_FORCEINLINE_FUNCTION operator []Kokkos::Impl::non_owning_variable_size_circular_buffer142 node_type*& operator[](size_type idx) noexcept { 143 return m_buffer[idx % size()]; 144 } 145 }; 146 147 /** Based on "Correct and Efficient Work-Stealing for Weak Memory Models," 148 * PPoPP '13, https://www.di.ens.fr/~zappa/readings/ppopp13.pdf 149 * 150 */ 151 template <class T, class CircularBufferT, class SizeType = int32_t> 152 struct ChaseLevDeque { 153 public: 154 using size_type = SizeType; 155 using value_type = T; 156 // Still using intrusive linked list for waiting queue 157 using node_type = SimpleSinglyLinkedListNode<>; 158 159 private: 160 // TODO @tasking @new_feature DSH variable size circular buffer? 161 162 CircularBufferT m_array; 163 size_type m_top = 0; 164 size_type m_bottom = 0; 165 166 public: 167 template <class _ignore = void, 168 class = typename std::enable_if< 169 std::is_default_constructible<CircularBufferT>::value>::type> ChaseLevDequeKokkos::Impl::ChaseLevDeque170 ChaseLevDeque() : m_array() {} 171 ChaseLevDequeKokkos::Impl::ChaseLevDeque172 explicit ChaseLevDeque(CircularBufferT buffer) : m_array(std::move(buffer)) {} 173 174 KOKKOS_INLINE_FUNCTION emptyKokkos::Impl::ChaseLevDeque175 bool empty() const { 176 // TODO @tasking @memory_order DSH memory order 177 return m_top > m_bottom - 1; 178 } 179 180 KOKKOS_INLINE_FUNCTION popKokkos::Impl::ChaseLevDeque181 OptionalRef<T> pop() { 182 auto b = m_bottom - 1; // atomic load relaxed 183 auto& a = m_array; // atomic load relaxed 184 m_bottom = b; // atomic store relaxed 185 Kokkos::memory_fence(); // memory order seq_cst 186 auto t = m_top; // atomic load relaxed 187 OptionalRef<T> return_value; 188 if (t <= b) { 189 /* non-empty queue */ 190 return_value = *static_cast<T*>(a[b]); // relaxed load 191 if (t == b) { 192 /* single last element in the queue. */ 193 #ifdef _WIN32 194 Kokkos::memory_fence(); 195 bool const success = 196 Kokkos::atomic_compare_exchange_strong(&m_top, t, t + 1); 197 Kokkos::memory_fence(); 198 if (!success) { 199 return_value = nullptr; 200 } 201 #else 202 if (!Impl::atomic_compare_exchange_strong( 203 &m_top, t, t + 1, memory_order_seq_cst, memory_order_relaxed)) { 204 /* failed race, someone else stole it */ 205 return_value = nullptr; 206 } 207 #endif 208 m_bottom = b + 1; // memory order relaxed 209 } 210 } else { 211 /* empty queue */ 212 m_bottom = b + 1; // memory order relaxed 213 } 214 return return_value; 215 } 216 217 KOKKOS_INLINE_FUNCTION pushKokkos::Impl::ChaseLevDeque218 bool push(node_type&& node) { 219 // Just forward to the lvalue version 220 return push(node); 221 } 222 223 KOKKOS_INLINE_FUNCTION pushKokkos::Impl::ChaseLevDeque224 bool push(node_type& node) { 225 auto b = m_bottom; // memory order relaxed 226 auto t = Impl::atomic_load(&m_top, memory_order_acquire); 227 auto& a = m_array; 228 if (b - t > a.size() - 1) { 229 /* queue is full, resize */ 230 // m_array = a->grow(); 231 // a = m_array; 232 return false; 233 } 234 a[b] = &node; // relaxed 235 Impl::atomic_store(&m_bottom, b + 1, memory_order_release); 236 return true; 237 } 238 239 KOKKOS_INLINE_FUNCTION stealKokkos::Impl::ChaseLevDeque240 OptionalRef<T> steal() { 241 auto t = m_top; // TODO @tasking @memory_order DSH: atomic load acquire 242 Kokkos::memory_fence(); // seq_cst fence, so why does the above need to be 243 // acquire? 244 auto b = Impl::atomic_load(&m_bottom, memory_order_acquire); 245 OptionalRef<T> return_value; 246 if (t < b) { 247 /* Non-empty queue */ 248 auto& a = m_array; // TODO @tasking @memory_order DSH: technically 249 // consume ordered, but acquire should be fine 250 Kokkos::load_fence(); // TODO @tasking @memory_order DSH memory order 251 // instead of fence 252 return_value = *static_cast<T*>(a[t]); // relaxed 253 #ifdef _WIN32 254 Kokkos::memory_fence(); 255 bool const success = 256 Kokkos::atomic_compare_exchange_strong(&m_top, t, t + 1); 257 Kokkos::memory_fence(); 258 if (!success) { 259 return_value = nullptr; 260 } 261 #else 262 if (!Impl::atomic_compare_exchange_strong( 263 &m_top, t, t + 1, memory_order_seq_cst, memory_order_relaxed)) { 264 return_value = nullptr; 265 } 266 #endif 267 } 268 return return_value; 269 } 270 }; 271 272 /* 273 // The atomicity of this load was more important in the paper's version 274 // because that version had a circular buffer that could grow. We're 275 // essentially using the memory order in this version as a fence, which 276 // may be unnecessary 277 auto buffer_ptr = (node_type***)&m_array.buffer; 278 auto a = Impl::atomic_load(buffer_ptr, memory_order_acquire); // 279 technically consume ordered, but acquire should be fine return_value = 280 *static_cast<T*>(a[t % m_array->size]); // relaxed; we'd have to replace the 281 m_array->size if we ever allow growth 282 */ 283 284 //---------------------------------------------------------------------------- 285 //---------------------------------------------------------------------------- 286 287 template <size_t CircularBufferSize> 288 struct TaskQueueTraitsChaseLev { 289 template <class Task> 290 using ready_queue_type = 291 ChaseLevDeque<Task, 292 fixed_size_circular_buffer<SimpleSinglyLinkedListNode<>, 293 CircularBufferSize, int32_t>, 294 int32_t>; 295 296 template <class Task> 297 using waiting_queue_type = SingleConsumeOperationLIFO<Task>; 298 299 template <class Task> 300 using intrusive_task_base_type = typename ready_queue_type<Task>::node_type; 301 302 static constexpr auto ready_queue_insertion_may_fail = true; 303 }; 304 305 } // end namespace Impl 306 } // end namespace Kokkos 307 308 //---------------------------------------------------------------------------- 309 //---------------------------------------------------------------------------- 310 311 #endif /* defined KOKKOS_ENABLE_TASKDAG */ 312 #endif /* #ifndef KOKKOS_IMPL_LOCKFREEDEQUE_HPP */ 313