1// -*- C++ -*-
2//===--------------------------- barrier ----------------------------------===//
3//
4// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
5// See https://llvm.org/LICENSE.txt for license information.
6// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7//
8//===----------------------------------------------------------------------===//
9
10#ifndef _LIBCPP_BARRIER
11#define _LIBCPP_BARRIER
12
13/*
14    barrier synopsis
15
16namespace std
17{
18
19  template<class CompletionFunction = see below>
20  class barrier
21  {
22  public:
23    using arrival_token = see below;
24
25    static constexpr ptrdiff_t max() noexcept;
26
27    constexpr explicit barrier(ptrdiff_t phase_count,
28                               CompletionFunction f = CompletionFunction());
29    ~barrier();
30
31    barrier(const barrier&) = delete;
32    barrier& operator=(const barrier&) = delete;
33
34    [[nodiscard]] arrival_token arrive(ptrdiff_t update = 1);
35    void wait(arrival_token&& arrival) const;
36
37    void arrive_and_wait();
38    void arrive_and_drop();
39
40  private:
41    CompletionFunction completion; // exposition only
42  };
43
44}
45
46*/
47
48#include <__config>
49#include <__availability>
50#include <atomic>
51#ifndef _LIBCPP_HAS_NO_TREE_BARRIER
52# include <memory>
53#endif
54
55#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
56#pragma GCC system_header
57#endif
58
59#ifdef _LIBCPP_HAS_NO_THREADS
60# error <barrier> is not supported on this single threaded system
61#endif
62
63_LIBCPP_PUSH_MACROS
64#include <__undef_macros>
65
66#if _LIBCPP_STD_VER >= 14
67
68_LIBCPP_BEGIN_NAMESPACE_STD
69
70struct __empty_completion
71{
72    inline _LIBCPP_INLINE_VISIBILITY
73    void operator()() noexcept
74    {
75    }
76};
77
78#ifndef _LIBCPP_HAS_NO_TREE_BARRIER
79
80/*
81
82The default implementation of __barrier_base is a classic tree barrier.
83
84It looks different from literature pseudocode for two main reasons:
85 1. Threads that call into std::barrier functions do not provide indices,
86    so a numbering step is added before the actual barrier algorithm,
87    appearing as an N+1 round to the N rounds of the tree barrier.
88 2. A great deal of attention has been paid to avoid cache line thrashing
89    by flattening the tree structure into cache-line sized arrays, that
90    are indexed in an efficient way.
91
92*/
93
94using __barrier_phase_t = uint8_t;
95
96class __barrier_algorithm_base;
97
98_LIBCPP_AVAILABILITY_SYNC _LIBCPP_EXPORTED_FROM_ABI
99__barrier_algorithm_base* __construct_barrier_algorithm_base(ptrdiff_t& __expected);
100
101_LIBCPP_AVAILABILITY_SYNC _LIBCPP_EXPORTED_FROM_ABI
102bool __arrive_barrier_algorithm_base(__barrier_algorithm_base* __barrier,
103                                     __barrier_phase_t __old_phase);
104
105_LIBCPP_AVAILABILITY_SYNC _LIBCPP_EXPORTED_FROM_ABI
106void __destroy_barrier_algorithm_base(__barrier_algorithm_base* __barrier);
107
108template<class _CompletionF>
109class __barrier_base {
110
111    ptrdiff_t                                               __expected;
112    unique_ptr<__barrier_algorithm_base,
113               void (*)(__barrier_algorithm_base*)>         __base;
114    __atomic_base<ptrdiff_t>                                __expected_adjustment;
115    _CompletionF                                            __completion;
116    __atomic_base<__barrier_phase_t>                        __phase;
117
118public:
119    using arrival_token = __barrier_phase_t;
120
121    static constexpr ptrdiff_t max() noexcept {
122        return numeric_limits<ptrdiff_t>::max();
123    }
124
125    _LIBCPP_AVAILABILITY_SYNC _LIBCPP_INLINE_VISIBILITY
126    __barrier_base(ptrdiff_t __expected, _CompletionF __completion = _CompletionF())
127            : __expected(__expected), __base(__construct_barrier_algorithm_base(this->__expected),
128                                             &__destroy_barrier_algorithm_base),
129              __expected_adjustment(0), __completion(move(__completion)), __phase(0)
130    {
131    }
132    [[nodiscard]] _LIBCPP_AVAILABILITY_SYNC _LIBCPP_INLINE_VISIBILITY
133    arrival_token arrive(ptrdiff_t update)
134    {
135        auto const __old_phase = __phase.load(memory_order_relaxed);
136        for(; update; --update)
137            if(__arrive_barrier_algorithm_base(__base.get(), __old_phase)) {
138                __completion();
139                __expected += __expected_adjustment.load(memory_order_relaxed);
140                __expected_adjustment.store(0, memory_order_relaxed);
141                __phase.store(__old_phase + 2, memory_order_release);
142                __phase.notify_all();
143            }
144        return __old_phase;
145    }
146    _LIBCPP_AVAILABILITY_SYNC _LIBCPP_INLINE_VISIBILITY
147    void wait(arrival_token&& __old_phase) const
148    {
149        auto const __test_fn = [=]() -> bool {
150            return __phase.load(memory_order_acquire) != __old_phase;
151        };
152        __libcpp_thread_poll_with_backoff(__test_fn, __libcpp_timed_backoff_policy());
153    }
154    _LIBCPP_AVAILABILITY_SYNC _LIBCPP_INLINE_VISIBILITY
155    void arrive_and_drop()
156    {
157        __expected_adjustment.fetch_sub(1, memory_order_relaxed);
158        (void)arrive(1);
159    }
160};
161
162#else
163
164/*
165
166The alternative implementation of __barrier_base is a central barrier.
167
168Two versions of this algorithm are provided:
169 1. A fairly straightforward implementation of the litterature for the
170    general case where the completion function is not empty.
171 2. An optimized implementation that exploits 2's complement arithmetic
172    and well-defined overflow in atomic arithmetic, to handle the phase
173    roll-over for free.
174
175*/
176
177template<class _CompletionF>
178class __barrier_base {
179
180    __atomic_base<ptrdiff_t> __expected;
181    __atomic_base<ptrdiff_t> __arrived;
182    _CompletionF             __completion;
183    __atomic_base<bool>      __phase;
184public:
185    using arrival_token = bool;
186
187    static constexpr ptrdiff_t max() noexcept {
188        return numeric_limits<ptrdiff_t>::max();
189    }
190
191    _LIBCPP_INLINE_VISIBILITY
192    __barrier_base(ptrdiff_t __expected, _CompletionF __completion = _CompletionF())
193        : __expected(__expected), __arrived(__expected), __completion(move(__completion)), __phase(false)
194    {
195    }
196    [[nodiscard]] _LIBCPP_AVAILABILITY_SYNC _LIBCPP_INLINE_VISIBILITY
197    arrival_token arrive(ptrdiff_t update)
198    {
199        auto const __old_phase = __phase.load(memory_order_relaxed);
200        auto const __result = __arrived.fetch_sub(update, memory_order_acq_rel) - update;
201        auto const new_expected = __expected.load(memory_order_relaxed);
202        if(0 == __result) {
203            __completion();
204            __arrived.store(new_expected, memory_order_relaxed);
205            __phase.store(!__old_phase, memory_order_release);
206            __phase.notify_all();
207        }
208        return __old_phase;
209    }
210    _LIBCPP_AVAILABILITY_SYNC _LIBCPP_INLINE_VISIBILITY
211    void wait(arrival_token&& __old_phase) const
212    {
213        __phase.wait(__old_phase, memory_order_acquire);
214    }
215    _LIBCPP_AVAILABILITY_SYNC _LIBCPP_INLINE_VISIBILITY
216    void arrive_and_drop()
217    {
218        __expected.fetch_sub(1, memory_order_relaxed);
219        (void)arrive(1);
220    }
221};
222
223template<>
224class __barrier_base<__empty_completion> {
225
226    static constexpr uint64_t __expected_unit = 1ull;
227    static constexpr uint64_t __arrived_unit = 1ull << 32;
228    static constexpr uint64_t __expected_mask = __arrived_unit - 1;
229    static constexpr uint64_t __phase_bit = 1ull << 63;
230    static constexpr uint64_t __arrived_mask = (__phase_bit - 1) & ~__expected_mask;
231
232    __atomic_base<uint64_t>   __phase_arrived_expected;
233
234    static _LIBCPP_INLINE_VISIBILITY
235    constexpr uint64_t __init(ptrdiff_t __count) _NOEXCEPT
236    {
237        return ((uint64_t(1u << 31) - __count) << 32)
238              | (uint64_t(1u << 31) - __count);
239    }
240
241public:
242    using arrival_token = uint64_t;
243
244    static constexpr ptrdiff_t max() noexcept {
245        return ptrdiff_t(1u << 31) - 1;
246    }
247
248    _LIBCPP_INLINE_VISIBILITY
249    explicit inline __barrier_base(ptrdiff_t __count, __empty_completion = __empty_completion())
250        : __phase_arrived_expected(__init(__count))
251    {
252    }
253    [[nodiscard]] inline _LIBCPP_AVAILABILITY_SYNC _LIBCPP_INLINE_VISIBILITY
254    arrival_token arrive(ptrdiff_t update)
255    {
256        auto const __inc = __arrived_unit * update;
257        auto const __old = __phase_arrived_expected.fetch_add(__inc, memory_order_acq_rel);
258        if((__old ^ (__old + __inc)) & __phase_bit) {
259            __phase_arrived_expected.fetch_add((__old & __expected_mask) << 32, memory_order_relaxed);
260            __phase_arrived_expected.notify_all();
261        }
262        return __old & __phase_bit;
263    }
264    inline _LIBCPP_AVAILABILITY_SYNC _LIBCPP_INLINE_VISIBILITY
265    void wait(arrival_token&& __phase) const
266    {
267        auto const __test_fn = [=]() -> bool {
268            uint64_t const __current = __phase_arrived_expected.load(memory_order_acquire);
269            return ((__current & __phase_bit) != __phase);
270        };
271        __libcpp_thread_poll_with_backoff(__test_fn, __libcpp_timed_backoff_policy());
272    }
273    inline _LIBCPP_AVAILABILITY_SYNC _LIBCPP_INLINE_VISIBILITY
274    void arrive_and_drop()
275    {
276        __phase_arrived_expected.fetch_add(__expected_unit, memory_order_relaxed);
277        (void)arrive(1);
278    }
279};
280
281#endif //_LIBCPP_HAS_NO_TREE_BARRIER
282
283template<class _CompletionF = __empty_completion>
284class barrier {
285
286    __barrier_base<_CompletionF> __b;
287public:
288    using arrival_token = typename __barrier_base<_CompletionF>::arrival_token;
289
290    static constexpr ptrdiff_t max() noexcept {
291        return __barrier_base<_CompletionF>::max();
292    }
293
294    _LIBCPP_AVAILABILITY_SYNC _LIBCPP_INLINE_VISIBILITY
295    barrier(ptrdiff_t __count, _CompletionF __completion = _CompletionF())
296        : __b(__count, _VSTD::move(__completion)) {
297    }
298
299    barrier(barrier const&) = delete;
300    barrier& operator=(barrier const&) = delete;
301
302    [[nodiscard]] _LIBCPP_AVAILABILITY_SYNC _LIBCPP_INLINE_VISIBILITY
303    arrival_token arrive(ptrdiff_t update = 1)
304    {
305        return __b.arrive(update);
306    }
307    _LIBCPP_AVAILABILITY_SYNC _LIBCPP_INLINE_VISIBILITY
308    void wait(arrival_token&& __phase) const
309    {
310        __b.wait(_VSTD::move(__phase));
311    }
312	_LIBCPP_AVAILABILITY_SYNC _LIBCPP_INLINE_VISIBILITY
313    void arrive_and_wait()
314    {
315        wait(arrive());
316	}
317    _LIBCPP_AVAILABILITY_SYNC _LIBCPP_INLINE_VISIBILITY
318    void arrive_and_drop()
319    {
320        __b.arrive_and_drop();
321    }
322};
323
324_LIBCPP_END_NAMESPACE_STD
325
326#endif // _LIBCPP_STD_VER >= 14
327
328_LIBCPP_POP_MACROS
329
330#endif //_LIBCPP_BARRIER
331