1 //===----------------------------------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
9 #ifndef _LIBCPP___ALGORITHM_PSTL_BACKENDS_CPU_BACKENDS_LIBDISPATCH_H
10 #define _LIBCPP___ALGORITHM_PSTL_BACKENDS_CPU_BACKENDS_LIBDISPATCH_H
11 
12 #include <__algorithm/lower_bound.h>
13 #include <__algorithm/max.h>
14 #include <__algorithm/upper_bound.h>
15 #include <__atomic/atomic.h>
16 #include <__config>
17 #include <__exception/terminate.h>
18 #include <__iterator/iterator_traits.h>
19 #include <__iterator/move_iterator.h>
20 #include <__memory/allocator.h>
21 #include <__memory/construct_at.h>
22 #include <__memory/unique_ptr.h>
23 #include <__numeric/reduce.h>
24 #include <__utility/exception_guard.h>
25 #include <__utility/move.h>
26 #include <__utility/pair.h>
27 #include <__utility/terminate_on_exception.h>
28 #include <cstddef>
29 #include <new>
30 
31 _LIBCPP_PUSH_MACROS
32 #include <__undef_macros>
33 
34 #if !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17
35 
36 _LIBCPP_BEGIN_NAMESPACE_STD
37 
38 namespace __par_backend {
39 inline namespace __libdispatch {
40 
41 // ::dispatch_apply is marked as __attribute__((nothrow)) because it doesn't let exceptions propagate, and neither do
42 // we.
43 // TODO: Do we want to add [[_Clang::__callback__(__func, __context, __)]]?
44 _LIBCPP_EXPORTED_FROM_ABI void
45 __dispatch_apply(size_t __chunk_count, void* __context, void (*__func)(void* __context, size_t __chunk)) noexcept;
46 
47 template <class _Func>
48 _LIBCPP_HIDE_FROM_ABI void __dispatch_apply(size_t __chunk_count, _Func __func) noexcept {
49   __libdispatch::__dispatch_apply(__chunk_count, &__func, [](void* __context, size_t __chunk) {
50     (*static_cast<_Func*>(__context))(__chunk);
51   });
52 }
53 
54 struct __chunk_partitions {
55   ptrdiff_t __chunk_count_; // includes the first chunk
56   ptrdiff_t __chunk_size_;
57   ptrdiff_t __first_chunk_size_;
58 };
59 
60 [[__gnu__::__const__]] _LIBCPP_EXPORTED_FROM_ABI __chunk_partitions __partition_chunks(ptrdiff_t __size);
61 
62 template <class _RandomAccessIterator, class _Functor>
63 _LIBCPP_HIDE_FROM_ABI void
64 __parallel_for(_RandomAccessIterator __first, _RandomAccessIterator __last, _Functor __func) {
65   auto __partitions = __libdispatch::__partition_chunks(__last - __first);
66 
67   // Perform the chunked execution.
68   __libdispatch::__dispatch_apply(__partitions.__chunk_count_, [&](size_t __chunk) {
69     auto __this_chunk_size = __chunk == 0 ? __partitions.__first_chunk_size_ : __partitions.__chunk_size_;
70     auto __index =
71         __chunk == 0
72             ? 0
73             : (__chunk * __partitions.__chunk_size_) + (__partitions.__first_chunk_size_ - __partitions.__chunk_size_);
74     __func(__first + __index, __first + __index + __this_chunk_size);
75   });
76 }
77 
78 template <class _RandomAccessIterator1, class _RandomAccessIterator2, class _RandomAccessIteratorOut>
79 struct __merge_range {
80   __merge_range(_RandomAccessIterator1 __mid1, _RandomAccessIterator2 __mid2, _RandomAccessIteratorOut __result)
81       : __mid1_(__mid1), __mid2_(__mid2), __result_(__result) {}
82 
83   _RandomAccessIterator1 __mid1_;
84   _RandomAccessIterator2 __mid2_;
85   _RandomAccessIteratorOut __result_;
86 };
87 
88 template <typename _RandomAccessIterator1,
89           typename _RandomAccessIterator2,
90           typename _RandomAccessIterator3,
91           typename _Compare,
92           typename _LeafMerge>
93 _LIBCPP_HIDE_FROM_ABI void __parallel_merge(
94     _RandomAccessIterator1 __first1,
95     _RandomAccessIterator1 __last1,
96     _RandomAccessIterator2 __first2,
97     _RandomAccessIterator2 __last2,
98     _RandomAccessIterator3 __result,
99     _Compare __comp,
100     _LeafMerge __leaf_merge) {
101   __chunk_partitions __partitions =
102       __libdispatch::__partition_chunks(std::max<ptrdiff_t>(__last1 - __first1, __last2 - __first2));
103 
104   if (__partitions.__chunk_count_ == 0)
105     return;
106 
107   if (__partitions.__chunk_count_ == 1) {
108     __leaf_merge(__first1, __last1, __first2, __last2, __result, __comp);
109     return;
110   }
111 
112   using __merge_range_t = __merge_range<_RandomAccessIterator1, _RandomAccessIterator2, _RandomAccessIterator3>;
113   auto const __n_ranges = __partitions.__chunk_count_ + 1;
114 
115   // TODO: use __uninitialized_buffer
116   auto __destroy = [=](__merge_range_t* __ptr) {
117     std::destroy_n(__ptr, __n_ranges);
118     std::allocator<__merge_range_t>().deallocate(__ptr, __n_ranges);
119   };
120   unique_ptr<__merge_range_t[], decltype(__destroy)> __ranges(
121       std::allocator<__merge_range_t>().allocate(__n_ranges), __destroy);
122 
123   // TODO: Improve the case where the smaller range is merged into just a few (or even one) chunks of the larger case
124   std::__terminate_on_exception([&] {
125     __merge_range_t* __r = __ranges.get();
126     std::__construct_at(__r++, __first1, __first2, __result);
127 
128     bool __iterate_first_range = __last1 - __first1 > __last2 - __first2;
129 
130     auto __compute_chunk = [&](size_t __chunk_size) -> __merge_range_t {
131       auto [__mid1, __mid2] = [&] {
132         if (__iterate_first_range) {
133           auto __m1 = __first1 + __chunk_size;
134           auto __m2 = std::lower_bound(__first2, __last2, __m1[-1], __comp);
135           return std::make_pair(__m1, __m2);
136         } else {
137           auto __m2 = __first2 + __chunk_size;
138           auto __m1 = std::lower_bound(__first1, __last1, __m2[-1], __comp);
139           return std::make_pair(__m1, __m2);
140         }
141       }();
142 
143       __result += (__mid1 - __first1) + (__mid2 - __first2);
144       __first1 = __mid1;
145       __first2 = __mid2;
146       return {std::move(__mid1), std::move(__mid2), __result};
147     };
148 
149     // handle first chunk
150     std::__construct_at(__r++, __compute_chunk(__partitions.__first_chunk_size_));
151 
152     // handle 2 -> N - 1 chunks
153     for (ptrdiff_t __i = 0; __i != __partitions.__chunk_count_ - 2; ++__i)
154       std::__construct_at(__r++, __compute_chunk(__partitions.__chunk_size_));
155 
156     // handle last chunk
157     std::__construct_at(__r, __last1, __last2, __result);
158 
159     __libdispatch::__dispatch_apply(__partitions.__chunk_count_, [&](size_t __index) {
160       auto __first_iters = __ranges[__index];
161       auto __last_iters  = __ranges[__index + 1];
162       __leaf_merge(
163           __first_iters.__mid1_,
164           __last_iters.__mid1_,
165           __first_iters.__mid2_,
166           __last_iters.__mid2_,
167           __first_iters.__result_,
168           __comp);
169     });
170   });
171 }
172 
173 template <class _RandomAccessIterator, class _Transform, class _Value, class _Combiner, class _Reduction>
174 _LIBCPP_HIDE_FROM_ABI _Value __parallel_transform_reduce(
175     _RandomAccessIterator __first,
176     _RandomAccessIterator __last,
177     _Transform __transform,
178     _Value __init,
179     _Combiner __combiner,
180     _Reduction __reduction) {
181   if (__first == __last)
182     return __init;
183 
184   auto __partitions = __libdispatch::__partition_chunks(__last - __first);
185 
186   auto __destroy = [__count = __partitions.__chunk_count_](_Value* __ptr) {
187     std::destroy_n(__ptr, __count);
188     std::allocator<_Value>().deallocate(__ptr, __count);
189   };
190 
191   // TODO: use __uninitialized_buffer
192   // TODO: allocate one element per worker instead of one element per chunk
193   unique_ptr<_Value[], decltype(__destroy)> __values(
194       std::allocator<_Value>().allocate(__partitions.__chunk_count_), __destroy);
195 
196   // __dispatch_apply is noexcept
197   __libdispatch::__dispatch_apply(__partitions.__chunk_count_, [&](size_t __chunk) {
198     auto __this_chunk_size = __chunk == 0 ? __partitions.__first_chunk_size_ : __partitions.__chunk_size_;
199     auto __index =
200         __chunk == 0
201             ? 0
202             : (__chunk * __partitions.__chunk_size_) + (__partitions.__first_chunk_size_ - __partitions.__chunk_size_);
203     if (__this_chunk_size != 1) {
204       std::__construct_at(
205           __values.get() + __chunk,
206           __reduction(__first + __index + 2,
207                       __first + __index + __this_chunk_size,
208                       __combiner(__transform(__first + __index), __transform(__first + __index + 1))));
209     } else {
210       std::__construct_at(__values.get() + __chunk, __transform(__first + __index));
211     }
212   });
213 
214   return std::__terminate_on_exception([&] {
215     return std::reduce(
216         std::make_move_iterator(__values.get()),
217         std::make_move_iterator(__values.get() + __partitions.__chunk_count_),
218         std::move(__init),
219         __combiner);
220   });
221 }
222 
223 // TODO: parallelize this
224 template <class _RandomAccessIterator, class _Comp, class _LeafSort>
225 _LIBCPP_HIDE_FROM_ABI void __parallel_stable_sort(
226     _RandomAccessIterator __first, _RandomAccessIterator __last, _Comp __comp, _LeafSort __leaf_sort) {
227   __leaf_sort(__first, __last, __comp);
228 }
229 
230 _LIBCPP_HIDE_FROM_ABI inline void __cancel_execution() {}
231 
232 } // namespace __libdispatch
233 } // namespace __par_backend
234 
235 _LIBCPP_END_NAMESPACE_STD
236 
237 #endif // !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17
238 
239 _LIBCPP_POP_MACROS
240 
241 #endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_CPU_BACKENDS_LIBDISPATCH_H
242