1 //===----------------------------------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
9 #ifndef _LIBCPP___ALGORITHM_PSTL_BACKENDS_CPU_BACKENDS_LIBDISPATCH_H
10 #define _LIBCPP___ALGORITHM_PSTL_BACKENDS_CPU_BACKENDS_LIBDISPATCH_H
11 
12 #include <__algorithm/inplace_merge.h>
13 #include <__algorithm/lower_bound.h>
14 #include <__algorithm/max.h>
15 #include <__algorithm/merge.h>
16 #include <__algorithm/upper_bound.h>
17 #include <__atomic/atomic.h>
18 #include <__config>
19 #include <__exception/terminate.h>
20 #include <__iterator/iterator_traits.h>
21 #include <__iterator/move_iterator.h>
22 #include <__memory/allocator.h>
23 #include <__memory/construct_at.h>
24 #include <__memory/unique_ptr.h>
25 #include <__numeric/reduce.h>
26 #include <__utility/empty.h>
27 #include <__utility/exception_guard.h>
28 #include <__utility/move.h>
29 #include <__utility/pair.h>
30 #include <cstddef>
31 #include <new>
32 #include <optional>
33 
34 _LIBCPP_PUSH_MACROS
35 #include <__undef_macros>
36 
37 #if !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17
38 
39 _LIBCPP_BEGIN_NAMESPACE_STD
40 
41 namespace __par_backend {
42 inline namespace __libdispatch {
43 
44 // ::dispatch_apply is marked as __attribute__((nothrow)) because it doesn't let exceptions propagate, and neither do
45 // we.
46 // TODO: Do we want to add [[_Clang::__callback__(__func, __context, __)]]?
47 _LIBCPP_EXPORTED_FROM_ABI void
48 __dispatch_apply(size_t __chunk_count, void* __context, void (*__func)(void* __context, size_t __chunk)) noexcept;
49 
50 template <class _Func>
51 _LIBCPP_HIDE_FROM_ABI void __dispatch_apply(size_t __chunk_count, _Func __func) noexcept {
52   __libdispatch::__dispatch_apply(__chunk_count, &__func, [](void* __context, size_t __chunk) {
53     (*static_cast<_Func*>(__context))(__chunk);
54   });
55 }
56 
57 struct __chunk_partitions {
58   ptrdiff_t __chunk_count_; // includes the first chunk
59   ptrdiff_t __chunk_size_;
60   ptrdiff_t __first_chunk_size_;
61 };
62 
63 [[__gnu__::__const__]] _LIBCPP_EXPORTED_FROM_ABI __chunk_partitions __partition_chunks(ptrdiff_t __size) noexcept;
64 
65 template <class _RandomAccessIterator, class _Functor>
66 _LIBCPP_HIDE_FROM_ABI optional<__empty>
67 __dispatch_parallel_for(__chunk_partitions __partitions, _RandomAccessIterator __first, _Functor __func) {
68   // Perform the chunked execution.
69   __libdispatch::__dispatch_apply(__partitions.__chunk_count_, [&](size_t __chunk) {
70     auto __this_chunk_size = __chunk == 0 ? __partitions.__first_chunk_size_ : __partitions.__chunk_size_;
71     auto __index =
72         __chunk == 0
73             ? 0
74             : (__chunk * __partitions.__chunk_size_) + (__partitions.__first_chunk_size_ - __partitions.__chunk_size_);
75     __func(__first + __index, __first + __index + __this_chunk_size);
76   });
77 
78   return __empty{};
79 }
80 
81 template <class _RandomAccessIterator, class _Functor>
82 _LIBCPP_HIDE_FROM_ABI optional<__empty>
83 __parallel_for(_RandomAccessIterator __first, _RandomAccessIterator __last, _Functor __func) {
84   return __libdispatch::__dispatch_parallel_for(
85       __libdispatch::__partition_chunks(__last - __first), std::move(__first), std::move(__func));
86 }
87 
88 template <class _RandomAccessIterator1, class _RandomAccessIterator2, class _RandomAccessIteratorOut>
89 struct __merge_range {
90   __merge_range(_RandomAccessIterator1 __mid1, _RandomAccessIterator2 __mid2, _RandomAccessIteratorOut __result)
91       : __mid1_(__mid1), __mid2_(__mid2), __result_(__result) {}
92 
93   _RandomAccessIterator1 __mid1_;
94   _RandomAccessIterator2 __mid2_;
95   _RandomAccessIteratorOut __result_;
96 };
97 
98 template <typename _RandomAccessIterator1,
99           typename _RandomAccessIterator2,
100           typename _RandomAccessIterator3,
101           typename _Compare,
102           typename _LeafMerge>
103 _LIBCPP_HIDE_FROM_ABI optional<__empty> __parallel_merge(
104     _RandomAccessIterator1 __first1,
105     _RandomAccessIterator1 __last1,
106     _RandomAccessIterator2 __first2,
107     _RandomAccessIterator2 __last2,
108     _RandomAccessIterator3 __result,
109     _Compare __comp,
110     _LeafMerge __leaf_merge) noexcept {
111   __chunk_partitions __partitions =
112       __libdispatch::__partition_chunks(std::max<ptrdiff_t>(__last1 - __first1, __last2 - __first2));
113 
114   if (__partitions.__chunk_count_ == 0)
115     return __empty{};
116 
117   if (__partitions.__chunk_count_ == 1) {
118     __leaf_merge(__first1, __last1, __first2, __last2, __result, __comp);
119     return __empty{};
120   }
121 
122   using __merge_range_t = __merge_range<_RandomAccessIterator1, _RandomAccessIterator2, _RandomAccessIterator3>;
123   auto const __n_ranges = __partitions.__chunk_count_ + 1;
124 
125   // TODO: use __uninitialized_buffer
126   auto __destroy = [=](__merge_range_t* __ptr) {
127     std::destroy_n(__ptr, __n_ranges);
128     std::allocator<__merge_range_t>().deallocate(__ptr, __n_ranges);
129   };
130 
131   unique_ptr<__merge_range_t[], decltype(__destroy)> __ranges(
132       [&]() -> __merge_range_t* {
133 #  ifndef _LIBCPP_HAS_NO_EXCEPTIONS
134         try {
135 #  endif
136           return std::allocator<__merge_range_t>().allocate(__n_ranges);
137 #  ifndef _LIBCPP_HAS_NO_EXCEPTIONS
138         } catch (const std::bad_alloc&) {
139           return nullptr;
140         }
141 #  endif
142       }(),
143       __destroy);
144 
145   if (!__ranges)
146     return nullopt;
147 
148   // TODO: Improve the case where the smaller range is merged into just a few (or even one) chunks of the larger case
149   __merge_range_t* __r = __ranges.get();
150   std::__construct_at(__r++, __first1, __first2, __result);
151 
152   bool __iterate_first_range = __last1 - __first1 > __last2 - __first2;
153 
154   auto __compute_chunk = [&](size_t __chunk_size) -> __merge_range_t {
155     auto [__mid1, __mid2] = [&] {
156       if (__iterate_first_range) {
157         auto __m1 = __first1 + __chunk_size;
158         auto __m2 = std::lower_bound(__first2, __last2, __m1[-1], __comp);
159         return std::make_pair(__m1, __m2);
160       } else {
161         auto __m2 = __first2 + __chunk_size;
162         auto __m1 = std::lower_bound(__first1, __last1, __m2[-1], __comp);
163         return std::make_pair(__m1, __m2);
164       }
165     }();
166 
167     __result += (__mid1 - __first1) + (__mid2 - __first2);
168     __first1 = __mid1;
169     __first2 = __mid2;
170     return {std::move(__mid1), std::move(__mid2), __result};
171   };
172 
173   // handle first chunk
174   std::__construct_at(__r++, __compute_chunk(__partitions.__first_chunk_size_));
175 
176   // handle 2 -> N - 1 chunks
177   for (ptrdiff_t __i = 0; __i != __partitions.__chunk_count_ - 2; ++__i)
178     std::__construct_at(__r++, __compute_chunk(__partitions.__chunk_size_));
179 
180   // handle last chunk
181   std::__construct_at(__r, __last1, __last2, __result);
182 
183   __libdispatch::__dispatch_apply(__partitions.__chunk_count_, [&](size_t __index) {
184     auto __first_iters = __ranges[__index];
185     auto __last_iters  = __ranges[__index + 1];
186     __leaf_merge(
187         __first_iters.__mid1_,
188         __last_iters.__mid1_,
189         __first_iters.__mid2_,
190         __last_iters.__mid2_,
191         __first_iters.__result_,
192         __comp);
193   });
194 
195   return __empty{};
196 }
197 
198 template <class _RandomAccessIterator, class _Transform, class _Value, class _Combiner, class _Reduction>
199 _LIBCPP_HIDE_FROM_ABI optional<_Value> __parallel_transform_reduce(
200     _RandomAccessIterator __first,
201     _RandomAccessIterator __last,
202     _Transform __transform,
203     _Value __init,
204     _Combiner __combiner,
205     _Reduction __reduction) {
206   if (__first == __last)
207     return __init;
208 
209   auto __partitions = __libdispatch::__partition_chunks(__last - __first);
210 
211   auto __destroy = [__count = __partitions.__chunk_count_](_Value* __ptr) {
212     std::destroy_n(__ptr, __count);
213     std::allocator<_Value>().deallocate(__ptr, __count);
214   };
215 
216   // TODO: use __uninitialized_buffer
217   // TODO: allocate one element per worker instead of one element per chunk
218   unique_ptr<_Value[], decltype(__destroy)> __values(
219       std::allocator<_Value>().allocate(__partitions.__chunk_count_), __destroy);
220 
221   // __dispatch_apply is noexcept
222   __libdispatch::__dispatch_apply(__partitions.__chunk_count_, [&](size_t __chunk) {
223     auto __this_chunk_size = __chunk == 0 ? __partitions.__first_chunk_size_ : __partitions.__chunk_size_;
224     auto __index =
225         __chunk == 0
226             ? 0
227             : (__chunk * __partitions.__chunk_size_) + (__partitions.__first_chunk_size_ - __partitions.__chunk_size_);
228     if (__this_chunk_size != 1) {
229       std::__construct_at(
230           __values.get() + __chunk,
231           __reduction(__first + __index + 2,
232                       __first + __index + __this_chunk_size,
233                       __combiner(__transform(__first + __index), __transform(__first + __index + 1))));
234     } else {
235       std::__construct_at(__values.get() + __chunk, __transform(__first + __index));
236     }
237   });
238 
239   return std::reduce(
240       std::make_move_iterator(__values.get()),
241       std::make_move_iterator(__values.get() + __partitions.__chunk_count_),
242       std::move(__init),
243       __combiner);
244 }
245 
246 template <class _RandomAccessIterator, class _Comp, class _LeafSort>
247 _LIBCPP_HIDE_FROM_ABI optional<__empty> __parallel_stable_sort(
248     _RandomAccessIterator __first, _RandomAccessIterator __last, _Comp __comp, _LeafSort __leaf_sort) {
249   const auto __size = __last - __first;
250   auto __partitions = __libdispatch::__partition_chunks(__size);
251 
252   if (__partitions.__chunk_count_ == 0)
253     return __empty{};
254 
255   if (__partitions.__chunk_count_ == 1) {
256     __leaf_sort(__first, __last, __comp);
257     return __empty{};
258   }
259 
260   using _Value = __iter_value_type<_RandomAccessIterator>;
261 
262   auto __destroy = [__size](_Value* __ptr) {
263     std::destroy_n(__ptr, __size);
264     std::allocator<_Value>().deallocate(__ptr, __size);
265   };
266 
267   // TODO: use __uninitialized_buffer
268   unique_ptr<_Value[], decltype(__destroy)> __values(std::allocator<_Value>().allocate(__size), __destroy);
269 
270   // Initialize all elements to a moved-from state
271   // TODO: Don't do this - this can be done in the first merge - see https://llvm.org/PR63928
272   std::__construct_at(__values.get(), std::move(*__first));
273   for (__iter_diff_t<_RandomAccessIterator> __i = 1; __i != __size; ++__i) {
274     std::__construct_at(__values.get() + __i, std::move(__values.get()[__i - 1]));
275   }
276   *__first = std::move(__values.get()[__size - 1]);
277 
278   __libdispatch::__dispatch_parallel_for(
279       __partitions,
280       __first,
281       [&__leaf_sort, &__comp](_RandomAccessIterator __chunk_first, _RandomAccessIterator __chunk_last) {
282         __leaf_sort(std::move(__chunk_first), std::move(__chunk_last), __comp);
283       });
284 
285   bool __objects_are_in_buffer = false;
286   do {
287     const auto __old_chunk_size = __partitions.__chunk_size_;
288     if (__partitions.__chunk_count_ % 2 == 1) {
289       auto __inplace_merge_chunks = [&__comp, &__partitions](auto __first_chunk_begin) {
290         std::inplace_merge(
291             __first_chunk_begin,
292             __first_chunk_begin + __partitions.__first_chunk_size_,
293             __first_chunk_begin + __partitions.__first_chunk_size_ + __partitions.__chunk_size_,
294             __comp);
295       };
296       if (__objects_are_in_buffer)
297         __inplace_merge_chunks(__values.get());
298       else
299         __inplace_merge_chunks(__first);
300       __partitions.__first_chunk_size_ += 2 * __partitions.__chunk_size_;
301     } else {
302       __partitions.__first_chunk_size_ += __partitions.__chunk_size_;
303     }
304 
305     __partitions.__chunk_size_ *= 2;
306     __partitions.__chunk_count_ /= 2;
307 
308     auto __merge_chunks = [__partitions, __old_chunk_size, &__comp](auto __from_first, auto __to_first) {
309       __libdispatch::__dispatch_parallel_for(
310           __partitions,
311           __from_first,
312           [__old_chunk_size, &__from_first, &__to_first, &__comp](auto __chunk_first, auto __chunk_last) {
313             std::merge(std::make_move_iterator(__chunk_first),
314                        std::make_move_iterator(__chunk_last - __old_chunk_size),
315                        std::make_move_iterator(__chunk_last - __old_chunk_size),
316                        std::make_move_iterator(__chunk_last),
317                        __to_first + (__chunk_first - __from_first),
318                        __comp);
319           });
320     };
321 
322     if (__objects_are_in_buffer)
323       __merge_chunks(__values.get(), __first);
324     else
325       __merge_chunks(__first, __values.get());
326     __objects_are_in_buffer = !__objects_are_in_buffer;
327   } while (__partitions.__chunk_count_ > 1);
328 
329   if (__objects_are_in_buffer) {
330     std::move(__values.get(), __values.get() + __size, __first);
331   }
332 
333   return __empty{};
334 }
335 
336 _LIBCPP_HIDE_FROM_ABI inline void __cancel_execution() {}
337 
338 } // namespace __libdispatch
339 } // namespace __par_backend
340 
341 _LIBCPP_END_NAMESPACE_STD
342 
343 #endif // !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17
344 
345 _LIBCPP_POP_MACROS
346 
347 #endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_CPU_BACKENDS_LIBDISPATCH_H
348