1 //===----------------------------------------------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 9 #ifndef _LIBCPP___ALGORITHM_PSTL_BACKENDS_CPU_BACKENDS_LIBDISPATCH_H 10 #define _LIBCPP___ALGORITHM_PSTL_BACKENDS_CPU_BACKENDS_LIBDISPATCH_H 11 12 #include <__algorithm/inplace_merge.h> 13 #include <__algorithm/lower_bound.h> 14 #include <__algorithm/max.h> 15 #include <__algorithm/merge.h> 16 #include <__algorithm/upper_bound.h> 17 #include <__atomic/atomic.h> 18 #include <__config> 19 #include <__exception/terminate.h> 20 #include <__iterator/iterator_traits.h> 21 #include <__iterator/move_iterator.h> 22 #include <__memory/allocator.h> 23 #include <__memory/construct_at.h> 24 #include <__memory/unique_ptr.h> 25 #include <__numeric/reduce.h> 26 #include <__utility/empty.h> 27 #include <__utility/exception_guard.h> 28 #include <__utility/move.h> 29 #include <__utility/pair.h> 30 #include <cstddef> 31 #include <new> 32 #include <optional> 33 34 _LIBCPP_PUSH_MACROS 35 #include <__undef_macros> 36 37 #if !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17 38 39 _LIBCPP_BEGIN_NAMESPACE_STD 40 41 namespace __par_backend { 42 inline namespace __libdispatch { 43 44 // ::dispatch_apply is marked as __attribute__((nothrow)) because it doesn't let exceptions propagate, and neither do 45 // we. 46 // TODO: Do we want to add [[_Clang::__callback__(__func, __context, __)]]? 47 _LIBCPP_EXPORTED_FROM_ABI void 48 __dispatch_apply(size_t __chunk_count, void* __context, void (*__func)(void* __context, size_t __chunk)) noexcept; 49 50 template <class _Func> 51 _LIBCPP_HIDE_FROM_ABI void __dispatch_apply(size_t __chunk_count, _Func __func) noexcept { 52 __libdispatch::__dispatch_apply(__chunk_count, &__func, [](void* __context, size_t __chunk) { 53 (*static_cast<_Func*>(__context))(__chunk); 54 }); 55 } 56 57 struct __chunk_partitions { 58 ptrdiff_t __chunk_count_; // includes the first chunk 59 ptrdiff_t __chunk_size_; 60 ptrdiff_t __first_chunk_size_; 61 }; 62 63 [[__gnu__::__const__]] _LIBCPP_EXPORTED_FROM_ABI __chunk_partitions __partition_chunks(ptrdiff_t __size) noexcept; 64 65 template <class _RandomAccessIterator, class _Functor> 66 _LIBCPP_HIDE_FROM_ABI optional<__empty> 67 __dispatch_parallel_for(__chunk_partitions __partitions, _RandomAccessIterator __first, _Functor __func) { 68 // Perform the chunked execution. 69 __libdispatch::__dispatch_apply(__partitions.__chunk_count_, [&](size_t __chunk) { 70 auto __this_chunk_size = __chunk == 0 ? __partitions.__first_chunk_size_ : __partitions.__chunk_size_; 71 auto __index = 72 __chunk == 0 73 ? 0 74 : (__chunk * __partitions.__chunk_size_) + (__partitions.__first_chunk_size_ - __partitions.__chunk_size_); 75 __func(__first + __index, __first + __index + __this_chunk_size); 76 }); 77 78 return __empty{}; 79 } 80 81 template <class _RandomAccessIterator, class _Functor> 82 _LIBCPP_HIDE_FROM_ABI optional<__empty> 83 __parallel_for(_RandomAccessIterator __first, _RandomAccessIterator __last, _Functor __func) { 84 return __libdispatch::__dispatch_parallel_for( 85 __libdispatch::__partition_chunks(__last - __first), std::move(__first), std::move(__func)); 86 } 87 88 template <class _RandomAccessIterator1, class _RandomAccessIterator2, class _RandomAccessIteratorOut> 89 struct __merge_range { 90 __merge_range(_RandomAccessIterator1 __mid1, _RandomAccessIterator2 __mid2, _RandomAccessIteratorOut __result) 91 : __mid1_(__mid1), __mid2_(__mid2), __result_(__result) {} 92 93 _RandomAccessIterator1 __mid1_; 94 _RandomAccessIterator2 __mid2_; 95 _RandomAccessIteratorOut __result_; 96 }; 97 98 template <typename _RandomAccessIterator1, 99 typename _RandomAccessIterator2, 100 typename _RandomAccessIterator3, 101 typename _Compare, 102 typename _LeafMerge> 103 _LIBCPP_HIDE_FROM_ABI optional<__empty> __parallel_merge( 104 _RandomAccessIterator1 __first1, 105 _RandomAccessIterator1 __last1, 106 _RandomAccessIterator2 __first2, 107 _RandomAccessIterator2 __last2, 108 _RandomAccessIterator3 __result, 109 _Compare __comp, 110 _LeafMerge __leaf_merge) noexcept { 111 __chunk_partitions __partitions = 112 __libdispatch::__partition_chunks(std::max<ptrdiff_t>(__last1 - __first1, __last2 - __first2)); 113 114 if (__partitions.__chunk_count_ == 0) 115 return __empty{}; 116 117 if (__partitions.__chunk_count_ == 1) { 118 __leaf_merge(__first1, __last1, __first2, __last2, __result, __comp); 119 return __empty{}; 120 } 121 122 using __merge_range_t = __merge_range<_RandomAccessIterator1, _RandomAccessIterator2, _RandomAccessIterator3>; 123 auto const __n_ranges = __partitions.__chunk_count_ + 1; 124 125 // TODO: use __uninitialized_buffer 126 auto __destroy = [=](__merge_range_t* __ptr) { 127 std::destroy_n(__ptr, __n_ranges); 128 std::allocator<__merge_range_t>().deallocate(__ptr, __n_ranges); 129 }; 130 131 unique_ptr<__merge_range_t[], decltype(__destroy)> __ranges( 132 [&]() -> __merge_range_t* { 133 # ifndef _LIBCPP_HAS_NO_EXCEPTIONS 134 try { 135 # endif 136 return std::allocator<__merge_range_t>().allocate(__n_ranges); 137 # ifndef _LIBCPP_HAS_NO_EXCEPTIONS 138 } catch (const std::bad_alloc&) { 139 return nullptr; 140 } 141 # endif 142 }(), 143 __destroy); 144 145 if (!__ranges) 146 return nullopt; 147 148 // TODO: Improve the case where the smaller range is merged into just a few (or even one) chunks of the larger case 149 __merge_range_t* __r = __ranges.get(); 150 std::__construct_at(__r++, __first1, __first2, __result); 151 152 bool __iterate_first_range = __last1 - __first1 > __last2 - __first2; 153 154 auto __compute_chunk = [&](size_t __chunk_size) -> __merge_range_t { 155 auto [__mid1, __mid2] = [&] { 156 if (__iterate_first_range) { 157 auto __m1 = __first1 + __chunk_size; 158 auto __m2 = std::lower_bound(__first2, __last2, __m1[-1], __comp); 159 return std::make_pair(__m1, __m2); 160 } else { 161 auto __m2 = __first2 + __chunk_size; 162 auto __m1 = std::lower_bound(__first1, __last1, __m2[-1], __comp); 163 return std::make_pair(__m1, __m2); 164 } 165 }(); 166 167 __result += (__mid1 - __first1) + (__mid2 - __first2); 168 __first1 = __mid1; 169 __first2 = __mid2; 170 return {std::move(__mid1), std::move(__mid2), __result}; 171 }; 172 173 // handle first chunk 174 std::__construct_at(__r++, __compute_chunk(__partitions.__first_chunk_size_)); 175 176 // handle 2 -> N - 1 chunks 177 for (ptrdiff_t __i = 0; __i != __partitions.__chunk_count_ - 2; ++__i) 178 std::__construct_at(__r++, __compute_chunk(__partitions.__chunk_size_)); 179 180 // handle last chunk 181 std::__construct_at(__r, __last1, __last2, __result); 182 183 __libdispatch::__dispatch_apply(__partitions.__chunk_count_, [&](size_t __index) { 184 auto __first_iters = __ranges[__index]; 185 auto __last_iters = __ranges[__index + 1]; 186 __leaf_merge( 187 __first_iters.__mid1_, 188 __last_iters.__mid1_, 189 __first_iters.__mid2_, 190 __last_iters.__mid2_, 191 __first_iters.__result_, 192 __comp); 193 }); 194 195 return __empty{}; 196 } 197 198 template <class _RandomAccessIterator, class _Transform, class _Value, class _Combiner, class _Reduction> 199 _LIBCPP_HIDE_FROM_ABI optional<_Value> __parallel_transform_reduce( 200 _RandomAccessIterator __first, 201 _RandomAccessIterator __last, 202 _Transform __transform, 203 _Value __init, 204 _Combiner __combiner, 205 _Reduction __reduction) { 206 if (__first == __last) 207 return __init; 208 209 auto __partitions = __libdispatch::__partition_chunks(__last - __first); 210 211 auto __destroy = [__count = __partitions.__chunk_count_](_Value* __ptr) { 212 std::destroy_n(__ptr, __count); 213 std::allocator<_Value>().deallocate(__ptr, __count); 214 }; 215 216 // TODO: use __uninitialized_buffer 217 // TODO: allocate one element per worker instead of one element per chunk 218 unique_ptr<_Value[], decltype(__destroy)> __values( 219 std::allocator<_Value>().allocate(__partitions.__chunk_count_), __destroy); 220 221 // __dispatch_apply is noexcept 222 __libdispatch::__dispatch_apply(__partitions.__chunk_count_, [&](size_t __chunk) { 223 auto __this_chunk_size = __chunk == 0 ? __partitions.__first_chunk_size_ : __partitions.__chunk_size_; 224 auto __index = 225 __chunk == 0 226 ? 0 227 : (__chunk * __partitions.__chunk_size_) + (__partitions.__first_chunk_size_ - __partitions.__chunk_size_); 228 if (__this_chunk_size != 1) { 229 std::__construct_at( 230 __values.get() + __chunk, 231 __reduction(__first + __index + 2, 232 __first + __index + __this_chunk_size, 233 __combiner(__transform(__first + __index), __transform(__first + __index + 1)))); 234 } else { 235 std::__construct_at(__values.get() + __chunk, __transform(__first + __index)); 236 } 237 }); 238 239 return std::reduce( 240 std::make_move_iterator(__values.get()), 241 std::make_move_iterator(__values.get() + __partitions.__chunk_count_), 242 std::move(__init), 243 __combiner); 244 } 245 246 template <class _RandomAccessIterator, class _Comp, class _LeafSort> 247 _LIBCPP_HIDE_FROM_ABI optional<__empty> __parallel_stable_sort( 248 _RandomAccessIterator __first, _RandomAccessIterator __last, _Comp __comp, _LeafSort __leaf_sort) { 249 const auto __size = __last - __first; 250 auto __partitions = __libdispatch::__partition_chunks(__size); 251 252 if (__partitions.__chunk_count_ == 0) 253 return __empty{}; 254 255 if (__partitions.__chunk_count_ == 1) { 256 __leaf_sort(__first, __last, __comp); 257 return __empty{}; 258 } 259 260 using _Value = __iter_value_type<_RandomAccessIterator>; 261 262 auto __destroy = [__size](_Value* __ptr) { 263 std::destroy_n(__ptr, __size); 264 std::allocator<_Value>().deallocate(__ptr, __size); 265 }; 266 267 // TODO: use __uninitialized_buffer 268 unique_ptr<_Value[], decltype(__destroy)> __values(std::allocator<_Value>().allocate(__size), __destroy); 269 270 // Initialize all elements to a moved-from state 271 // TODO: Don't do this - this can be done in the first merge - see https://llvm.org/PR63928 272 std::__construct_at(__values.get(), std::move(*__first)); 273 for (__iter_diff_t<_RandomAccessIterator> __i = 1; __i != __size; ++__i) { 274 std::__construct_at(__values.get() + __i, std::move(__values.get()[__i - 1])); 275 } 276 *__first = std::move(__values.get()[__size - 1]); 277 278 __libdispatch::__dispatch_parallel_for( 279 __partitions, 280 __first, 281 [&__leaf_sort, &__comp](_RandomAccessIterator __chunk_first, _RandomAccessIterator __chunk_last) { 282 __leaf_sort(std::move(__chunk_first), std::move(__chunk_last), __comp); 283 }); 284 285 bool __objects_are_in_buffer = false; 286 do { 287 const auto __old_chunk_size = __partitions.__chunk_size_; 288 if (__partitions.__chunk_count_ % 2 == 1) { 289 auto __inplace_merge_chunks = [&__comp, &__partitions](auto __first_chunk_begin) { 290 std::inplace_merge( 291 __first_chunk_begin, 292 __first_chunk_begin + __partitions.__first_chunk_size_, 293 __first_chunk_begin + __partitions.__first_chunk_size_ + __partitions.__chunk_size_, 294 __comp); 295 }; 296 if (__objects_are_in_buffer) 297 __inplace_merge_chunks(__values.get()); 298 else 299 __inplace_merge_chunks(__first); 300 __partitions.__first_chunk_size_ += 2 * __partitions.__chunk_size_; 301 } else { 302 __partitions.__first_chunk_size_ += __partitions.__chunk_size_; 303 } 304 305 __partitions.__chunk_size_ *= 2; 306 __partitions.__chunk_count_ /= 2; 307 308 auto __merge_chunks = [__partitions, __old_chunk_size, &__comp](auto __from_first, auto __to_first) { 309 __libdispatch::__dispatch_parallel_for( 310 __partitions, 311 __from_first, 312 [__old_chunk_size, &__from_first, &__to_first, &__comp](auto __chunk_first, auto __chunk_last) { 313 std::merge(std::make_move_iterator(__chunk_first), 314 std::make_move_iterator(__chunk_last - __old_chunk_size), 315 std::make_move_iterator(__chunk_last - __old_chunk_size), 316 std::make_move_iterator(__chunk_last), 317 __to_first + (__chunk_first - __from_first), 318 __comp); 319 }); 320 }; 321 322 if (__objects_are_in_buffer) 323 __merge_chunks(__values.get(), __first); 324 else 325 __merge_chunks(__first, __values.get()); 326 __objects_are_in_buffer = !__objects_are_in_buffer; 327 } while (__partitions.__chunk_count_ > 1); 328 329 if (__objects_are_in_buffer) { 330 std::move(__values.get(), __values.get() + __size, __first); 331 } 332 333 return __empty{}; 334 } 335 336 _LIBCPP_HIDE_FROM_ABI inline void __cancel_execution() {} 337 338 } // namespace __libdispatch 339 } // namespace __par_backend 340 341 _LIBCPP_END_NAMESPACE_STD 342 343 #endif // !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17 344 345 _LIBCPP_POP_MACROS 346 347 #endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_CPU_BACKENDS_LIBDISPATCH_H 348