1 //===----------------------------------------------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 9 #ifndef _LIBCPP___ALGORITHM_PSTL_BACKENDS_CPU_BACKENDS_LIBDISPATCH_H 10 #define _LIBCPP___ALGORITHM_PSTL_BACKENDS_CPU_BACKENDS_LIBDISPATCH_H 11 12 #include <__algorithm/lower_bound.h> 13 #include <__algorithm/max.h> 14 #include <__algorithm/upper_bound.h> 15 #include <__atomic/atomic.h> 16 #include <__config> 17 #include <__exception/terminate.h> 18 #include <__iterator/iterator_traits.h> 19 #include <__iterator/move_iterator.h> 20 #include <__memory/allocator.h> 21 #include <__memory/construct_at.h> 22 #include <__memory/unique_ptr.h> 23 #include <__numeric/reduce.h> 24 #include <__utility/exception_guard.h> 25 #include <__utility/move.h> 26 #include <__utility/pair.h> 27 #include <__utility/terminate_on_exception.h> 28 #include <cstddef> 29 #include <new> 30 31 _LIBCPP_PUSH_MACROS 32 #include <__undef_macros> 33 34 #if !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17 35 36 _LIBCPP_BEGIN_NAMESPACE_STD 37 38 namespace __par_backend { 39 inline namespace __libdispatch { 40 41 // ::dispatch_apply is marked as __attribute__((nothrow)) because it doesn't let exceptions propagate, and neither do 42 // we. 43 // TODO: Do we want to add [[_Clang::__callback__(__func, __context, __)]]? 44 _LIBCPP_EXPORTED_FROM_ABI void 45 __dispatch_apply(size_t __chunk_count, void* __context, void (*__func)(void* __context, size_t __chunk)) noexcept; 46 47 template <class _Func> 48 _LIBCPP_HIDE_FROM_ABI void __dispatch_apply(size_t __chunk_count, _Func __func) noexcept { 49 __libdispatch::__dispatch_apply(__chunk_count, &__func, [](void* __context, size_t __chunk) { 50 (*static_cast<_Func*>(__context))(__chunk); 51 }); 52 } 53 54 struct __chunk_partitions { 55 ptrdiff_t __chunk_count_; // includes the first chunk 56 ptrdiff_t __chunk_size_; 57 ptrdiff_t __first_chunk_size_; 58 }; 59 60 [[__gnu__::__const__]] _LIBCPP_EXPORTED_FROM_ABI __chunk_partitions __partition_chunks(ptrdiff_t __size); 61 62 template <class _RandomAccessIterator, class _Functor> 63 _LIBCPP_HIDE_FROM_ABI void 64 __parallel_for(_RandomAccessIterator __first, _RandomAccessIterator __last, _Functor __func) { 65 auto __partitions = __libdispatch::__partition_chunks(__last - __first); 66 67 // Perform the chunked execution. 68 __libdispatch::__dispatch_apply(__partitions.__chunk_count_, [&](size_t __chunk) { 69 auto __this_chunk_size = __chunk == 0 ? __partitions.__first_chunk_size_ : __partitions.__chunk_size_; 70 auto __index = 71 __chunk == 0 72 ? 0 73 : (__chunk * __partitions.__chunk_size_) + (__partitions.__first_chunk_size_ - __partitions.__chunk_size_); 74 __func(__first + __index, __first + __index + __this_chunk_size); 75 }); 76 } 77 78 template <class _RandomAccessIterator1, class _RandomAccessIterator2, class _RandomAccessIteratorOut> 79 struct __merge_range { 80 __merge_range(_RandomAccessIterator1 __mid1, _RandomAccessIterator2 __mid2, _RandomAccessIteratorOut __result) 81 : __mid1_(__mid1), __mid2_(__mid2), __result_(__result) {} 82 83 _RandomAccessIterator1 __mid1_; 84 _RandomAccessIterator2 __mid2_; 85 _RandomAccessIteratorOut __result_; 86 }; 87 88 template <typename _RandomAccessIterator1, 89 typename _RandomAccessIterator2, 90 typename _RandomAccessIterator3, 91 typename _Compare, 92 typename _LeafMerge> 93 _LIBCPP_HIDE_FROM_ABI void __parallel_merge( 94 _RandomAccessIterator1 __first1, 95 _RandomAccessIterator1 __last1, 96 _RandomAccessIterator2 __first2, 97 _RandomAccessIterator2 __last2, 98 _RandomAccessIterator3 __result, 99 _Compare __comp, 100 _LeafMerge __leaf_merge) { 101 __chunk_partitions __partitions = 102 __libdispatch::__partition_chunks(std::max<ptrdiff_t>(__last1 - __first1, __last2 - __first2)); 103 104 if (__partitions.__chunk_count_ == 0) 105 return; 106 107 if (__partitions.__chunk_count_ == 1) { 108 __leaf_merge(__first1, __last1, __first2, __last2, __result, __comp); 109 return; 110 } 111 112 using __merge_range_t = __merge_range<_RandomAccessIterator1, _RandomAccessIterator2, _RandomAccessIterator3>; 113 auto const __n_ranges = __partitions.__chunk_count_ + 1; 114 115 // TODO: use __uninitialized_buffer 116 auto __destroy = [=](__merge_range_t* __ptr) { 117 std::destroy_n(__ptr, __n_ranges); 118 std::allocator<__merge_range_t>().deallocate(__ptr, __n_ranges); 119 }; 120 unique_ptr<__merge_range_t[], decltype(__destroy)> __ranges( 121 std::allocator<__merge_range_t>().allocate(__n_ranges), __destroy); 122 123 // TODO: Improve the case where the smaller range is merged into just a few (or even one) chunks of the larger case 124 std::__terminate_on_exception([&] { 125 __merge_range_t* __r = __ranges.get(); 126 std::__construct_at(__r++, __first1, __first2, __result); 127 128 bool __iterate_first_range = __last1 - __first1 > __last2 - __first2; 129 130 auto __compute_chunk = [&](size_t __chunk_size) -> __merge_range_t { 131 auto [__mid1, __mid2] = [&] { 132 if (__iterate_first_range) { 133 auto __m1 = __first1 + __chunk_size; 134 auto __m2 = std::lower_bound(__first2, __last2, __m1[-1], __comp); 135 return std::make_pair(__m1, __m2); 136 } else { 137 auto __m2 = __first2 + __chunk_size; 138 auto __m1 = std::lower_bound(__first1, __last1, __m2[-1], __comp); 139 return std::make_pair(__m1, __m2); 140 } 141 }(); 142 143 __result += (__mid1 - __first1) + (__mid2 - __first2); 144 __first1 = __mid1; 145 __first2 = __mid2; 146 return {std::move(__mid1), std::move(__mid2), __result}; 147 }; 148 149 // handle first chunk 150 std::__construct_at(__r++, __compute_chunk(__partitions.__first_chunk_size_)); 151 152 // handle 2 -> N - 1 chunks 153 for (ptrdiff_t __i = 0; __i != __partitions.__chunk_count_ - 2; ++__i) 154 std::__construct_at(__r++, __compute_chunk(__partitions.__chunk_size_)); 155 156 // handle last chunk 157 std::__construct_at(__r, __last1, __last2, __result); 158 159 __libdispatch::__dispatch_apply(__partitions.__chunk_count_, [&](size_t __index) { 160 auto __first_iters = __ranges[__index]; 161 auto __last_iters = __ranges[__index + 1]; 162 __leaf_merge( 163 __first_iters.__mid1_, 164 __last_iters.__mid1_, 165 __first_iters.__mid2_, 166 __last_iters.__mid2_, 167 __first_iters.__result_, 168 __comp); 169 }); 170 }); 171 } 172 173 template <class _RandomAccessIterator, class _Transform, class _Value, class _Combiner, class _Reduction> 174 _LIBCPP_HIDE_FROM_ABI _Value __parallel_transform_reduce( 175 _RandomAccessIterator __first, 176 _RandomAccessIterator __last, 177 _Transform __transform, 178 _Value __init, 179 _Combiner __combiner, 180 _Reduction __reduction) { 181 if (__first == __last) 182 return __init; 183 184 auto __partitions = __libdispatch::__partition_chunks(__last - __first); 185 186 auto __destroy = [__count = __partitions.__chunk_count_](_Value* __ptr) { 187 std::destroy_n(__ptr, __count); 188 std::allocator<_Value>().deallocate(__ptr, __count); 189 }; 190 191 // TODO: use __uninitialized_buffer 192 // TODO: allocate one element per worker instead of one element per chunk 193 unique_ptr<_Value[], decltype(__destroy)> __values( 194 std::allocator<_Value>().allocate(__partitions.__chunk_count_), __destroy); 195 196 // __dispatch_apply is noexcept 197 __libdispatch::__dispatch_apply(__partitions.__chunk_count_, [&](size_t __chunk) { 198 auto __this_chunk_size = __chunk == 0 ? __partitions.__first_chunk_size_ : __partitions.__chunk_size_; 199 auto __index = 200 __chunk == 0 201 ? 0 202 : (__chunk * __partitions.__chunk_size_) + (__partitions.__first_chunk_size_ - __partitions.__chunk_size_); 203 if (__this_chunk_size != 1) { 204 std::__construct_at( 205 __values.get() + __chunk, 206 __reduction(__first + __index + 2, 207 __first + __index + __this_chunk_size, 208 __combiner(__transform(__first + __index), __transform(__first + __index + 1)))); 209 } else { 210 std::__construct_at(__values.get() + __chunk, __transform(__first + __index)); 211 } 212 }); 213 214 return std::__terminate_on_exception([&] { 215 return std::reduce( 216 std::make_move_iterator(__values.get()), 217 std::make_move_iterator(__values.get() + __partitions.__chunk_count_), 218 std::move(__init), 219 __combiner); 220 }); 221 } 222 223 // TODO: parallelize this 224 template <class _RandomAccessIterator, class _Comp, class _LeafSort> 225 _LIBCPP_HIDE_FROM_ABI void __parallel_stable_sort( 226 _RandomAccessIterator __first, _RandomAccessIterator __last, _Comp __comp, _LeafSort __leaf_sort) { 227 __leaf_sort(__first, __last, __comp); 228 } 229 230 _LIBCPP_HIDE_FROM_ABI inline void __cancel_execution() {} 231 232 } // namespace __libdispatch 233 } // namespace __par_backend 234 235 _LIBCPP_END_NAMESPACE_STD 236 237 #endif // !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17 238 239 _LIBCPP_POP_MACROS 240 241 #endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_CPU_BACKENDS_LIBDISPATCH_H 242