1 //===- llvm/Support/Parallel.h - Parallel algorithms ----------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 9 #ifndef LLVM_SUPPORT_PARALLEL_H 10 #define LLVM_SUPPORT_PARALLEL_H 11 12 #include "llvm/ADT/STLExtras.h" 13 #include "llvm/Config/llvm-config.h" 14 #include "llvm/Support/Error.h" 15 #include "llvm/Support/MathExtras.h" 16 #include "llvm/Support/Threading.h" 17 18 #include <algorithm> 19 #include <condition_variable> 20 #include <functional> 21 #include <mutex> 22 23 namespace llvm { 24 25 namespace parallel { 26 27 // Strategy for the default executor used by the parallel routines provided by 28 // this file. It defaults to using all hardware threads and should be 29 // initialized before the first use of parallel routines. 30 extern ThreadPoolStrategy strategy; 31 32 #if LLVM_ENABLE_THREADS 33 #ifdef _WIN32 34 // Direct access to thread_local variables from a different DLL isn't 35 // possible with Windows Native TLS. 36 unsigned getThreadIndex(); 37 #else 38 // Don't access this directly, use the getThreadIndex wrapper. 39 extern thread_local unsigned threadIndex; 40 41 inline unsigned getThreadIndex() { return threadIndex; } 42 #endif 43 #else 44 inline unsigned getThreadIndex() { return 0; } 45 #endif 46 47 namespace detail { 48 class Latch { 49 uint32_t Count; 50 mutable std::mutex Mutex; 51 mutable std::condition_variable Cond; 52 53 public: 54 explicit Latch(uint32_t Count = 0) : Count(Count) {} 55 ~Latch() { 56 // Ensure at least that sync() was called. 57 assert(Count == 0); 58 } 59 60 void inc() { 61 std::lock_guard<std::mutex> lock(Mutex); 62 ++Count; 63 } 64 65 void dec() { 66 std::lock_guard<std::mutex> lock(Mutex); 67 if (--Count == 0) 68 Cond.notify_all(); 69 } 70 71 void sync() const { 72 std::unique_lock<std::mutex> lock(Mutex); 73 Cond.wait(lock, [&] { return Count == 0; }); 74 } 75 }; 76 } // namespace detail 77 78 class TaskGroup { 79 detail::Latch L; 80 bool Parallel; 81 82 public: 83 TaskGroup(); 84 ~TaskGroup(); 85 86 // Spawn a task, but does not wait for it to finish. 87 void spawn(std::function<void()> f); 88 89 // Similar to spawn, but execute the task immediately when ThreadsRequested == 90 // 1. The difference is to give the following pattern a more intuitive order 91 // when single threading is requested. 92 // 93 // for (size_t begin = 0, i = 0, taskSize = 0;;) { 94 // taskSize += ... 95 // bool done = ++i == end; 96 // if (done || taskSize >= taskSizeLimit) { 97 // tg.execute([=] { fn(begin, i); }); 98 // if (done) 99 // break; 100 // begin = i; 101 // taskSize = 0; 102 // } 103 // } 104 void execute(std::function<void()> f); 105 106 void sync() const { L.sync(); } 107 }; 108 109 namespace detail { 110 111 #if LLVM_ENABLE_THREADS 112 const ptrdiff_t MinParallelSize = 1024; 113 114 /// Inclusive median. 115 template <class RandomAccessIterator, class Comparator> 116 RandomAccessIterator medianOf3(RandomAccessIterator Start, 117 RandomAccessIterator End, 118 const Comparator &Comp) { 119 RandomAccessIterator Mid = Start + (std::distance(Start, End) / 2); 120 return Comp(*Start, *(End - 1)) 121 ? (Comp(*Mid, *(End - 1)) ? (Comp(*Start, *Mid) ? Mid : Start) 122 : End - 1) 123 : (Comp(*Mid, *Start) ? (Comp(*(End - 1), *Mid) ? Mid : End - 1) 124 : Start); 125 } 126 127 template <class RandomAccessIterator, class Comparator> 128 void parallel_quick_sort(RandomAccessIterator Start, RandomAccessIterator End, 129 const Comparator &Comp, TaskGroup &TG, size_t Depth) { 130 // Do a sequential sort for small inputs. 131 if (std::distance(Start, End) < detail::MinParallelSize || Depth == 0) { 132 llvm::sort(Start, End, Comp); 133 return; 134 } 135 136 // Partition. 137 auto Pivot = medianOf3(Start, End, Comp); 138 // Move Pivot to End. 139 std::swap(*(End - 1), *Pivot); 140 Pivot = std::partition(Start, End - 1, [&Comp, End](decltype(*Start) V) { 141 return Comp(V, *(End - 1)); 142 }); 143 // Move Pivot to middle of partition. 144 std::swap(*Pivot, *(End - 1)); 145 146 // Recurse. 147 TG.spawn([=, &Comp, &TG] { 148 parallel_quick_sort(Start, Pivot, Comp, TG, Depth - 1); 149 }); 150 parallel_quick_sort(Pivot + 1, End, Comp, TG, Depth - 1); 151 } 152 153 template <class RandomAccessIterator, class Comparator> 154 void parallel_sort(RandomAccessIterator Start, RandomAccessIterator End, 155 const Comparator &Comp) { 156 TaskGroup TG; 157 parallel_quick_sort(Start, End, Comp, TG, 158 llvm::Log2_64(std::distance(Start, End)) + 1); 159 } 160 161 // TaskGroup has a relatively high overhead, so we want to reduce 162 // the number of spawn() calls. We'll create up to 1024 tasks here. 163 // (Note that 1024 is an arbitrary number. This code probably needs 164 // improving to take the number of available cores into account.) 165 enum { MaxTasksPerGroup = 1024 }; 166 167 template <class IterTy, class ResultTy, class ReduceFuncTy, 168 class TransformFuncTy> 169 ResultTy parallel_transform_reduce(IterTy Begin, IterTy End, ResultTy Init, 170 ReduceFuncTy Reduce, 171 TransformFuncTy Transform) { 172 // Limit the number of tasks to MaxTasksPerGroup to limit job scheduling 173 // overhead on large inputs. 174 size_t NumInputs = std::distance(Begin, End); 175 if (NumInputs == 0) 176 return std::move(Init); 177 size_t NumTasks = std::min(static_cast<size_t>(MaxTasksPerGroup), NumInputs); 178 std::vector<ResultTy> Results(NumTasks, Init); 179 { 180 // Each task processes either TaskSize or TaskSize+1 inputs. Any inputs 181 // remaining after dividing them equally amongst tasks are distributed as 182 // one extra input over the first tasks. 183 TaskGroup TG; 184 size_t TaskSize = NumInputs / NumTasks; 185 size_t RemainingInputs = NumInputs % NumTasks; 186 IterTy TBegin = Begin; 187 for (size_t TaskId = 0; TaskId < NumTasks; ++TaskId) { 188 IterTy TEnd = TBegin + TaskSize + (TaskId < RemainingInputs ? 1 : 0); 189 TG.spawn([=, &Transform, &Reduce, &Results] { 190 // Reduce the result of transformation eagerly within each task. 191 ResultTy R = Init; 192 for (IterTy It = TBegin; It != TEnd; ++It) 193 R = Reduce(R, Transform(*It)); 194 Results[TaskId] = R; 195 }); 196 TBegin = TEnd; 197 } 198 assert(TBegin == End); 199 } 200 201 // Do a final reduction. There are at most 1024 tasks, so this only adds 202 // constant single-threaded overhead for large inputs. Hopefully most 203 // reductions are cheaper than the transformation. 204 ResultTy FinalResult = std::move(Results.front()); 205 for (ResultTy &PartialResult : 206 MutableArrayRef(Results.data() + 1, Results.size() - 1)) 207 FinalResult = Reduce(FinalResult, std::move(PartialResult)); 208 return std::move(FinalResult); 209 } 210 211 #endif 212 213 } // namespace detail 214 } // namespace parallel 215 216 template <class RandomAccessIterator, 217 class Comparator = std::less< 218 typename std::iterator_traits<RandomAccessIterator>::value_type>> 219 void parallelSort(RandomAccessIterator Start, RandomAccessIterator End, 220 const Comparator &Comp = Comparator()) { 221 #if LLVM_ENABLE_THREADS 222 if (parallel::strategy.ThreadsRequested != 1) { 223 parallel::detail::parallel_sort(Start, End, Comp); 224 return; 225 } 226 #endif 227 llvm::sort(Start, End, Comp); 228 } 229 230 void parallelFor(size_t Begin, size_t End, function_ref<void(size_t)> Fn); 231 232 template <class IterTy, class FuncTy> 233 void parallelForEach(IterTy Begin, IterTy End, FuncTy Fn) { 234 parallelFor(0, End - Begin, [&](size_t I) { Fn(Begin[I]); }); 235 } 236 237 template <class IterTy, class ResultTy, class ReduceFuncTy, 238 class TransformFuncTy> 239 ResultTy parallelTransformReduce(IterTy Begin, IterTy End, ResultTy Init, 240 ReduceFuncTy Reduce, 241 TransformFuncTy Transform) { 242 #if LLVM_ENABLE_THREADS 243 if (parallel::strategy.ThreadsRequested != 1) { 244 return parallel::detail::parallel_transform_reduce(Begin, End, Init, Reduce, 245 Transform); 246 } 247 #endif 248 for (IterTy I = Begin; I != End; ++I) 249 Init = Reduce(std::move(Init), Transform(*I)); 250 return std::move(Init); 251 } 252 253 // Range wrappers. 254 template <class RangeTy, 255 class Comparator = std::less<decltype(*std::begin(RangeTy()))>> 256 void parallelSort(RangeTy &&R, const Comparator &Comp = Comparator()) { 257 parallelSort(std::begin(R), std::end(R), Comp); 258 } 259 260 template <class RangeTy, class FuncTy> 261 void parallelForEach(RangeTy &&R, FuncTy Fn) { 262 parallelForEach(std::begin(R), std::end(R), Fn); 263 } 264 265 template <class RangeTy, class ResultTy, class ReduceFuncTy, 266 class TransformFuncTy> 267 ResultTy parallelTransformReduce(RangeTy &&R, ResultTy Init, 268 ReduceFuncTy Reduce, 269 TransformFuncTy Transform) { 270 return parallelTransformReduce(std::begin(R), std::end(R), Init, Reduce, 271 Transform); 272 } 273 274 // Parallel for-each, but with error handling. 275 template <class RangeTy, class FuncTy> 276 Error parallelForEachError(RangeTy &&R, FuncTy Fn) { 277 // The transform_reduce algorithm requires that the initial value be copyable. 278 // Error objects are uncopyable. We only need to copy initial success values, 279 // so work around this mismatch via the C API. The C API represents success 280 // values with a null pointer. The joinErrors discards null values and joins 281 // multiple errors into an ErrorList. 282 return unwrap(parallelTransformReduce( 283 std::begin(R), std::end(R), wrap(Error::success()), 284 [](LLVMErrorRef Lhs, LLVMErrorRef Rhs) { 285 return wrap(joinErrors(unwrap(Lhs), unwrap(Rhs))); 286 }, 287 [&Fn](auto &&V) { return wrap(Fn(V)); })); 288 } 289 290 } // namespace llvm 291 292 #endif // LLVM_SUPPORT_PARALLEL_H 293