llvm/Support/Parallel.h

09467b48Spatrick//===- llvm/Support/Parallel.h - Parallel algorithms ----------------------===//
09467b48Spatrick//
09467b48Spatrick// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
09467b48Spatrick// See https://llvm.org/LICENSE.txt for license information.
09467b48Spatrick// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
09467b48Spatrick//
09467b48Spatrick//===----------------------------------------------------------------------===//
09467b48Spatrick
09467b48Spatrick#ifndef LLVM_SUPPORT_PARALLEL_H
09467b48Spatrick#define LLVM_SUPPORT_PARALLEL_H
09467b48Spatrick
09467b48Spatrick#include "llvm/ADT/STLExtras.h"
09467b48Spatrick#include "llvm/Config/llvm-config.h"
73471bf0Spatrick#include "llvm/Support/Error.h"
09467b48Spatrick#include "llvm/Support/MathExtras.h"
097a140dSpatrick#include "llvm/Support/Threading.h"
09467b48Spatrick
09467b48Spatrick#include <algorithm>
09467b48Spatrick#include <condition_variable>
09467b48Spatrick#include <functional>
09467b48Spatrick#include <mutex>
09467b48Spatrick
09467b48Spatricknamespace llvm {
09467b48Spatrick
09467b48Spatricknamespace parallel {
09467b48Spatrick
097a140dSpatrick// Strategy for the default executor used by the parallel routines provided by
097a140dSpatrick// this file. It defaults to using all hardware threads and should be
097a140dSpatrick// initialized before the first use of parallel routines.
097a140dSpatrickextern ThreadPoolStrategy strategy;
09467b48Spatrick
09467b48Spatrick#if LLVM_ENABLE_THREADS
*d415bd75Srobert#ifdef _WIN32
*d415bd75Srobert// Direct access to thread_local variables from a different DLL isn't
*d415bd75Srobert// possible with Windows Native TLS.
*d415bd75Srobertunsigned getThreadIndex();
*d415bd75Srobert#else
*d415bd75Srobert// Don't access this directly, use the getThreadIndex wrapper.
*d415bd75Srobertextern thread_local unsigned threadIndex;
09467b48Spatrick
*d415bd75Srobertinline unsigned getThreadIndex() { return threadIndex; }
*d415bd75Srobert#endif
*d415bd75Srobert#else
*d415bd75Srobertinline unsigned getThreadIndex() { return 0; }
*d415bd75Srobert#endif
*d415bd75Srobert
*d415bd75Srobertnamespace detail {
09467b48Spatrickclass Latch {
09467b48Spatrick  uint32_t Count;
09467b48Spatrick  mutable std::mutex Mutex;
09467b48Spatrick  mutable std::condition_variable Cond;
09467b48Spatrick
09467b48Spatrickpublic:
09467b48Spatrick  explicit Latch(uint32_t Count = 0) : Count(Count) {}
*d415bd75Srobert  ~Latch() {
*d415bd75Srobert    // Ensure at least that sync() was called.
*d415bd75Srobert    assert(Count == 0);
*d415bd75Srobert  }
09467b48Spatrick
09467b48Spatrick  void inc() {
09467b48Spatrick    std::lock_guard<std::mutex> lock(Mutex);
09467b48Spatrick    ++Count;
09467b48Spatrick  }
09467b48Spatrick
09467b48Spatrick  void dec() {
09467b48Spatrick    std::lock_guard<std::mutex> lock(Mutex);
09467b48Spatrick    if (--Count == 0)
09467b48Spatrick      Cond.notify_all();
09467b48Spatrick  }
09467b48Spatrick
09467b48Spatrick  void sync() const {
09467b48Spatrick    std::unique_lock<std::mutex> lock(Mutex);
09467b48Spatrick    Cond.wait(lock, [&] { return Count == 0; });
09467b48Spatrick  }
09467b48Spatrick};
*d415bd75Srobert} // namespace detail
09467b48Spatrick
09467b48Spatrickclass TaskGroup {
*d415bd75Srobert  detail::Latch L;
09467b48Spatrick  bool Parallel;
09467b48Spatrick
09467b48Spatrickpublic:
09467b48Spatrick  TaskGroup();
09467b48Spatrick  ~TaskGroup();
09467b48Spatrick
*d415bd75Srobert  // Spawn a task, but does not wait for it to finish.
09467b48Spatrick  void spawn(std::function<void()> f);
09467b48Spatrick
*d415bd75Srobert  // Similar to spawn, but execute the task immediately when ThreadsRequested ==
*d415bd75Srobert  // 1. The difference is to give the following pattern a more intuitive order
*d415bd75Srobert  // when single threading is requested.
*d415bd75Srobert  //
*d415bd75Srobert  // for (size_t begin = 0, i = 0, taskSize = 0;;) {
*d415bd75Srobert  //   taskSize += ...
*d415bd75Srobert  //   bool done = ++i == end;
*d415bd75Srobert  //   if (done || taskSize >= taskSizeLimit) {
*d415bd75Srobert  //     tg.execute([=] { fn(begin, i); });
*d415bd75Srobert  //     if (done)
*d415bd75Srobert  //       break;
*d415bd75Srobert  //     begin = i;
*d415bd75Srobert  //     taskSize = 0;
*d415bd75Srobert  //   }
*d415bd75Srobert  // }
*d415bd75Srobert  void execute(std::function<void()> f);
*d415bd75Srobert
09467b48Spatrick  void sync() const { L.sync(); }
09467b48Spatrick};
09467b48Spatrick
*d415bd75Srobertnamespace detail {
*d415bd75Srobert
*d415bd75Srobert#if LLVM_ENABLE_THREADS
09467b48Spatrickconst ptrdiff_t MinParallelSize = 1024;
09467b48Spatrick
09467b48Spatrick/// Inclusive median.
09467b48Spatricktemplate <class RandomAccessIterator, class Comparator>
09467b48SpatrickRandomAccessIterator medianOf3(RandomAccessIterator Start,
09467b48Spatrick                               RandomAccessIterator End,
09467b48Spatrick                               const Comparator &Comp) {
09467b48Spatrick  RandomAccessIterator Mid = Start + (std::distance(Start, End) / 2);
09467b48Spatrick  return Comp(*Start, *(End - 1))
09467b48Spatrick             ? (Comp(*Mid, *(End - 1)) ? (Comp(*Start, *Mid) ? Mid : Start)
09467b48Spatrick                                       : End - 1)
09467b48Spatrick             : (Comp(*Mid, *Start) ? (Comp(*(End - 1), *Mid) ? Mid : End - 1)
09467b48Spatrick                                   : Start);
09467b48Spatrick}
09467b48Spatrick
09467b48Spatricktemplate <class RandomAccessIterator, class Comparator>
09467b48Spatrickvoid parallel_quick_sort(RandomAccessIterator Start, RandomAccessIterator End,
09467b48Spatrick                         const Comparator &Comp, TaskGroup &TG, size_t Depth) {
09467b48Spatrick  // Do a sequential sort for small inputs.
09467b48Spatrick  if (std::distance(Start, End) < detail::MinParallelSize || Depth == 0) {
09467b48Spatrick    llvm::sort(Start, End, Comp);
09467b48Spatrick    return;
09467b48Spatrick  }
09467b48Spatrick
09467b48Spatrick  // Partition.
09467b48Spatrick  auto Pivot = medianOf3(Start, End, Comp);
09467b48Spatrick  // Move Pivot to End.
09467b48Spatrick  std::swap(*(End - 1), *Pivot);
09467b48Spatrick  Pivot = std::partition(Start, End - 1, [&Comp, End](decltype(*Start) V) {
09467b48Spatrick    return Comp(V, *(End - 1));
09467b48Spatrick  });
09467b48Spatrick  // Move Pivot to middle of partition.
09467b48Spatrick  std::swap(*Pivot, *(End - 1));
09467b48Spatrick
09467b48Spatrick  // Recurse.
09467b48Spatrick  TG.spawn([=, &Comp, &TG] {
09467b48Spatrick    parallel_quick_sort(Start, Pivot, Comp, TG, Depth - 1);
09467b48Spatrick  });
09467b48Spatrick  parallel_quick_sort(Pivot + 1, End, Comp, TG, Depth - 1);
09467b48Spatrick}
09467b48Spatrick
09467b48Spatricktemplate <class RandomAccessIterator, class Comparator>
09467b48Spatrickvoid parallel_sort(RandomAccessIterator Start, RandomAccessIterator End,
09467b48Spatrick                   const Comparator &Comp) {
09467b48Spatrick  TaskGroup TG;
09467b48Spatrick  parallel_quick_sort(Start, End, Comp, TG,
09467b48Spatrick                      llvm::Log2_64(std::distance(Start, End)) + 1);
09467b48Spatrick}
09467b48Spatrick
09467b48Spatrick// TaskGroup has a relatively high overhead, so we want to reduce
09467b48Spatrick// the number of spawn() calls. We'll create up to 1024 tasks here.
09467b48Spatrick// (Note that 1024 is an arbitrary number. This code probably needs
09467b48Spatrick// improving to take the number of available cores into account.)
73471bf0Spatrickenum { MaxTasksPerGroup = 1024 };
73471bf0Spatrick
73471bf0Spatricktemplate <class IterTy, class ResultTy, class ReduceFuncTy,
73471bf0Spatrick          class TransformFuncTy>
73471bf0SpatrickResultTy parallel_transform_reduce(IterTy Begin, IterTy End, ResultTy Init,
73471bf0Spatrick                                   ReduceFuncTy Reduce,
73471bf0Spatrick                                   TransformFuncTy Transform) {
73471bf0Spatrick  // Limit the number of tasks to MaxTasksPerGroup to limit job scheduling
73471bf0Spatrick  // overhead on large inputs.
73471bf0Spatrick  size_t NumInputs = std::distance(Begin, End);
73471bf0Spatrick  if (NumInputs == 0)
73471bf0Spatrick    return std::move(Init);
73471bf0Spatrick  size_t NumTasks = std::min(static_cast<size_t>(MaxTasksPerGroup), NumInputs);
73471bf0Spatrick  std::vector<ResultTy> Results(NumTasks, Init);
73471bf0Spatrick  {
73471bf0Spatrick    // Each task processes either TaskSize or TaskSize+1 inputs. Any inputs
73471bf0Spatrick    // remaining after dividing them equally amongst tasks are distributed as
73471bf0Spatrick    // one extra input over the first tasks.
73471bf0Spatrick    TaskGroup TG;
73471bf0Spatrick    size_t TaskSize = NumInputs / NumTasks;
73471bf0Spatrick    size_t RemainingInputs = NumInputs % NumTasks;
73471bf0Spatrick    IterTy TBegin = Begin;
73471bf0Spatrick    for (size_t TaskId = 0; TaskId < NumTasks; ++TaskId) {
73471bf0Spatrick      IterTy TEnd = TBegin + TaskSize + (TaskId < RemainingInputs ? 1 : 0);
73471bf0Spatrick      TG.spawn([=, &Transform, &Reduce, &Results] {
73471bf0Spatrick        // Reduce the result of transformation eagerly within each task.
73471bf0Spatrick        ResultTy R = Init;
73471bf0Spatrick        for (IterTy It = TBegin; It != TEnd; ++It)
73471bf0Spatrick          R = Reduce(R, Transform(*It));
73471bf0Spatrick        Results[TaskId] = R;
73471bf0Spatrick      });
73471bf0Spatrick      TBegin = TEnd;
73471bf0Spatrick    }
73471bf0Spatrick    assert(TBegin == End);
73471bf0Spatrick  }
73471bf0Spatrick
73471bf0Spatrick  // Do a final reduction. There are at most 1024 tasks, so this only adds
73471bf0Spatrick  // constant single-threaded overhead for large inputs. Hopefully most
73471bf0Spatrick  // reductions are cheaper than the transformation.
73471bf0Spatrick  ResultTy FinalResult = std::move(Results.front());
73471bf0Spatrick  for (ResultTy &PartialResult :
*d415bd75Srobert       MutableArrayRef(Results.data() + 1, Results.size() - 1))
73471bf0Spatrick    FinalResult = Reduce(FinalResult, std::move(PartialResult));
73471bf0Spatrick  return std::move(FinalResult);
73471bf0Spatrick}
73471bf0Spatrick
09467b48Spatrick#endif
09467b48Spatrick
09467b48Spatrick} // namespace detail
097a140dSpatrick} // namespace parallel
09467b48Spatrick
097a140dSpatricktemplate <class RandomAccessIterator,
097a140dSpatrick          class Comparator = std::less<
097a140dSpatrick              typename std::iterator_traits<RandomAccessIterator>::value_type>>
097a140dSpatrickvoid parallelSort(RandomAccessIterator Start, RandomAccessIterator End,
09467b48Spatrick                  const Comparator &Comp = Comparator()) {
097a140dSpatrick#if LLVM_ENABLE_THREADS
097a140dSpatrick  if (parallel::strategy.ThreadsRequested != 1) {
097a140dSpatrick    parallel::detail::parallel_sort(Start, End, Comp);
097a140dSpatrick    return;
097a140dSpatrick  }
097a140dSpatrick#endif
09467b48Spatrick  llvm::sort(Start, End, Comp);
09467b48Spatrick}
09467b48Spatrick
*d415bd75Srobertvoid parallelFor(size_t Begin, size_t End, function_ref<void(size_t)> Fn);
*d415bd75Srobert
097a140dSpatricktemplate <class IterTy, class FuncTy>
097a140dSpatrickvoid parallelForEach(IterTy Begin, IterTy End, FuncTy Fn) {
*d415bd75Srobert  parallelFor(0, End - Begin, [&](size_t I) { Fn(Begin[I]); });
09467b48Spatrick}
09467b48Spatrick
73471bf0Spatricktemplate <class IterTy, class ResultTy, class ReduceFuncTy,
73471bf0Spatrick          class TransformFuncTy>
73471bf0SpatrickResultTy parallelTransformReduce(IterTy Begin, IterTy End, ResultTy Init,
73471bf0Spatrick                                 ReduceFuncTy Reduce,
73471bf0Spatrick                                 TransformFuncTy Transform) {
73471bf0Spatrick#if LLVM_ENABLE_THREADS
73471bf0Spatrick  if (parallel::strategy.ThreadsRequested != 1) {
73471bf0Spatrick    return parallel::detail::parallel_transform_reduce(Begin, End, Init, Reduce,
73471bf0Spatrick                                                       Transform);
73471bf0Spatrick  }
73471bf0Spatrick#endif
73471bf0Spatrick  for (IterTy I = Begin; I != End; ++I)
73471bf0Spatrick    Init = Reduce(std::move(Init), Transform(*I));
73471bf0Spatrick  return std::move(Init);
73471bf0Spatrick}
73471bf0Spatrick
097a140dSpatrick// Range wrappers.
097a140dSpatricktemplate <class RangeTy,
097a140dSpatrick          class Comparator = std::less<decltype(*std::begin(RangeTy()))>>
097a140dSpatrickvoid parallelSort(RangeTy &&R, const Comparator &Comp = Comparator()) {
097a140dSpatrick  parallelSort(std::begin(R), std::end(R), Comp);
09467b48Spatrick}
09467b48Spatrick
097a140dSpatricktemplate <class RangeTy, class FuncTy>
097a140dSpatrickvoid parallelForEach(RangeTy &&R, FuncTy Fn) {
097a140dSpatrick  parallelForEach(std::begin(R), std::end(R), Fn);
09467b48Spatrick}
09467b48Spatrick
73471bf0Spatricktemplate <class RangeTy, class ResultTy, class ReduceFuncTy,
73471bf0Spatrick          class TransformFuncTy>
73471bf0SpatrickResultTy parallelTransformReduce(RangeTy &&R, ResultTy Init,
73471bf0Spatrick                                 ReduceFuncTy Reduce,
73471bf0Spatrick                                 TransformFuncTy Transform) {
73471bf0Spatrick  return parallelTransformReduce(std::begin(R), std::end(R), Init, Reduce,
73471bf0Spatrick                                 Transform);
73471bf0Spatrick}
73471bf0Spatrick
73471bf0Spatrick// Parallel for-each, but with error handling.
73471bf0Spatricktemplate <class RangeTy, class FuncTy>
73471bf0SpatrickError parallelForEachError(RangeTy &&R, FuncTy Fn) {
73471bf0Spatrick  // The transform_reduce algorithm requires that the initial value be copyable.
73471bf0Spatrick  // Error objects are uncopyable. We only need to copy initial success values,
73471bf0Spatrick  // so work around this mismatch via the C API. The C API represents success
73471bf0Spatrick  // values with a null pointer. The joinErrors discards null values and joins
73471bf0Spatrick  // multiple errors into an ErrorList.
73471bf0Spatrick  return unwrap(parallelTransformReduce(
73471bf0Spatrick      std::begin(R), std::end(R), wrap(Error::success()),
73471bf0Spatrick      [](LLVMErrorRef Lhs, LLVMErrorRef Rhs) {
73471bf0Spatrick        return wrap(joinErrors(unwrap(Lhs), unwrap(Rhs)));
73471bf0Spatrick      },
73471bf0Spatrick      [&Fn](auto &&V) { return wrap(Fn(V)); }));
73471bf0Spatrick}
73471bf0Spatrick
09467b48Spatrick} // namespace llvm
09467b48Spatrick
09467b48Spatrick#endif // LLVM_SUPPORT_PARALLEL_H