1 // Copyright 2009-2021 Intel Corporation 2 // SPDX-License-Identifier: Apache-2.0 3 4 #pragma once 5 6 #include "parallel_for.h" 7 8 namespace embree 9 { 10 template<typename Value> 11 struct ParallelPrefixSumState 12 { 13 enum { MAX_TASKS = 64 }; 14 Value counts[MAX_TASKS]; 15 Value sums [MAX_TASKS]; 16 }; 17 18 template<typename Index, typename Value, typename Func, typename Reduction> parallel_prefix_sum(ParallelPrefixSumState<Value> & state,Index first,Index last,Index minStepSize,const Value & identity,const Func & func,const Reduction & reduction)19 __forceinline Value parallel_prefix_sum( ParallelPrefixSumState<Value>& state, Index first, Index last, Index minStepSize, const Value& identity, const Func& func, const Reduction& reduction) 20 { 21 /* calculate number of tasks to use */ 22 const size_t numThreads = TaskScheduler::threadCount(); 23 const size_t numBlocks = (last-first+minStepSize-1)/minStepSize; 24 const size_t taskCount = min(numThreads,numBlocks,size_t(ParallelPrefixSumState<Value>::MAX_TASKS)); 25 26 /* perform parallel prefix sum */ 27 parallel_for(taskCount, [&](const size_t taskIndex) 28 { 29 const size_t i0 = first+(taskIndex+0)*(last-first)/taskCount; 30 const size_t i1 = first+(taskIndex+1)*(last-first)/taskCount; 31 state.counts[taskIndex] = func(range<size_t>(i0,i1),state.sums[taskIndex]); 32 }); 33 34 /* calculate prefix sum */ 35 Value sum=identity; 36 for (size_t i=0; i<taskCount; i++) 37 { 38 const Value c = state.counts[i]; 39 state.sums[i] = sum; 40 sum=reduction(sum,c); 41 } 42 43 return sum; 44 } 45 46 /*! parallel calculation of prefix sums */ 47 template<typename SrcArray, typename DstArray, typename Value, typename Add> 48 __forceinline Value parallel_prefix_sum(const SrcArray& src, DstArray& dst, size_t N, const Value& identity, const Add& add, const size_t SINGLE_THREAD_THRESHOLD = 4096) 49 { 50 /* perform single threaded prefix operation for small N */ 51 if (N < SINGLE_THREAD_THRESHOLD) 52 { 53 Value sum=identity; 54 for (size_t i=0; i<N; sum=add(sum,src[i++])) dst[i] = sum; 55 return sum; 56 } 57 58 /* perform parallel prefix operation for large N */ 59 else 60 { 61 ParallelPrefixSumState<Value> state; 62 63 /* initial run just sets up start values for subtasks */ 64 parallel_prefix_sum( state, size_t(0), size_t(N), size_t(1024), identity, [&](const range<size_t>& r, const Value& sum) -> Value { 65 66 Value s = identity; 67 for (size_t i=r.begin(); i<r.end(); i++) s = add(s,src[i]); 68 return s; 69 70 }, add); 71 72 /* final run calculates prefix sum */ 73 return parallel_prefix_sum( state, size_t(0), size_t(N), size_t(1024), identity, [&](const range<size_t>& r, const Value& sum) -> Value { 74 75 Value s = identity; 76 for (size_t i=r.begin(); i<r.end(); i++) { 77 dst[i] = add(sum,s); 78 s = add(s,src[i]); 79 } 80 return s; 81 82 }, add); 83 } 84 } 85 } 86