1 // Copyright 2009-2021 Intel Corporation
2 // SPDX-License-Identifier: Apache-2.0
3 
4 #pragma once
5 
6 #include "parallel_for.h"
7 
8 namespace embree
9 {
10   template<typename Value>
11     struct ParallelPrefixSumState
12   {
13     enum { MAX_TASKS = 64 };
14     Value counts[MAX_TASKS];
15     Value sums  [MAX_TASKS];
16   };
17 
18   template<typename Index, typename Value, typename Func, typename Reduction>
parallel_prefix_sum(ParallelPrefixSumState<Value> & state,Index first,Index last,Index minStepSize,const Value & identity,const Func & func,const Reduction & reduction)19     __forceinline Value parallel_prefix_sum( ParallelPrefixSumState<Value>& state, Index first, Index last, Index minStepSize, const Value& identity, const Func& func, const Reduction& reduction)
20   {
21     /* calculate number of tasks to use */
22     const size_t numThreads = TaskScheduler::threadCount();
23     const size_t numBlocks  = (last-first+minStepSize-1)/minStepSize;
24     const size_t taskCount  = min(numThreads,numBlocks,size_t(ParallelPrefixSumState<Value>::MAX_TASKS));
25 
26     /* perform parallel prefix sum */
27     parallel_for(taskCount, [&](const size_t taskIndex)
28     {
29       const size_t i0 = first+(taskIndex+0)*(last-first)/taskCount;
30       const size_t i1 = first+(taskIndex+1)*(last-first)/taskCount;
31       state.counts[taskIndex] = func(range<size_t>(i0,i1),state.sums[taskIndex]);
32     });
33 
34     /* calculate prefix sum */
35     Value sum=identity;
36     for (size_t i=0; i<taskCount; i++)
37     {
38       const Value c = state.counts[i];
39       state.sums[i] = sum;
40       sum=reduction(sum,c);
41     }
42 
43     return sum;
44   }
45 
46   /*! parallel calculation of prefix sums */
47   template<typename SrcArray, typename DstArray, typename Value, typename Add>
48     __forceinline Value parallel_prefix_sum(const SrcArray& src, DstArray& dst, size_t N, const Value& identity, const Add& add, const size_t SINGLE_THREAD_THRESHOLD = 4096)
49   {
50     /* perform single threaded prefix operation for small N */
51     if (N < SINGLE_THREAD_THRESHOLD)
52     {
53       Value sum=identity;
54       for (size_t i=0; i<N; sum=add(sum,src[i++])) dst[i] = sum;
55       return sum;
56     }
57 
58     /* perform parallel prefix operation for large N */
59     else
60     {
61       ParallelPrefixSumState<Value> state;
62 
63       /* initial run just sets up start values for subtasks */
64       parallel_prefix_sum( state, size_t(0), size_t(N), size_t(1024), identity, [&](const range<size_t>& r, const Value& sum) -> Value {
65 
66           Value s = identity;
67           for (size_t i=r.begin(); i<r.end(); i++) s = add(s,src[i]);
68           return s;
69 
70         }, add);
71 
72       /* final run calculates prefix sum */
73       return parallel_prefix_sum( state, size_t(0), size_t(N), size_t(1024), identity, [&](const range<size_t>& r, const Value& sum) -> Value {
74 
75           Value s = identity;
76           for (size_t i=r.begin(); i<r.end(); i++) {
77             dst[i] = add(sum,s);
78             s = add(s,src[i]);
79           }
80           return s;
81 
82         }, add);
83     }
84   }
85 }
86