1 // This program demonstrate how to perform a parallel reduction
2 // using cudaFlow.
3
4 #include <taskflow/cudaflow.hpp>
5
main(int argc,char * argv[])6 int main(int argc, char* argv[]) {
7
8 if(argc != 2) {
9 std::cerr << "usage: ./cuda_reduce num_items\n";
10 std::exit(EXIT_FAILURE);
11 }
12
13 size_t N = std::atoi(argv[1]);
14
15 auto data = tf::cuda_malloc_shared<int>(N);
16 auto res1 = tf::cuda_malloc_shared<int>(1);
17 auto res2 = tf::cuda_malloc_shared<int>(1);
18 auto hres = 0;
19
20 // initialize the data
21 for(size_t i=0; i<N; i++) {
22 data[i] = ::rand()%100;
23 hres += data[i];
24 }
25 *res1 = 10;
26 *res2 = 10;
27
28 // perform reduction
29 tf::cudaFlow cudaflow;
30
31 // res1 = res1 + data[0] + data[1] + ...
32 cudaflow.reduce(
33 data, data+N, res1, [] __device__ (int a, int b){ return a+b; }
34 );
35
36 // res2 = data[0] + data[1] + data[2] + ...
37 cudaflow.uninitialized_reduce(
38 data, data+N, res2, [] __device__ (int a, int b){ return a+b; }
39 );
40
41 cudaflow.offload();
42
43 // inspect
44 if(hres + 10 != *res1 || hres != *res2) {
45 throw std::runtime_error("incorrect result");
46 }
47
48 std::cout << "correct result\n";
49
50 return 0;
51 }
52
53
54