1 // This program demonstrate how to perform a parallel reduction
2 // using cudaFlow.
3 
4 #include <taskflow/cudaflow.hpp>
5 
main(int argc,char * argv[])6 int main(int argc, char* argv[]) {
7 
8   if(argc != 2) {
9     std::cerr << "usage: ./cuda_reduce num_items\n";
10     std::exit(EXIT_FAILURE);
11   }
12 
13   size_t N = std::atoi(argv[1]);
14 
15   auto data = tf::cuda_malloc_shared<int>(N);
16   auto res1 = tf::cuda_malloc_shared<int>(1);
17   auto res2 = tf::cuda_malloc_shared<int>(1);
18   auto hres = 0;
19 
20   // initialize the data
21   for(size_t i=0; i<N; i++) {
22     data[i] = ::rand()%100;
23     hres += data[i];
24   }
25   *res1 = 10;
26   *res2 = 10;
27 
28   // perform reduction
29   tf::cudaFlow cudaflow;
30 
31   // res1 = res1 + data[0] + data[1] + ...
32   cudaflow.reduce(
33     data, data+N, res1, [] __device__ (int a, int b){ return a+b; }
34   );
35 
36   // res2 = data[0] + data[1] + data[2] + ...
37   cudaflow.uninitialized_reduce(
38     data, data+N, res2, [] __device__ (int a, int b){ return a+b; }
39   );
40 
41   cudaflow.offload();
42 
43   // inspect
44   if(hres + 10 != *res1 || hres != *res2) {
45     throw std::runtime_error("incorrect result");
46   }
47 
48   std::cout << "correct result\n";
49 
50   return 0;
51 }
52 
53 
54