1 // This program performs a simple single-precision Ax+Y operation
2 // using cudaFlow and verifies its result.
3 
4 #include <taskflow/cudaflow.hpp>
5 #include <taskflow/taskflow.hpp>
6 
7 // Kernel: saxpy
saxpy(int n,float a,float * x,float * y)8 __global__ void saxpy(int n, float a, float *x, float *y) {
9   int i = blockIdx.x*blockDim.x + threadIdx.x;
10   if (i < n) {
11     y[i] = a*x[i] + y[i];
12   }
13 }
14 
15 // Function: main
main()16 int main() {
17 
18   const unsigned N = 1<<20;
19 
20   tf::Taskflow taskflow ("saxpy-flow");
21   tf::Executor executor;
22 
23   std::vector<float> hx, hy;
24 
25   float* dx {nullptr};
26   float* dy {nullptr};
27 
28   // allocate x
29   auto allocate_x = taskflow.emplace([&]() {
30     std::cout << "allocating host x and device x ...\n";
31     hx.resize(N, 1.0f);
32     cudaMalloc(&dx, N*sizeof(float));
33   }).name("allocate_x");
34 
35   // allocate y
36   auto allocate_y = taskflow.emplace([&]() {
37     std::cout << "allocating host y and device y ...\n";
38     hy.resize(N, 2.0f);
39     cudaMalloc(&dy, N*sizeof(float));
40   }).name("allocate_y");
41 
42   // saxpy
43   auto cudaflow = taskflow.emplace([&](tf::cudaFlow& cf) {
44 
45     std::cout << "running cudaflow ...\n";
46     auto h2d_x = cf.copy(dx, hx.data(), N).name("h2d_x");
47     auto h2d_y = cf.copy(dy, hy.data(), N).name("h2d_y");
48     auto d2h_x = cf.copy(hx.data(), dx, N).name("d2h_x");
49     auto d2h_y = cf.copy(hy.data(), dy, N).name("d2h_y");
50     auto kernel = cf.kernel((N+255)/256, 256, 0, saxpy, N, 2.0f, dx, dy)
51                     .name("saxpy");
52     kernel.succeed(h2d_x, h2d_y)
53           .precede(d2h_x, d2h_y);
54   }).name("saxpy");
55 
56   cudaflow.succeed(allocate_x, allocate_y);
57 
58   // Add a verification task
59   auto verifier = taskflow.emplace([&](){
60     float max_error = 0.0f;
61     for (size_t i = 0; i < N; i++) {
62       max_error = std::max(max_error, abs(hx[i]-1.0f));
63       max_error = std::max(max_error, abs(hy[i]-4.0f));
64     }
65     std::cout << "saxpy finished with max error: " << max_error << '\n';
66   }).succeed(cudaflow).name("verify");
67 
68   // free memory
69   auto deallocate_x = taskflow.emplace([&](){
70     std::cout << "deallocating device x ...\n";
71     cudaFree(dx);
72   }).name("deallocate_x");
73 
74   auto deallocate_y = taskflow.emplace([&](){
75     std::cout << "deallocating device y ...\n";
76     cudaFree(dy);
77   }).name("deallocate_y");
78 
79   verifier.precede(deallocate_x, deallocate_y);
80 
81   executor.run(taskflow).wait();
82 
83   std::cout << "dumping the taskflow ...\n";
84   taskflow.dump(std::cout);
85 
86   return 0;
87 }
88 
89