1 // This program performs a simple single-precision Ax+Y operation
2 // using cudaFlow and verifies its result.
3
4 #include <taskflow/cudaflow.hpp>
5 #include <taskflow/taskflow.hpp>
6
7 // Kernel: saxpy
saxpy(int n,float a,float * x,float * y)8 __global__ void saxpy(int n, float a, float *x, float *y) {
9 int i = blockIdx.x*blockDim.x + threadIdx.x;
10 if (i < n) {
11 y[i] = a*x[i] + y[i];
12 }
13 }
14
15 // Function: main
main()16 int main() {
17
18 const unsigned N = 1<<20;
19
20 tf::Taskflow taskflow ("saxpy-flow");
21 tf::Executor executor;
22
23 std::vector<float> hx, hy;
24
25 float* dx {nullptr};
26 float* dy {nullptr};
27
28 // allocate x
29 auto allocate_x = taskflow.emplace([&]() {
30 std::cout << "allocating host x and device x ...\n";
31 hx.resize(N, 1.0f);
32 cudaMalloc(&dx, N*sizeof(float));
33 }).name("allocate_x");
34
35 // allocate y
36 auto allocate_y = taskflow.emplace([&]() {
37 std::cout << "allocating host y and device y ...\n";
38 hy.resize(N, 2.0f);
39 cudaMalloc(&dy, N*sizeof(float));
40 }).name("allocate_y");
41
42 // saxpy
43 auto cudaflow = taskflow.emplace([&](tf::cudaFlow& cf) {
44
45 std::cout << "running cudaflow ...\n";
46 auto h2d_x = cf.copy(dx, hx.data(), N).name("h2d_x");
47 auto h2d_y = cf.copy(dy, hy.data(), N).name("h2d_y");
48 auto d2h_x = cf.copy(hx.data(), dx, N).name("d2h_x");
49 auto d2h_y = cf.copy(hy.data(), dy, N).name("d2h_y");
50 auto kernel = cf.kernel((N+255)/256, 256, 0, saxpy, N, 2.0f, dx, dy)
51 .name("saxpy");
52 kernel.succeed(h2d_x, h2d_y)
53 .precede(d2h_x, d2h_y);
54 }).name("saxpy");
55
56 cudaflow.succeed(allocate_x, allocate_y);
57
58 // Add a verification task
59 auto verifier = taskflow.emplace([&](){
60 float max_error = 0.0f;
61 for (size_t i = 0; i < N; i++) {
62 max_error = std::max(max_error, abs(hx[i]-1.0f));
63 max_error = std::max(max_error, abs(hy[i]-4.0f));
64 }
65 std::cout << "saxpy finished with max error: " << max_error << '\n';
66 }).succeed(cudaflow).name("verify");
67
68 // free memory
69 auto deallocate_x = taskflow.emplace([&](){
70 std::cout << "deallocating device x ...\n";
71 cudaFree(dx);
72 }).name("deallocate_x");
73
74 auto deallocate_y = taskflow.emplace([&](){
75 std::cout << "deallocating device y ...\n";
76 cudaFree(dy);
77 }).name("deallocate_y");
78
79 verifier.precede(deallocate_x, deallocate_y);
80
81 executor.run(taskflow).wait();
82
83 std::cout << "dumping the taskflow ...\n";
84 taskflow.dump(std::cout);
85
86 return 0;
87 }
88
89