1 #define DOCTEST_CONFIG_IMPLEMENT_WITH_MAIN
2 
3 #include <doctest.h>
4 #include <taskflow/taskflow.hpp>
5 #include <taskflow/cudaflow.hpp>
6 
7 constexpr float eps = 0.0001f;
8 
9 // --------------------------------------------------------
10 // Testcase: add2
11 // --------------------------------------------------------
12 template <typename T, typename F>
add2()13 void add2() {
14 
15   //const unsigned N = 1<<20;
16 
17   tf::Taskflow taskflow;
18   tf::Executor executor;
19 
20   for(size_t N=1; N<=(1<<20); N <<= 1) {
21 
22     taskflow.clear();
23 
24     T v1 = ::rand() % 100;
25     T v2 = ::rand() % 100;
26 
27     std::vector<T> hx, hy;
28 
29     T* dx {nullptr};
30     T* dy {nullptr};
31 
32     // allocate x
33     auto allocate_x = taskflow.emplace([&]() {
34       hx.resize(N, v1);
35       REQUIRE(cudaMalloc(&dx, N*sizeof(T)) == cudaSuccess);
36     }).name("allocate_x");
37 
38     // allocate y
39     auto allocate_y = taskflow.emplace([&]() {
40       hy.resize(N, v2);
41       REQUIRE(cudaMalloc(&dy, N*sizeof(T)) == cudaSuccess);
42     }).name("allocate_y");
43 
44     // axpy
45     auto cudaflow = taskflow.emplace([&](F& cf) {
46       auto h2d_x = cf.copy(dx, hx.data(), N).name("h2d_x");
47       auto h2d_y = cf.copy(dy, hy.data(), N).name("h2d_y");
48       auto d2h_x = cf.copy(hx.data(), dx, N).name("d2h_x");
49       auto d2h_y = cf.copy(hy.data(), dy, N).name("d2h_y");
50       //auto kernel = cf.add(dx, N, dx, dy);
51       auto kernel = cf.transform(dx, dx+N, dy,
52         [] __device__ (T x) { return x + 2;  }
53       );
54       kernel.succeed(h2d_x, h2d_y)
55             .precede(d2h_x, d2h_y);
56     }).name("saxpy");
57 
58     cudaflow.succeed(allocate_x, allocate_y);
59 
60     // Add a verification task
61     auto verifier = taskflow.emplace([&](){
62       for (size_t i = 0; i < N; i++) {
63         REQUIRE(std::fabs(hx[i] - v1) < eps);
64         REQUIRE(std::fabs(hy[i] - (hx[i] + 2)) < eps);
65       }
66     }).succeed(cudaflow).name("verify");
67 
68     // free memory
69     auto deallocate_x = taskflow.emplace([&](){
70       REQUIRE(cudaFree(dx) == cudaSuccess);
71     }).name("deallocate_x");
72 
73     auto deallocate_y = taskflow.emplace([&](){
74       REQUIRE(cudaFree(dy) == cudaSuccess);
75     }).name("deallocate_y");
76 
77     verifier.precede(deallocate_x, deallocate_y);
78 
79     executor.run(taskflow).wait();
80 
81     // standalone tramsform
82     tf::cudaDefaultExecutionPolicy p;
83 
84     auto input  = tf::cuda_malloc_shared<T>(N);
85     auto output = tf::cuda_malloc_shared<T>(N);
86     for(size_t n=0; n<N; n++) {
87       input[n] = 1;
88     }
89 
90     tf::cuda_transform(p, input, input + N, output,
91       [] __device__ (T i) { return i+2; }
92     );
93     cudaStreamSynchronize(0);
94 
95     for(size_t n=0; n<N; n++) {
96       REQUIRE(output[n] == 3);
97     }
98   }
99 }
100 
101 TEST_CASE("add2.int" * doctest::timeout(300)) {
102   add2<int, tf::cudaFlow>();
103 }
104 
105 TEST_CASE("add2.float" * doctest::timeout(300)) {
106   add2<float, tf::cudaFlow>();
107 }
108 
109 TEST_CASE("add2.double" * doctest::timeout(300)) {
110   add2<double, tf::cudaFlow>();
111 }
112 
113 TEST_CASE("capture_add2.int" * doctest::timeout(300)) {
114   add2<int, tf::cudaFlowCapturer>();
115 }
116 
117 TEST_CASE("capture_add2.float" * doctest::timeout(300)) {
118   add2<float, tf::cudaFlowCapturer>();
119 }
120 
121 TEST_CASE("capture_add2.double" * doctest::timeout(300)) {
122   add2<double, tf::cudaFlowCapturer>();
123 }
124 
125 // ----------------------------------------------------------------------------
126 // for_each
127 // ----------------------------------------------------------------------------
128 
129 template <typename T, typename F>
for_each()130 void for_each() {
131 
132   tf::Taskflow taskflow;
133   tf::Executor executor;
134 
135   for(int n=1; n<=1234567; n = n*2 + 1) {
136 
137     taskflow.clear();
138 
139     T* cpu = nullptr;
140     T* gpu = nullptr;
141 
142     auto cputask = taskflow.emplace([&](){
143       cpu = static_cast<T*>(std::calloc(n, sizeof(T)));
144       REQUIRE(cudaMalloc(&gpu, n*sizeof(T)) == cudaSuccess);
145     });
146 
147     tf::Task gputask;
148 
149     gputask = taskflow.emplace([&](F& cf) {
150       auto d2h = cf.copy(cpu, gpu, n);
151       auto h2d = cf.copy(gpu, cpu, n);
152       auto kernel = cf.for_each(
153         gpu, gpu+n, [] __device__ (T& val) { val = 65536; }
154       );
155       h2d.precede(kernel);
156       d2h.succeed(kernel);
157     });
158 
159     cputask.precede(gputask);
160 
161     executor.run(taskflow).wait();
162 
163     for(int i=0; i<n; i++) {
164       REQUIRE(std::fabs(cpu[i] - (T)65536) < eps);
165     }
166 
167     std::free(cpu);
168     REQUIRE(cudaFree(gpu) == cudaSuccess);
169 
170     // standard algorithm: for_each
171     auto g_data = tf::cuda_malloc_shared<T>(n);
172 
173     for(int i=0; i<n; i++) {
174       g_data[i] = 0;
175     }
176 
177     tf::cuda_for_each(tf::cudaDefaultExecutionPolicy{},
178       g_data, g_data + n, [] __device__ (T& val) { val = 12222; }
179     );
180 
181     cudaStreamSynchronize(0);
182 
183     for(int i=0; i<n; i++) {
184       REQUIRE(std::fabs(g_data[i] - (T)12222) < eps);
185     }
186 
187     tf::cuda_free(g_data);
188   }
189 }
190 
191 TEST_CASE("cudaflow.for_each.int" * doctest::timeout(300)) {
192   for_each<int, tf::cudaFlow>();
193 }
194 
195 TEST_CASE("cudaflow.for_each.float" * doctest::timeout(300)) {
196   for_each<float, tf::cudaFlow>();
197 }
198 
199 TEST_CASE("cudaflow.for_each.double" * doctest::timeout(300)) {
200   for_each<double, tf::cudaFlow>();
201 }
202 
203 TEST_CASE("capture.for_each.int" * doctest::timeout(300)) {
204   for_each<int, tf::cudaFlowCapturer>();
205 }
206 
207 TEST_CASE("capture.for_each.float" * doctest::timeout(300)) {
208   for_each<float, tf::cudaFlowCapturer>();
209 }
210 
211 TEST_CASE("capture.for_each.double" * doctest::timeout(300)) {
212   for_each<double, tf::cudaFlowCapturer>();
213 }
214 
215 // --------------------------------------------------------
216 // Testcase: for_each_index
217 // --------------------------------------------------------
218 
219 template <typename T, typename F>
for_each_index()220 void for_each_index() {
221 
222   for(int n=10; n<=1234567; n = n*2 + 1) {
223 
224     tf::Taskflow taskflow;
225     tf::Executor executor;
226 
227     T* cpu = nullptr;
228     T* gpu = nullptr;
229 
230     auto cputask = taskflow.emplace([&](){
231       cpu = static_cast<T*>(std::calloc(n, sizeof(T)));
232       REQUIRE(cudaMalloc(&gpu, n*sizeof(T)) == cudaSuccess);
233     });
234 
235     auto gputask = taskflow.emplace([&](F& cf) {
236       auto d2h = cf.copy(cpu, gpu, n);
237       auto h2d = cf.copy(gpu, cpu, n);
238       //auto kernel = cf.for_each_index(gpu, n, [] __device__ (T& value){ value = 17; });
239       auto kernel1 = cf.for_each_index(
240         0, n, 2,
241         [gpu] __device__ (int i) { gpu[i] = 17; }
242       );
243       auto kernel2 = cf.for_each_index(
244         1, n, 2,
245         [=] __device__ (int i) { gpu[i] = -17; }
246       );
247       h2d.precede(kernel1, kernel2);
248       d2h.succeed(kernel1, kernel2);
249     });
250 
251     cputask.precede(gputask);
252 
253     executor.run(taskflow).wait();
254 
255     for(int i=0; i<n; i++) {
256       if(i % 2 == 0) {
257         REQUIRE(std::fabs(cpu[i] - (T)17) < eps);
258       }
259       else {
260         REQUIRE(std::fabs(cpu[i] - (T)(-17)) < eps);
261       }
262     }
263 
264     std::free(cpu);
265     REQUIRE(cudaFree(gpu) == cudaSuccess);
266   }
267 }
268 
269 TEST_CASE("cudaflow.for_each_index.int" * doctest::timeout(300)) {
270   for_each_index<int, tf::cudaFlow>();
271 }
272 
273 TEST_CASE("cudaflow.for_each_index.float" * doctest::timeout(300)) {
274   for_each_index<float, tf::cudaFlow>();
275 }
276 
277 TEST_CASE("cudaflow.for_each_index.double" * doctest::timeout(300)) {
278   for_each_index<double, tf::cudaFlow>();
279 }
280 
281 TEST_CASE("capture.for_each_index.int" * doctest::timeout(300)) {
282   for_each_index<int, tf::cudaFlowCapturer>();
283 }
284 
285 TEST_CASE("capture.for_each_index.float" * doctest::timeout(300)) {
286   for_each_index<float, tf::cudaFlowCapturer>();
287 }
288 
289 TEST_CASE("capture.for_each_index.double" * doctest::timeout(300)) {
290   for_each_index<double, tf::cudaFlowCapturer>();
291 }
292 
293 // ----------------------------------------------------------------------------
294 // transform
295 // ----------------------------------------------------------------------------
296 
297 template <typename F>
transform()298 void transform() {
299 
300   F cudaflow;
301 
302   for(unsigned n=1; n<=1234567; n = n*2 + 1) {
303 
304     cudaflow.clear();
305 
306     auto src1 = tf::cuda_malloc_shared<int>(n);
307     auto src2 = tf::cuda_malloc_shared<int>(n);
308     auto dest = tf::cuda_malloc_shared<int>(n);
309 
310     for(unsigned i=0; i<n; i++) {
311       src1[i] = 10;
312       src2[i] = 90;
313       dest[i] = 0;
314     }
315 
316     cudaflow.transform(src1, src1+n, src2, dest,
317       []__device__(int s1, int s2) { return s1 + s2; }
318     );
319 
320     cudaflow.offload();
321 
322     for(unsigned i=0; i<n; i++){
323       REQUIRE(dest[i] == src1[i] + src2[i]);
324     }
325   }
326 }
327 
328 TEST_CASE("cudaflow.transform" * doctest::timeout(300)) {
329   transform<tf::cudaFlow>();
330 }
331 
332 TEST_CASE("capture.transform" * doctest::timeout(300) ) {
333   transform<tf::cudaFlowCapturer>();
334 }
335 
336 // ----------------------------------------------------------------------------
337 // reduce
338 // ----------------------------------------------------------------------------
339 
340 template <typename T, typename F>
reduce()341 void reduce() {
342 
343   for(int n=1; n<=1234567; n = n*2 + 1) {
344 
345     tf::Taskflow taskflow;
346     tf::Executor executor;
347 
348     T sum = 0;
349 
350     std::vector<T> cpu(n);
351     for(auto& i : cpu) {
352       i = ::rand()%100-50;
353       sum += i;
354     }
355 
356     T sol;
357 
358     T* gpu = nullptr;
359     T* res = nullptr;
360 
361     auto cputask = taskflow.emplace([&](){
362       REQUIRE(cudaMalloc(&gpu, n*sizeof(T)) == cudaSuccess);
363       REQUIRE(cudaMalloc(&res, 1*sizeof(T)) == cudaSuccess);
364     });
365 
366     tf::Task gputask;
367 
368     gputask = taskflow.emplace([&](F& cf) {
369       auto d2h = cf.copy(&sol, res, 1);
370       auto h2d = cf.copy(gpu, cpu.data(), n);
371       auto set = cf.single_task([res] __device__ () mutable {
372         *res = 1000;
373       });
374       auto kernel = cf.reduce(
375         gpu, gpu+n, res, [] __device__ (T a, T b) mutable {
376           return a + b;
377         }
378       );
379       kernel.succeed(h2d, set);
380       d2h.succeed(kernel);
381     });
382 
383     cputask.precede(gputask);
384 
385     executor.run(taskflow).wait();
386 
387     REQUIRE(std::fabs(sum-sol+1000) < 0.0001);
388 
389     REQUIRE(cudaFree(gpu) == cudaSuccess);
390     REQUIRE(cudaFree(res) == cudaSuccess);
391   }
392 }
393 
394 TEST_CASE("cudaflow.reduce.int" * doctest::timeout(300)) {
395   reduce<int, tf::cudaFlow>();
396 }
397 
398 TEST_CASE("cudaflow.reduce.float" * doctest::timeout(300)) {
399   reduce<float, tf::cudaFlow>();
400 }
401 
402 TEST_CASE("cudaflow.reduce.double" * doctest::timeout(300)) {
403   reduce<double, tf::cudaFlow>();
404 }
405 
406 TEST_CASE("capture.reduce.int" * doctest::timeout(300)) {
407   reduce<int, tf::cudaFlowCapturer>();
408 }
409 
410 TEST_CASE("capture.reduce.float" * doctest::timeout(300)) {
411   reduce<float, tf::cudaFlowCapturer>();
412 }
413 
414 TEST_CASE("capture.reduce.double" * doctest::timeout(300)) {
415   reduce<double, tf::cudaFlow>();
416 }
417 
418 // ----------------------------------------------------------------------------
419 // uninitialized_reduce
420 // ----------------------------------------------------------------------------
421 
422 template <typename T, typename F>
uninitialized_reduce()423 void uninitialized_reduce() {
424 
425   for(int n=1; n<=1234567; n = n*2 + 1) {
426 
427     tf::Taskflow taskflow;
428     tf::Executor executor;
429 
430     T sum = 0;
431 
432     std::vector<T> cpu(n);
433     for(auto& i : cpu) {
434       i = ::rand()%100-50;
435       sum += i;
436     }
437 
438     T sol;
439 
440     T* gpu = nullptr;
441     T* res = nullptr;
442 
443     auto cputask = taskflow.emplace([&](){
444       REQUIRE(cudaMalloc(&gpu, n*sizeof(T)) == cudaSuccess);
445       REQUIRE(cudaMalloc(&res, 1*sizeof(T)) == cudaSuccess);
446     });
447 
448     tf::Task gputask;
449 
450     gputask = taskflow.emplace([&](F& cf) {
451       auto d2h = cf.copy(&sol, res, 1);
452       auto h2d = cf.copy(gpu, cpu.data(), n);
453       auto set = cf.single_task([res] __device__ () mutable {
454         *res = 1000;
455       });
456       auto kernel = cf.uninitialized_reduce(
457         gpu, gpu+n, res, [] __device__ (T a, T b) {
458           return a + b;
459         }
460       );
461       kernel.succeed(h2d, set);
462       d2h.succeed(kernel);
463     });
464 
465     cputask.precede(gputask);
466 
467     executor.run(taskflow).wait();
468 
469     REQUIRE(std::fabs(sum-sol) < 0.0001);
470 
471     REQUIRE(cudaFree(gpu) == cudaSuccess);
472     REQUIRE(cudaFree(res) == cudaSuccess);
473   }
474 }
475 
476 TEST_CASE("cudaflow.uninitialized_reduce.int" * doctest::timeout(300)) {
477   uninitialized_reduce<int, tf::cudaFlow>();
478 }
479 
480 TEST_CASE("cudaflow.uninitialized_reduce.float" * doctest::timeout(300)) {
481   uninitialized_reduce<float, tf::cudaFlow>();
482 }
483 
484 TEST_CASE("cudaflow.uninitialized_reduce.double" * doctest::timeout(300)) {
485   uninitialized_reduce<double, tf::cudaFlow>();
486 }
487 
488 TEST_CASE("capture.uninitialized_reduce.int" * doctest::timeout(300)) {
489   uninitialized_reduce<int, tf::cudaFlowCapturer>();
490 }
491 
492 TEST_CASE("capture.uninitialized_reduce.float" * doctest::timeout(300)) {
493   uninitialized_reduce<float, tf::cudaFlowCapturer>();
494 }
495 
496 TEST_CASE("capture.uninitialized_reduce.double" * doctest::timeout(300)) {
497   uninitialized_reduce<double, tf::cudaFlow>();
498 }
499 
500 // ----------------------------------------------------------------------------
501 // transform_reduce
502 // ----------------------------------------------------------------------------
503 
504 template <typename T, typename F>
transform_reduce()505 void transform_reduce() {
506 
507   tf::Executor executor;
508 
509   for(int n=1; n<=1234567; n = n*2 + 1) {
510 
511     tf::Taskflow taskflow;
512 
513     T sum = 0;
514 
515     std::vector<T> cpu(n);
516     for(auto& i : cpu) {
517       i = ::rand()%100-50;
518       sum += i;
519     }
520 
521     T sol;
522 
523     T* gpu = nullptr;
524     T* res = nullptr;
525 
526     auto cputask = taskflow.emplace([&](){
527       REQUIRE(cudaMalloc(&gpu, n*sizeof(T)) == cudaSuccess);
528       REQUIRE(cudaMalloc(&res, 1*sizeof(T)) == cudaSuccess);
529     });
530 
531     tf::Task gputask;
532 
533     gputask = taskflow.emplace([&](F& cf) {
534       auto d2h = cf.copy(&sol, res, 1);
535       auto h2d = cf.copy(gpu, cpu.data(), n);
536       auto set = cf.single_task([res] __device__ () mutable {
537         *res = 1000;
538       });
539       auto kernel = cf.transform_reduce(
540         gpu, gpu+n, res,
541         [] __device__ (T a, T b) { return a + b; },
542         [] __device__ (T a) { return a + 1; }
543       );
544       kernel.succeed(h2d, set);
545       d2h.succeed(kernel);
546     });
547 
548     cputask.precede(gputask);
549 
550     executor.run(taskflow).wait();
551 
552     REQUIRE(std::fabs(sum+n+1000-sol) < 0.0001);
553 
554     REQUIRE(cudaFree(gpu) == cudaSuccess);
555     REQUIRE(cudaFree(res) == cudaSuccess);
556   }
557 }
558 
559 TEST_CASE("cudaflow.transform_reduce.int" * doctest::timeout(300)) {
560   transform_reduce<int, tf::cudaFlow>();
561 }
562 
563 TEST_CASE("cudaflow.transform_reduce.float" * doctest::timeout(300)) {
564   transform_reduce<float, tf::cudaFlow>();
565 }
566 
567 TEST_CASE("cudaflow.transform_reduce.double" * doctest::timeout(300)) {
568   transform_reduce<double, tf::cudaFlow>();
569 }
570 
571 TEST_CASE("capture.transform_reduce.int" * doctest::timeout(300)) {
572   transform_reduce<int, tf::cudaFlowCapturer>();
573 }
574 
575 TEST_CASE("capture.transform_reduce.float" * doctest::timeout(300)) {
576   transform_reduce<float, tf::cudaFlowCapturer>();
577 }
578 
579 TEST_CASE("capture.transform_reduce.double" * doctest::timeout(300)) {
580   transform_reduce<double, tf::cudaFlowCapturer>();
581 }
582 
583 // ----------------------------------------------------------------------------
584 // transform_uninitialized_reduce
585 // ----------------------------------------------------------------------------
586 
587 template <typename T, typename F>
transform_uninitialized_reduce()588 void transform_uninitialized_reduce() {
589 
590   tf::Executor executor;
591 
592   for(int n=1; n<=1234567; n = n*2 + 1) {
593 
594     tf::Taskflow taskflow;
595 
596     T sum = 0;
597 
598     std::vector<T> cpu(n);
599     for(auto& i : cpu) {
600       i = ::rand()%100-50;
601       sum += i;
602     }
603 
604     T sol;
605 
606     T* gpu = nullptr;
607     T* res = nullptr;
608 
609     auto cputask = taskflow.emplace([&](){
610       REQUIRE(cudaMalloc(&gpu, n*sizeof(T)) == cudaSuccess);
611       REQUIRE(cudaMalloc(&res, 1*sizeof(T)) == cudaSuccess);
612     });
613 
614     tf::Task gputask;
615 
616     gputask = taskflow.emplace([&](F& cf) {
617       auto d2h = cf.copy(&sol, res, 1);
618       auto h2d = cf.copy(gpu, cpu.data(), n);
619       auto set = cf.single_task([res] __device__ () mutable {
620         *res = 1000;
621       });
622       auto kernel = cf.transform_uninitialized_reduce(
623         gpu, gpu+n, res,
624         [] __device__ (T a, T b) { return a + b; },
625         [] __device__ (T a) { return a + 1; }
626       );
627       kernel.succeed(h2d, set);
628       d2h.succeed(kernel);
629     });
630 
631     cputask.precede(gputask);
632 
633     executor.run(taskflow).wait();
634 
635     REQUIRE(std::fabs(sum+n-sol) < 0.0001);
636 
637     REQUIRE(cudaFree(gpu) == cudaSuccess);
638     REQUIRE(cudaFree(res) == cudaSuccess);
639   }
640 }
641 
642 TEST_CASE("cudaflow.transform_uninitialized_reduce.int" * doctest::timeout(300)) {
643   transform_uninitialized_reduce<int, tf::cudaFlow>();
644 }
645 
646 TEST_CASE("cudaflow.transform_uninitialized_reduce.float" * doctest::timeout(300)) {
647   transform_uninitialized_reduce<float, tf::cudaFlow>();
648 }
649 
650 TEST_CASE("cudaflow.transform_uninitialized_reduce.double" * doctest::timeout(300)) {
651   transform_uninitialized_reduce<double, tf::cudaFlow>();
652 }
653 
654 TEST_CASE("capture.transform_uninitialized_reduce.int" * doctest::timeout(300)) {
655   transform_uninitialized_reduce<int, tf::cudaFlowCapturer>();
656 }
657 
658 TEST_CASE("capture.transform_uninitialized_reduce.float" * doctest::timeout(300)) {
659   transform_uninitialized_reduce<float, tf::cudaFlowCapturer>();
660 }
661 
662 TEST_CASE("capture.transform_uninitialized_reduce.double" * doctest::timeout(300)) {
663   transform_uninitialized_reduce<double, tf::cudaFlowCapturer>();
664 }
665 
666 // ----------------------------------------------------------------------------
667 // scan
668 // ----------------------------------------------------------------------------
669 
670 template <typename T, typename F>
scan()671 void scan() {
672 
673   tf::Executor executor;
674   tf::Taskflow taskflow;
675 
676   for(int N=1; N<=1234567; N = N*2 + 1) {
677 
678     taskflow.clear();
679 
680     auto data1 = tf::cuda_malloc_shared<T>(N);
681     auto data2 = tf::cuda_malloc_shared<T>(N);
682     auto scan1 = tf::cuda_malloc_shared<T>(N);
683     auto scan2 = tf::cuda_malloc_shared<T>(N);
684 
685     // initialize the data
686     for(int i=0; i<N; i++) {
687       data1[i] = T(i);
688       data2[i] = T(i);
689       scan1[i] = 0;
690       scan2[i] = 0;
691     }
692 
693     // perform reduction
694     taskflow.emplace([&](F& cudaflow){
695       // inclusive scan
696       cudaflow.inclusive_scan(
697         data1, data1+N, scan1, [] __device__ (T a, T b){ return a+b; }
698       );
699       // exclusive scan
700       cudaflow.exclusive_scan(
701         data2, data2+N, scan2, [] __device__ (T a, T b){ return a+b; }
702       );
703     });
704 
705     executor.run(taskflow).wait();
706 
707     // inspect
708     for(int i=1; i<N; i++) {
709       REQUIRE(scan1[i] == (scan1[i-1]+data1[i]));
710       REQUIRE(scan2[i] == (scan2[i-1]+data2[i-1]));
711     }
712 
713     // test standalone algorithms
714 
715     // initialize the data
716     for(int i=0; i<N; i++) {
717       data1[i] = T(i);
718       data2[i] = T(i);
719       scan1[i] = 0;
720       scan2[i] = 0;
721     }
722 
723     // allocate temporary buffer
724     tf::cudaScopedDeviceMemory<std::byte> temp(
725       tf::cuda_scan_buffer_size<tf::cudaDefaultExecutionPolicy, T>(N)
726     );
727 
728     tf::cuda_inclusive_scan(tf::cudaDefaultExecutionPolicy{},
729       data1, data1+N, scan1, tf::cuda_plus<T>{}, temp.data()
730     );
731     cudaStreamSynchronize(0);
732 
733     tf::cuda_exclusive_scan(tf::cudaDefaultExecutionPolicy{},
734       data2, data2+N, scan2, tf::cuda_plus<T>{}, temp.data()
735     );
736     cudaStreamSynchronize(0);
737 
738     // inspect
739     for(int i=1; i<N; i++) {
740       REQUIRE(scan1[i] == (scan1[i-1]+data1[i]));
741       REQUIRE(scan2[i] == (scan2[i-1]+data2[i-1]));
742     }
743 
744     REQUIRE(cudaFree(data1) == cudaSuccess);
745     REQUIRE(cudaFree(data2) == cudaSuccess);
746     REQUIRE(cudaFree(scan1) == cudaSuccess);
747     REQUIRE(cudaFree(scan2) == cudaSuccess);
748   }
749 }
750 
751 TEST_CASE("cudaflow.scan.int" * doctest::timeout(300)) {
752   scan<int, tf::cudaFlow>();
753 }
754 
755 TEST_CASE("cudaflow.scan.size_t" * doctest::timeout(300)) {
756   scan<size_t, tf::cudaFlow>();
757 }
758 
759 TEST_CASE("capture.scan.int" * doctest::timeout(300)) {
760   scan<int, tf::cudaFlowCapturer>();
761 }
762 
763 TEST_CASE("capture.scan.size_t" * doctest::timeout(300)) {
764   scan<size_t, tf::cudaFlowCapturer>();
765 }
766 
767 // ----------------------------------------------------------------------------
768 // transofrm scan
769 // ----------------------------------------------------------------------------
770 
771 template <typename T, typename F>
transform_scan()772 void transform_scan() {
773 
774   tf::Executor executor;
775   tf::Taskflow taskflow;
776 
777   for(int N=1; N<=1234567; N = N*2 + 1) {
778 
779     taskflow.clear();
780 
781     auto data1 = tf::cuda_malloc_shared<T>(N);
782     auto data2 = tf::cuda_malloc_shared<T>(N);
783     auto scan1 = tf::cuda_malloc_shared<T>(N);
784     auto scan2 = tf::cuda_malloc_shared<T>(N);
785 
786     // initialize the data
787     for(int i=0; i<N; i++) {
788       data1[i] = T(i);
789       data2[i] = T(i);
790       scan1[i] = 0;
791       scan2[i] = 0;
792     }
793 
794     // perform reduction
795     taskflow.emplace([&](F& cudaflow){
796       // inclusive scan
797       cudaflow.transform_inclusive_scan(
798         data1, data1+N, scan1,
799         [] __device__ (T a, T b){ return a+b; },
800         [] __device__ (T a) { return a*10; }
801       );
802       // exclusive scan
803       cudaflow.transform_exclusive_scan(
804         data2, data2+N, scan2,
805         [] __device__ (T a, T b){ return a+b; },
806         [] __device__ (T a) { return a*10; }
807       );
808     });
809 
810     executor.run(taskflow).wait();
811 
812     // standalone algorithms
813 
814     // initialize the data
815     for(int i=0; i<N; i++) {
816       data1[i] = T(i);
817       data2[i] = T(i);
818       scan1[i] = 0;
819       scan2[i] = 0;
820     }
821 
822     // allocate temporary buffer
823     tf::cudaScopedDeviceMemory<std::byte> temp(
824       tf::cuda_scan_buffer_size<tf::cudaDefaultExecutionPolicy, T>(N)
825     );
826 
827     tf::cuda_transform_inclusive_scan(tf::cudaDefaultExecutionPolicy{},
828       data1, data1+N, scan1,
829       [] __device__ (T a, T b){ return a+b; },
830       [] __device__ (T a) { return a*10; },
831       temp.data()
832     );
833     cudaStreamSynchronize(0);
834 
835     tf::cuda_transform_exclusive_scan(tf::cudaDefaultExecutionPolicy{},
836       data2, data2+N, scan2,
837       [] __device__ (T a, T b){ return a+b; },
838       [] __device__ (T a) { return a*10; },
839       temp.data()
840     );
841     cudaStreamSynchronize(0);
842 
843     // inspect
844     for(int i=1; i<N; i++) {
845       REQUIRE(scan1[i] == (scan1[i-1]+data1[i]*10));
846       REQUIRE(scan2[i] == (scan2[i-1]+data2[i-1]*10));
847     }
848 
849     REQUIRE(cudaFree(data1) == cudaSuccess);
850     REQUIRE(cudaFree(data2) == cudaSuccess);
851     REQUIRE(cudaFree(scan1) == cudaSuccess);
852     REQUIRE(cudaFree(scan2) == cudaSuccess);
853   }
854 }
855 
856 TEST_CASE("cudaflow.scan.int" * doctest::timeout(300)) {
857   transform_scan<int, tf::cudaFlow>();
858 }
859 
860 TEST_CASE("cudaflow.scan.size_t" * doctest::timeout(300)) {
861   transform_scan<size_t, tf::cudaFlow>();
862 }
863 
864 TEST_CASE("capture.transform_scan.int" * doctest::timeout(300)) {
865   transform_scan<int, tf::cudaFlowCapturer>();
866 }
867 
868 TEST_CASE("capture.transform_scan.size_t" * doctest::timeout(300)) {
869   transform_scan<size_t, tf::cudaFlowCapturer>();
870 }
871 
872 // ----------------------------------------------------------------------------
873 // merge
874 // ----------------------------------------------------------------------------
875 
876 template <typename T, typename F>
merge_keys()877 void merge_keys() {
878 
879   tf::Executor executor;
880   tf::Taskflow taskflow;
881 
882   for(int N=0; N<=1234567; N = N*2 + 1) {
883 
884     taskflow.clear();
885 
886     auto a = tf::cuda_malloc_shared<T>(N);
887     auto b = tf::cuda_malloc_shared<T>(N);
888     auto c = tf::cuda_malloc_shared<T>(2*N);
889 
890     auto p = tf::cudaDefaultExecutionPolicy{};
891 
892     // ----------------- standalone algorithms
893 
894     // initialize the data
895     for(int i=0; i<N; i++) {
896       a[i] = T(rand()%100);
897       b[i] = T(rand()%100);
898     }
899 
900     std::sort(a, a+N);
901     std::sort(b, b+N);
902 
903     auto bufsz = tf::cuda_merge_buffer_size<decltype(p)>(N, N);
904     tf::cudaScopedDeviceMemory<std::byte> buf(bufsz);
905 
906     tf::cuda_merge(p, a, a+N, b, b+N, c, tf::cuda_less<T>{}, buf.data());
907     p.synchronize();
908 
909     REQUIRE(std::is_sorted(c, c+2*N));
910 
911     // ----------------- cudaFlow capturer
912     for(int i=0; i<N*2; i++) {
913       c[i] = rand();
914     }
915 
916     taskflow.emplace([&](F& cudaflow){
917       cudaflow.merge(a, a+N, b, b+N, c, tf::cuda_less<T>{});
918     });
919 
920     executor.run(taskflow).wait();
921 
922     REQUIRE(std::is_sorted(c, c+2*N));
923 
924     REQUIRE(cudaFree(a) == cudaSuccess);
925     REQUIRE(cudaFree(b) == cudaSuccess);
926     REQUIRE(cudaFree(c) == cudaSuccess);
927   }
928 }
929 
930 TEST_CASE("cudaflow.merge_keys.int" * doctest::timeout(300)) {
931   merge_keys<int, tf::cudaFlow>();
932 }
933 
934 TEST_CASE("cudaflow.merge_keys.float" * doctest::timeout(300)) {
935   merge_keys<float, tf::cudaFlow>();
936 }
937 
938 TEST_CASE("cudaflow.merge_keys.int" * doctest::timeout(300)) {
939   merge_keys<int, tf::cudaFlow>();
940 }
941 
942 TEST_CASE("capture.merge_keys.float" * doctest::timeout(300)) {
943   merge_keys<float, tf::cudaFlowCapturer>();
944 }
945 
946 // ----------------------------------------------------------------------------
947 // merge_by_keys
948 // ----------------------------------------------------------------------------
949 
950 template <typename T, typename F>
merge_keys_values()951 void merge_keys_values() {
952 
953   tf::Executor executor;
954   tf::Taskflow taskflow;
955 
956   for(int N=0; N<=1234567; N = N*2 + 1) {
957 
958     taskflow.clear();
959 
960     auto a_k = tf::cuda_malloc_shared<T>(N);
961     auto b_k = tf::cuda_malloc_shared<T>(N);
962     auto c_k = tf::cuda_malloc_shared<T>(2*N);
963     auto a_v = tf::cuda_malloc_shared<int>(N);
964     auto b_v = tf::cuda_malloc_shared<int>(N);
965     auto c_v = tf::cuda_malloc_shared<int>(2*N);
966 
967     auto p = tf::cudaDefaultExecutionPolicy{};
968 
969     // ----------------- standalone algorithms
970 
971     // initialize the data
972     for(int i=0; i<N; i++) {
973       a_k[i] =  (i*2+1);
974       a_v[i] = -(i*2+1);
975       b_k[i] =  (i+1)*2;
976       b_v[i] = -(i+1)*2;
977       c_k[i] = c_k[i+N] = c_v[i] = c_v[i+N] = 0;
978     }
979 
980     auto bufsz = tf::cuda_merge_buffer_size<decltype(p)>(N, N);
981     tf::cudaScopedDeviceMemory<std::byte> buf(bufsz);
982 
983     tf::cuda_merge_by_key(
984       p,
985       a_k, a_k+N, a_v,
986       b_k, b_k+N, b_v,
987       c_k, c_v,
988       tf::cuda_less<T>{},
989       buf.data()
990     );
991     p.synchronize();
992 
993     for(int i=0; i<2*N; i++) {
994       REQUIRE(c_k[i] == (i+1));
995       REQUIRE(c_v[i] == -(i+1));
996     }
997 
998     // ----------------- cudaFlow capturer
999     // initialize the data
1000     for(int i=0; i<N; i++) {
1001       a_k[i] =  (i*2+1);
1002       a_v[i] = -(i*2+1);
1003       b_k[i] =  (i+1)*2;
1004       b_v[i] = -(i+1)*2;
1005       c_k[i] = c_k[i+N] = c_v[i] = c_v[i+N] = 0;
1006     }
1007 
1008     taskflow.emplace([&](F& cudaflow){
1009       cudaflow.merge_by_key(
1010         a_k, a_k+N, a_v, b_k, b_k+N, b_v, c_k, c_v, tf::cuda_less<T>{}
1011       );
1012     });
1013 
1014     executor.run(taskflow).wait();
1015 
1016     for(int i=0; i<2*N; i++) {
1017       REQUIRE(c_k[i] == (i+1));
1018       REQUIRE(c_v[i] == -(i+1));
1019     }
1020 
1021     REQUIRE(cudaFree(a_k) == cudaSuccess);
1022     REQUIRE(cudaFree(b_k) == cudaSuccess);
1023     REQUIRE(cudaFree(c_k) == cudaSuccess);
1024     REQUIRE(cudaFree(a_v) == cudaSuccess);
1025     REQUIRE(cudaFree(b_v) == cudaSuccess);
1026     REQUIRE(cudaFree(c_v) == cudaSuccess);
1027   }
1028 }
1029 
1030 TEST_CASE("cudaflow.merge_keys_values.int" * doctest::timeout(300)) {
1031   merge_keys_values<int, tf::cudaFlow>();
1032 }
1033 
1034 TEST_CASE("cudaflow.merge_keys_values.float" * doctest::timeout(300)) {
1035   merge_keys_values<float, tf::cudaFlow>();
1036 }
1037 
1038 TEST_CASE("capturer.merge_keys_values.int" * doctest::timeout(300)) {
1039   merge_keys_values<int, tf::cudaFlowCapturer>();
1040 }
1041 
1042 TEST_CASE("capturer.merge_keys_values.float" * doctest::timeout(300)) {
1043   merge_keys_values<float, tf::cudaFlowCapturer>();
1044 }
1045 
1046 // ----------------------------------------------------------------------------
1047 // sort
1048 // ----------------------------------------------------------------------------
1049 
1050 template <typename T, typename F>
sort_keys()1051 void sort_keys() {
1052 
1053   tf::Executor executor;
1054   tf::Taskflow taskflow;
1055 
1056   for(int N=0; N<=1234567; N = N*2 + 1) {
1057 
1058     taskflow.clear();
1059 
1060     auto a = tf::cuda_malloc_shared<T>(N);
1061     auto p = tf::cudaDefaultExecutionPolicy{};
1062 
1063     // ----------------- standalone asynchronous algorithms
1064 
1065     // initialize the data
1066     for(int i=0; i<N; i++) {
1067       a[i] = T(rand()%1000);
1068     }
1069 
1070     auto bufsz = tf::cuda_sort_buffer_size<decltype(p), T>(N);
1071     tf::cudaScopedDeviceMemory<std::byte> buf(bufsz);
1072     tf::cuda_sort(p, a, a+N, tf::cuda_less<T>{}, buf.data());
1073     p.synchronize();
1074     REQUIRE(std::is_sorted(a, a+N));
1075 
1076     // ----------------- cudaflow capturer
1077     for(int i=0; i<N; i++) {
1078       a[i] = T(rand()%1000);
1079     }
1080 
1081     taskflow.emplace([&](F& cudaflow){
1082       cudaflow.sort(a, a+N, tf::cuda_less<T>{});
1083     });
1084 
1085     executor.run(taskflow).wait();
1086 
1087     REQUIRE(std::is_sorted(a, a+N));
1088 
1089     REQUIRE(cudaFree(a) == cudaSuccess);
1090   }
1091 }
1092 
1093 TEST_CASE("cudaflow.sort_keys.int" * doctest::timeout(300)) {
1094   sort_keys<int, tf::cudaFlow>();
1095 }
1096 
1097 TEST_CASE("cudaflow.sort_keys.float" * doctest::timeout(300)) {
1098   sort_keys<float, tf::cudaFlow>();
1099 }
1100 
1101 TEST_CASE("capture.sort_keys.int" * doctest::timeout(300)) {
1102   sort_keys<int, tf::cudaFlowCapturer>();
1103 }
1104 
1105 TEST_CASE("capture.sort_keys.float" * doctest::timeout(300)) {
1106   sort_keys<float, tf::cudaFlowCapturer>();
1107 }
1108 
1109 // ----------------------------------------------------------------------------
1110 // sort key-value
1111 // ----------------------------------------------------------------------------
1112 
1113 template <typename T, typename F>
sort_keys_values()1114 void sort_keys_values() {
1115 
1116   std::random_device rd;
1117   std::mt19937 g(rd());
1118 
1119   tf::Executor executor;
1120   tf::Taskflow taskflow;
1121 
1122   for(int N=1; N<=1234567; N = N*2 + 1) {
1123 
1124     taskflow.clear();
1125 
1126     auto a = tf::cuda_malloc_shared<T>(N);
1127     auto b = tf::cuda_malloc_shared<int>(N);
1128     auto p = tf::cudaDefaultExecutionPolicy{};
1129 
1130     std::vector<int> indices(N);
1131 
1132     // ----------------- standalone asynchronous algorithms
1133 
1134     // initialize the data
1135     for(int i=0; i<N; i++) {
1136       a[i] = i;
1137       b[i] = i;
1138       indices[i] = i;
1139       //printf("a[%d]=%d, b[%d]=%d\n", i, a[i], i, b[i]);
1140     }
1141     std::shuffle(a, a+N, g);
1142 
1143     std::sort(indices.begin(), indices.end(), [&](auto i, auto j){
1144       return a[i] < a[j];
1145     });
1146 
1147     auto bufsz = tf::cuda_sort_buffer_size<decltype(p), T, int>(N);
1148     tf::cudaScopedDeviceMemory<std::byte> buf(bufsz);
1149     tf::cuda_sort_by_key(p, a, a+N, b, tf::cuda_less<T>{}, buf.data());
1150     p.synchronize();
1151 
1152     REQUIRE(std::is_sorted(a, a+N));
1153     for(int i=0; i<N; i++) {
1154       REQUIRE(indices[i] == b[i]);
1155     }
1156 
1157     // ----------------- cudaflow capturer
1158     // initialize the data
1159     for(int i=0; i<N; i++) {
1160       b[i] = i;
1161       indices[i] = i;
1162       //printf("a[%d]=%d, b[%d]=%d\n", i, a[i], i, b[i]);
1163     }
1164     std::shuffle(a, a+N, g);
1165 
1166     std::sort(indices.begin(), indices.end(), [&](auto i, auto j){
1167       return a[i] > a[j];
1168     });
1169 
1170     taskflow.emplace([&](F& cudaflow){
1171       cudaflow.sort_by_key(a, a+N, b, tf::cuda_greater<T>{});
1172     });
1173 
1174     executor.run(taskflow).wait();
1175 
1176     REQUIRE(std::is_sorted(a, a+N, std::greater<T>{}));
1177     for(int i=0; i<N; i++) {
1178       REQUIRE(indices[i] == b[i]);
1179     }
1180 
1181     REQUIRE(cudaFree(a) == cudaSuccess);
1182     REQUIRE(cudaFree(b) == cudaSuccess);
1183   }
1184 }
1185 
1186 TEST_CASE("cudaflow.sort_keys_values.int" * doctest::timeout(300)) {
1187   sort_keys_values<int, tf::cudaFlow>();
1188 }
1189 
1190 TEST_CASE("cudaflow.sort_keys_values.float" * doctest::timeout(300)) {
1191   sort_keys_values<float, tf::cudaFlow>();
1192 }
1193 
1194 TEST_CASE("capture.sort_keys_values.int" * doctest::timeout(300)) {
1195   sort_keys_values<int, tf::cudaFlowCapturer>();
1196 }
1197 
1198 TEST_CASE("capture.sort_keys_values.float" * doctest::timeout(300)) {
1199   sort_keys_values<float, tf::cudaFlowCapturer>();
1200 }
1201 
1202 // ----------------------------------------------------------------------------
1203 // find-if
1204 // ----------------------------------------------------------------------------
1205 
1206 template <typename T, typename F>
find_if()1207 void find_if() {
1208 
1209   tf::Executor executor;
1210   tf::Taskflow taskflow;
1211 
1212   for(int N=0; N<=1234567; N += std::max(N/100, 1)) {
1213 
1214     taskflow.clear();
1215 
1216     auto a = tf::cuda_malloc_shared<T>(N);
1217     auto r = tf::cuda_malloc_shared<unsigned>(1);
1218     auto p = tf::cudaDefaultExecutionPolicy{};
1219 
1220     // initialize the data
1221     for(int i=0; i<N; i++) {
1222       a[i] = i;
1223     }
1224     *r = 1234;
1225 
1226     // ----------------- standalone asynchronous algorithms
1227 
1228     tf::cuda_find_if(p, a, a+N, r, []__device__(int v){ return v == 5000; });
1229     p.synchronize();
1230 
1231     if(N <= 5000) {
1232       REQUIRE(*r == N);
1233     }
1234     else {
1235       REQUIRE(*r == 5000);
1236     }
1237 
1238     // ----------------- cudaflow capturer
1239     *r = 1234;
1240 
1241     taskflow.emplace([&](F& cudaflow){
1242       cudaflow.find_if(a, a+N, r, []__device__(int v){ return v == 5000; });
1243     });
1244 
1245     executor.run(taskflow).wait();
1246 
1247     if(N <= 5000) {
1248       REQUIRE(*r == N);
1249     }
1250     else {
1251       REQUIRE(*r == 5000);
1252     }
1253 
1254     REQUIRE(cudaFree(a) == cudaSuccess);
1255     REQUIRE(cudaFree(r) == cudaSuccess);
1256   }
1257 }
1258 
1259 TEST_CASE("cudaflow.find_if.int" * doctest::timeout(300)) {
1260   find_if<int, tf::cudaFlow>();
1261 }
1262 
1263 TEST_CASE("cudaflow.find_if.float" * doctest::timeout(300)) {
1264   find_if<float, tf::cudaFlow>();
1265 }
1266 
1267 TEST_CASE("capture.find_if.int" * doctest::timeout(300)) {
1268   find_if<int, tf::cudaFlowCapturer>();
1269 }
1270 
1271 TEST_CASE("capture.find_if.float" * doctest::timeout(300)) {
1272   find_if<float, tf::cudaFlowCapturer>();
1273 }
1274 
1275 // ----------------------------------------------------------------------------
1276 // min_element
1277 // ----------------------------------------------------------------------------
1278 
1279 template <typename T, typename F>
min_element()1280 void min_element() {
1281 
1282   tf::Executor executor;
1283   tf::Taskflow taskflow;
1284 
1285   for(int N=0; N<=1234567; N += std::max(N/10, 1)) {
1286 
1287     taskflow.clear();
1288 
1289     auto a = tf::cuda_malloc_shared<T>(N);
1290     auto r = tf::cuda_malloc_shared<unsigned>(1);
1291     auto p = tf::cudaDefaultExecutionPolicy{};
1292     auto min = std::numeric_limits<T>::max();
1293 
1294     // initialize the data
1295     for(int i=0; i<N; i++) {
1296       a[i] = rand();
1297       min = std::min(min, a[i]);
1298     }
1299     *r = 1234;
1300 
1301     // ----------------- standalone asynchronous algorithms
1302 
1303     tf::cudaScopedDeviceMemory<std::byte> buf(
1304       tf::cuda_min_element_buffer_size<decltype(p), T>(N)
1305     );
1306 
1307     tf::cuda_min_element(
1308       p, a, a+N, r, tf::cuda_less<T>{}, buf.data()
1309     );
1310     p.synchronize();
1311 
1312     if(min != std::numeric_limits<T>::max()) {
1313       REQUIRE(a[*r] == min);
1314     }
1315     else {
1316       REQUIRE(*r == N);
1317     }
1318 
1319     // ----------------- cudaflow
1320     *r = 1234;
1321 
1322     taskflow.emplace([&](F& cudaflow){
1323       cudaflow.min_element(a, a+N, r, tf::cuda_less<T>{});
1324     });
1325 
1326     executor.run(taskflow).wait();
1327 
1328     if(min != std::numeric_limits<T>::max()) {
1329       REQUIRE(a[*r] == min);
1330     }
1331     else {
1332       REQUIRE(*r == N);
1333     }
1334 
1335     REQUIRE(cudaFree(a) == cudaSuccess);
1336     REQUIRE(cudaFree(r) == cudaSuccess);
1337   }
1338 }
1339 
1340 TEST_CASE("cudaflow.min_element.int" * doctest::timeout(300)) {
1341   min_element<int, tf::cudaFlow>();
1342 }
1343 
1344 TEST_CASE("cudaflow.min_element.float" * doctest::timeout(300)) {
1345   min_element<float, tf::cudaFlow>();
1346 }
1347 
1348 TEST_CASE("capturer.min_element.int" * doctest::timeout(300)) {
1349   min_element<int, tf::cudaFlowCapturer>();
1350 }
1351 
1352 TEST_CASE("capturer.min_element.float" * doctest::timeout(300)) {
1353   min_element<float, tf::cudaFlowCapturer>();
1354 }
1355 
1356 // ----------------------------------------------------------------------------
1357 // max_element
1358 // ----------------------------------------------------------------------------
1359 
1360 template <typename T, typename F>
max_element()1361 void max_element() {
1362 
1363   tf::Executor executor;
1364   tf::Taskflow taskflow;
1365 
1366   for(int N=0; N<=1234567; N += std::max(N/10, 1)) {
1367 
1368     taskflow.clear();
1369 
1370     auto a = tf::cuda_malloc_shared<T>(N);
1371     auto r = tf::cuda_malloc_shared<unsigned>(1);
1372     auto p = tf::cudaDefaultExecutionPolicy{};
1373     auto max = std::numeric_limits<T>::lowest();
1374 
1375     // initialize the data
1376     for(int i=0; i<N; i++) {
1377       a[i] = rand();
1378       max = std::max(max, a[i]);
1379     }
1380     *r = 1234;
1381 
1382     // ----------------- standalone asynchronous algorithms
1383 
1384     tf::cudaScopedDeviceMemory<std::byte> buf(
1385       tf::cuda_max_element_buffer_size<decltype(p), T>(N)
1386     );
1387 
1388     tf::cuda_max_element(p, a, a+N, r, tf::cuda_less<T>{}, buf.data());
1389     p.synchronize();
1390 
1391     if(max != std::numeric_limits<T>::lowest()) {
1392       REQUIRE(a[*r] == max);
1393     }
1394     else {
1395       REQUIRE(*r == N);
1396     }
1397 
1398     // ----------------- cudaflow
1399     *r = 1234;
1400 
1401     taskflow.emplace([&](F& cudaflow){
1402       cudaflow.max_element(a, a+N, r, tf::cuda_less<T>{});
1403     });
1404 
1405     executor.run(taskflow).wait();
1406 
1407     if(max != std::numeric_limits<T>::lowest()) {
1408       REQUIRE(a[*r] == max);
1409     }
1410     else {
1411       REQUIRE(*r == N);
1412     }
1413 
1414     REQUIRE(cudaFree(a) == cudaSuccess);
1415     REQUIRE(cudaFree(r) == cudaSuccess);
1416   }
1417 }
1418 
1419 TEST_CASE("cudaflow.max_element.int" * doctest::timeout(300)) {
1420   max_element<int, tf::cudaFlow>();
1421 }
1422 
1423 TEST_CASE("cudaflow.max_element.float" * doctest::timeout(300)) {
1424   max_element<float, tf::cudaFlow>();
1425 }
1426 
1427 TEST_CASE("capturer.max_element.int" * doctest::timeout(300)) {
1428   max_element<int, tf::cudaFlowCapturer>();
1429 }
1430 
1431 TEST_CASE("capturer.max_element.float" * doctest::timeout(300)) {
1432   max_element<float, tf::cudaFlowCapturer>();
1433 }
1434 
1435 /*// --------------------------------------------------------------------------
1436 // row-major transpose
1437 // ----------------------------------------------------------------------------
1438 
1439 // Disable for now - better to use cublasFlowCapturer
1440 
1441 template <typename T>
1442 __global__
1443 void verify(const T* din_mat, const T* dout_mat, bool* check, size_t rows, size_t cols) {
1444 
1445   size_t tid = blockDim.x * blockIdx.x + threadIdx.x;
1446   size_t size = rows * cols;
1447   for(; tid < size; tid += gridDim.x * blockDim.x) {
1448     if(din_mat[tid] != dout_mat[tid / cols + (tid % cols) * rows]) {
1449       *check = false;
1450       return;
1451     }
1452   }
1453 }
1454 
1455 template <typename T>
1456 void transpose() {
1457   tf::Executor executor;
1458 
1459   for(size_t rows = 1; rows <= 7999; rows*=2+3) {
1460     for(size_t cols = 1; cols <= 8021; cols*=3+5) {
1461 
1462       tf::Taskflow taskflow;
1463       std::vector<T> hinput_mat(rows * cols);
1464 
1465       std::generate_n(hinput_mat.begin(), rows * cols, [](){ return ::rand(); });
1466 
1467       T* dinput_mat {nullptr};
1468       T* doutput_mat {nullptr};
1469       bool* check {nullptr};
1470 
1471        //allocate
1472       auto allocate = taskflow.emplace([&]() {
1473         REQUIRE(cudaMalloc(&dinput_mat, (rows * cols) * sizeof(T)) == cudaSuccess);
1474         REQUIRE(cudaMalloc(&doutput_mat, (rows * cols) * sizeof(T)) == cudaSuccess);
1475         REQUIRE(cudaMallocManaged(&check, sizeof(bool)) == cudaSuccess);
1476         *check = true;
1477       }).name("allocate");
1478 
1479        //transpose
1480       auto cudaflow = taskflow.emplace([&](tf::cudaFlow& cf) {
1481         auto h2d_input_t = cf.copy(dinput_mat, hinput_mat.data(), rows * cols).name("h2d");
1482 
1483         auto kernel_t = tf::cudaBLAF(cf).transpose(
1484           dinput_mat,
1485           doutput_mat,
1486           rows,
1487           cols
1488         );
1489 
1490         auto verify_t = cf.kernel(
1491           32,
1492           512,
1493           0,
1494           verify<T>,
1495           dinput_mat,
1496           doutput_mat,
1497           check,
1498           rows,
1499           cols
1500         );
1501 
1502         h2d_input_t.precede(kernel_t);
1503         kernel_t.precede(verify_t);
1504       }).name("transpose");
1505 
1506 
1507        //free memory
1508       auto deallocate = taskflow.emplace([&](){
1509         REQUIRE(cudaFree(dinput_mat) == cudaSuccess);
1510         REQUIRE(cudaFree(doutput_mat) == cudaSuccess);
1511       }).name("deallocate");
1512 
1513 
1514       allocate.precede(cudaflow);
1515       cudaflow.precede(deallocate);
1516 
1517       executor.run(taskflow).wait();
1518       REQUIRE(*check);
1519     }
1520   }
1521 }
1522 
1523 TEST_CASE("transpose.int" * doctest::timeout(300) ) {
1524   transpose<int>();
1525 }
1526 
1527 TEST_CASE("transpose.float" * doctest::timeout(300) ) {
1528   transpose<float>();
1529 }
1530 
1531 
1532 TEST_CASE("transpose.double" * doctest::timeout(300) ) {
1533   transpose<double>();
1534 }
1535 
1536 // ----------------------------------------------------------------------------
1537 // row-major matrix multiplication
1538 // ----------------------------------------------------------------------------
1539 
1540 template <typename T>
1541 void matmul() {
1542   tf::Taskflow taskflow;
1543   tf::Executor executor;
1544 
1545   std::vector<T> a, b, c;
1546 
1547   for(int m=1; m<=1992; m=2*m+1) {
1548     for(int k=1; k<=1012; k=2*k+3) {
1549       for(int n=1; n<=1998; n=2*n+8) {
1550 
1551         taskflow.clear();
1552 
1553         T* ha {nullptr};
1554         T* hb {nullptr};
1555         T* hc {nullptr};
1556         T* da {nullptr};
1557         T* db {nullptr};
1558         T* dc {nullptr};
1559 
1560         T val_a = ::rand()%5-1;
1561         T val_b = ::rand()%7-3;
1562 
1563         auto hosta = taskflow.emplace([&](){
1564           a.resize(m*k);
1565           std::fill_n(a.begin(), m*k, val_a);
1566           ha = a.data();
1567           REQUIRE(cudaMalloc(&da, m*k*sizeof(T)) == cudaSuccess);
1568         }).name("ha");
1569 
1570         auto hostb = taskflow.emplace([&](){
1571           b.resize(k*n);
1572           std::fill_n(b.begin(), k*n, val_b);
1573           hb = b.data();
1574           REQUIRE(cudaMalloc(&db, k*n*sizeof(T)) == cudaSuccess);
1575         }).name("hb");
1576 
1577         auto hostc = taskflow.emplace([&](){
1578           c.resize(m*n);
1579           hc = c.data();
1580           REQUIRE(cudaMalloc(&dc, m*n*sizeof(T)) == cudaSuccess);
1581         }).name("hc");
1582 
1583         auto cuda = taskflow.emplace([&](tf::cudaFlow& cf){
1584           auto pa = cf.copy(da, ha, m*k);
1585           auto pb = cf.copy(db, hb, k*n);
1586 
1587           auto op = tf::cudaBLAF(cf).matmul(
1588             da, db, dc, m, k, n
1589           ).name("op");
1590 
1591           auto cc = cf.copy(hc, dc, m*n).name("cc");
1592 
1593           op.precede(cc).succeed(pa, pb);
1594         });
1595 
1596         cuda.succeed(hosta, hostb, hostc);
1597 
1598         executor.run(taskflow).wait();
1599 
1600         int ans = val_a*val_b*k;
1601         for(const auto& x : c) {
1602           REQUIRE((int)x == ans);
1603         }
1604 
1605         REQUIRE(cudaFree(da) == cudaSuccess);
1606         REQUIRE(cudaFree(db) == cudaSuccess);
1607         REQUIRE(cudaFree(dc) == cudaSuccess);
1608       }
1609     }
1610   }
1611 }
1612 
1613 TEST_CASE("matmul.int" * doctest::timeout(300) ) {
1614   matmul<int>();
1615 }
1616 
1617 TEST_CASE("matmul.float" * doctest::timeout(300) ) {
1618   matmul<float>();
1619 }
1620 
1621 TEST_CASE("matmul.double" * doctest::timeout(300) ) {
1622   matmul<double>();
1623 }*/
1624 
1625