1 #define DOCTEST_CONFIG_IMPLEMENT_WITH_MAIN
2
3 #include <doctest.h>
4 #include <taskflow/taskflow.hpp>
5 #include <taskflow/cudaflow.hpp>
6
7 constexpr float eps = 0.0001f;
8
9 // --------------------------------------------------------
10 // Testcase: add2
11 // --------------------------------------------------------
12 template <typename T, typename F>
add2()13 void add2() {
14
15 //const unsigned N = 1<<20;
16
17 tf::Taskflow taskflow;
18 tf::Executor executor;
19
20 for(size_t N=1; N<=(1<<20); N <<= 1) {
21
22 taskflow.clear();
23
24 T v1 = ::rand() % 100;
25 T v2 = ::rand() % 100;
26
27 std::vector<T> hx, hy;
28
29 T* dx {nullptr};
30 T* dy {nullptr};
31
32 // allocate x
33 auto allocate_x = taskflow.emplace([&]() {
34 hx.resize(N, v1);
35 REQUIRE(cudaMalloc(&dx, N*sizeof(T)) == cudaSuccess);
36 }).name("allocate_x");
37
38 // allocate y
39 auto allocate_y = taskflow.emplace([&]() {
40 hy.resize(N, v2);
41 REQUIRE(cudaMalloc(&dy, N*sizeof(T)) == cudaSuccess);
42 }).name("allocate_y");
43
44 // axpy
45 auto cudaflow = taskflow.emplace([&](F& cf) {
46 auto h2d_x = cf.copy(dx, hx.data(), N).name("h2d_x");
47 auto h2d_y = cf.copy(dy, hy.data(), N).name("h2d_y");
48 auto d2h_x = cf.copy(hx.data(), dx, N).name("d2h_x");
49 auto d2h_y = cf.copy(hy.data(), dy, N).name("d2h_y");
50 //auto kernel = cf.add(dx, N, dx, dy);
51 auto kernel = cf.transform(dx, dx+N, dy,
52 [] __device__ (T x) { return x + 2; }
53 );
54 kernel.succeed(h2d_x, h2d_y)
55 .precede(d2h_x, d2h_y);
56 }).name("saxpy");
57
58 cudaflow.succeed(allocate_x, allocate_y);
59
60 // Add a verification task
61 auto verifier = taskflow.emplace([&](){
62 for (size_t i = 0; i < N; i++) {
63 REQUIRE(std::fabs(hx[i] - v1) < eps);
64 REQUIRE(std::fabs(hy[i] - (hx[i] + 2)) < eps);
65 }
66 }).succeed(cudaflow).name("verify");
67
68 // free memory
69 auto deallocate_x = taskflow.emplace([&](){
70 REQUIRE(cudaFree(dx) == cudaSuccess);
71 }).name("deallocate_x");
72
73 auto deallocate_y = taskflow.emplace([&](){
74 REQUIRE(cudaFree(dy) == cudaSuccess);
75 }).name("deallocate_y");
76
77 verifier.precede(deallocate_x, deallocate_y);
78
79 executor.run(taskflow).wait();
80
81 // standalone tramsform
82 tf::cudaDefaultExecutionPolicy p;
83
84 auto input = tf::cuda_malloc_shared<T>(N);
85 auto output = tf::cuda_malloc_shared<T>(N);
86 for(size_t n=0; n<N; n++) {
87 input[n] = 1;
88 }
89
90 tf::cuda_transform(p, input, input + N, output,
91 [] __device__ (T i) { return i+2; }
92 );
93 cudaStreamSynchronize(0);
94
95 for(size_t n=0; n<N; n++) {
96 REQUIRE(output[n] == 3);
97 }
98 }
99 }
100
101 TEST_CASE("add2.int" * doctest::timeout(300)) {
102 add2<int, tf::cudaFlow>();
103 }
104
105 TEST_CASE("add2.float" * doctest::timeout(300)) {
106 add2<float, tf::cudaFlow>();
107 }
108
109 TEST_CASE("add2.double" * doctest::timeout(300)) {
110 add2<double, tf::cudaFlow>();
111 }
112
113 TEST_CASE("capture_add2.int" * doctest::timeout(300)) {
114 add2<int, tf::cudaFlowCapturer>();
115 }
116
117 TEST_CASE("capture_add2.float" * doctest::timeout(300)) {
118 add2<float, tf::cudaFlowCapturer>();
119 }
120
121 TEST_CASE("capture_add2.double" * doctest::timeout(300)) {
122 add2<double, tf::cudaFlowCapturer>();
123 }
124
125 // ----------------------------------------------------------------------------
126 // for_each
127 // ----------------------------------------------------------------------------
128
129 template <typename T, typename F>
for_each()130 void for_each() {
131
132 tf::Taskflow taskflow;
133 tf::Executor executor;
134
135 for(int n=1; n<=1234567; n = n*2 + 1) {
136
137 taskflow.clear();
138
139 T* cpu = nullptr;
140 T* gpu = nullptr;
141
142 auto cputask = taskflow.emplace([&](){
143 cpu = static_cast<T*>(std::calloc(n, sizeof(T)));
144 REQUIRE(cudaMalloc(&gpu, n*sizeof(T)) == cudaSuccess);
145 });
146
147 tf::Task gputask;
148
149 gputask = taskflow.emplace([&](F& cf) {
150 auto d2h = cf.copy(cpu, gpu, n);
151 auto h2d = cf.copy(gpu, cpu, n);
152 auto kernel = cf.for_each(
153 gpu, gpu+n, [] __device__ (T& val) { val = 65536; }
154 );
155 h2d.precede(kernel);
156 d2h.succeed(kernel);
157 });
158
159 cputask.precede(gputask);
160
161 executor.run(taskflow).wait();
162
163 for(int i=0; i<n; i++) {
164 REQUIRE(std::fabs(cpu[i] - (T)65536) < eps);
165 }
166
167 std::free(cpu);
168 REQUIRE(cudaFree(gpu) == cudaSuccess);
169
170 // standard algorithm: for_each
171 auto g_data = tf::cuda_malloc_shared<T>(n);
172
173 for(int i=0; i<n; i++) {
174 g_data[i] = 0;
175 }
176
177 tf::cuda_for_each(tf::cudaDefaultExecutionPolicy{},
178 g_data, g_data + n, [] __device__ (T& val) { val = 12222; }
179 );
180
181 cudaStreamSynchronize(0);
182
183 for(int i=0; i<n; i++) {
184 REQUIRE(std::fabs(g_data[i] - (T)12222) < eps);
185 }
186
187 tf::cuda_free(g_data);
188 }
189 }
190
191 TEST_CASE("cudaflow.for_each.int" * doctest::timeout(300)) {
192 for_each<int, tf::cudaFlow>();
193 }
194
195 TEST_CASE("cudaflow.for_each.float" * doctest::timeout(300)) {
196 for_each<float, tf::cudaFlow>();
197 }
198
199 TEST_CASE("cudaflow.for_each.double" * doctest::timeout(300)) {
200 for_each<double, tf::cudaFlow>();
201 }
202
203 TEST_CASE("capture.for_each.int" * doctest::timeout(300)) {
204 for_each<int, tf::cudaFlowCapturer>();
205 }
206
207 TEST_CASE("capture.for_each.float" * doctest::timeout(300)) {
208 for_each<float, tf::cudaFlowCapturer>();
209 }
210
211 TEST_CASE("capture.for_each.double" * doctest::timeout(300)) {
212 for_each<double, tf::cudaFlowCapturer>();
213 }
214
215 // --------------------------------------------------------
216 // Testcase: for_each_index
217 // --------------------------------------------------------
218
219 template <typename T, typename F>
for_each_index()220 void for_each_index() {
221
222 for(int n=10; n<=1234567; n = n*2 + 1) {
223
224 tf::Taskflow taskflow;
225 tf::Executor executor;
226
227 T* cpu = nullptr;
228 T* gpu = nullptr;
229
230 auto cputask = taskflow.emplace([&](){
231 cpu = static_cast<T*>(std::calloc(n, sizeof(T)));
232 REQUIRE(cudaMalloc(&gpu, n*sizeof(T)) == cudaSuccess);
233 });
234
235 auto gputask = taskflow.emplace([&](F& cf) {
236 auto d2h = cf.copy(cpu, gpu, n);
237 auto h2d = cf.copy(gpu, cpu, n);
238 //auto kernel = cf.for_each_index(gpu, n, [] __device__ (T& value){ value = 17; });
239 auto kernel1 = cf.for_each_index(
240 0, n, 2,
241 [gpu] __device__ (int i) { gpu[i] = 17; }
242 );
243 auto kernel2 = cf.for_each_index(
244 1, n, 2,
245 [=] __device__ (int i) { gpu[i] = -17; }
246 );
247 h2d.precede(kernel1, kernel2);
248 d2h.succeed(kernel1, kernel2);
249 });
250
251 cputask.precede(gputask);
252
253 executor.run(taskflow).wait();
254
255 for(int i=0; i<n; i++) {
256 if(i % 2 == 0) {
257 REQUIRE(std::fabs(cpu[i] - (T)17) < eps);
258 }
259 else {
260 REQUIRE(std::fabs(cpu[i] - (T)(-17)) < eps);
261 }
262 }
263
264 std::free(cpu);
265 REQUIRE(cudaFree(gpu) == cudaSuccess);
266 }
267 }
268
269 TEST_CASE("cudaflow.for_each_index.int" * doctest::timeout(300)) {
270 for_each_index<int, tf::cudaFlow>();
271 }
272
273 TEST_CASE("cudaflow.for_each_index.float" * doctest::timeout(300)) {
274 for_each_index<float, tf::cudaFlow>();
275 }
276
277 TEST_CASE("cudaflow.for_each_index.double" * doctest::timeout(300)) {
278 for_each_index<double, tf::cudaFlow>();
279 }
280
281 TEST_CASE("capture.for_each_index.int" * doctest::timeout(300)) {
282 for_each_index<int, tf::cudaFlowCapturer>();
283 }
284
285 TEST_CASE("capture.for_each_index.float" * doctest::timeout(300)) {
286 for_each_index<float, tf::cudaFlowCapturer>();
287 }
288
289 TEST_CASE("capture.for_each_index.double" * doctest::timeout(300)) {
290 for_each_index<double, tf::cudaFlowCapturer>();
291 }
292
293 // ----------------------------------------------------------------------------
294 // transform
295 // ----------------------------------------------------------------------------
296
297 template <typename F>
transform()298 void transform() {
299
300 F cudaflow;
301
302 for(unsigned n=1; n<=1234567; n = n*2 + 1) {
303
304 cudaflow.clear();
305
306 auto src1 = tf::cuda_malloc_shared<int>(n);
307 auto src2 = tf::cuda_malloc_shared<int>(n);
308 auto dest = tf::cuda_malloc_shared<int>(n);
309
310 for(unsigned i=0; i<n; i++) {
311 src1[i] = 10;
312 src2[i] = 90;
313 dest[i] = 0;
314 }
315
316 cudaflow.transform(src1, src1+n, src2, dest,
317 []__device__(int s1, int s2) { return s1 + s2; }
318 );
319
320 cudaflow.offload();
321
322 for(unsigned i=0; i<n; i++){
323 REQUIRE(dest[i] == src1[i] + src2[i]);
324 }
325 }
326 }
327
328 TEST_CASE("cudaflow.transform" * doctest::timeout(300)) {
329 transform<tf::cudaFlow>();
330 }
331
332 TEST_CASE("capture.transform" * doctest::timeout(300) ) {
333 transform<tf::cudaFlowCapturer>();
334 }
335
336 // ----------------------------------------------------------------------------
337 // reduce
338 // ----------------------------------------------------------------------------
339
340 template <typename T, typename F>
reduce()341 void reduce() {
342
343 for(int n=1; n<=1234567; n = n*2 + 1) {
344
345 tf::Taskflow taskflow;
346 tf::Executor executor;
347
348 T sum = 0;
349
350 std::vector<T> cpu(n);
351 for(auto& i : cpu) {
352 i = ::rand()%100-50;
353 sum += i;
354 }
355
356 T sol;
357
358 T* gpu = nullptr;
359 T* res = nullptr;
360
361 auto cputask = taskflow.emplace([&](){
362 REQUIRE(cudaMalloc(&gpu, n*sizeof(T)) == cudaSuccess);
363 REQUIRE(cudaMalloc(&res, 1*sizeof(T)) == cudaSuccess);
364 });
365
366 tf::Task gputask;
367
368 gputask = taskflow.emplace([&](F& cf) {
369 auto d2h = cf.copy(&sol, res, 1);
370 auto h2d = cf.copy(gpu, cpu.data(), n);
371 auto set = cf.single_task([res] __device__ () mutable {
372 *res = 1000;
373 });
374 auto kernel = cf.reduce(
375 gpu, gpu+n, res, [] __device__ (T a, T b) mutable {
376 return a + b;
377 }
378 );
379 kernel.succeed(h2d, set);
380 d2h.succeed(kernel);
381 });
382
383 cputask.precede(gputask);
384
385 executor.run(taskflow).wait();
386
387 REQUIRE(std::fabs(sum-sol+1000) < 0.0001);
388
389 REQUIRE(cudaFree(gpu) == cudaSuccess);
390 REQUIRE(cudaFree(res) == cudaSuccess);
391 }
392 }
393
394 TEST_CASE("cudaflow.reduce.int" * doctest::timeout(300)) {
395 reduce<int, tf::cudaFlow>();
396 }
397
398 TEST_CASE("cudaflow.reduce.float" * doctest::timeout(300)) {
399 reduce<float, tf::cudaFlow>();
400 }
401
402 TEST_CASE("cudaflow.reduce.double" * doctest::timeout(300)) {
403 reduce<double, tf::cudaFlow>();
404 }
405
406 TEST_CASE("capture.reduce.int" * doctest::timeout(300)) {
407 reduce<int, tf::cudaFlowCapturer>();
408 }
409
410 TEST_CASE("capture.reduce.float" * doctest::timeout(300)) {
411 reduce<float, tf::cudaFlowCapturer>();
412 }
413
414 TEST_CASE("capture.reduce.double" * doctest::timeout(300)) {
415 reduce<double, tf::cudaFlow>();
416 }
417
418 // ----------------------------------------------------------------------------
419 // uninitialized_reduce
420 // ----------------------------------------------------------------------------
421
422 template <typename T, typename F>
uninitialized_reduce()423 void uninitialized_reduce() {
424
425 for(int n=1; n<=1234567; n = n*2 + 1) {
426
427 tf::Taskflow taskflow;
428 tf::Executor executor;
429
430 T sum = 0;
431
432 std::vector<T> cpu(n);
433 for(auto& i : cpu) {
434 i = ::rand()%100-50;
435 sum += i;
436 }
437
438 T sol;
439
440 T* gpu = nullptr;
441 T* res = nullptr;
442
443 auto cputask = taskflow.emplace([&](){
444 REQUIRE(cudaMalloc(&gpu, n*sizeof(T)) == cudaSuccess);
445 REQUIRE(cudaMalloc(&res, 1*sizeof(T)) == cudaSuccess);
446 });
447
448 tf::Task gputask;
449
450 gputask = taskflow.emplace([&](F& cf) {
451 auto d2h = cf.copy(&sol, res, 1);
452 auto h2d = cf.copy(gpu, cpu.data(), n);
453 auto set = cf.single_task([res] __device__ () mutable {
454 *res = 1000;
455 });
456 auto kernel = cf.uninitialized_reduce(
457 gpu, gpu+n, res, [] __device__ (T a, T b) {
458 return a + b;
459 }
460 );
461 kernel.succeed(h2d, set);
462 d2h.succeed(kernel);
463 });
464
465 cputask.precede(gputask);
466
467 executor.run(taskflow).wait();
468
469 REQUIRE(std::fabs(sum-sol) < 0.0001);
470
471 REQUIRE(cudaFree(gpu) == cudaSuccess);
472 REQUIRE(cudaFree(res) == cudaSuccess);
473 }
474 }
475
476 TEST_CASE("cudaflow.uninitialized_reduce.int" * doctest::timeout(300)) {
477 uninitialized_reduce<int, tf::cudaFlow>();
478 }
479
480 TEST_CASE("cudaflow.uninitialized_reduce.float" * doctest::timeout(300)) {
481 uninitialized_reduce<float, tf::cudaFlow>();
482 }
483
484 TEST_CASE("cudaflow.uninitialized_reduce.double" * doctest::timeout(300)) {
485 uninitialized_reduce<double, tf::cudaFlow>();
486 }
487
488 TEST_CASE("capture.uninitialized_reduce.int" * doctest::timeout(300)) {
489 uninitialized_reduce<int, tf::cudaFlowCapturer>();
490 }
491
492 TEST_CASE("capture.uninitialized_reduce.float" * doctest::timeout(300)) {
493 uninitialized_reduce<float, tf::cudaFlowCapturer>();
494 }
495
496 TEST_CASE("capture.uninitialized_reduce.double" * doctest::timeout(300)) {
497 uninitialized_reduce<double, tf::cudaFlow>();
498 }
499
500 // ----------------------------------------------------------------------------
501 // transform_reduce
502 // ----------------------------------------------------------------------------
503
504 template <typename T, typename F>
transform_reduce()505 void transform_reduce() {
506
507 tf::Executor executor;
508
509 for(int n=1; n<=1234567; n = n*2 + 1) {
510
511 tf::Taskflow taskflow;
512
513 T sum = 0;
514
515 std::vector<T> cpu(n);
516 for(auto& i : cpu) {
517 i = ::rand()%100-50;
518 sum += i;
519 }
520
521 T sol;
522
523 T* gpu = nullptr;
524 T* res = nullptr;
525
526 auto cputask = taskflow.emplace([&](){
527 REQUIRE(cudaMalloc(&gpu, n*sizeof(T)) == cudaSuccess);
528 REQUIRE(cudaMalloc(&res, 1*sizeof(T)) == cudaSuccess);
529 });
530
531 tf::Task gputask;
532
533 gputask = taskflow.emplace([&](F& cf) {
534 auto d2h = cf.copy(&sol, res, 1);
535 auto h2d = cf.copy(gpu, cpu.data(), n);
536 auto set = cf.single_task([res] __device__ () mutable {
537 *res = 1000;
538 });
539 auto kernel = cf.transform_reduce(
540 gpu, gpu+n, res,
541 [] __device__ (T a, T b) { return a + b; },
542 [] __device__ (T a) { return a + 1; }
543 );
544 kernel.succeed(h2d, set);
545 d2h.succeed(kernel);
546 });
547
548 cputask.precede(gputask);
549
550 executor.run(taskflow).wait();
551
552 REQUIRE(std::fabs(sum+n+1000-sol) < 0.0001);
553
554 REQUIRE(cudaFree(gpu) == cudaSuccess);
555 REQUIRE(cudaFree(res) == cudaSuccess);
556 }
557 }
558
559 TEST_CASE("cudaflow.transform_reduce.int" * doctest::timeout(300)) {
560 transform_reduce<int, tf::cudaFlow>();
561 }
562
563 TEST_CASE("cudaflow.transform_reduce.float" * doctest::timeout(300)) {
564 transform_reduce<float, tf::cudaFlow>();
565 }
566
567 TEST_CASE("cudaflow.transform_reduce.double" * doctest::timeout(300)) {
568 transform_reduce<double, tf::cudaFlow>();
569 }
570
571 TEST_CASE("capture.transform_reduce.int" * doctest::timeout(300)) {
572 transform_reduce<int, tf::cudaFlowCapturer>();
573 }
574
575 TEST_CASE("capture.transform_reduce.float" * doctest::timeout(300)) {
576 transform_reduce<float, tf::cudaFlowCapturer>();
577 }
578
579 TEST_CASE("capture.transform_reduce.double" * doctest::timeout(300)) {
580 transform_reduce<double, tf::cudaFlowCapturer>();
581 }
582
583 // ----------------------------------------------------------------------------
584 // transform_uninitialized_reduce
585 // ----------------------------------------------------------------------------
586
587 template <typename T, typename F>
transform_uninitialized_reduce()588 void transform_uninitialized_reduce() {
589
590 tf::Executor executor;
591
592 for(int n=1; n<=1234567; n = n*2 + 1) {
593
594 tf::Taskflow taskflow;
595
596 T sum = 0;
597
598 std::vector<T> cpu(n);
599 for(auto& i : cpu) {
600 i = ::rand()%100-50;
601 sum += i;
602 }
603
604 T sol;
605
606 T* gpu = nullptr;
607 T* res = nullptr;
608
609 auto cputask = taskflow.emplace([&](){
610 REQUIRE(cudaMalloc(&gpu, n*sizeof(T)) == cudaSuccess);
611 REQUIRE(cudaMalloc(&res, 1*sizeof(T)) == cudaSuccess);
612 });
613
614 tf::Task gputask;
615
616 gputask = taskflow.emplace([&](F& cf) {
617 auto d2h = cf.copy(&sol, res, 1);
618 auto h2d = cf.copy(gpu, cpu.data(), n);
619 auto set = cf.single_task([res] __device__ () mutable {
620 *res = 1000;
621 });
622 auto kernel = cf.transform_uninitialized_reduce(
623 gpu, gpu+n, res,
624 [] __device__ (T a, T b) { return a + b; },
625 [] __device__ (T a) { return a + 1; }
626 );
627 kernel.succeed(h2d, set);
628 d2h.succeed(kernel);
629 });
630
631 cputask.precede(gputask);
632
633 executor.run(taskflow).wait();
634
635 REQUIRE(std::fabs(sum+n-sol) < 0.0001);
636
637 REQUIRE(cudaFree(gpu) == cudaSuccess);
638 REQUIRE(cudaFree(res) == cudaSuccess);
639 }
640 }
641
642 TEST_CASE("cudaflow.transform_uninitialized_reduce.int" * doctest::timeout(300)) {
643 transform_uninitialized_reduce<int, tf::cudaFlow>();
644 }
645
646 TEST_CASE("cudaflow.transform_uninitialized_reduce.float" * doctest::timeout(300)) {
647 transform_uninitialized_reduce<float, tf::cudaFlow>();
648 }
649
650 TEST_CASE("cudaflow.transform_uninitialized_reduce.double" * doctest::timeout(300)) {
651 transform_uninitialized_reduce<double, tf::cudaFlow>();
652 }
653
654 TEST_CASE("capture.transform_uninitialized_reduce.int" * doctest::timeout(300)) {
655 transform_uninitialized_reduce<int, tf::cudaFlowCapturer>();
656 }
657
658 TEST_CASE("capture.transform_uninitialized_reduce.float" * doctest::timeout(300)) {
659 transform_uninitialized_reduce<float, tf::cudaFlowCapturer>();
660 }
661
662 TEST_CASE("capture.transform_uninitialized_reduce.double" * doctest::timeout(300)) {
663 transform_uninitialized_reduce<double, tf::cudaFlowCapturer>();
664 }
665
666 // ----------------------------------------------------------------------------
667 // scan
668 // ----------------------------------------------------------------------------
669
670 template <typename T, typename F>
scan()671 void scan() {
672
673 tf::Executor executor;
674 tf::Taskflow taskflow;
675
676 for(int N=1; N<=1234567; N = N*2 + 1) {
677
678 taskflow.clear();
679
680 auto data1 = tf::cuda_malloc_shared<T>(N);
681 auto data2 = tf::cuda_malloc_shared<T>(N);
682 auto scan1 = tf::cuda_malloc_shared<T>(N);
683 auto scan2 = tf::cuda_malloc_shared<T>(N);
684
685 // initialize the data
686 for(int i=0; i<N; i++) {
687 data1[i] = T(i);
688 data2[i] = T(i);
689 scan1[i] = 0;
690 scan2[i] = 0;
691 }
692
693 // perform reduction
694 taskflow.emplace([&](F& cudaflow){
695 // inclusive scan
696 cudaflow.inclusive_scan(
697 data1, data1+N, scan1, [] __device__ (T a, T b){ return a+b; }
698 );
699 // exclusive scan
700 cudaflow.exclusive_scan(
701 data2, data2+N, scan2, [] __device__ (T a, T b){ return a+b; }
702 );
703 });
704
705 executor.run(taskflow).wait();
706
707 // inspect
708 for(int i=1; i<N; i++) {
709 REQUIRE(scan1[i] == (scan1[i-1]+data1[i]));
710 REQUIRE(scan2[i] == (scan2[i-1]+data2[i-1]));
711 }
712
713 // test standalone algorithms
714
715 // initialize the data
716 for(int i=0; i<N; i++) {
717 data1[i] = T(i);
718 data2[i] = T(i);
719 scan1[i] = 0;
720 scan2[i] = 0;
721 }
722
723 // allocate temporary buffer
724 tf::cudaScopedDeviceMemory<std::byte> temp(
725 tf::cuda_scan_buffer_size<tf::cudaDefaultExecutionPolicy, T>(N)
726 );
727
728 tf::cuda_inclusive_scan(tf::cudaDefaultExecutionPolicy{},
729 data1, data1+N, scan1, tf::cuda_plus<T>{}, temp.data()
730 );
731 cudaStreamSynchronize(0);
732
733 tf::cuda_exclusive_scan(tf::cudaDefaultExecutionPolicy{},
734 data2, data2+N, scan2, tf::cuda_plus<T>{}, temp.data()
735 );
736 cudaStreamSynchronize(0);
737
738 // inspect
739 for(int i=1; i<N; i++) {
740 REQUIRE(scan1[i] == (scan1[i-1]+data1[i]));
741 REQUIRE(scan2[i] == (scan2[i-1]+data2[i-1]));
742 }
743
744 REQUIRE(cudaFree(data1) == cudaSuccess);
745 REQUIRE(cudaFree(data2) == cudaSuccess);
746 REQUIRE(cudaFree(scan1) == cudaSuccess);
747 REQUIRE(cudaFree(scan2) == cudaSuccess);
748 }
749 }
750
751 TEST_CASE("cudaflow.scan.int" * doctest::timeout(300)) {
752 scan<int, tf::cudaFlow>();
753 }
754
755 TEST_CASE("cudaflow.scan.size_t" * doctest::timeout(300)) {
756 scan<size_t, tf::cudaFlow>();
757 }
758
759 TEST_CASE("capture.scan.int" * doctest::timeout(300)) {
760 scan<int, tf::cudaFlowCapturer>();
761 }
762
763 TEST_CASE("capture.scan.size_t" * doctest::timeout(300)) {
764 scan<size_t, tf::cudaFlowCapturer>();
765 }
766
767 // ----------------------------------------------------------------------------
768 // transofrm scan
769 // ----------------------------------------------------------------------------
770
771 template <typename T, typename F>
transform_scan()772 void transform_scan() {
773
774 tf::Executor executor;
775 tf::Taskflow taskflow;
776
777 for(int N=1; N<=1234567; N = N*2 + 1) {
778
779 taskflow.clear();
780
781 auto data1 = tf::cuda_malloc_shared<T>(N);
782 auto data2 = tf::cuda_malloc_shared<T>(N);
783 auto scan1 = tf::cuda_malloc_shared<T>(N);
784 auto scan2 = tf::cuda_malloc_shared<T>(N);
785
786 // initialize the data
787 for(int i=0; i<N; i++) {
788 data1[i] = T(i);
789 data2[i] = T(i);
790 scan1[i] = 0;
791 scan2[i] = 0;
792 }
793
794 // perform reduction
795 taskflow.emplace([&](F& cudaflow){
796 // inclusive scan
797 cudaflow.transform_inclusive_scan(
798 data1, data1+N, scan1,
799 [] __device__ (T a, T b){ return a+b; },
800 [] __device__ (T a) { return a*10; }
801 );
802 // exclusive scan
803 cudaflow.transform_exclusive_scan(
804 data2, data2+N, scan2,
805 [] __device__ (T a, T b){ return a+b; },
806 [] __device__ (T a) { return a*10; }
807 );
808 });
809
810 executor.run(taskflow).wait();
811
812 // standalone algorithms
813
814 // initialize the data
815 for(int i=0; i<N; i++) {
816 data1[i] = T(i);
817 data2[i] = T(i);
818 scan1[i] = 0;
819 scan2[i] = 0;
820 }
821
822 // allocate temporary buffer
823 tf::cudaScopedDeviceMemory<std::byte> temp(
824 tf::cuda_scan_buffer_size<tf::cudaDefaultExecutionPolicy, T>(N)
825 );
826
827 tf::cuda_transform_inclusive_scan(tf::cudaDefaultExecutionPolicy{},
828 data1, data1+N, scan1,
829 [] __device__ (T a, T b){ return a+b; },
830 [] __device__ (T a) { return a*10; },
831 temp.data()
832 );
833 cudaStreamSynchronize(0);
834
835 tf::cuda_transform_exclusive_scan(tf::cudaDefaultExecutionPolicy{},
836 data2, data2+N, scan2,
837 [] __device__ (T a, T b){ return a+b; },
838 [] __device__ (T a) { return a*10; },
839 temp.data()
840 );
841 cudaStreamSynchronize(0);
842
843 // inspect
844 for(int i=1; i<N; i++) {
845 REQUIRE(scan1[i] == (scan1[i-1]+data1[i]*10));
846 REQUIRE(scan2[i] == (scan2[i-1]+data2[i-1]*10));
847 }
848
849 REQUIRE(cudaFree(data1) == cudaSuccess);
850 REQUIRE(cudaFree(data2) == cudaSuccess);
851 REQUIRE(cudaFree(scan1) == cudaSuccess);
852 REQUIRE(cudaFree(scan2) == cudaSuccess);
853 }
854 }
855
856 TEST_CASE("cudaflow.scan.int" * doctest::timeout(300)) {
857 transform_scan<int, tf::cudaFlow>();
858 }
859
860 TEST_CASE("cudaflow.scan.size_t" * doctest::timeout(300)) {
861 transform_scan<size_t, tf::cudaFlow>();
862 }
863
864 TEST_CASE("capture.transform_scan.int" * doctest::timeout(300)) {
865 transform_scan<int, tf::cudaFlowCapturer>();
866 }
867
868 TEST_CASE("capture.transform_scan.size_t" * doctest::timeout(300)) {
869 transform_scan<size_t, tf::cudaFlowCapturer>();
870 }
871
872 // ----------------------------------------------------------------------------
873 // merge
874 // ----------------------------------------------------------------------------
875
876 template <typename T, typename F>
merge_keys()877 void merge_keys() {
878
879 tf::Executor executor;
880 tf::Taskflow taskflow;
881
882 for(int N=0; N<=1234567; N = N*2 + 1) {
883
884 taskflow.clear();
885
886 auto a = tf::cuda_malloc_shared<T>(N);
887 auto b = tf::cuda_malloc_shared<T>(N);
888 auto c = tf::cuda_malloc_shared<T>(2*N);
889
890 auto p = tf::cudaDefaultExecutionPolicy{};
891
892 // ----------------- standalone algorithms
893
894 // initialize the data
895 for(int i=0; i<N; i++) {
896 a[i] = T(rand()%100);
897 b[i] = T(rand()%100);
898 }
899
900 std::sort(a, a+N);
901 std::sort(b, b+N);
902
903 auto bufsz = tf::cuda_merge_buffer_size<decltype(p)>(N, N);
904 tf::cudaScopedDeviceMemory<std::byte> buf(bufsz);
905
906 tf::cuda_merge(p, a, a+N, b, b+N, c, tf::cuda_less<T>{}, buf.data());
907 p.synchronize();
908
909 REQUIRE(std::is_sorted(c, c+2*N));
910
911 // ----------------- cudaFlow capturer
912 for(int i=0; i<N*2; i++) {
913 c[i] = rand();
914 }
915
916 taskflow.emplace([&](F& cudaflow){
917 cudaflow.merge(a, a+N, b, b+N, c, tf::cuda_less<T>{});
918 });
919
920 executor.run(taskflow).wait();
921
922 REQUIRE(std::is_sorted(c, c+2*N));
923
924 REQUIRE(cudaFree(a) == cudaSuccess);
925 REQUIRE(cudaFree(b) == cudaSuccess);
926 REQUIRE(cudaFree(c) == cudaSuccess);
927 }
928 }
929
930 TEST_CASE("cudaflow.merge_keys.int" * doctest::timeout(300)) {
931 merge_keys<int, tf::cudaFlow>();
932 }
933
934 TEST_CASE("cudaflow.merge_keys.float" * doctest::timeout(300)) {
935 merge_keys<float, tf::cudaFlow>();
936 }
937
938 TEST_CASE("cudaflow.merge_keys.int" * doctest::timeout(300)) {
939 merge_keys<int, tf::cudaFlow>();
940 }
941
942 TEST_CASE("capture.merge_keys.float" * doctest::timeout(300)) {
943 merge_keys<float, tf::cudaFlowCapturer>();
944 }
945
946 // ----------------------------------------------------------------------------
947 // merge_by_keys
948 // ----------------------------------------------------------------------------
949
950 template <typename T, typename F>
merge_keys_values()951 void merge_keys_values() {
952
953 tf::Executor executor;
954 tf::Taskflow taskflow;
955
956 for(int N=0; N<=1234567; N = N*2 + 1) {
957
958 taskflow.clear();
959
960 auto a_k = tf::cuda_malloc_shared<T>(N);
961 auto b_k = tf::cuda_malloc_shared<T>(N);
962 auto c_k = tf::cuda_malloc_shared<T>(2*N);
963 auto a_v = tf::cuda_malloc_shared<int>(N);
964 auto b_v = tf::cuda_malloc_shared<int>(N);
965 auto c_v = tf::cuda_malloc_shared<int>(2*N);
966
967 auto p = tf::cudaDefaultExecutionPolicy{};
968
969 // ----------------- standalone algorithms
970
971 // initialize the data
972 for(int i=0; i<N; i++) {
973 a_k[i] = (i*2+1);
974 a_v[i] = -(i*2+1);
975 b_k[i] = (i+1)*2;
976 b_v[i] = -(i+1)*2;
977 c_k[i] = c_k[i+N] = c_v[i] = c_v[i+N] = 0;
978 }
979
980 auto bufsz = tf::cuda_merge_buffer_size<decltype(p)>(N, N);
981 tf::cudaScopedDeviceMemory<std::byte> buf(bufsz);
982
983 tf::cuda_merge_by_key(
984 p,
985 a_k, a_k+N, a_v,
986 b_k, b_k+N, b_v,
987 c_k, c_v,
988 tf::cuda_less<T>{},
989 buf.data()
990 );
991 p.synchronize();
992
993 for(int i=0; i<2*N; i++) {
994 REQUIRE(c_k[i] == (i+1));
995 REQUIRE(c_v[i] == -(i+1));
996 }
997
998 // ----------------- cudaFlow capturer
999 // initialize the data
1000 for(int i=0; i<N; i++) {
1001 a_k[i] = (i*2+1);
1002 a_v[i] = -(i*2+1);
1003 b_k[i] = (i+1)*2;
1004 b_v[i] = -(i+1)*2;
1005 c_k[i] = c_k[i+N] = c_v[i] = c_v[i+N] = 0;
1006 }
1007
1008 taskflow.emplace([&](F& cudaflow){
1009 cudaflow.merge_by_key(
1010 a_k, a_k+N, a_v, b_k, b_k+N, b_v, c_k, c_v, tf::cuda_less<T>{}
1011 );
1012 });
1013
1014 executor.run(taskflow).wait();
1015
1016 for(int i=0; i<2*N; i++) {
1017 REQUIRE(c_k[i] == (i+1));
1018 REQUIRE(c_v[i] == -(i+1));
1019 }
1020
1021 REQUIRE(cudaFree(a_k) == cudaSuccess);
1022 REQUIRE(cudaFree(b_k) == cudaSuccess);
1023 REQUIRE(cudaFree(c_k) == cudaSuccess);
1024 REQUIRE(cudaFree(a_v) == cudaSuccess);
1025 REQUIRE(cudaFree(b_v) == cudaSuccess);
1026 REQUIRE(cudaFree(c_v) == cudaSuccess);
1027 }
1028 }
1029
1030 TEST_CASE("cudaflow.merge_keys_values.int" * doctest::timeout(300)) {
1031 merge_keys_values<int, tf::cudaFlow>();
1032 }
1033
1034 TEST_CASE("cudaflow.merge_keys_values.float" * doctest::timeout(300)) {
1035 merge_keys_values<float, tf::cudaFlow>();
1036 }
1037
1038 TEST_CASE("capturer.merge_keys_values.int" * doctest::timeout(300)) {
1039 merge_keys_values<int, tf::cudaFlowCapturer>();
1040 }
1041
1042 TEST_CASE("capturer.merge_keys_values.float" * doctest::timeout(300)) {
1043 merge_keys_values<float, tf::cudaFlowCapturer>();
1044 }
1045
1046 // ----------------------------------------------------------------------------
1047 // sort
1048 // ----------------------------------------------------------------------------
1049
1050 template <typename T, typename F>
sort_keys()1051 void sort_keys() {
1052
1053 tf::Executor executor;
1054 tf::Taskflow taskflow;
1055
1056 for(int N=0; N<=1234567; N = N*2 + 1) {
1057
1058 taskflow.clear();
1059
1060 auto a = tf::cuda_malloc_shared<T>(N);
1061 auto p = tf::cudaDefaultExecutionPolicy{};
1062
1063 // ----------------- standalone asynchronous algorithms
1064
1065 // initialize the data
1066 for(int i=0; i<N; i++) {
1067 a[i] = T(rand()%1000);
1068 }
1069
1070 auto bufsz = tf::cuda_sort_buffer_size<decltype(p), T>(N);
1071 tf::cudaScopedDeviceMemory<std::byte> buf(bufsz);
1072 tf::cuda_sort(p, a, a+N, tf::cuda_less<T>{}, buf.data());
1073 p.synchronize();
1074 REQUIRE(std::is_sorted(a, a+N));
1075
1076 // ----------------- cudaflow capturer
1077 for(int i=0; i<N; i++) {
1078 a[i] = T(rand()%1000);
1079 }
1080
1081 taskflow.emplace([&](F& cudaflow){
1082 cudaflow.sort(a, a+N, tf::cuda_less<T>{});
1083 });
1084
1085 executor.run(taskflow).wait();
1086
1087 REQUIRE(std::is_sorted(a, a+N));
1088
1089 REQUIRE(cudaFree(a) == cudaSuccess);
1090 }
1091 }
1092
1093 TEST_CASE("cudaflow.sort_keys.int" * doctest::timeout(300)) {
1094 sort_keys<int, tf::cudaFlow>();
1095 }
1096
1097 TEST_CASE("cudaflow.sort_keys.float" * doctest::timeout(300)) {
1098 sort_keys<float, tf::cudaFlow>();
1099 }
1100
1101 TEST_CASE("capture.sort_keys.int" * doctest::timeout(300)) {
1102 sort_keys<int, tf::cudaFlowCapturer>();
1103 }
1104
1105 TEST_CASE("capture.sort_keys.float" * doctest::timeout(300)) {
1106 sort_keys<float, tf::cudaFlowCapturer>();
1107 }
1108
1109 // ----------------------------------------------------------------------------
1110 // sort key-value
1111 // ----------------------------------------------------------------------------
1112
1113 template <typename T, typename F>
sort_keys_values()1114 void sort_keys_values() {
1115
1116 std::random_device rd;
1117 std::mt19937 g(rd());
1118
1119 tf::Executor executor;
1120 tf::Taskflow taskflow;
1121
1122 for(int N=1; N<=1234567; N = N*2 + 1) {
1123
1124 taskflow.clear();
1125
1126 auto a = tf::cuda_malloc_shared<T>(N);
1127 auto b = tf::cuda_malloc_shared<int>(N);
1128 auto p = tf::cudaDefaultExecutionPolicy{};
1129
1130 std::vector<int> indices(N);
1131
1132 // ----------------- standalone asynchronous algorithms
1133
1134 // initialize the data
1135 for(int i=0; i<N; i++) {
1136 a[i] = i;
1137 b[i] = i;
1138 indices[i] = i;
1139 //printf("a[%d]=%d, b[%d]=%d\n", i, a[i], i, b[i]);
1140 }
1141 std::shuffle(a, a+N, g);
1142
1143 std::sort(indices.begin(), indices.end(), [&](auto i, auto j){
1144 return a[i] < a[j];
1145 });
1146
1147 auto bufsz = tf::cuda_sort_buffer_size<decltype(p), T, int>(N);
1148 tf::cudaScopedDeviceMemory<std::byte> buf(bufsz);
1149 tf::cuda_sort_by_key(p, a, a+N, b, tf::cuda_less<T>{}, buf.data());
1150 p.synchronize();
1151
1152 REQUIRE(std::is_sorted(a, a+N));
1153 for(int i=0; i<N; i++) {
1154 REQUIRE(indices[i] == b[i]);
1155 }
1156
1157 // ----------------- cudaflow capturer
1158 // initialize the data
1159 for(int i=0; i<N; i++) {
1160 b[i] = i;
1161 indices[i] = i;
1162 //printf("a[%d]=%d, b[%d]=%d\n", i, a[i], i, b[i]);
1163 }
1164 std::shuffle(a, a+N, g);
1165
1166 std::sort(indices.begin(), indices.end(), [&](auto i, auto j){
1167 return a[i] > a[j];
1168 });
1169
1170 taskflow.emplace([&](F& cudaflow){
1171 cudaflow.sort_by_key(a, a+N, b, tf::cuda_greater<T>{});
1172 });
1173
1174 executor.run(taskflow).wait();
1175
1176 REQUIRE(std::is_sorted(a, a+N, std::greater<T>{}));
1177 for(int i=0; i<N; i++) {
1178 REQUIRE(indices[i] == b[i]);
1179 }
1180
1181 REQUIRE(cudaFree(a) == cudaSuccess);
1182 REQUIRE(cudaFree(b) == cudaSuccess);
1183 }
1184 }
1185
1186 TEST_CASE("cudaflow.sort_keys_values.int" * doctest::timeout(300)) {
1187 sort_keys_values<int, tf::cudaFlow>();
1188 }
1189
1190 TEST_CASE("cudaflow.sort_keys_values.float" * doctest::timeout(300)) {
1191 sort_keys_values<float, tf::cudaFlow>();
1192 }
1193
1194 TEST_CASE("capture.sort_keys_values.int" * doctest::timeout(300)) {
1195 sort_keys_values<int, tf::cudaFlowCapturer>();
1196 }
1197
1198 TEST_CASE("capture.sort_keys_values.float" * doctest::timeout(300)) {
1199 sort_keys_values<float, tf::cudaFlowCapturer>();
1200 }
1201
1202 // ----------------------------------------------------------------------------
1203 // find-if
1204 // ----------------------------------------------------------------------------
1205
1206 template <typename T, typename F>
find_if()1207 void find_if() {
1208
1209 tf::Executor executor;
1210 tf::Taskflow taskflow;
1211
1212 for(int N=0; N<=1234567; N += std::max(N/100, 1)) {
1213
1214 taskflow.clear();
1215
1216 auto a = tf::cuda_malloc_shared<T>(N);
1217 auto r = tf::cuda_malloc_shared<unsigned>(1);
1218 auto p = tf::cudaDefaultExecutionPolicy{};
1219
1220 // initialize the data
1221 for(int i=0; i<N; i++) {
1222 a[i] = i;
1223 }
1224 *r = 1234;
1225
1226 // ----------------- standalone asynchronous algorithms
1227
1228 tf::cuda_find_if(p, a, a+N, r, []__device__(int v){ return v == 5000; });
1229 p.synchronize();
1230
1231 if(N <= 5000) {
1232 REQUIRE(*r == N);
1233 }
1234 else {
1235 REQUIRE(*r == 5000);
1236 }
1237
1238 // ----------------- cudaflow capturer
1239 *r = 1234;
1240
1241 taskflow.emplace([&](F& cudaflow){
1242 cudaflow.find_if(a, a+N, r, []__device__(int v){ return v == 5000; });
1243 });
1244
1245 executor.run(taskflow).wait();
1246
1247 if(N <= 5000) {
1248 REQUIRE(*r == N);
1249 }
1250 else {
1251 REQUIRE(*r == 5000);
1252 }
1253
1254 REQUIRE(cudaFree(a) == cudaSuccess);
1255 REQUIRE(cudaFree(r) == cudaSuccess);
1256 }
1257 }
1258
1259 TEST_CASE("cudaflow.find_if.int" * doctest::timeout(300)) {
1260 find_if<int, tf::cudaFlow>();
1261 }
1262
1263 TEST_CASE("cudaflow.find_if.float" * doctest::timeout(300)) {
1264 find_if<float, tf::cudaFlow>();
1265 }
1266
1267 TEST_CASE("capture.find_if.int" * doctest::timeout(300)) {
1268 find_if<int, tf::cudaFlowCapturer>();
1269 }
1270
1271 TEST_CASE("capture.find_if.float" * doctest::timeout(300)) {
1272 find_if<float, tf::cudaFlowCapturer>();
1273 }
1274
1275 // ----------------------------------------------------------------------------
1276 // min_element
1277 // ----------------------------------------------------------------------------
1278
1279 template <typename T, typename F>
min_element()1280 void min_element() {
1281
1282 tf::Executor executor;
1283 tf::Taskflow taskflow;
1284
1285 for(int N=0; N<=1234567; N += std::max(N/10, 1)) {
1286
1287 taskflow.clear();
1288
1289 auto a = tf::cuda_malloc_shared<T>(N);
1290 auto r = tf::cuda_malloc_shared<unsigned>(1);
1291 auto p = tf::cudaDefaultExecutionPolicy{};
1292 auto min = std::numeric_limits<T>::max();
1293
1294 // initialize the data
1295 for(int i=0; i<N; i++) {
1296 a[i] = rand();
1297 min = std::min(min, a[i]);
1298 }
1299 *r = 1234;
1300
1301 // ----------------- standalone asynchronous algorithms
1302
1303 tf::cudaScopedDeviceMemory<std::byte> buf(
1304 tf::cuda_min_element_buffer_size<decltype(p), T>(N)
1305 );
1306
1307 tf::cuda_min_element(
1308 p, a, a+N, r, tf::cuda_less<T>{}, buf.data()
1309 );
1310 p.synchronize();
1311
1312 if(min != std::numeric_limits<T>::max()) {
1313 REQUIRE(a[*r] == min);
1314 }
1315 else {
1316 REQUIRE(*r == N);
1317 }
1318
1319 // ----------------- cudaflow
1320 *r = 1234;
1321
1322 taskflow.emplace([&](F& cudaflow){
1323 cudaflow.min_element(a, a+N, r, tf::cuda_less<T>{});
1324 });
1325
1326 executor.run(taskflow).wait();
1327
1328 if(min != std::numeric_limits<T>::max()) {
1329 REQUIRE(a[*r] == min);
1330 }
1331 else {
1332 REQUIRE(*r == N);
1333 }
1334
1335 REQUIRE(cudaFree(a) == cudaSuccess);
1336 REQUIRE(cudaFree(r) == cudaSuccess);
1337 }
1338 }
1339
1340 TEST_CASE("cudaflow.min_element.int" * doctest::timeout(300)) {
1341 min_element<int, tf::cudaFlow>();
1342 }
1343
1344 TEST_CASE("cudaflow.min_element.float" * doctest::timeout(300)) {
1345 min_element<float, tf::cudaFlow>();
1346 }
1347
1348 TEST_CASE("capturer.min_element.int" * doctest::timeout(300)) {
1349 min_element<int, tf::cudaFlowCapturer>();
1350 }
1351
1352 TEST_CASE("capturer.min_element.float" * doctest::timeout(300)) {
1353 min_element<float, tf::cudaFlowCapturer>();
1354 }
1355
1356 // ----------------------------------------------------------------------------
1357 // max_element
1358 // ----------------------------------------------------------------------------
1359
1360 template <typename T, typename F>
max_element()1361 void max_element() {
1362
1363 tf::Executor executor;
1364 tf::Taskflow taskflow;
1365
1366 for(int N=0; N<=1234567; N += std::max(N/10, 1)) {
1367
1368 taskflow.clear();
1369
1370 auto a = tf::cuda_malloc_shared<T>(N);
1371 auto r = tf::cuda_malloc_shared<unsigned>(1);
1372 auto p = tf::cudaDefaultExecutionPolicy{};
1373 auto max = std::numeric_limits<T>::lowest();
1374
1375 // initialize the data
1376 for(int i=0; i<N; i++) {
1377 a[i] = rand();
1378 max = std::max(max, a[i]);
1379 }
1380 *r = 1234;
1381
1382 // ----------------- standalone asynchronous algorithms
1383
1384 tf::cudaScopedDeviceMemory<std::byte> buf(
1385 tf::cuda_max_element_buffer_size<decltype(p), T>(N)
1386 );
1387
1388 tf::cuda_max_element(p, a, a+N, r, tf::cuda_less<T>{}, buf.data());
1389 p.synchronize();
1390
1391 if(max != std::numeric_limits<T>::lowest()) {
1392 REQUIRE(a[*r] == max);
1393 }
1394 else {
1395 REQUIRE(*r == N);
1396 }
1397
1398 // ----------------- cudaflow
1399 *r = 1234;
1400
1401 taskflow.emplace([&](F& cudaflow){
1402 cudaflow.max_element(a, a+N, r, tf::cuda_less<T>{});
1403 });
1404
1405 executor.run(taskflow).wait();
1406
1407 if(max != std::numeric_limits<T>::lowest()) {
1408 REQUIRE(a[*r] == max);
1409 }
1410 else {
1411 REQUIRE(*r == N);
1412 }
1413
1414 REQUIRE(cudaFree(a) == cudaSuccess);
1415 REQUIRE(cudaFree(r) == cudaSuccess);
1416 }
1417 }
1418
1419 TEST_CASE("cudaflow.max_element.int" * doctest::timeout(300)) {
1420 max_element<int, tf::cudaFlow>();
1421 }
1422
1423 TEST_CASE("cudaflow.max_element.float" * doctest::timeout(300)) {
1424 max_element<float, tf::cudaFlow>();
1425 }
1426
1427 TEST_CASE("capturer.max_element.int" * doctest::timeout(300)) {
1428 max_element<int, tf::cudaFlowCapturer>();
1429 }
1430
1431 TEST_CASE("capturer.max_element.float" * doctest::timeout(300)) {
1432 max_element<float, tf::cudaFlowCapturer>();
1433 }
1434
1435 /*// --------------------------------------------------------------------------
1436 // row-major transpose
1437 // ----------------------------------------------------------------------------
1438
1439 // Disable for now - better to use cublasFlowCapturer
1440
1441 template <typename T>
1442 __global__
1443 void verify(const T* din_mat, const T* dout_mat, bool* check, size_t rows, size_t cols) {
1444
1445 size_t tid = blockDim.x * blockIdx.x + threadIdx.x;
1446 size_t size = rows * cols;
1447 for(; tid < size; tid += gridDim.x * blockDim.x) {
1448 if(din_mat[tid] != dout_mat[tid / cols + (tid % cols) * rows]) {
1449 *check = false;
1450 return;
1451 }
1452 }
1453 }
1454
1455 template <typename T>
1456 void transpose() {
1457 tf::Executor executor;
1458
1459 for(size_t rows = 1; rows <= 7999; rows*=2+3) {
1460 for(size_t cols = 1; cols <= 8021; cols*=3+5) {
1461
1462 tf::Taskflow taskflow;
1463 std::vector<T> hinput_mat(rows * cols);
1464
1465 std::generate_n(hinput_mat.begin(), rows * cols, [](){ return ::rand(); });
1466
1467 T* dinput_mat {nullptr};
1468 T* doutput_mat {nullptr};
1469 bool* check {nullptr};
1470
1471 //allocate
1472 auto allocate = taskflow.emplace([&]() {
1473 REQUIRE(cudaMalloc(&dinput_mat, (rows * cols) * sizeof(T)) == cudaSuccess);
1474 REQUIRE(cudaMalloc(&doutput_mat, (rows * cols) * sizeof(T)) == cudaSuccess);
1475 REQUIRE(cudaMallocManaged(&check, sizeof(bool)) == cudaSuccess);
1476 *check = true;
1477 }).name("allocate");
1478
1479 //transpose
1480 auto cudaflow = taskflow.emplace([&](tf::cudaFlow& cf) {
1481 auto h2d_input_t = cf.copy(dinput_mat, hinput_mat.data(), rows * cols).name("h2d");
1482
1483 auto kernel_t = tf::cudaBLAF(cf).transpose(
1484 dinput_mat,
1485 doutput_mat,
1486 rows,
1487 cols
1488 );
1489
1490 auto verify_t = cf.kernel(
1491 32,
1492 512,
1493 0,
1494 verify<T>,
1495 dinput_mat,
1496 doutput_mat,
1497 check,
1498 rows,
1499 cols
1500 );
1501
1502 h2d_input_t.precede(kernel_t);
1503 kernel_t.precede(verify_t);
1504 }).name("transpose");
1505
1506
1507 //free memory
1508 auto deallocate = taskflow.emplace([&](){
1509 REQUIRE(cudaFree(dinput_mat) == cudaSuccess);
1510 REQUIRE(cudaFree(doutput_mat) == cudaSuccess);
1511 }).name("deallocate");
1512
1513
1514 allocate.precede(cudaflow);
1515 cudaflow.precede(deallocate);
1516
1517 executor.run(taskflow).wait();
1518 REQUIRE(*check);
1519 }
1520 }
1521 }
1522
1523 TEST_CASE("transpose.int" * doctest::timeout(300) ) {
1524 transpose<int>();
1525 }
1526
1527 TEST_CASE("transpose.float" * doctest::timeout(300) ) {
1528 transpose<float>();
1529 }
1530
1531
1532 TEST_CASE("transpose.double" * doctest::timeout(300) ) {
1533 transpose<double>();
1534 }
1535
1536 // ----------------------------------------------------------------------------
1537 // row-major matrix multiplication
1538 // ----------------------------------------------------------------------------
1539
1540 template <typename T>
1541 void matmul() {
1542 tf::Taskflow taskflow;
1543 tf::Executor executor;
1544
1545 std::vector<T> a, b, c;
1546
1547 for(int m=1; m<=1992; m=2*m+1) {
1548 for(int k=1; k<=1012; k=2*k+3) {
1549 for(int n=1; n<=1998; n=2*n+8) {
1550
1551 taskflow.clear();
1552
1553 T* ha {nullptr};
1554 T* hb {nullptr};
1555 T* hc {nullptr};
1556 T* da {nullptr};
1557 T* db {nullptr};
1558 T* dc {nullptr};
1559
1560 T val_a = ::rand()%5-1;
1561 T val_b = ::rand()%7-3;
1562
1563 auto hosta = taskflow.emplace([&](){
1564 a.resize(m*k);
1565 std::fill_n(a.begin(), m*k, val_a);
1566 ha = a.data();
1567 REQUIRE(cudaMalloc(&da, m*k*sizeof(T)) == cudaSuccess);
1568 }).name("ha");
1569
1570 auto hostb = taskflow.emplace([&](){
1571 b.resize(k*n);
1572 std::fill_n(b.begin(), k*n, val_b);
1573 hb = b.data();
1574 REQUIRE(cudaMalloc(&db, k*n*sizeof(T)) == cudaSuccess);
1575 }).name("hb");
1576
1577 auto hostc = taskflow.emplace([&](){
1578 c.resize(m*n);
1579 hc = c.data();
1580 REQUIRE(cudaMalloc(&dc, m*n*sizeof(T)) == cudaSuccess);
1581 }).name("hc");
1582
1583 auto cuda = taskflow.emplace([&](tf::cudaFlow& cf){
1584 auto pa = cf.copy(da, ha, m*k);
1585 auto pb = cf.copy(db, hb, k*n);
1586
1587 auto op = tf::cudaBLAF(cf).matmul(
1588 da, db, dc, m, k, n
1589 ).name("op");
1590
1591 auto cc = cf.copy(hc, dc, m*n).name("cc");
1592
1593 op.precede(cc).succeed(pa, pb);
1594 });
1595
1596 cuda.succeed(hosta, hostb, hostc);
1597
1598 executor.run(taskflow).wait();
1599
1600 int ans = val_a*val_b*k;
1601 for(const auto& x : c) {
1602 REQUIRE((int)x == ans);
1603 }
1604
1605 REQUIRE(cudaFree(da) == cudaSuccess);
1606 REQUIRE(cudaFree(db) == cudaSuccess);
1607 REQUIRE(cudaFree(dc) == cudaSuccess);
1608 }
1609 }
1610 }
1611 }
1612
1613 TEST_CASE("matmul.int" * doctest::timeout(300) ) {
1614 matmul<int>();
1615 }
1616
1617 TEST_CASE("matmul.float" * doctest::timeout(300) ) {
1618 matmul<float>();
1619 }
1620
1621 TEST_CASE("matmul.double" * doctest::timeout(300) ) {
1622 matmul<double>();
1623 }*/
1624
1625