1 /* { dg-do run } */
2 /* { dg-additional-options "-DUSE_CUDA_H" { target openacc_cuda } } */
3 /* { dg-additional-options "-lcuda" { target { openacc_nvidia_accel_selected && openacc_cuda } } } */
4
5 #include <openacc.h>
6 #include <stdlib.h>
7 #if defined ACC_DEVICE_TYPE_nvidia && defined USE_CUDA_H
8 #include "cuda.h"
9 #endif
10
11 #include <stdio.h>
12 #include <sys/time.h>
13
14 int
main(int argc,char ** argv)15 main (int argc, char **argv)
16 {
17 #if defined ACC_DEVICE_TYPE_nvidia && defined USE_CUDA_H
18 CUresult r;
19 CUstream stream1;
20 #endif
21 int N = 128; //1024 * 1024;
22 float *a, *b, *c, *d, *e;
23 int i;
24 int nbytes;
25
26 #if defined ACC_DEVICE_TYPE_nvidia && defined USE_CUDA_H
27 acc_init (acc_device_nvidia);
28 #endif
29
30 nbytes = N * sizeof (float);
31
32 a = (float *) malloc (nbytes);
33 b = (float *) malloc (nbytes);
34 c = (float *) malloc (nbytes);
35 d = (float *) malloc (nbytes);
36 e = (float *) malloc (nbytes);
37
38 for (i = 0; i < N; i++)
39 {
40 a[i] = 3.0;
41 b[i] = 0.0;
42 }
43
44 #pragma acc data copy (a[0:N]) copy (b[0:N]) copyin (N)
45 {
46
47 #pragma acc parallel async
48 {
49 int ii;
50
51 for (ii = 0; ii < N; ii++)
52 b[ii] = a[ii];
53 }
54
55 #pragma acc wait
56
57 }
58
59 for (i = 0; i < N; i++)
60 {
61 if (a[i] != 3.0)
62 abort ();
63
64 if (b[i] != 3.0)
65 abort ();
66 }
67
68 for (i = 0; i < N; i++)
69 {
70 a[i] = 2.0;
71 b[i] = 0.0;
72 }
73
74 #pragma acc data copy (a[0:N]) copy (b[0:N]) copyin (N)
75 {
76
77 #pragma acc parallel async (1)
78 {
79 int ii;
80
81 for (ii = 0; ii < N; ii++)
82 b[ii] = a[ii];
83 }
84
85 #pragma acc wait (1)
86
87 }
88
89 for (i = 0; i < N; i++)
90 {
91 if (a[i] != 2.0)
92 abort ();
93
94 if (b[i] != 2.0)
95 abort ();
96 }
97
98 for (i = 0; i < N; i++)
99 {
100 a[i] = 3.0;
101 b[i] = 0.0;
102 c[i] = 0.0;
103 d[i] = 0.0;
104 }
105
106 #pragma acc data copy (a[0:N]) copy (b[0:N]) copy (c[0:N]) copy (d[0:N]) copyin (N)
107 {
108
109 #pragma acc parallel async (1)
110 {
111 int ii;
112
113 for (ii = 0; ii < N; ii++)
114 b[ii] = (a[ii] * a[ii] * a[ii]) / a[ii];
115 }
116
117 #pragma acc parallel async (1)
118 {
119 int ii;
120
121 for (ii = 0; ii < N; ii++)
122 c[ii] = (a[ii] + a[ii] + a[ii] + a[ii]) / a[ii];
123 }
124
125
126 #pragma acc parallel async (1)
127 {
128 int ii;
129
130 for (ii = 0; ii < N; ii++)
131 d[ii] = ((a[ii] * a[ii] + a[ii]) / a[ii]) - a[ii];
132 }
133
134 #pragma acc wait (1)
135
136 }
137
138 for (i = 0; i < N; i++)
139 {
140 if (a[i] != 3.0)
141 abort ();
142
143 if (b[i] != 9.0)
144 abort ();
145
146 if (c[i] != 4.0)
147 abort ();
148
149 if (d[i] != 1.0)
150 abort ();
151 }
152
153 for (i = 0; i < N; i++)
154 {
155 a[i] = 2.0;
156 b[i] = 0.0;
157 c[i] = 0.0;
158 d[i] = 0.0;
159 e[i] = 0.0;
160 }
161
162 #pragma acc data copy (a[0:N], b[0:N], c[0:N], d[0:N], e[0:N]) copyin (N)
163 {
164
165 #pragma acc parallel async (1)
166 {
167 int ii;
168
169 for (ii = 0; ii < N; ii++)
170 b[ii] = (a[ii] * a[ii] * a[ii]) / a[ii];
171 }
172
173 #pragma acc parallel async (1)
174 {
175 int ii;
176
177 for (ii = 0; ii < N; ii++)
178 c[ii] = (a[ii] + a[ii] + a[ii] + a[ii]) / a[ii];
179 }
180
181 #pragma acc parallel async (1)
182 {
183 int ii;
184
185 for (ii = 0; ii < N; ii++)
186 d[ii] = ((a[ii] * a[ii] + a[ii]) / a[ii]) - a[ii];
187 }
188
189 #pragma acc parallel wait (1) async (1)
190 {
191 int ii;
192
193 for (ii = 0; ii < N; ii++)
194 e[ii] = a[ii] + b[ii] + c[ii] + d[ii];
195 }
196
197 #pragma acc wait (1)
198
199 }
200
201 for (i = 0; i < N; i++)
202 {
203 if (a[i] != 2.0)
204 abort ();
205
206 if (b[i] != 4.0)
207 abort ();
208
209 if (c[i] != 4.0)
210 abort ();
211
212 if (d[i] != 1.0)
213 abort ();
214
215 if (e[i] != 11.0)
216 abort ();
217 }
218
219
220 #if defined ACC_DEVICE_TYPE_nvidia && defined USE_CUDA_H
221 r = cuStreamCreate (&stream1, CU_STREAM_NON_BLOCKING);
222 if (r != CUDA_SUCCESS)
223 {
224 fprintf (stderr, "cuStreamCreate failed: %d\n", r);
225 abort ();
226 }
227
228 acc_set_cuda_stream (1, stream1);
229 #endif
230
231 for (i = 0; i < N; i++)
232 {
233 a[i] = 5.0;
234 b[i] = 0.0;
235 }
236
237 #pragma acc data copy (a[0:N], b[0:N]) copyin (N)
238 {
239
240 #pragma acc parallel async (1)
241 {
242 int ii;
243
244 for (ii = 0; ii < N; ii++)
245 b[ii] = a[ii];
246 }
247
248 #pragma acc wait (1)
249
250 }
251
252 for (i = 0; i < N; i++)
253 {
254 if (a[i] != 5.0)
255 abort ();
256
257 if (b[i] != 5.0)
258 abort ();
259 }
260
261 for (i = 0; i < N; i++)
262 {
263 a[i] = 7.0;
264 b[i] = 0.0;
265 c[i] = 0.0;
266 d[i] = 0.0;
267 }
268
269 #pragma acc data copy (a[0:N]) copy (b[0:N]) copy (c[0:N]) copy (d[0:N]) copyin (N)
270 {
271
272 #pragma acc parallel async (1)
273 {
274 int ii;
275
276 for (ii = 0; ii < N; ii++)
277 b[ii] = (a[ii] * a[ii] * a[ii]) / a[ii];
278 }
279
280 #pragma acc parallel async (1)
281 {
282 int ii;
283
284 for (ii = 0; ii < N; ii++)
285 c[ii] = (a[ii] + a[ii] + a[ii] + a[ii]) / a[ii];
286 }
287
288 #pragma acc parallel async (1)
289 {
290 int ii;
291
292 for (ii = 0; ii < N; ii++)
293 d[ii] = ((a[ii] * a[ii] + a[ii]) / a[ii]) - a[ii];
294 }
295
296 #pragma acc wait (1)
297
298 }
299
300 for (i = 0; i < N; i++)
301 {
302 if (a[i] != 7.0)
303 abort ();
304
305 if (b[i] != 49.0)
306 abort ();
307
308 if (c[i] != 4.0)
309 abort ();
310
311 if (d[i] != 1.0)
312 abort ();
313 }
314
315 for (i = 0; i < N; i++)
316 {
317 a[i] = 3.0;
318 b[i] = 0.0;
319 c[i] = 0.0;
320 d[i] = 0.0;
321 e[i] = 0.0;
322 }
323
324 #pragma acc data copy (a[0:N], b[0:N], c[0:N], d[0:N], e[0:N]) copyin (N)
325 {
326
327 #pragma acc parallel async (1)
328 {
329 int ii;
330
331 for (ii = 0; ii < N; ii++)
332 b[ii] = (a[ii] * a[ii] * a[ii]) / a[ii];
333 }
334
335 #pragma acc parallel async (1)
336 {
337 int ii;
338
339 for (ii = 0; ii < N; ii++)
340 c[ii] = (a[ii] + a[ii] + a[ii] + a[ii]) / a[ii];
341 }
342
343 #pragma acc parallel async (1)
344 {
345 int ii;
346
347 for (ii = 0; ii < N; ii++)
348 d[ii] = ((a[ii] * a[ii] + a[ii]) / a[ii]) - a[ii];
349 }
350
351 #pragma acc parallel wait (1) async (1)
352 {
353 int ii;
354
355 for (ii = 0; ii < N; ii++)
356 e[ii] = a[ii] + b[ii] + c[ii] + d[ii];
357 }
358
359 #pragma acc wait (1)
360
361 }
362
363 for (i = 0; i < N; i++)
364 {
365 if (a[i] != 3.0)
366 abort ();
367
368 if (b[i] != 9.0)
369 abort ();
370
371 if (c[i] != 4.0)
372 abort ();
373
374 if (d[i] != 1.0)
375 abort ();
376
377 if (e[i] != 17.0)
378 abort ();
379 }
380
381 for (i = 0; i < N; i++)
382 {
383 a[i] = 4.0;
384 b[i] = 0.0;
385 c[i] = 0.0;
386 d[i] = 0.0;
387 e[i] = 0.0;
388 }
389
390 #pragma acc data copyin (a[0:N], b[0:N], c[0:N]) copyin (N)
391 {
392
393 #pragma acc parallel async (1)
394 {
395 int ii;
396
397 for (ii = 0; ii < N; ii++)
398 b[ii] = (a[ii] * a[ii] * a[ii]) / a[ii];
399 }
400
401 #pragma acc parallel async (1)
402 {
403 int ii;
404
405 for (ii = 0; ii < N; ii++)
406 c[ii] = (a[ii] + a[ii] + a[ii] + a[ii]) / a[ii];
407 }
408
409 #pragma acc update host (a[0:N], b[0:N], c[0:N]) wait (1)
410
411 }
412
413 for (i = 0; i < N; i++)
414 {
415 if (a[i] != 4.0)
416 abort ();
417
418 if (b[i] != 16.0)
419 abort ();
420
421 if (c[i] != 4.0)
422 abort ();
423 }
424
425
426 for (i = 0; i < N; i++)
427 {
428 a[i] = 5.0;
429 b[i] = 0.0;
430 c[i] = 0.0;
431 d[i] = 0.0;
432 e[i] = 0.0;
433 }
434
435 #pragma acc data copyin (a[0:N], b[0:N], c[0:N]) copyin (N)
436 {
437
438 #pragma acc parallel async (1)
439 {
440 int ii;
441
442 for (ii = 0; ii < N; ii++)
443 b[ii] = (a[ii] * a[ii] * a[ii]) / a[ii];
444 }
445
446 #pragma acc parallel async (1)
447 {
448 int ii;
449
450 for (ii = 0; ii < N; ii++)
451 c[ii] = (a[ii] + a[ii] + a[ii] + a[ii]) / a[ii];
452 }
453
454 #pragma acc update host (a[0:N], b[0:N], c[0:N]) async (1)
455
456 #pragma acc wait (1)
457
458 }
459
460 for (i = 0; i < N; i++)
461 {
462 if (a[i] != 5.0)
463 abort ();
464
465 if (b[i] != 25.0)
466 abort ();
467
468 if (c[i] != 4.0)
469 abort ();
470 }
471
472 for (i = 0; i < N; i++)
473 {
474 a[i] = 3.0;
475 b[i] = 0.0;
476 }
477
478 #pragma acc data copy (a[0:N]) copy (b[0:N]) copyin (N)
479 {
480
481 #pragma acc kernels async
482 {
483 int ii;
484
485 for (ii = 0; ii < N; ii++)
486 b[ii] = a[ii];
487 }
488
489 #pragma acc wait
490
491 }
492
493 for (i = 0; i < N; i++)
494 {
495 if (a[i] != 3.0)
496 abort ();
497
498 if (b[i] != 3.0)
499 abort ();
500 }
501
502 for (i = 0; i < N; i++)
503 {
504 a[i] = 2.0;
505 b[i] = 0.0;
506 }
507
508 #pragma acc data copy (a[0:N]) copy (b[0:N]) copyin (N)
509 {
510
511 #pragma acc kernels async (1)
512 {
513 int ii;
514
515 for (ii = 0; ii < N; ii++)
516 b[ii] = a[ii];
517 }
518
519 #pragma acc wait (1)
520
521 }
522
523 for (i = 0; i < N; i++)
524 {
525 if (a[i] != 2.0)
526 abort ();
527
528 if (b[i] != 2.0)
529 abort ();
530 }
531
532 for (i = 0; i < N; i++)
533 {
534 a[i] = 3.0;
535 b[i] = 0.0;
536 c[i] = 0.0;
537 d[i] = 0.0;
538 }
539
540 #pragma acc data copy (a[0:N]) copy (b[0:N]) copy (c[0:N]) copy (d[0:N]) copyin (N)
541 {
542
543 #pragma acc kernels async (1)
544 {
545 int ii;
546
547 for (ii = 0; ii < N; ii++)
548 b[ii] = (a[ii] * a[ii] * a[ii]) / a[ii];
549 }
550
551 #pragma acc kernels async (1)
552 {
553 int ii;
554
555 for (ii = 0; ii < N; ii++)
556 c[ii] = (a[ii] + a[ii] + a[ii] + a[ii]) / a[ii];
557 }
558
559
560 #pragma acc kernels async (1)
561 {
562 int ii;
563
564 for (ii = 0; ii < N; ii++)
565 d[ii] = ((a[ii] * a[ii] + a[ii]) / a[ii]) - a[ii];
566 }
567
568 #pragma acc wait (1)
569
570 }
571
572 for (i = 0; i < N; i++)
573 {
574 if (a[i] != 3.0)
575 abort ();
576
577 if (b[i] != 9.0)
578 abort ();
579
580 if (c[i] != 4.0)
581 abort ();
582
583 if (d[i] != 1.0)
584 abort ();
585 }
586
587 for (i = 0; i < N; i++)
588 {
589 a[i] = 2.0;
590 b[i] = 0.0;
591 c[i] = 0.0;
592 d[i] = 0.0;
593 e[i] = 0.0;
594 }
595
596 #pragma acc data copy (a[0:N], b[0:N], c[0:N], d[0:N], e[0:N]) copyin (N)
597 {
598
599 #pragma acc kernels async (1)
600 {
601 int ii;
602
603 for (ii = 0; ii < N; ii++)
604 b[ii] = (a[ii] * a[ii] * a[ii]) / a[ii];
605 }
606
607 #pragma acc kernels async (1)
608 {
609 int ii;
610
611 for (ii = 0; ii < N; ii++)
612 c[ii] = (a[ii] + a[ii] + a[ii] + a[ii]) / a[ii];
613 }
614
615 #pragma acc kernels async (1)
616 {
617 int ii;
618
619 for (ii = 0; ii < N; ii++)
620 d[ii] = ((a[ii] * a[ii] + a[ii]) / a[ii]) - a[ii];
621 }
622
623 #pragma acc kernels wait (1) async (1)
624 {
625 int ii;
626
627 for (ii = 0; ii < N; ii++)
628 e[ii] = a[ii] + b[ii] + c[ii] + d[ii];
629 }
630
631 #pragma acc wait (1)
632
633 }
634
635 for (i = 0; i < N; i++)
636 {
637 if (a[i] != 2.0)
638 abort ();
639
640 if (b[i] != 4.0)
641 abort ();
642
643 if (c[i] != 4.0)
644 abort ();
645
646 if (d[i] != 1.0)
647 abort ();
648
649 if (e[i] != 11.0)
650 abort ();
651 }
652
653
654 #if defined ACC_DEVICE_TYPE_nvidia && defined USE_CUDA_H
655 r = cuStreamCreate (&stream1, CU_STREAM_NON_BLOCKING);
656 if (r != CUDA_SUCCESS)
657 {
658 fprintf (stderr, "cuStreamCreate failed: %d\n", r);
659 abort ();
660 }
661
662 acc_set_cuda_stream (1, stream1);
663 #endif
664
665 for (i = 0; i < N; i++)
666 {
667 a[i] = 5.0;
668 b[i] = 0.0;
669 }
670
671 #pragma acc data copy (a[0:N], b[0:N]) copyin (N)
672 {
673
674 #pragma acc kernels async (1)
675 {
676 int ii;
677
678 for (ii = 0; ii < N; ii++)
679 b[ii] = a[ii];
680 }
681
682 #pragma acc wait (1)
683
684 }
685
686 for (i = 0; i < N; i++)
687 {
688 if (a[i] != 5.0)
689 abort ();
690
691 if (b[i] != 5.0)
692 abort ();
693 }
694
695 for (i = 0; i < N; i++)
696 {
697 a[i] = 7.0;
698 b[i] = 0.0;
699 c[i] = 0.0;
700 d[i] = 0.0;
701 }
702
703 #pragma acc data copy (a[0:N]) copy (b[0:N]) copy (c[0:N]) copy (d[0:N]) copyin (N)
704 {
705
706 #pragma acc kernels async (1)
707 {
708 int ii;
709
710 for (ii = 0; ii < N; ii++)
711 b[ii] = (a[ii] * a[ii] * a[ii]) / a[ii];
712 }
713
714 #pragma acc kernels async (1)
715 {
716 int ii;
717
718 for (ii = 0; ii < N; ii++)
719 c[ii] = (a[ii] + a[ii] + a[ii] + a[ii]) / a[ii];
720 }
721
722 #pragma acc kernels async (1)
723 {
724 int ii;
725
726 for (ii = 0; ii < N; ii++)
727 d[ii] = ((a[ii] * a[ii] + a[ii]) / a[ii]) - a[ii];
728 }
729
730 #pragma acc wait (1)
731
732 }
733
734 for (i = 0; i < N; i++)
735 {
736 if (a[i] != 7.0)
737 abort ();
738
739 if (b[i] != 49.0)
740 abort ();
741
742 if (c[i] != 4.0)
743 abort ();
744
745 if (d[i] != 1.0)
746 abort ();
747 }
748
749 for (i = 0; i < N; i++)
750 {
751 a[i] = 3.0;
752 b[i] = 0.0;
753 c[i] = 0.0;
754 d[i] = 0.0;
755 e[i] = 0.0;
756 }
757
758 #pragma acc data copy (a[0:N], b[0:N], c[0:N], d[0:N], e[0:N]) copyin (N)
759 {
760
761 #pragma acc kernels async (1)
762 {
763 int ii;
764
765 for (ii = 0; ii < N; ii++)
766 b[ii] = (a[ii] * a[ii] * a[ii]) / a[ii];
767 }
768
769 #pragma acc kernels async (1)
770 {
771 int ii;
772
773 for (ii = 0; ii < N; ii++)
774 c[ii] = (a[ii] + a[ii] + a[ii] + a[ii]) / a[ii];
775 }
776
777 #pragma acc kernels async (1)
778 {
779 int ii;
780
781 for (ii = 0; ii < N; ii++)
782 d[ii] = ((a[ii] * a[ii] + a[ii]) / a[ii]) - a[ii];
783 }
784
785 #pragma acc kernels wait (1) async (1)
786 {
787 int ii;
788
789 for (ii = 0; ii < N; ii++)
790 e[ii] = a[ii] + b[ii] + c[ii] + d[ii];
791 }
792
793 #pragma acc wait (1)
794
795 }
796
797 for (i = 0; i < N; i++)
798 {
799 if (a[i] != 3.0)
800 abort ();
801
802 if (b[i] != 9.0)
803 abort ();
804
805 if (c[i] != 4.0)
806 abort ();
807
808 if (d[i] != 1.0)
809 abort ();
810
811 if (e[i] != 17.0)
812 abort ();
813 }
814
815 for (i = 0; i < N; i++)
816 {
817 a[i] = 4.0;
818 b[i] = 0.0;
819 c[i] = 0.0;
820 d[i] = 0.0;
821 e[i] = 0.0;
822 }
823
824 #pragma acc data copyin (a[0:N], b[0:N], c[0:N]) copyin (N)
825 {
826
827 #pragma acc kernels async (1)
828 {
829 int ii;
830
831 for (ii = 0; ii < N; ii++)
832 b[ii] = (a[ii] * a[ii] * a[ii]) / a[ii];
833 }
834
835 #pragma acc kernels async (1)
836 {
837 int ii;
838
839 for (ii = 0; ii < N; ii++)
840 c[ii] = (a[ii] + a[ii] + a[ii] + a[ii]) / a[ii];
841 }
842
843 #pragma acc update host (a[0:N], b[0:N], c[0:N]) wait (1)
844
845 }
846
847 for (i = 0; i < N; i++)
848 {
849 if (a[i] != 4.0)
850 abort ();
851
852 if (b[i] != 16.0)
853 abort ();
854
855 if (c[i] != 4.0)
856 abort ();
857 }
858
859
860 for (i = 0; i < N; i++)
861 {
862 a[i] = 5.0;
863 b[i] = 0.0;
864 c[i] = 0.0;
865 d[i] = 0.0;
866 e[i] = 0.0;
867 }
868
869 #pragma acc data copyin (a[0:N], b[0:N], c[0:N]) copyin (N)
870 {
871
872 #pragma acc kernels async (1)
873 {
874 int ii;
875
876 for (ii = 0; ii < N; ii++)
877 b[ii] = (a[ii] * a[ii] * a[ii]) / a[ii];
878 }
879
880 #pragma acc kernels async (1)
881 {
882 int ii;
883
884 for (ii = 0; ii < N; ii++)
885 c[ii] = (a[ii] + a[ii] + a[ii] + a[ii]) / a[ii];
886 }
887
888 #pragma acc update host (a[0:N], b[0:N], c[0:N]) async (1)
889
890 #pragma acc wait (1)
891
892 }
893
894 for (i = 0; i < N; i++)
895 {
896 if (a[i] != 5.0)
897 abort ();
898
899 if (b[i] != 25.0)
900 abort ();
901
902 if (c[i] != 4.0)
903 abort ();
904 }
905
906 #if defined ACC_DEVICE_TYPE_nvidia && defined USE_CUDA_H
907 acc_shutdown (acc_device_nvidia);
908 #endif
909
910 return 0;
911 }
912