1 /* { dg-do run } */
2 /* { dg-additional-options "-lcuda" { target openacc_nvidia_accel_selected } } */
3
4 #include <openacc.h>
5 #include <stdlib.h>
6 #if defined ACC_DEVICE_TYPE_nvidia
7 #include "cuda.h"
8 #endif
9
10 #include <stdio.h>
11 #include <sys/time.h>
12
13 int
main(int argc,char ** argv)14 main (int argc, char **argv)
15 {
16 #if defined ACC_DEVICE_TYPE_nvidia
17 CUresult r;
18 CUstream stream1;
19 #endif
20 int N = 128; //1024 * 1024;
21 float *a, *b, *c, *d, *e;
22 int i;
23 int nbytes;
24
25 #if defined ACC_DEVICE_TYPE_nvidia
26 acc_init (acc_device_nvidia);
27 #endif
28
29 nbytes = N * sizeof (float);
30
31 a = (float *) malloc (nbytes);
32 b = (float *) malloc (nbytes);
33 c = (float *) malloc (nbytes);
34 d = (float *) malloc (nbytes);
35 e = (float *) malloc (nbytes);
36
37 for (i = 0; i < N; i++)
38 {
39 a[i] = 3.0;
40 b[i] = 0.0;
41 }
42
43 #pragma acc data copy (a[0:N]) copy (b[0:N]) copyin (N)
44 {
45
46 #pragma acc parallel async
47 {
48 int ii;
49
50 for (ii = 0; ii < N; ii++)
51 b[ii] = a[ii];
52 }
53
54 #pragma acc wait
55
56 }
57
58 for (i = 0; i < N; i++)
59 {
60 if (a[i] != 3.0)
61 abort ();
62
63 if (b[i] != 3.0)
64 abort ();
65 }
66
67 for (i = 0; i < N; i++)
68 {
69 a[i] = 2.0;
70 b[i] = 0.0;
71 }
72
73 #pragma acc data copy (a[0:N]) copy (b[0:N]) copyin (N)
74 {
75
76 #pragma acc parallel async (1)
77 {
78 int ii;
79
80 for (ii = 0; ii < N; ii++)
81 b[ii] = a[ii];
82 }
83
84 #pragma acc wait (1)
85
86 }
87
88 for (i = 0; i < N; i++)
89 {
90 if (a[i] != 2.0)
91 abort ();
92
93 if (b[i] != 2.0)
94 abort ();
95 }
96
97 for (i = 0; i < N; i++)
98 {
99 a[i] = 3.0;
100 b[i] = 0.0;
101 c[i] = 0.0;
102 d[i] = 0.0;
103 }
104
105 #pragma acc data copy (a[0:N]) copy (b[0:N]) copy (c[0:N]) copy (d[0:N]) copyin (N)
106 {
107
108 #pragma acc parallel async (1)
109 {
110 int ii;
111
112 for (ii = 0; ii < N; ii++)
113 b[ii] = (a[ii] * a[ii] * a[ii]) / a[ii];
114 }
115
116 #pragma acc parallel async (1)
117 {
118 int ii;
119
120 for (ii = 0; ii < N; ii++)
121 c[ii] = (a[ii] + a[ii] + a[ii] + a[ii]) / a[ii];
122 }
123
124
125 #pragma acc parallel async (1)
126 {
127 int ii;
128
129 for (ii = 0; ii < N; ii++)
130 d[ii] = ((a[ii] * a[ii] + a[ii]) / a[ii]) - a[ii];
131 }
132
133 #pragma acc wait (1)
134
135 }
136
137 for (i = 0; i < N; i++)
138 {
139 if (a[i] != 3.0)
140 abort ();
141
142 if (b[i] != 9.0)
143 abort ();
144
145 if (c[i] != 4.0)
146 abort ();
147
148 if (d[i] != 1.0)
149 abort ();
150 }
151
152 for (i = 0; i < N; i++)
153 {
154 a[i] = 2.0;
155 b[i] = 0.0;
156 c[i] = 0.0;
157 d[i] = 0.0;
158 e[i] = 0.0;
159 }
160
161 #pragma acc data copy (a[0:N], b[0:N], c[0:N], d[0:N], e[0:N]) copyin (N)
162 {
163
164 #pragma acc parallel async (1)
165 {
166 int ii;
167
168 for (ii = 0; ii < N; ii++)
169 b[ii] = (a[ii] * a[ii] * a[ii]) / a[ii];
170 }
171
172 #pragma acc parallel async (1)
173 {
174 int ii;
175
176 for (ii = 0; ii < N; ii++)
177 c[ii] = (a[ii] + a[ii] + a[ii] + a[ii]) / a[ii];
178 }
179
180 #pragma acc parallel async (1)
181 {
182 int ii;
183
184 for (ii = 0; ii < N; ii++)
185 d[ii] = ((a[ii] * a[ii] + a[ii]) / a[ii]) - a[ii];
186 }
187
188 #pragma acc parallel wait (1) async (1)
189 {
190 int ii;
191
192 for (ii = 0; ii < N; ii++)
193 e[ii] = a[ii] + b[ii] + c[ii] + d[ii];
194 }
195
196 #pragma acc wait (1)
197
198 }
199
200 for (i = 0; i < N; i++)
201 {
202 if (a[i] != 2.0)
203 abort ();
204
205 if (b[i] != 4.0)
206 abort ();
207
208 if (c[i] != 4.0)
209 abort ();
210
211 if (d[i] != 1.0)
212 abort ();
213
214 if (e[i] != 11.0)
215 abort ();
216 }
217
218
219 #if defined ACC_DEVICE_TYPE_nvidia
220 r = cuStreamCreate (&stream1, CU_STREAM_NON_BLOCKING);
221 if (r != CUDA_SUCCESS)
222 {
223 fprintf (stderr, "cuStreamCreate failed: %d\n", r);
224 abort ();
225 }
226
227 acc_set_cuda_stream (1, stream1);
228 #endif
229
230 for (i = 0; i < N; i++)
231 {
232 a[i] = 5.0;
233 b[i] = 0.0;
234 }
235
236 #pragma acc data copy (a[0:N], b[0:N]) copyin (N)
237 {
238
239 #pragma acc parallel async (1)
240 {
241 int ii;
242
243 for (ii = 0; ii < N; ii++)
244 b[ii] = a[ii];
245 }
246
247 #pragma acc wait (1)
248
249 }
250
251 for (i = 0; i < N; i++)
252 {
253 if (a[i] != 5.0)
254 abort ();
255
256 if (b[i] != 5.0)
257 abort ();
258 }
259
260 for (i = 0; i < N; i++)
261 {
262 a[i] = 7.0;
263 b[i] = 0.0;
264 c[i] = 0.0;
265 d[i] = 0.0;
266 }
267
268 #pragma acc data copy (a[0:N]) copy (b[0:N]) copy (c[0:N]) copy (d[0:N]) copyin (N)
269 {
270
271 #pragma acc parallel async (1)
272 {
273 int ii;
274
275 for (ii = 0; ii < N; ii++)
276 b[ii] = (a[ii] * a[ii] * a[ii]) / a[ii];
277 }
278
279 #pragma acc parallel async (1)
280 {
281 int ii;
282
283 for (ii = 0; ii < N; ii++)
284 c[ii] = (a[ii] + a[ii] + a[ii] + a[ii]) / a[ii];
285 }
286
287 #pragma acc parallel async (1)
288 {
289 int ii;
290
291 for (ii = 0; ii < N; ii++)
292 d[ii] = ((a[ii] * a[ii] + a[ii]) / a[ii]) - a[ii];
293 }
294
295 #pragma acc wait (1)
296
297 }
298
299 for (i = 0; i < N; i++)
300 {
301 if (a[i] != 7.0)
302 abort ();
303
304 if (b[i] != 49.0)
305 abort ();
306
307 if (c[i] != 4.0)
308 abort ();
309
310 if (d[i] != 1.0)
311 abort ();
312 }
313
314 for (i = 0; i < N; i++)
315 {
316 a[i] = 3.0;
317 b[i] = 0.0;
318 c[i] = 0.0;
319 d[i] = 0.0;
320 e[i] = 0.0;
321 }
322
323 #pragma acc data copy (a[0:N], b[0:N], c[0:N], d[0:N], e[0:N]) copyin (N)
324 {
325
326 #pragma acc parallel async (1)
327 {
328 int ii;
329
330 for (ii = 0; ii < N; ii++)
331 b[ii] = (a[ii] * a[ii] * a[ii]) / a[ii];
332 }
333
334 #pragma acc parallel async (1)
335 {
336 int ii;
337
338 for (ii = 0; ii < N; ii++)
339 c[ii] = (a[ii] + a[ii] + a[ii] + a[ii]) / a[ii];
340 }
341
342 #pragma acc parallel async (1)
343 {
344 int ii;
345
346 for (ii = 0; ii < N; ii++)
347 d[ii] = ((a[ii] * a[ii] + a[ii]) / a[ii]) - a[ii];
348 }
349
350 #pragma acc parallel wait (1) async (1)
351 {
352 int ii;
353
354 for (ii = 0; ii < N; ii++)
355 e[ii] = a[ii] + b[ii] + c[ii] + d[ii];
356 }
357
358 #pragma acc wait (1)
359
360 }
361
362 for (i = 0; i < N; i++)
363 {
364 if (a[i] != 3.0)
365 abort ();
366
367 if (b[i] != 9.0)
368 abort ();
369
370 if (c[i] != 4.0)
371 abort ();
372
373 if (d[i] != 1.0)
374 abort ();
375
376 if (e[i] != 17.0)
377 abort ();
378 }
379
380 for (i = 0; i < N; i++)
381 {
382 a[i] = 4.0;
383 b[i] = 0.0;
384 c[i] = 0.0;
385 d[i] = 0.0;
386 e[i] = 0.0;
387 }
388
389 #pragma acc data copyin (a[0:N], b[0:N], c[0:N]) copyin (N)
390 {
391
392 #pragma acc parallel async (1)
393 {
394 int ii;
395
396 for (ii = 0; ii < N; ii++)
397 b[ii] = (a[ii] * a[ii] * a[ii]) / a[ii];
398 }
399
400 #pragma acc parallel async (1)
401 {
402 int ii;
403
404 for (ii = 0; ii < N; ii++)
405 c[ii] = (a[ii] + a[ii] + a[ii] + a[ii]) / a[ii];
406 }
407
408 #pragma acc update host (a[0:N], b[0:N], c[0:N]) wait (1)
409
410 }
411
412 for (i = 0; i < N; i++)
413 {
414 if (a[i] != 4.0)
415 abort ();
416
417 if (b[i] != 16.0)
418 abort ();
419
420 if (c[i] != 4.0)
421 abort ();
422 }
423
424
425 for (i = 0; i < N; i++)
426 {
427 a[i] = 5.0;
428 b[i] = 0.0;
429 c[i] = 0.0;
430 d[i] = 0.0;
431 e[i] = 0.0;
432 }
433
434 #pragma acc data copyin (a[0:N], b[0:N], c[0:N]) copyin (N)
435 {
436
437 #pragma acc parallel async (1)
438 {
439 int ii;
440
441 for (ii = 0; ii < N; ii++)
442 b[ii] = (a[ii] * a[ii] * a[ii]) / a[ii];
443 }
444
445 #pragma acc parallel async (1)
446 {
447 int ii;
448
449 for (ii = 0; ii < N; ii++)
450 c[ii] = (a[ii] + a[ii] + a[ii] + a[ii]) / a[ii];
451 }
452
453 #pragma acc update host (a[0:N], b[0:N], c[0:N]) async (1)
454
455 #pragma acc wait (1)
456
457 }
458
459 for (i = 0; i < N; i++)
460 {
461 if (a[i] != 5.0)
462 abort ();
463
464 if (b[i] != 25.0)
465 abort ();
466
467 if (c[i] != 4.0)
468 abort ();
469 }
470
471 for (i = 0; i < N; i++)
472 {
473 a[i] = 3.0;
474 b[i] = 0.0;
475 }
476
477 #pragma acc data copy (a[0:N]) copy (b[0:N]) copyin (N)
478 {
479
480 #pragma acc kernels async
481 {
482 int ii;
483
484 for (ii = 0; ii < N; ii++)
485 b[ii] = a[ii];
486 }
487
488 #pragma acc wait
489
490 }
491
492 for (i = 0; i < N; i++)
493 {
494 if (a[i] != 3.0)
495 abort ();
496
497 if (b[i] != 3.0)
498 abort ();
499 }
500
501 for (i = 0; i < N; i++)
502 {
503 a[i] = 2.0;
504 b[i] = 0.0;
505 }
506
507 #pragma acc data copy (a[0:N]) copy (b[0:N]) copyin (N)
508 {
509
510 #pragma acc kernels async (1)
511 {
512 int ii;
513
514 for (ii = 0; ii < N; ii++)
515 b[ii] = a[ii];
516 }
517
518 #pragma acc wait (1)
519
520 }
521
522 for (i = 0; i < N; i++)
523 {
524 if (a[i] != 2.0)
525 abort ();
526
527 if (b[i] != 2.0)
528 abort ();
529 }
530
531 for (i = 0; i < N; i++)
532 {
533 a[i] = 3.0;
534 b[i] = 0.0;
535 c[i] = 0.0;
536 d[i] = 0.0;
537 }
538
539 #pragma acc data copy (a[0:N]) copy (b[0:N]) copy (c[0:N]) copy (d[0:N]) copyin (N)
540 {
541
542 #pragma acc kernels async (1)
543 {
544 int ii;
545
546 for (ii = 0; ii < N; ii++)
547 b[ii] = (a[ii] * a[ii] * a[ii]) / a[ii];
548 }
549
550 #pragma acc kernels async (1)
551 {
552 int ii;
553
554 for (ii = 0; ii < N; ii++)
555 c[ii] = (a[ii] + a[ii] + a[ii] + a[ii]) / a[ii];
556 }
557
558
559 #pragma acc kernels async (1)
560 {
561 int ii;
562
563 for (ii = 0; ii < N; ii++)
564 d[ii] = ((a[ii] * a[ii] + a[ii]) / a[ii]) - a[ii];
565 }
566
567 #pragma acc wait (1)
568
569 }
570
571 for (i = 0; i < N; i++)
572 {
573 if (a[i] != 3.0)
574 abort ();
575
576 if (b[i] != 9.0)
577 abort ();
578
579 if (c[i] != 4.0)
580 abort ();
581
582 if (d[i] != 1.0)
583 abort ();
584 }
585
586 for (i = 0; i < N; i++)
587 {
588 a[i] = 2.0;
589 b[i] = 0.0;
590 c[i] = 0.0;
591 d[i] = 0.0;
592 e[i] = 0.0;
593 }
594
595 #pragma acc data copy (a[0:N], b[0:N], c[0:N], d[0:N], e[0:N]) copyin (N)
596 {
597
598 #pragma acc kernels async (1)
599 {
600 int ii;
601
602 for (ii = 0; ii < N; ii++)
603 b[ii] = (a[ii] * a[ii] * a[ii]) / a[ii];
604 }
605
606 #pragma acc kernels async (1)
607 {
608 int ii;
609
610 for (ii = 0; ii < N; ii++)
611 c[ii] = (a[ii] + a[ii] + a[ii] + a[ii]) / a[ii];
612 }
613
614 #pragma acc kernels async (1)
615 {
616 int ii;
617
618 for (ii = 0; ii < N; ii++)
619 d[ii] = ((a[ii] * a[ii] + a[ii]) / a[ii]) - a[ii];
620 }
621
622 #pragma acc kernels wait (1) async (1)
623 {
624 int ii;
625
626 for (ii = 0; ii < N; ii++)
627 e[ii] = a[ii] + b[ii] + c[ii] + d[ii];
628 }
629
630 #pragma acc wait (1)
631
632 }
633
634 for (i = 0; i < N; i++)
635 {
636 if (a[i] != 2.0)
637 abort ();
638
639 if (b[i] != 4.0)
640 abort ();
641
642 if (c[i] != 4.0)
643 abort ();
644
645 if (d[i] != 1.0)
646 abort ();
647
648 if (e[i] != 11.0)
649 abort ();
650 }
651
652
653 #if defined ACC_DEVICE_TYPE_nvidia
654 r = cuStreamCreate (&stream1, CU_STREAM_NON_BLOCKING);
655 if (r != CUDA_SUCCESS)
656 {
657 fprintf (stderr, "cuStreamCreate failed: %d\n", r);
658 abort ();
659 }
660
661 acc_set_cuda_stream (1, stream1);
662 #endif
663
664 for (i = 0; i < N; i++)
665 {
666 a[i] = 5.0;
667 b[i] = 0.0;
668 }
669
670 #pragma acc data copy (a[0:N], b[0:N]) copyin (N)
671 {
672
673 #pragma acc kernels async (1)
674 {
675 int ii;
676
677 for (ii = 0; ii < N; ii++)
678 b[ii] = a[ii];
679 }
680
681 #pragma acc wait (1)
682
683 }
684
685 for (i = 0; i < N; i++)
686 {
687 if (a[i] != 5.0)
688 abort ();
689
690 if (b[i] != 5.0)
691 abort ();
692 }
693
694 for (i = 0; i < N; i++)
695 {
696 a[i] = 7.0;
697 b[i] = 0.0;
698 c[i] = 0.0;
699 d[i] = 0.0;
700 }
701
702 #pragma acc data copy (a[0:N]) copy (b[0:N]) copy (c[0:N]) copy (d[0:N]) copyin (N)
703 {
704
705 #pragma acc kernels async (1)
706 {
707 int ii;
708
709 for (ii = 0; ii < N; ii++)
710 b[ii] = (a[ii] * a[ii] * a[ii]) / a[ii];
711 }
712
713 #pragma acc kernels async (1)
714 {
715 int ii;
716
717 for (ii = 0; ii < N; ii++)
718 c[ii] = (a[ii] + a[ii] + a[ii] + a[ii]) / a[ii];
719 }
720
721 #pragma acc kernels async (1)
722 {
723 int ii;
724
725 for (ii = 0; ii < N; ii++)
726 d[ii] = ((a[ii] * a[ii] + a[ii]) / a[ii]) - a[ii];
727 }
728
729 #pragma acc wait (1)
730
731 }
732
733 for (i = 0; i < N; i++)
734 {
735 if (a[i] != 7.0)
736 abort ();
737
738 if (b[i] != 49.0)
739 abort ();
740
741 if (c[i] != 4.0)
742 abort ();
743
744 if (d[i] != 1.0)
745 abort ();
746 }
747
748 for (i = 0; i < N; i++)
749 {
750 a[i] = 3.0;
751 b[i] = 0.0;
752 c[i] = 0.0;
753 d[i] = 0.0;
754 e[i] = 0.0;
755 }
756
757 #pragma acc data copy (a[0:N], b[0:N], c[0:N], d[0:N], e[0:N]) copyin (N)
758 {
759
760 #pragma acc kernels async (1)
761 {
762 int ii;
763
764 for (ii = 0; ii < N; ii++)
765 b[ii] = (a[ii] * a[ii] * a[ii]) / a[ii];
766 }
767
768 #pragma acc kernels async (1)
769 {
770 int ii;
771
772 for (ii = 0; ii < N; ii++)
773 c[ii] = (a[ii] + a[ii] + a[ii] + a[ii]) / a[ii];
774 }
775
776 #pragma acc kernels async (1)
777 {
778 int ii;
779
780 for (ii = 0; ii < N; ii++)
781 d[ii] = ((a[ii] * a[ii] + a[ii]) / a[ii]) - a[ii];
782 }
783
784 #pragma acc kernels wait (1) async (1)
785 {
786 int ii;
787
788 for (ii = 0; ii < N; ii++)
789 e[ii] = a[ii] + b[ii] + c[ii] + d[ii];
790 }
791
792 #pragma acc wait (1)
793
794 }
795
796 for (i = 0; i < N; i++)
797 {
798 if (a[i] != 3.0)
799 abort ();
800
801 if (b[i] != 9.0)
802 abort ();
803
804 if (c[i] != 4.0)
805 abort ();
806
807 if (d[i] != 1.0)
808 abort ();
809
810 if (e[i] != 17.0)
811 abort ();
812 }
813
814 for (i = 0; i < N; i++)
815 {
816 a[i] = 4.0;
817 b[i] = 0.0;
818 c[i] = 0.0;
819 d[i] = 0.0;
820 e[i] = 0.0;
821 }
822
823 #pragma acc data copyin (a[0:N], b[0:N], c[0:N]) copyin (N)
824 {
825
826 #pragma acc kernels async (1)
827 {
828 int ii;
829
830 for (ii = 0; ii < N; ii++)
831 b[ii] = (a[ii] * a[ii] * a[ii]) / a[ii];
832 }
833
834 #pragma acc kernels async (1)
835 {
836 int ii;
837
838 for (ii = 0; ii < N; ii++)
839 c[ii] = (a[ii] + a[ii] + a[ii] + a[ii]) / a[ii];
840 }
841
842 #pragma acc update host (a[0:N], b[0:N], c[0:N]) wait (1)
843
844 }
845
846 for (i = 0; i < N; i++)
847 {
848 if (a[i] != 4.0)
849 abort ();
850
851 if (b[i] != 16.0)
852 abort ();
853
854 if (c[i] != 4.0)
855 abort ();
856 }
857
858
859 for (i = 0; i < N; i++)
860 {
861 a[i] = 5.0;
862 b[i] = 0.0;
863 c[i] = 0.0;
864 d[i] = 0.0;
865 e[i] = 0.0;
866 }
867
868 #pragma acc data copyin (a[0:N], b[0:N], c[0:N]) copyin (N)
869 {
870
871 #pragma acc kernels async (1)
872 {
873 int ii;
874
875 for (ii = 0; ii < N; ii++)
876 b[ii] = (a[ii] * a[ii] * a[ii]) / a[ii];
877 }
878
879 #pragma acc kernels async (1)
880 {
881 int ii;
882
883 for (ii = 0; ii < N; ii++)
884 c[ii] = (a[ii] + a[ii] + a[ii] + a[ii]) / a[ii];
885 }
886
887 #pragma acc update host (a[0:N], b[0:N], c[0:N]) async (1)
888
889 #pragma acc wait (1)
890
891 }
892
893 for (i = 0; i < N; i++)
894 {
895 if (a[i] != 5.0)
896 abort ();
897
898 if (b[i] != 25.0)
899 abort ();
900
901 if (c[i] != 4.0)
902 abort ();
903 }
904
905 #if defined ACC_DEVICE_TYPE_nvidia
906 acc_shutdown (acc_device_nvidia);
907 #endif
908
909 return 0;
910 }
911