1 /* { dg-do run { target openacc_nvidia_accel_selected } } */
2 /* <http://news.gmane.org/find-root.php?message_id=%3C87pp0aaksc.fsf%40kepler.schwinge.homeip.net%3E>.
3 { dg-xfail-run-if "TODO" { *-*-* } } */
4 /* { dg-additional-options "-lcuda" } */
5
6 #include <openacc.h>
7 #include <stdlib.h>
8 #include "cuda.h"
9
10 #include <stdio.h>
11 #include <sys/time.h>
12
13 int
main(int argc,char ** argv)14 main (int argc, char **argv)
15 {
16 CUresult r;
17 CUstream stream1;
18 int N = 128; //1024 * 1024;
19 float *a, *b, *c, *d, *e;
20 int i;
21 int nbytes;
22
23 acc_init (acc_device_nvidia);
24
25 nbytes = N * sizeof (float);
26
27 a = (float *) malloc (nbytes);
28 b = (float *) malloc (nbytes);
29 c = (float *) malloc (nbytes);
30 d = (float *) malloc (nbytes);
31 e = (float *) malloc (nbytes);
32
33 for (i = 0; i < N; i++)
34 {
35 a[i] = 3.0;
36 b[i] = 0.0;
37 }
38
39 #pragma acc data copy (a[0:N]) copy (b[0:N]) copyin (N)
40 {
41
42 #pragma acc parallel async
43 {
44 int ii;
45
46 for (ii = 0; ii < N; ii++)
47 b[ii] = a[ii];
48 }
49
50 #pragma acc wait
51
52 }
53
54 for (i = 0; i < N; i++)
55 {
56 if (a[i] != 3.0)
57 abort ();
58
59 if (b[i] != 3.0)
60 abort ();
61 }
62
63 for (i = 0; i < N; i++)
64 {
65 a[i] = 2.0;
66 b[i] = 0.0;
67 }
68
69 #pragma acc data copy (a[0:N]) copy (b[0:N]) copyin (N)
70 {
71
72 #pragma acc parallel async (1)
73 {
74 int ii;
75
76 for (ii = 0; ii < N; ii++)
77 b[ii] = a[ii];
78 }
79
80 #pragma acc wait (1)
81
82 }
83
84 for (i = 0; i < N; i++)
85 {
86 if (a[i] != 2.0)
87 abort ();
88
89 if (b[i] != 2.0)
90 abort ();
91 }
92
93 for (i = 0; i < N; i++)
94 {
95 a[i] = 3.0;
96 b[i] = 0.0;
97 c[i] = 0.0;
98 d[i] = 0.0;
99 }
100
101 #pragma acc data copy (a[0:N]) copy (b[0:N]) copy (c[0:N]) copy (d[0:N]) copyin (N)
102 {
103
104 #pragma acc parallel async (1)
105 {
106 int ii;
107
108 for (ii = 0; ii < N; ii++)
109 b[ii] = (a[ii] * a[ii] * a[ii]) / a[ii];
110 }
111
112 #pragma acc parallel async (1)
113 {
114 int ii;
115
116 for (ii = 0; ii < N; ii++)
117 c[ii] = (a[ii] + a[ii] + a[ii] + a[ii]) / a[ii];
118 }
119
120
121 #pragma acc parallel async (1)
122 {
123 int ii;
124
125 for (ii = 0; ii < N; ii++)
126 d[ii] = ((a[ii] * a[ii] + a[ii]) / a[ii]) - a[ii];
127 }
128
129 #pragma acc wait (1)
130
131 }
132
133 for (i = 0; i < N; i++)
134 {
135 if (a[i] != 3.0)
136 abort ();
137
138 if (b[i] != 9.0)
139 abort ();
140
141 if (c[i] != 4.0)
142 abort ();
143
144 if (d[i] != 1.0)
145 abort ();
146 }
147
148 for (i = 0; i < N; i++)
149 {
150 a[i] = 2.0;
151 b[i] = 0.0;
152 c[i] = 0.0;
153 d[i] = 0.0;
154 e[i] = 0.0;
155 }
156
157 #pragma acc data copy (a[0:N], b[0:N], c[0:N], d[0:N], e[0:N]) copyin (N)
158 {
159
160 #pragma acc parallel async (1)
161 {
162 int ii;
163
164 for (ii = 0; ii < N; ii++)
165 b[ii] = (a[ii] * a[ii] * a[ii]) / a[ii];
166 }
167
168 #pragma acc parallel async (1)
169 {
170 int ii;
171
172 for (ii = 0; ii < N; ii++)
173 c[ii] = (a[ii] + a[ii] + a[ii] + a[ii]) / a[ii];
174 }
175
176 #pragma acc parallel async (1)
177 {
178 int ii;
179
180 for (ii = 0; ii < N; ii++)
181 d[ii] = ((a[ii] * a[ii] + a[ii]) / a[ii]) - a[ii];
182 }
183
184 #pragma acc parallel wait (1) async (1)
185 {
186 int ii;
187
188 for (ii = 0; ii < N; ii++)
189 e[ii] = a[ii] + b[ii] + c[ii] + d[ii];
190 }
191
192 #pragma acc wait (1)
193
194 }
195
196 for (i = 0; i < N; i++)
197 {
198 if (a[i] != 2.0)
199 abort ();
200
201 if (b[i] != 4.0)
202 abort ();
203
204 if (c[i] != 4.0)
205 abort ();
206
207 if (d[i] != 1.0)
208 abort ();
209
210 if (e[i] != 11.0)
211 abort ();
212 }
213
214
215 r = cuStreamCreate (&stream1, CU_STREAM_NON_BLOCKING);
216 if (r != CUDA_SUCCESS)
217 {
218 fprintf (stderr, "cuStreamCreate failed: %d\n", r);
219 abort ();
220 }
221
222 acc_set_cuda_stream (1, stream1);
223
224 for (i = 0; i < N; i++)
225 {
226 a[i] = 5.0;
227 b[i] = 0.0;
228 }
229
230 #pragma acc data copy (a[0:N], b[0:N]) copyin (N)
231 {
232
233 #pragma acc parallel async (1)
234 {
235 int ii;
236
237 for (ii = 0; ii < N; ii++)
238 b[ii] = a[ii];
239 }
240
241 #pragma acc wait (1)
242
243 }
244
245 for (i = 0; i < N; i++)
246 {
247 if (a[i] != 5.0)
248 abort ();
249
250 if (b[i] != 5.0)
251 abort ();
252 }
253
254 for (i = 0; i < N; i++)
255 {
256 a[i] = 7.0;
257 b[i] = 0.0;
258 c[i] = 0.0;
259 d[i] = 0.0;
260 }
261
262 #pragma acc data copy (a[0:N]) copy (b[0:N]) copy (c[0:N]) copy (d[0:N]) copyin (N)
263 {
264
265 #pragma acc parallel async (1)
266 {
267 int ii;
268
269 for (ii = 0; ii < N; ii++)
270 b[ii] = (a[ii] * a[ii] * a[ii]) / a[ii];
271 }
272
273 #pragma acc parallel async (1)
274 {
275 int ii;
276
277 for (ii = 0; ii < N; ii++)
278 c[ii] = (a[ii] + a[ii] + a[ii] + a[ii]) / a[ii];
279 }
280
281 #pragma acc parallel async (1)
282 {
283 int ii;
284
285 for (ii = 0; ii < N; ii++)
286 d[ii] = ((a[ii] * a[ii] + a[ii]) / a[ii]) - a[ii];
287 }
288
289 #pragma acc wait (1)
290
291 }
292
293 for (i = 0; i < N; i++)
294 {
295 if (a[i] != 7.0)
296 abort ();
297
298 if (b[i] != 49.0)
299 abort ();
300
301 if (c[i] != 4.0)
302 abort ();
303
304 if (d[i] != 1.0)
305 abort ();
306 }
307
308 for (i = 0; i < N; i++)
309 {
310 a[i] = 3.0;
311 b[i] = 0.0;
312 c[i] = 0.0;
313 d[i] = 0.0;
314 e[i] = 0.0;
315 }
316
317 #pragma acc data copy (a[0:N], b[0:N], c[0:N], d[0:N], e[0:N]) copyin (N)
318 {
319
320 #pragma acc parallel async (1)
321 {
322 int ii;
323
324 for (ii = 0; ii < N; ii++)
325 b[ii] = (a[ii] * a[ii] * a[ii]) / a[ii];
326 }
327
328 #pragma acc parallel async (1)
329 {
330 int ii;
331
332 for (ii = 0; ii < N; ii++)
333 c[ii] = (a[ii] + a[ii] + a[ii] + a[ii]) / a[ii];
334 }
335
336 #pragma acc parallel async (1)
337 {
338 int ii;
339
340 for (ii = 0; ii < N; ii++)
341 d[ii] = ((a[ii] * a[ii] + a[ii]) / a[ii]) - a[ii];
342 }
343
344 #pragma acc parallel wait (1) async (1)
345 {
346 int ii;
347
348 for (ii = 0; ii < N; ii++)
349 e[ii] = a[ii] + b[ii] + c[ii] + d[ii];
350 }
351
352 #pragma acc wait (1)
353
354 }
355
356 for (i = 0; i < N; i++)
357 {
358 if (a[i] != 3.0)
359 abort ();
360
361 if (b[i] != 9.0)
362 abort ();
363
364 if (c[i] != 4.0)
365 abort ();
366
367 if (d[i] != 1.0)
368 abort ();
369
370 if (e[i] != 17.0)
371 abort ();
372 }
373
374 for (i = 0; i < N; i++)
375 {
376 a[i] = 4.0;
377 b[i] = 0.0;
378 c[i] = 0.0;
379 d[i] = 0.0;
380 e[i] = 0.0;
381 }
382
383 #pragma acc data copyin (a[0:N], b[0:N], c[0:N]) copyin (N)
384 {
385
386 #pragma acc parallel async (1)
387 {
388 int ii;
389
390 for (ii = 0; ii < N; ii++)
391 b[ii] = (a[ii] * a[ii] * a[ii]) / a[ii];
392 }
393
394 #pragma acc parallel async (1)
395 {
396 int ii;
397
398 for (ii = 0; ii < N; ii++)
399 c[ii] = (a[ii] + a[ii] + a[ii] + a[ii]) / a[ii];
400 }
401
402 #pragma acc update host (a[0:N], b[0:N], c[0:N]) wait (1)
403
404 }
405
406 for (i = 0; i < N; i++)
407 {
408 if (a[i] != 4.0)
409 abort ();
410
411 if (b[i] != 16.0)
412 abort ();
413
414 if (c[i] != 4.0)
415 abort ();
416 }
417
418
419 for (i = 0; i < N; i++)
420 {
421 a[i] = 5.0;
422 b[i] = 0.0;
423 c[i] = 0.0;
424 d[i] = 0.0;
425 e[i] = 0.0;
426 }
427
428 #pragma acc data copyin (a[0:N], b[0:N], c[0:N]) copyin (N)
429 {
430
431 #pragma acc parallel async (1)
432 {
433 int ii;
434
435 for (ii = 0; ii < N; ii++)
436 b[ii] = (a[ii] * a[ii] * a[ii]) / a[ii];
437 }
438
439 #pragma acc parallel async (1)
440 {
441 int ii;
442
443 for (ii = 0; ii < N; ii++)
444 c[ii] = (a[ii] + a[ii] + a[ii] + a[ii]) / a[ii];
445 }
446
447 #pragma acc update host (a[0:N], b[0:N], c[0:N]) async (1)
448
449 #pragma acc wait (1)
450
451 }
452
453 for (i = 0; i < N; i++)
454 {
455 if (a[i] != 5.0)
456 abort ();
457
458 if (b[i] != 25.0)
459 abort ();
460
461 if (c[i] != 4.0)
462 abort ();
463 }
464
465 for (i = 0; i < N; i++)
466 {
467 a[i] = 3.0;
468 b[i] = 0.0;
469 }
470
471 #pragma acc data copy (a[0:N]) copy (b[0:N]) copyin (N)
472 {
473
474 #pragma acc kernels async
475 {
476 int ii;
477
478 for (ii = 0; ii < N; ii++)
479 b[ii] = a[ii];
480 }
481
482 #pragma acc wait
483
484 }
485
486 for (i = 0; i < N; i++)
487 {
488 if (a[i] != 3.0)
489 abort ();
490
491 if (b[i] != 3.0)
492 abort ();
493 }
494
495 for (i = 0; i < N; i++)
496 {
497 a[i] = 2.0;
498 b[i] = 0.0;
499 }
500
501 #pragma acc data copy (a[0:N]) copy (b[0:N]) copyin (N)
502 {
503
504 #pragma acc kernels async (1)
505 {
506 int ii;
507
508 for (ii = 0; ii < N; ii++)
509 b[ii] = a[ii];
510 }
511
512 #pragma acc wait (1)
513
514 }
515
516 for (i = 0; i < N; i++)
517 {
518 if (a[i] != 2.0)
519 abort ();
520
521 if (b[i] != 2.0)
522 abort ();
523 }
524
525 for (i = 0; i < N; i++)
526 {
527 a[i] = 3.0;
528 b[i] = 0.0;
529 c[i] = 0.0;
530 d[i] = 0.0;
531 }
532
533 #pragma acc data copy (a[0:N]) copy (b[0:N]) copy (c[0:N]) copy (d[0:N]) copyin (N)
534 {
535
536 #pragma acc kernels async (1)
537 {
538 int ii;
539
540 for (ii = 0; ii < N; ii++)
541 b[ii] = (a[ii] * a[ii] * a[ii]) / a[ii];
542 }
543
544 #pragma acc kernels async (1)
545 {
546 int ii;
547
548 for (ii = 0; ii < N; ii++)
549 c[ii] = (a[ii] + a[ii] + a[ii] + a[ii]) / a[ii];
550 }
551
552
553 #pragma acc kernels async (1)
554 {
555 int ii;
556
557 for (ii = 0; ii < N; ii++)
558 d[ii] = ((a[ii] * a[ii] + a[ii]) / a[ii]) - a[ii];
559 }
560
561 #pragma acc wait (1)
562
563 }
564
565 for (i = 0; i < N; i++)
566 {
567 if (a[i] != 3.0)
568 abort ();
569
570 if (b[i] != 9.0)
571 abort ();
572
573 if (c[i] != 4.0)
574 abort ();
575
576 if (d[i] != 1.0)
577 abort ();
578 }
579
580 for (i = 0; i < N; i++)
581 {
582 a[i] = 2.0;
583 b[i] = 0.0;
584 c[i] = 0.0;
585 d[i] = 0.0;
586 e[i] = 0.0;
587 }
588
589 #pragma acc data copy (a[0:N], b[0:N], c[0:N], d[0:N], e[0:N]) copyin (N)
590 {
591
592 #pragma acc kernels async (1)
593 {
594 int ii;
595
596 for (ii = 0; ii < N; ii++)
597 b[ii] = (a[ii] * a[ii] * a[ii]) / a[ii];
598 }
599
600 #pragma acc kernels async (1)
601 {
602 int ii;
603
604 for (ii = 0; ii < N; ii++)
605 c[ii] = (a[ii] + a[ii] + a[ii] + a[ii]) / a[ii];
606 }
607
608 #pragma acc kernels async (1)
609 {
610 int ii;
611
612 for (ii = 0; ii < N; ii++)
613 d[ii] = ((a[ii] * a[ii] + a[ii]) / a[ii]) - a[ii];
614 }
615
616 #pragma acc kernels wait (1) async (1)
617 {
618 int ii;
619
620 for (ii = 0; ii < N; ii++)
621 e[ii] = a[ii] + b[ii] + c[ii] + d[ii];
622 }
623
624 #pragma acc wait (1)
625
626 }
627
628 for (i = 0; i < N; i++)
629 {
630 if (a[i] != 2.0)
631 abort ();
632
633 if (b[i] != 4.0)
634 abort ();
635
636 if (c[i] != 4.0)
637 abort ();
638
639 if (d[i] != 1.0)
640 abort ();
641
642 if (e[i] != 11.0)
643 abort ();
644 }
645
646
647 r = cuStreamCreate (&stream1, CU_STREAM_NON_BLOCKING);
648 if (r != CUDA_SUCCESS)
649 {
650 fprintf (stderr, "cuStreamCreate failed: %d\n", r);
651 abort ();
652 }
653
654 acc_set_cuda_stream (1, stream1);
655
656 for (i = 0; i < N; i++)
657 {
658 a[i] = 5.0;
659 b[i] = 0.0;
660 }
661
662 #pragma acc data copy (a[0:N], b[0:N]) copyin (N)
663 {
664
665 #pragma acc kernels async (1)
666 {
667 int ii;
668
669 for (ii = 0; ii < N; ii++)
670 b[ii] = a[ii];
671 }
672
673 #pragma acc wait (1)
674
675 }
676
677 for (i = 0; i < N; i++)
678 {
679 if (a[i] != 5.0)
680 abort ();
681
682 if (b[i] != 5.0)
683 abort ();
684 }
685
686 for (i = 0; i < N; i++)
687 {
688 a[i] = 7.0;
689 b[i] = 0.0;
690 c[i] = 0.0;
691 d[i] = 0.0;
692 }
693
694 #pragma acc data copy (a[0:N]) copy (b[0:N]) copy (c[0:N]) copy (d[0:N]) copyin (N)
695 {
696
697 #pragma acc kernels async (1)
698 {
699 int ii;
700
701 for (ii = 0; ii < N; ii++)
702 b[ii] = (a[ii] * a[ii] * a[ii]) / a[ii];
703 }
704
705 #pragma acc kernels async (1)
706 {
707 int ii;
708
709 for (ii = 0; ii < N; ii++)
710 c[ii] = (a[ii] + a[ii] + a[ii] + a[ii]) / a[ii];
711 }
712
713 #pragma acc kernels async (1)
714 {
715 int ii;
716
717 for (ii = 0; ii < N; ii++)
718 d[ii] = ((a[ii] * a[ii] + a[ii]) / a[ii]) - a[ii];
719 }
720
721 #pragma acc wait (1)
722
723 }
724
725 for (i = 0; i < N; i++)
726 {
727 if (a[i] != 7.0)
728 abort ();
729
730 if (b[i] != 49.0)
731 abort ();
732
733 if (c[i] != 4.0)
734 abort ();
735
736 if (d[i] != 1.0)
737 abort ();
738 }
739
740 for (i = 0; i < N; i++)
741 {
742 a[i] = 3.0;
743 b[i] = 0.0;
744 c[i] = 0.0;
745 d[i] = 0.0;
746 e[i] = 0.0;
747 }
748
749 #pragma acc data copy (a[0:N], b[0:N], c[0:N], d[0:N], e[0:N]) copyin (N)
750 {
751
752 #pragma acc kernels async (1)
753 {
754 int ii;
755
756 for (ii = 0; ii < N; ii++)
757 b[ii] = (a[ii] * a[ii] * a[ii]) / a[ii];
758 }
759
760 #pragma acc kernels async (1)
761 {
762 int ii;
763
764 for (ii = 0; ii < N; ii++)
765 c[ii] = (a[ii] + a[ii] + a[ii] + a[ii]) / a[ii];
766 }
767
768 #pragma acc kernels async (1)
769 {
770 int ii;
771
772 for (ii = 0; ii < N; ii++)
773 d[ii] = ((a[ii] * a[ii] + a[ii]) / a[ii]) - a[ii];
774 }
775
776 #pragma acc kernels wait (1) async (1)
777 {
778 int ii;
779
780 for (ii = 0; ii < N; ii++)
781 e[ii] = a[ii] + b[ii] + c[ii] + d[ii];
782 }
783
784 #pragma acc wait (1)
785
786 }
787
788 for (i = 0; i < N; i++)
789 {
790 if (a[i] != 3.0)
791 abort ();
792
793 if (b[i] != 9.0)
794 abort ();
795
796 if (c[i] != 4.0)
797 abort ();
798
799 if (d[i] != 1.0)
800 abort ();
801
802 if (e[i] != 17.0)
803 abort ();
804 }
805
806 for (i = 0; i < N; i++)
807 {
808 a[i] = 4.0;
809 b[i] = 0.0;
810 c[i] = 0.0;
811 d[i] = 0.0;
812 e[i] = 0.0;
813 }
814
815 #pragma acc data copyin (a[0:N], b[0:N], c[0:N]) copyin (N)
816 {
817
818 #pragma acc kernels async (1)
819 {
820 int ii;
821
822 for (ii = 0; ii < N; ii++)
823 b[ii] = (a[ii] * a[ii] * a[ii]) / a[ii];
824 }
825
826 #pragma acc kernels async (1)
827 {
828 int ii;
829
830 for (ii = 0; ii < N; ii++)
831 c[ii] = (a[ii] + a[ii] + a[ii] + a[ii]) / a[ii];
832 }
833
834 #pragma acc update host (a[0:N], b[0:N], c[0:N]) wait (1)
835
836 }
837
838 for (i = 0; i < N; i++)
839 {
840 if (a[i] != 4.0)
841 abort ();
842
843 if (b[i] != 16.0)
844 abort ();
845
846 if (c[i] != 4.0)
847 abort ();
848 }
849
850
851 for (i = 0; i < N; i++)
852 {
853 a[i] = 5.0;
854 b[i] = 0.0;
855 c[i] = 0.0;
856 d[i] = 0.0;
857 e[i] = 0.0;
858 }
859
860 #pragma acc data copyin (a[0:N], b[0:N], c[0:N]) copyin (N)
861 {
862
863 #pragma acc kernels async (1)
864 {
865 int ii;
866
867 for (ii = 0; ii < N; ii++)
868 b[ii] = (a[ii] * a[ii] * a[ii]) / a[ii];
869 }
870
871 #pragma acc kernels async (1)
872 {
873 int ii;
874
875 for (ii = 0; ii < N; ii++)
876 c[ii] = (a[ii] + a[ii] + a[ii] + a[ii]) / a[ii];
877 }
878
879 #pragma acc update host (a[0:N], b[0:N], c[0:N]) async (1)
880
881 #pragma acc wait (1)
882
883 }
884
885 for (i = 0; i < N; i++)
886 {
887 if (a[i] != 5.0)
888 abort ();
889
890 if (b[i] != 25.0)
891 abort ();
892
893 if (c[i] != 4.0)
894 abort ();
895 }
896
897 acc_shutdown (acc_device_nvidia);
898
899 return 0;
900 }
901