1 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ \
2 // RUN:  -triple powerpc64le-unknown-unknown -DCUDA \
3 // RUN:  -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o \
4 // RUN:  %t-ppc-host.bc
5 
6 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ \
7 // RUN:  -triple nvptx64-unknown-unknown -DCUA \
8 // RUN:  -fopenmp-targets=nvptx64-nvidia-cuda -DCUDA -emit-llvm %s \
9 // RUN:  -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc \
10 // RUN:  -o - | FileCheck %s --check-prefix CHECK
11 
12 // RUN: %clang_cc1 -verify -fopenmp -x c++ \
13 // RUN:   -triple powerpc64le-unknown-unknown -DDIAG\
14 // RUN:   -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm \
15 // RUN:   %s -o - | FileCheck  %s \
16 // RUN:   --check-prefix=CHECK1
17 
18 // RUN: %clang_cc1 -verify -fopenmp -x c++ \
19 // RUN:   -triple i386-unknown-unknown \
20 // RUN:   -fopenmp-targets=i386-pc-linux-gnu -emit-llvm \
21 // RUN:   %s -o - | FileCheck  %s \
22 // RUN:   --check-prefix=CHECK2
23 
24 
25 #if defined(CUDA)
26 // expected-no-diagnostics
27 
foo(int n)28 int foo(int n) {
29   double *e;
30   //no error and no implicit map generated for e[:1]
31   #pragma omp target parallel reduction(+: e[:1])
32     *e=10;
33   ;
34   return 0;
35 }
36 // CHECK-NOT @.offload_maptypes
37 // CHECK: call void @__kmpc_nvptx_end_reduce_nowait(
38 #elif defined(DIAG)
39 class S2 {
40   mutable int a;
41 public:
S2()42   S2():a(0) { }
S2(S2 & s2)43   S2(S2 &s2):a(s2.a) { }
44   S2 &operator +(S2 &s);
45 };
bar()46 int bar() {
47  S2 o[5];
48   //warnig "copyable and not guaranteed to be mapped correctly" and
49   //implicit map generated.
50 #pragma omp target parallel reduction(+:o[0]) //expected-warning {{Type 'S2' is not trivially copyable and not guaranteed to be mapped correctly}}
51   for (int i = 0; i < 10; i++);
52   double b[10][10][10];
53   //no error no implicit map generated, the map for b is generated but not
54   //for b[0:2][2:4][1].
55 #pragma omp target parallel for reduction(task, +: b[0:2][2:4][1])
56   for (long long i = 0; i < 10; ++i);
57   return 0;
58 }
59 // map for variable o
60 // CHECK1: offload_sizes = private unnamed_addr constant [1 x i64] [i64 4]
61 // CHECK1: offload_maptypes = private unnamed_addr constant [1 x i64] [i64 547]
62 // map for b:
63 // CHECK1: @.offload_sizes{{.*}} = private unnamed_addr constant [1 x i64] [i64 8000]
64 // CHECK1: @.offload_maptypes{{.*}} = private unnamed_addr constant [1 x i64] [i64 547]
65 #else
66 // expected-no-diagnostics
67 
68 // generate implicit map for array elements or array sections in reduction
69 // clause. In following case: the implicit map is generate for output[0]
70 // with map size 4 and output[:3] with map size 12.
sum(int * input,int size,int * output)71 void sum(int* input, int size, int* output)
72 {
73 #pragma omp target teams distribute parallel for reduction(+: output[0]) \
74                                                  map(to: input [0:size])
75   for (int i = 0; i < size; i++)
76     output[0] += input[i];
77 #pragma omp target teams distribute parallel for reduction(+: output[:3])  \
78                                                  map(to: input [0:size])
79   for (int i = 0; i < size; i++)
80     output[0] += input[i];
81   int a[10];
82 #pragma omp target parallel reduction(+: a[:2])
83   for (int i = 0; i < size; i++)
84     ;
85 #pragma omp target parallel reduction(+: a[3])
86   for (int i = 0; i < size; i++)
87     ;
88 }
89 //CHECK2: @.offload_sizes = private unnamed_addr constant [2 x i64] [i64 4, i64 8]
90 //CHECK2: @.offload_maptypes.10 = private unnamed_addr constant [2 x i64] [i64 800, i64 547]
91 //CHECK2: @.offload_sizes.13 = private unnamed_addr constant [2 x i64] [i64 4, i64 4]
92 //CHECK2: @.offload_maptypes.14 = private unnamed_addr constant [2 x i64] [i64 800, i64 547]
93 //CHECK2: define dso_local void @_Z3sumPiiS_
94 //CHECK2-NEXT: entry
95 //CHECK2-NEXT: [[INP:%.*]] = alloca i32*
96 //CHECK2-NEXT: [[SIZE:%.*]] = alloca i32
97 //CHECK2-NEXT: [[OUTP:%.*]] = alloca i32*
98 //CHECK2:      [[OFFSIZE:%.*]] = alloca [3 x i64]
99 //CHECK2:      [[OFFSIZE10:%.*]] = alloca [3 x i64]
100 //CHECK2:      [[T15:%.*]] = getelementptr inbounds [3 x i64], [3 x i64]* [[OFFSIZE]], i32 0, i32 0
101 //CHECK2-NEXT: store i64 4, i64* [[T15]]
102 //CHECK2:      [[T21:%.*]] = getelementptr inbounds [3 x i64], [3 x i64]* [[OFFSIZE]], i32 0, i32 1
103 //CHECK2-NEXT: store i64 4, i64* [[T21]]
104 //CHECK2:     [[T53:%.*]] = getelementptr inbounds [3 x i64], [3 x i64]* [[OFFSIZE10]], i32 0, i32 0
105 //CHECK2-NEXT: store i64 4, i64* [[T53]]
106 //CHECK2:     [[T59:%.*]] = getelementptr inbounds [3 x i64], [3 x i64]* [[OFFSIZE10]], i32 0, i32 1
107 //CHECK2-NEXT: store i64 12, i64* [[T59]]
108 #endif
main()109 int main()
110 {
111 #if defined(CUDA)
112   int a = foo(10);
113 #elif defined(DIAG)
114   int a = bar();
115 #else
116   const int size = 100;
117   int *array = new int[size];
118   int result = 0;
119   sum(array, size, &result);
120 #endif
121   return 0;
122 }
123