1 // Test target codegen - host bc file has to be created first.
2 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
3 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64
4 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc
5 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32
6 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32
7 // expected-no-diagnostics
8 #ifndef HEADER
9 #define HEADER
10 
11 // Check for the data transfer medium in shared memory to transfer the reduction list to the first warp.
12 // CHECK-DAG: [[TRANSFER_STORAGE:@.+]] = weak addrspace([[SHARED_ADDRSPACE:[0-9]+]]) global [32 x i32]
13 
14 // Check that the execution mode of all 3 target regions is set to Spmd Mode.
15 // CHECK-DAG: {{@__omp_offloading_.+l27}}_exec_mode = weak constant i8 0
16 // CHECK-DAG: {{@__omp_offloading_.+l32}}_exec_mode = weak constant i8 0
17 // CHECK-DAG: {{@__omp_offloading_.+l38}}_exec_mode = weak constant i8 0
18 
19 template<typename tx>
ftemplate(int n)20 tx ftemplate(int n) {
21   int a;
22   short b;
23   tx c;
24   float d;
25   double e;
26 
27   #pragma omp target parallel reduction(+: e)
28   {
29     e += 5;
30   }
31 
32   #pragma omp target parallel reduction(^: c) reduction(*: d)
33   {
34     c ^= 2;
35     d *= 33;
36   }
37 
38   #pragma omp target parallel reduction(|: a) reduction(max: b)
39   {
40     a |= 1;
41     b = 99 > b ? 99 : b;
42   }
43 
44   return a+b+c+d+e;
45 }
46 
bar(int n)47 int bar(int n){
48   int a = 0;
49 
50   a += ftemplate<char>(n);
51 
52   return a;
53 }
54 
55 // CHECK: define {{.*}}void {{@__omp_offloading_.+template.+l27}}(
56 //
57 // CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
58 // CHECK: call void @__kmpc_data_sharing_init_stack_spmd
59 // CHECK: br label {{%?}}[[EXECUTE:.+]]
60 //
61 // CHECK: [[EXECUTE]]
62 // CHECK: {{call|invoke}} void [[PFN:@.+]](i32*
63 // CHECK: call void @__kmpc_spmd_kernel_deinit_v2(i16 1)
64 //
65 //
66 // define internal void [[PFN]](
67 // CHECK: store double {{[0\.e\+]+}}, double* [[E:%.+]], align
68 // CHECK: [[EV:%.+]] = load double, double* [[E]], align
69 // CHECK: [[ADD:%.+]] = fadd double [[EV]], 5
70 // CHECK: store double [[ADD]], double* [[E]], align
71 // CHECK: [[PTR1:%.+]] = getelementptr inbounds [[RLT:.+]], [1 x i8*]* [[RL:%.+]], i{{32|64}} 0, i{{32|64}} 0
72 // CHECK: [[E_CAST:%.+]] = bitcast double* [[E]] to i8*
73 // CHECK: store i8* [[E_CAST]], i8** [[PTR1]], align
74 // CHECK: [[ARG_RL:%.+]] = bitcast [[RLT]]* [[RL]] to i8*
75 // CHECK: [[RET:%.+]] = call i32 @__kmpc_nvptx_parallel_reduce_nowait_v2(%struct.ident_t* @{{.+}}, i32 {{.+}}, i32 1, i{{32|64}} {{4|8}}, i8* [[ARG_RL]], void (i8*, i16, i16, i16)* [[SHUFFLE_REDUCE_FN:@.+]], void (i8*, i32)* [[WARP_COPY_FN:@.+]])
76 // CHECK: [[CMP:%.+]] = icmp eq i32 [[RET]], 1
77 // CHECK: br i1 [[CMP]], label
78 
79 // CHECK: [[E_INV:%.+]] = load double, double* [[E_IN:%.+]], align
80 // CHECK: [[EV:%.+]] = load double, double* [[E]], align
81 // CHECK: [[ADD:%.+]] = fadd double [[E_INV]], [[EV]]
82 // CHECK: store double [[ADD]], double* [[E_IN]], align
83 // CHECK: call void @__kmpc_nvptx_end_reduce_nowait(
84 // CHECK: br label
85 //
86 // CHECK: ret
87 
88 //
89 // Reduction function
90 // CHECK: define internal void [[REDUCTION_FUNC:@.+]](i8* %0, i8* %1)
91 // CHECK: [[VAR_RHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_RHS:%.+]], i{{32|64}} 0, i{{32|64}} 0
92 // CHECK: [[VAR_RHS_VOID:%.+]] = load i8*, i8** [[VAR_RHS_REF]],
93 // CHECK: [[VAR_RHS:%.+]] = bitcast i8* [[VAR_RHS_VOID]] to double*
94 //
95 // CHECK: [[VAR_LHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_LHS:%.+]], i{{32|64}} 0, i{{32|64}} 0
96 // CHECK: [[VAR_LHS_VOID:%.+]] = load i8*, i8** [[VAR_LHS_REF]],
97 // CHECK: [[VAR_LHS:%.+]] = bitcast i8* [[VAR_LHS_VOID]] to double*
98 //
99 // CHECK: [[VAR_LHS_VAL:%.+]] = load double, double* [[VAR_LHS]],
100 // CHECK: [[VAR_RHS_VAL:%.+]] = load double, double* [[VAR_RHS]],
101 // CHECK: [[RES:%.+]] = fadd double [[VAR_LHS_VAL]], [[VAR_RHS_VAL]]
102 // CHECK: store double [[RES]], double* [[VAR_LHS]],
103 // CHECK: ret void
104 
105 //
106 // Shuffle and reduce function
107 // CHECK: define internal void [[SHUFFLE_REDUCE_FN]](i8* %0, i16 {{.*}}, i16 {{.*}}, i16 {{.*}})
108 // CHECK: [[REMOTE_RED_LIST:%.+]] = alloca [[RLT]], align
109 // CHECK: [[REMOTE_ELT:%.+]] = alloca double
110 //
111 // CHECK: [[LANEID:%.+]] = load i16, i16* {{.+}}, align
112 // CHECK: [[LANEOFFSET:%.+]] = load i16, i16* {{.+}}, align
113 // CHECK: [[ALGVER:%.+]] = load i16, i16* {{.+}}, align
114 //
115 // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0
116 // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
117 // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0
118 // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to double*
119 //
120 // CHECK: [[ELT_CAST:%.+]] = bitcast double* [[ELT]] to i64*
121 // CHECK: [[REMOTE_ELT_CAST:%.+]] = bitcast double* [[REMOTE_ELT]] to i64*
122 // CHECK: [[ELT_VAL:%.+]] = load i64, i64* [[ELT_CAST]], align
123 // CHECK: [[WS32:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
124 // CHECK: [[WS:%.+]] = trunc i32 [[WS32]] to i16
125 // CHECK: [[REMOTE_ELT_VAL64:%.+]] = call i64 @__kmpc_shuffle_int64(i64 [[ELT_VAL]], i16 [[LANEOFFSET]], i16 [[WS]])
126 //
127 // CHECK: store i64 [[REMOTE_ELT_VAL64]], i64* [[REMOTE_ELT_CAST]], align
128 // CHECK: [[REMOTE_ELT_VOID:%.+]] = bitcast double* [[REMOTE_ELT]] to i8*
129 // CHECK: store i8* [[REMOTE_ELT_VOID]], i8** [[REMOTE_ELT_REF]], align
130 //
131 // Condition to reduce
132 // CHECK: [[CONDALG0:%.+]] = icmp eq i16 [[ALGVER]], 0
133 //
134 // CHECK: [[COND1:%.+]] = icmp eq i16 [[ALGVER]], 1
135 // CHECK: [[COND2:%.+]] = icmp ult i16 [[LANEID]], [[LANEOFFSET]]
136 // CHECK: [[CONDALG1:%.+]] = and i1 [[COND1]], [[COND2]]
137 //
138 // CHECK: [[COND3:%.+]] = icmp eq i16 [[ALGVER]], 2
139 // CHECK: [[COND4:%.+]] = and i16 [[LANEID]], 1
140 // CHECK: [[COND5:%.+]] = icmp eq i16 [[COND4]], 0
141 // CHECK: [[COND6:%.+]] = and i1 [[COND3]], [[COND5]]
142 // CHECK: [[COND7:%.+]] = icmp sgt i16 [[LANEOFFSET]], 0
143 // CHECK: [[CONDALG2:%.+]] = and i1 [[COND6]], [[COND7]]
144 //
145 // CHECK: [[COND8:%.+]] = or i1 [[CONDALG0]], [[CONDALG1]]
146 // CHECK: [[SHOULD_REDUCE:%.+]] = or i1 [[COND8]], [[CONDALG2]]
147 // CHECK: br i1 [[SHOULD_REDUCE]], label {{%?}}[[DO_REDUCE:.+]], label {{%?}}[[REDUCE_ELSE:.+]]
148 //
149 // CHECK: [[DO_REDUCE]]
150 // CHECK: [[RED_LIST1_VOID:%.+]] = bitcast [[RLT]]* [[RED_LIST]] to i8*
151 // CHECK: [[RED_LIST2_VOID:%.+]] = bitcast [[RLT]]* [[REMOTE_RED_LIST]] to i8*
152 // CHECK: call void [[REDUCTION_FUNC]](i8* [[RED_LIST1_VOID]], i8* [[RED_LIST2_VOID]])
153 // CHECK: br label {{%?}}[[REDUCE_CONT:.+]]
154 //
155 // CHECK: [[REDUCE_ELSE]]
156 // CHECK: br label {{%?}}[[REDUCE_CONT]]
157 //
158 // CHECK: [[REDUCE_CONT]]
159 // Now check if we should just copy over the remote reduction list
160 // CHECK: [[COND1:%.+]] = icmp eq i16 [[ALGVER]], 1
161 // CHECK: [[COND2:%.+]] = icmp uge i16 [[LANEID]], [[LANEOFFSET]]
162 // CHECK: [[SHOULD_COPY:%.+]] = and i1 [[COND1]], [[COND2]]
163 // CHECK: br i1 [[SHOULD_COPY]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]]
164 //
165 // CHECK: [[DO_COPY]]
166 // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i{{32|64}} 0, i{{32|64}} 0
167 // CHECK: [[REMOTE_ELT_VOID:%.+]] = load i8*, i8** [[REMOTE_ELT_REF]],
168 // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 0
169 // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
170 // CHECK: [[REMOTE_ELT:%.+]] = bitcast i8* [[REMOTE_ELT_VOID]] to double*
171 // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to double*
172 // CHECK: [[REMOTE_ELT_VAL:%.+]] = load double, double* [[REMOTE_ELT]], align
173 // CHECK: store double [[REMOTE_ELT_VAL]], double* [[ELT]], align
174 // CHECK: br label {{%?}}[[COPY_CONT:.+]]
175 //
176 // CHECK: [[COPY_ELSE]]
177 // CHECK: br label {{%?}}[[COPY_CONT]]
178 //
179 // CHECK: [[COPY_CONT]]
180 // CHECK: void
181 
182 //
183 // Inter warp copy function
184 // CHECK: define internal void [[WARP_COPY_FN]](i8* %0, i32 %1)
185 // CHECK-DAG: [[LANEID:%.+]] = and i32 {{.+}}, 31
186 // CHECK-DAG: [[WARPID:%.+]] = ashr i32 {{.+}}, 5
187 // CHECK-DAG: [[RED_LIST:%.+]] = bitcast i8* {{.+}} to [[RLT]]*
188 // CHECK: store i32 0, i32* [[CNT_ADDR:%.+]],
189 // CHECK: br label
190 // CHECK: [[CNT:%.+]] = load i32, i32* [[CNT_ADDR]],
191 // CHECK: [[DONE_COPY:%.+]] = icmp ult i32 [[CNT]], 2
192 // CHECK: br i1 [[DONE_COPY]], label
193 // CHECK: call void @__kmpc_barrier(%struct.ident_t* @
194 // CHECK: [[IS_WARP_MASTER:%.+]] = icmp eq i32 [[LANEID]], 0
195 // CHECK: br i1 [[IS_WARP_MASTER]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]]
196 //
197 // [[DO_COPY]]
198 // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 0
199 // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
200 // CHECK: [[BASE_ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32*
201 // CHECK: [[ELT:%.+]] = getelementptr i32, i32* [[BASE_ELT]], i32 [[CNT]]
202 //
203 // CHECK: [[MEDIUM_ELT:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]]
204 // CHECK: [[ELT_VAL:%.+]] = load i32, i32* [[ELT]],
205 // CHECK: store volatile i32 [[ELT_VAL]], i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]],
206 // CHECK: br label {{%?}}[[COPY_CONT:.+]]
207 //
208 // CHECK: [[COPY_ELSE]]
209 // CHECK: br label {{%?}}[[COPY_CONT]]
210 //
211 // Barrier after copy to shared memory storage medium.
212 // CHECK: [[COPY_CONT]]
213 // CHECK: call void @__kmpc_barrier(%struct.ident_t* @
214 // CHECK: [[ACTIVE_WARPS:%.+]] = load i32, i32*
215 //
216 // Read into warp 0.
217 // CHECK: [[IS_W0_ACTIVE_THREAD:%.+]] = icmp ult i32 [[TID:%.+]], [[ACTIVE_WARPS]]
218 // CHECK: br i1 [[IS_W0_ACTIVE_THREAD]], label {{%?}}[[DO_READ:.+]], label {{%?}}[[READ_ELSE:.+]]
219 //
220 // CHECK: [[DO_READ]]
221 // CHECK: [[MEDIUM_ELT:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]]
222 // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0
223 // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
224 // CHECK: [[ELT_BASE:%.+]] = bitcast i8* [[ELT_VOID]] to i32*
225 // CHECK: [[ELT:%.+]] = getelementptr i32, i32* [[ELT_BASE]], i32 [[CNT]]
226 // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load volatile i32, i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]],
227 // CHECK: store i32 [[MEDIUM_ELT_VAL]], i32* [[ELT]],
228 // CHECK: br label {{%?}}[[READ_CONT:.+]]
229 //
230 // CHECK: [[READ_ELSE]]
231 // CHECK: br label {{%?}}[[READ_CONT]]
232 //
233 // CHECK: [[READ_CONT]]
234 // CHECK: [[NEXT:%.+]] = add nsw i32 [[CNT]], 1
235 // CHECK: store i32 [[NEXT]], i32* [[CNT_ADDR]],
236 // CHECK: br label
237 // CHECK: ret
238 
239 // CHECK: define {{.*}}void {{@__omp_offloading_.+template.+l32}}(
240 //
241 // CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
242 // CHECK: call void @__kmpc_data_sharing_init_stack_spmd
243 // CHECK: br label {{%?}}[[EXECUTE:.+]]
244 //
245 // CHECK: [[EXECUTE]]
246 // CHECK: {{call|invoke}} void [[PFN1:@.+]](i32*
247 // CHECK: call void @__kmpc_spmd_kernel_deinit_v2(i16 1)
248 //
249 //
250 // define internal void [[PFN1]](
251 // CHECK: store float {{1\.[0e\+]+}}, float* [[D:%.+]], align
252 // CHECK: [[C_VAL:%.+]] = load i8, i8* [[C:%.+]], align
253 // CHECK: [[CONV:%.+]] = sext i8 [[C_VAL]] to i32
254 // CHECK: [[XOR:%.+]] = xor i32 [[CONV]], 2
255 // CHECK: [[TRUNC:%.+]] = trunc i32 [[XOR]] to i8
256 // CHECK: store i8 [[TRUNC]], i8* [[C]], align
257 // CHECK: [[DV:%.+]] = load float, float* [[D]], align
258 // CHECK: [[MUL:%.+]] = fmul float [[DV]], {{[0-9e\.\+]+}}
259 // CHECK: store float [[MUL]], float* [[D]], align
260 // CHECK: [[PTR1:%.+]] = getelementptr inbounds [[RLT:.+]], [2 x i8*]* [[RL:%.+]], i{{32|64}} 0, i{{32|64}} 0
261 // CHECK: store i8* [[C]], i8** [[PTR1]], align
262 // CHECK: [[PTR2:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RL]], i{{32|64}} 0, i{{32|64}} 1
263 // CHECK: [[D_CAST:%.+]] = bitcast float* [[D]] to i8*
264 // CHECK: store i8* [[D_CAST]], i8** [[PTR2]], align
265 // CHECK: [[ARG_RL:%.+]] = bitcast [[RLT]]* [[RL]] to i8*
266 // CHECK: [[RET:%.+]] = call i32 @__kmpc_nvptx_parallel_reduce_nowait_v2(%struct.ident_t* @{{.+}}, i32 {{.+}}, i32 2, i{{32|64}} {{8|16}}, i8* [[ARG_RL]], void (i8*, i16, i16, i16)* [[SHUFFLE_REDUCE_FN:@.+]], void (i8*, i32)* [[WARP_COPY_FN:@.+]])
267 // CHECK: [[CMP:%.+]] = icmp eq i32 [[RET]], 1
268 // CHECK: br i1 [[CMP]], label
269 // CHECK: [[C_INV8:%.+]] = load i8, i8* [[C_IN:%.+]], align
270 // CHECK: [[C_INV:%.+]] = sext i8 [[C_INV8]] to i32
271 // CHECK: [[CV8:%.+]] = load i8, i8* [[C]], align
272 // CHECK: [[CV:%.+]] = sext i8 [[CV8]] to i32
273 // CHECK: [[XOR:%.+]] = xor i32 [[C_INV]], [[CV]]
274 // CHECK: [[TRUNC:%.+]] = trunc i32 [[XOR]] to i8
275 // CHECK: store i8 [[TRUNC]], i8* [[C_IN]], align
276 // CHECK: [[D_INV:%.+]] = load float, float* [[D_IN:%.+]], align
277 // CHECK: [[DV:%.+]] = load float, float* [[D]], align
278 // CHECK: [[MUL:%.+]] = fmul float [[D_INV]], [[DV]]
279 // CHECK: store float [[MUL]], float* [[D_IN]], align
280 // CHECK: call void @__kmpc_nvptx_end_reduce_nowait(
281 // CHECK: br label
282 //
283 // CHECK: ret
284 
285 //
286 // Reduction function
287 // CHECK: define internal void [[REDUCTION_FUNC:@.+]](i8* %0, i8* %1)
288 // CHECK: [[VAR1_RHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_RHS:%.+]], i{{32|64}} 0, i{{32|64}} 0
289 // CHECK: [[VAR1_RHS:%.+]] = load i8*, i8** [[VAR1_RHS_REF]],
290 //
291 // CHECK: [[VAR1_LHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_LHS:%.+]], i{{32|64}} 0, i{{32|64}} 0
292 // CHECK: [[VAR1_LHS:%.+]] = load i8*, i8** [[VAR1_LHS_REF]],
293 //
294 // CHECK: [[VAR2_RHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_RHS]], i{{32|64}} 0, i{{32|64}} 1
295 // CHECK: [[VAR2_RHS_VOID:%.+]] = load i8*, i8** [[VAR2_RHS_REF]],
296 // CHECK: [[VAR2_RHS:%.+]] = bitcast i8* [[VAR2_RHS_VOID]] to float*
297 //
298 // CHECK: [[VAR2_LHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_LHS]], i{{32|64}} 0, i{{32|64}} 1
299 // CHECK: [[VAR2_LHS_VOID:%.+]] = load i8*, i8** [[VAR2_LHS_REF]],
300 // CHECK: [[VAR2_LHS:%.+]] = bitcast i8* [[VAR2_LHS_VOID]] to float*
301 //
302 // CHECK: [[VAR1_LHS_VAL8:%.+]] = load i8, i8* [[VAR1_LHS]],
303 // CHECK: [[VAR1_LHS_VAL:%.+]] = sext i8 [[VAR1_LHS_VAL8]] to i32
304 // CHECK: [[VAR1_RHS_VAL8:%.+]] = load i8, i8* [[VAR1_RHS]],
305 // CHECK: [[VAR1_RHS_VAL:%.+]] = sext i8 [[VAR1_RHS_VAL8]] to i32
306 // CHECK: [[XOR:%.+]] = xor i32 [[VAR1_LHS_VAL]], [[VAR1_RHS_VAL]]
307 // CHECK: [[RES:%.+]] = trunc i32 [[XOR]] to i8
308 // CHECK: store i8 [[RES]], i8* [[VAR1_LHS]],
309 //
310 // CHECK: [[VAR2_LHS_VAL:%.+]] = load float, float* [[VAR2_LHS]],
311 // CHECK: [[VAR2_RHS_VAL:%.+]] = load float, float* [[VAR2_RHS]],
312 // CHECK: [[RES:%.+]] = fmul float [[VAR2_LHS_VAL]], [[VAR2_RHS_VAL]]
313 // CHECK: store float [[RES]], float* [[VAR2_LHS]],
314 // CHECK: ret void
315 
316 //
317 // Shuffle and reduce function
318 // CHECK: define internal void [[SHUFFLE_REDUCE_FN]](i8* %0, i16 {{.*}}, i16 {{.*}}, i16 {{.*}})
319 // CHECK: [[REMOTE_RED_LIST:%.+]] = alloca [[RLT]], align
320 // CHECK: [[REMOTE_ELT1:%.+]] = alloca i8
321 // CHECK: [[REMOTE_ELT2:%.+]] = alloca float
322 //
323 // CHECK: [[LANEID:%.+]] = load i16, i16* {{.+}}, align
324 // CHECK: [[LANEOFFSET:%.+]] = load i16, i16* {{.+}}, align
325 // CHECK: [[ALGVER:%.+]] = load i16, i16* {{.+}}, align
326 //
327 // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0
328 // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
329 // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0
330 // CHECK: [[ELT_VAL:%.+]] = load i8, i8* [[ELT_VOID]], align
331 //
332 // CHECK: [[ELT_CAST:%.+]] = sext i8 [[ELT_VAL]] to i32
333 // CHECK: [[WS32:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
334 // CHECK: [[WS:%.+]] = trunc i32 [[WS32]] to i16
335 // CHECK: [[REMOTE_ELT1_VAL32:%.+]] = call i32 @__kmpc_shuffle_int32(i32 [[ELT_CAST]], i16 [[LANEOFFSET]], i16 [[WS]])
336 // CHECK: [[REMOTE_ELT1_VAL:%.+]] = trunc i32 [[REMOTE_ELT1_VAL32]] to i8
337 //
338 // CHECK: store i8 [[REMOTE_ELT1_VAL]], i8* [[REMOTE_ELT1]], align
339 // CHECK: store i8* [[REMOTE_ELT1]], i8** [[REMOTE_ELT_REF]], align
340 //
341 // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 1
342 // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
343 // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i{{32|64}} 0, i{{32|64}} 1
344 // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to float*
345 //
346 // CHECK: [[ELT_CAST:%.+]] = bitcast float* [[ELT]] to i32*
347 // CHECK: [[REMOTE_ELT2_CAST:%.+]] = bitcast float* [[REMOTE_ELT2]] to i32*
348 // CHECK: [[ELT_VAL:%.+]] = load i32, i32* [[ELT_CAST]], align
349 // CHECK: [[WS32:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
350 // CHECK: [[WS:%.+]] = trunc i32 [[WS32]] to i16
351 // CHECK: [[REMOTE_ELT2_VAL32:%.+]] = call i32 @__kmpc_shuffle_int32(i32 [[ELT_VAL]], i16 [[LANEOFFSET]], i16 [[WS]])
352 //
353 // CHECK: store i32 [[REMOTE_ELT2_VAL32]], i32* [[REMOTE_ELT2_CAST]], align
354 // CHECK: [[REMOTE_ELT2C:%.+]] = bitcast float* [[REMOTE_ELT2]] to i8*
355 // CHECK: store i8* [[REMOTE_ELT2C]], i8** [[REMOTE_ELT_REF]], align
356 //
357 // Condition to reduce
358 // CHECK: [[CONDALG0:%.+]] = icmp eq i16 [[ALGVER]], 0
359 //
360 // CHECK: [[COND1:%.+]] = icmp eq i16 [[ALGVER]], 1
361 // CHECK: [[COND2:%.+]] = icmp ult i16 [[LANEID]], [[LANEOFFSET]]
362 // CHECK: [[CONDALG1:%.+]] = and i1 [[COND1]], [[COND2]]
363 //
364 // CHECK: [[COND3:%.+]] = icmp eq i16 [[ALGVER]], 2
365 // CHECK: [[COND4:%.+]] = and i16 [[LANEID]], 1
366 // CHECK: [[COND5:%.+]] = icmp eq i16 [[COND4]], 0
367 // CHECK: [[COND6:%.+]] = and i1 [[COND3]], [[COND5]]
368 // CHECK: [[COND7:%.+]] = icmp sgt i16 [[LANEOFFSET]], 0
369 // CHECK: [[CONDALG2:%.+]] = and i1 [[COND6]], [[COND7]]
370 //
371 // CHECK: [[COND8:%.+]] = or i1 [[CONDALG0]], [[CONDALG1]]
372 // CHECK: [[SHOULD_REDUCE:%.+]] = or i1 [[COND8]], [[CONDALG2]]
373 // CHECK: br i1 [[SHOULD_REDUCE]], label {{%?}}[[DO_REDUCE:.+]], label {{%?}}[[REDUCE_ELSE:.+]]
374 //
375 // CHECK: [[DO_REDUCE]]
376 // CHECK: [[RED_LIST1_VOID:%.+]] = bitcast [[RLT]]* [[RED_LIST]] to i8*
377 // CHECK: [[RED_LIST2_VOID:%.+]] = bitcast [[RLT]]* [[REMOTE_RED_LIST]] to i8*
378 // CHECK: call void [[REDUCTION_FUNC]](i8* [[RED_LIST1_VOID]], i8* [[RED_LIST2_VOID]])
379 // CHECK: br label {{%?}}[[REDUCE_CONT:.+]]
380 //
381 // CHECK: [[REDUCE_ELSE]]
382 // CHECK: br label {{%?}}[[REDUCE_CONT]]
383 //
384 // CHECK: [[REDUCE_CONT]]
385 // Now check if we should just copy over the remote reduction list
386 // CHECK: [[COND1:%.+]] = icmp eq i16 [[ALGVER]], 1
387 // CHECK: [[COND2:%.+]] = icmp uge i16 [[LANEID]], [[LANEOFFSET]]
388 // CHECK: [[SHOULD_COPY:%.+]] = and i1 [[COND1]], [[COND2]]
389 // CHECK: br i1 [[SHOULD_COPY]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]]
390 //
391 // CHECK: [[DO_COPY]]
392 // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i{{32|64}} 0, i{{32|64}} 0
393 // CHECK: [[REMOTE_ELT_VOID:%.+]] = load i8*, i8** [[REMOTE_ELT_REF]],
394 // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 0
395 // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
396 // CHECK: [[REMOTE_ELT_VAL:%.+]] = load i8, i8* [[REMOTE_ELT_VOID]], align
397 // CHECK: store i8 [[REMOTE_ELT_VAL]], i8* [[ELT_VOID]], align
398 //
399 // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i{{32|64}} 0, i{{32|64}} 1
400 // CHECK: [[REMOTE_ELT_VOID:%.+]] = load i8*, i8** [[REMOTE_ELT_REF]],
401 // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 1
402 // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
403 // CHECK: [[REMOTE_ELT:%.+]] = bitcast i8* [[REMOTE_ELT_VOID]] to float*
404 // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to float*
405 // CHECK: [[REMOTE_ELT_VAL:%.+]] = load float, float* [[REMOTE_ELT]], align
406 // CHECK: store float [[REMOTE_ELT_VAL]], float* [[ELT]], align
407 // CHECK: br label {{%?}}[[COPY_CONT:.+]]
408 //
409 // CHECK: [[COPY_ELSE]]
410 // CHECK: br label {{%?}}[[COPY_CONT]]
411 //
412 // CHECK: [[COPY_CONT]]
413 // CHECK: void
414 
415 //
416 // Inter warp copy function
417 // CHECK: define internal void [[WARP_COPY_FN]](i8* %0, i32 %1)
418 // CHECK-DAG: [[LANEID:%.+]] = and i32 {{.+}}, 31
419 // CHECK-DAG: [[WARPID:%.+]] = ashr i32 {{.+}}, 5
420 // CHECK-DAG: [[RED_LIST:%.+]] = bitcast i8* {{.+}} to [[RLT]]*
421 // CHECK: call void @__kmpc_barrier(%struct.ident_t* @
422 // CHECK: [[IS_WARP_MASTER:%.+]] = icmp eq i32 [[LANEID]], 0
423 // CHECK: br i1 [[IS_WARP_MASTER]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]]
424 //
425 // [[DO_COPY]]
426 // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 0
427 // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
428 //
429 // CHECK: [[MEDIUM_ELT64:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]]
430 // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT64]] to i8 addrspace([[SHARED_ADDRSPACE]])*
431 // CHECK: [[ELT_VAL:%.+]] = load i8, i8* [[ELT_VOID]], align
432 // CHECK: store volatile i8 [[ELT_VAL]], i8 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
433 // CHECK: br label {{%?}}[[COPY_CONT:.+]]
434 //
435 // CHECK: [[COPY_ELSE]]
436 // CHECK: br label {{%?}}[[COPY_CONT]]
437 //
438 // Barrier after copy to shared memory storage medium.
439 // CHECK: [[COPY_CONT]]
440 // CHECK: call void @__kmpc_barrier(%struct.ident_t* @
441 // CHECK: [[ACTIVE_WARPS:%.+]] = load i32, i32*
442 //
443 // Read into warp 0.
444 // CHECK: [[IS_W0_ACTIVE_THREAD:%.+]] = icmp ult i32 [[TID:%.+]], [[ACTIVE_WARPS]]
445 // CHECK: br i1 [[IS_W0_ACTIVE_THREAD]], label {{%?}}[[DO_READ:.+]], label {{%?}}[[READ_ELSE:.+]]
446 //
447 // CHECK: [[DO_READ]]
448 // CHECK: [[MEDIUM_ELT32:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]]
449 // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT32]] to i8 addrspace([[SHARED_ADDRSPACE]])*
450 // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0
451 // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
452 // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load volatile i8, i8 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
453 // CHECK: store i8 [[MEDIUM_ELT_VAL]], i8* [[ELT_VOID]], align
454 // CHECK: br label {{%?}}[[READ_CONT:.+]]
455 //
456 // CHECK: [[READ_ELSE]]
457 // CHECK: br label {{%?}}[[READ_CONT]]
458 //
459 // CHECK: [[READ_CONT]]
460 // CHECK: call void @__kmpc_barrier(%struct.ident_t* @
461 // CHECK: [[IS_WARP_MASTER:%.+]] = icmp eq i32 [[LANEID]], 0
462 // CHECK: br i1 [[IS_WARP_MASTER]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]]
463 //
464 // [[DO_COPY]]
465 // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 1
466 // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
467 // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32*
468 //
469 // CHECK: [[MEDIUM_ELT:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]]
470 // CHECK: [[ELT_VAL:%.+]] = load i32, i32* [[ELT]], align
471 // CHECK: store volatile i32 [[ELT_VAL]], i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
472 // CHECK: br label {{%?}}[[COPY_CONT:.+]]
473 //
474 // CHECK: [[COPY_ELSE]]
475 // CHECK: br label {{%?}}[[COPY_CONT]]
476 //
477 // Barrier after copy to shared memory storage medium.
478 // CHECK: [[COPY_CONT]]
479 // CHECK: call void @__kmpc_barrier(%struct.ident_t* @
480 // CHECK: [[ACTIVE_WARPS:%.+]] = load i32, i32*
481 //
482 // Read into warp 0.
483 // CHECK: [[IS_W0_ACTIVE_THREAD:%.+]] = icmp ult i32 [[TID:%.+]], [[ACTIVE_WARPS]]
484 // CHECK: br i1 [[IS_W0_ACTIVE_THREAD]], label {{%?}}[[DO_READ:.+]], label {{%?}}[[READ_ELSE:.+]]
485 //
486 // CHECK: [[DO_READ]]
487 // CHECK: [[MEDIUM_ELT:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]]
488 // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 1
489 // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
490 // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32*
491 // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load volatile i32, i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
492 // CHECK: store i32 [[MEDIUM_ELT_VAL]], i32* [[ELT]], align
493 // CHECK: br label {{%?}}[[READ_CONT:.+]]
494 //
495 // CHECK: [[READ_ELSE]]
496 // CHECK: br label {{%?}}[[READ_CONT]]
497 //
498 // CHECK: [[READ_CONT]]
499 // CHECK: ret
500 
501 // CHECK: define {{.*}}void {{@__omp_offloading_.+template.+l38}}(
502 //
503 // CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
504 // CHECK: call void @__kmpc_data_sharing_init_stack_spmd
505 // CHECK: br label {{%?}}[[EXECUTE:.+]]
506 //
507 // CHECK: [[EXECUTE]]
508 // CHECK: {{call|invoke}} void [[PFN2:@.+]](i32*
509 // CHECK: call void @__kmpc_spmd_kernel_deinit_v2(i16 1)
510 //
511 //
512 // define internal void [[PFN2]](
513 // CHECK: store i32 0, i32* [[A:%.+]], align
514 // CHECK: store i16 -32768, i16* [[B:%.+]], align
515 // CHECK: [[A_VAL:%.+]] = load i32, i32* [[A:%.+]], align
516 // CHECK: [[OR:%.+]] = or i32 [[A_VAL]], 1
517 // CHECK: store i32 [[OR]], i32* [[A]], align
518 // CHECK: [[BV16:%.+]] = load i16, i16* [[B]], align
519 // CHECK: [[BV:%.+]] = sext i16 [[BV16]] to i32
520 // CHECK: [[CMP:%.+]] = icmp sgt i32 99, [[BV]]
521 // CHECK: br i1 [[CMP]], label {{%?}}[[DO_MAX:.+]], label {{%?}}[[MAX_ELSE:.+]]
522 //
523 // CHECK: [[DO_MAX]]
524 // CHECK: br label {{%?}}[[MAX_CONT:.+]]
525 //
526 // CHECK: [[MAX_ELSE]]
527 // CHECK: [[BV:%.+]] = load i16, i16* [[B]], align
528 // CHECK: [[MAX:%.+]] = sext i16 [[BV]] to i32
529 // CHECK: br label {{%?}}[[MAX_CONT]]
530 //
531 // CHECK: [[MAX_CONT]]
532 // CHECK: [[B_LVALUE:%.+]] = phi i32 [ 99, %[[DO_MAX]] ], [ [[MAX]], %[[MAX_ELSE]] ]
533 // CHECK: [[TRUNC:%.+]] = trunc i32 [[B_LVALUE]] to i16
534 // CHECK: store i16 [[TRUNC]], i16* [[B]], align
535 // CHECK: [[PTR1:%.+]] = getelementptr inbounds [[RLT:.+]], [2 x i8*]* [[RL:%.+]], i{{32|64}} 0, i{{32|64}} 0
536 // CHECK: [[A_CAST:%.+]] = bitcast i32* [[A]] to i8*
537 // CHECK: store i8* [[A_CAST]], i8** [[PTR1]], align
538 // CHECK: [[PTR2:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RL]], i{{32|64}} 0, i{{32|64}} 1
539 // CHECK: [[B_CAST:%.+]] = bitcast i16* [[B]] to i8*
540 // CHECK: store i8* [[B_CAST]], i8** [[PTR2]], align
541 // CHECK: [[ARG_RL:%.+]] = bitcast [[RLT]]* [[RL]] to i8*
542 // CHECK: [[RET:%.+]] = call i32 @__kmpc_nvptx_parallel_reduce_nowait_v2(%struct.ident_t* @{{.+}}, i32 {{.+}}, i32 2, i{{32|64}} {{8|16}}, i8* [[ARG_RL]], void (i8*, i16, i16, i16)* [[SHUFFLE_REDUCE_FN:@.+]], void (i8*, i32)* [[WARP_COPY_FN:@.+]])
543 // CHECK: [[CMP:%.+]] = icmp eq i32 [[RET]], 1
544 // CHECK: br i1 [[CMP]], label
545 
546 // CHECK: [[A_INV:%.+]] = load i32, i32* [[A_IN:%.+]], align
547 // CHECK: [[AV:%.+]] = load i32, i32* [[A]], align
548 // CHECK: [[OR:%.+]] = or i32 [[A_INV]], [[AV]]
549 // CHECK: store i32 [[OR]], i32* [[A_IN]], align
550 // CHECK: [[B_INV16:%.+]] = load i16, i16* [[B_IN:%.+]], align
551 // CHECK: [[B_INV:%.+]] = sext i16 [[B_INV16]] to i32
552 // CHECK: [[BV16:%.+]] = load i16, i16* [[B]], align
553 // CHECK: [[BV:%.+]] = sext i16 [[BV16]] to i32
554 // CHECK: [[CMP:%.+]] = icmp sgt i32 [[B_INV]], [[BV]]
555 // CHECK: br i1 [[CMP]], label {{%?}}[[DO_MAX:.+]], label {{%?}}[[MAX_ELSE:.+]]
556 //
557 // CHECK: [[DO_MAX]]
558 // CHECK: [[MAX1:%.+]] = load i16, i16* [[B_IN]], align
559 // CHECK: br label {{%?}}[[MAX_CONT:.+]]
560 //
561 // CHECK: [[MAX_ELSE]]
562 // CHECK: [[MAX2:%.+]] = load i16, i16* [[B]], align
563 // CHECK: br label {{%?}}[[MAX_CONT]]
564 //
565 // CHECK: [[MAX_CONT]]
566 // CHECK: [[B_MAX:%.+]] = phi i16 [ [[MAX1]], %[[DO_MAX]] ], [ [[MAX2]], %[[MAX_ELSE]] ]
567 // CHECK: store i16 [[B_MAX]], i16* [[B_IN]], align
568 // CHECK: call void @__kmpc_nvptx_end_reduce_nowait(
569 // CHECK: br label
570 //
571 // CHECK: ret
572 
573 //
574 // Reduction function
575 // CHECK: define internal void [[REDUCTION_FUNC:@.+]](i8* %0, i8* %1)
576 // CHECK: [[VAR1_RHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_RHS:%.+]], i{{32|64}} 0, i{{32|64}} 0
577 // CHECK: [[VAR1_RHS_VOID:%.+]] = load i8*, i8** [[VAR1_RHS_REF]],
578 // CHECK: [[VAR1_RHS:%.+]] = bitcast i8* [[VAR1_RHS_VOID]] to i32*
579 //
580 // CHECK: [[VAR1_LHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_LHS:%.+]], i{{32|64}} 0, i{{32|64}} 0
581 // CHECK: [[VAR1_LHS_VOID:%.+]] = load i8*, i8** [[VAR1_LHS_REF]],
582 // CHECK: [[VAR1_LHS:%.+]] = bitcast i8* [[VAR1_LHS_VOID]] to i32*
583 //
584 // CHECK: [[VAR2_RHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_RHS]], i{{32|64}} 0, i{{32|64}} 1
585 // CHECK: [[VAR2_RHS_VOID:%.+]] = load i8*, i8** [[VAR2_RHS_REF]],
586 // CHECK: [[VAR2_RHS:%.+]] = bitcast i8* [[VAR2_RHS_VOID]] to i16*
587 //
588 // CHECK: [[VAR2_LHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_LHS]], i{{32|64}} 0, i{{32|64}} 1
589 // CHECK: [[VAR2_LHS_VOID:%.+]] = load i8*, i8** [[VAR2_LHS_REF]],
590 // CHECK: [[VAR2_LHS:%.+]] = bitcast i8* [[VAR2_LHS_VOID]] to i16*
591 //
592 // CHECK: [[VAR1_LHS_VAL:%.+]] = load i32, i32* [[VAR1_LHS]],
593 // CHECK: [[VAR1_RHS_VAL:%.+]] = load i32, i32* [[VAR1_RHS]],
594 // CHECK: [[OR:%.+]] = or i32 [[VAR1_LHS_VAL]], [[VAR1_RHS_VAL]]
595 // CHECK: store i32 [[OR]], i32* [[VAR1_LHS]],
596 //
597 // CHECK: [[VAR2_LHS_VAL16:%.+]] = load i16, i16* [[VAR2_LHS]],
598 // CHECK: [[VAR2_LHS_VAL:%.+]] = sext i16 [[VAR2_LHS_VAL16]] to i32
599 // CHECK: [[VAR2_RHS_VAL16:%.+]] = load i16, i16* [[VAR2_RHS]],
600 // CHECK: [[VAR2_RHS_VAL:%.+]] = sext i16 [[VAR2_RHS_VAL16]] to i32
601 //
602 // CHECK: [[CMP:%.+]] = icmp sgt i32 [[VAR2_LHS_VAL]], [[VAR2_RHS_VAL]]
603 // CHECK: br i1 [[CMP]], label {{%?}}[[DO_MAX:.+]], label {{%?}}[[MAX_ELSE:.+]]
604 //
605 // CHECK: [[DO_MAX]]
606 // CHECK: [[MAX1:%.+]] = load i16, i16* [[VAR2_LHS]], align
607 // CHECK: br label {{%?}}[[MAX_CONT:.+]]
608 //
609 // CHECK: [[MAX_ELSE]]
610 // CHECK: [[MAX2:%.+]] = load i16, i16* [[VAR2_RHS]], align
611 // CHECK: br label {{%?}}[[MAX_CONT]]
612 //
613 // CHECK: [[MAX_CONT]]
614 // CHECK: [[MAXV:%.+]] = phi i16 [ [[MAX1]], %[[DO_MAX]] ], [ [[MAX2]], %[[MAX_ELSE]] ]
615 // CHECK: store i16 [[MAXV]], i16* [[VAR2_LHS]],
616 // CHECK: ret void
617 
618 //
619 // Shuffle and reduce function
620 // CHECK: define internal void [[SHUFFLE_REDUCE_FN]](i8* %0, i16 {{.*}}, i16 {{.*}}, i16 {{.*}})
621 // CHECK: [[REMOTE_RED_LIST:%.+]] = alloca [[RLT]], align
622 // CHECK: [[REMOTE_ELT1:%.+]] = alloca i32
623 // CHECK: [[REMOTE_ELT2:%.+]] = alloca i16
624 //
625 // CHECK: [[LANEID:%.+]] = load i16, i16* {{.+}}, align
626 // CHECK: [[LANEOFFSET:%.+]] = load i16, i16* {{.+}}, align
627 // CHECK: [[ALGVER:%.+]] = load i16, i16* {{.+}}, align
628 //
629 // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0
630 // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
631 // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0
632 // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32*
633 // CHECK: [[ELT_VAL:%.+]] = load i32, i32* [[ELT]], align
634 //
635 // CHECK: [[WS32:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
636 // CHECK: [[WS:%.+]] = trunc i32 [[WS32]] to i16
637 // CHECK: [[REMOTE_ELT1_VAL:%.+]] = call i32 @__kmpc_shuffle_int32(i32 [[ELT_VAL]], i16 [[LANEOFFSET]], i16 [[WS]])
638 //
639 // CHECK: store i32 [[REMOTE_ELT1_VAL]], i32* [[REMOTE_ELT1]], align
640 // CHECK: [[REMOTE_ELT1C:%.+]] = bitcast i32* [[REMOTE_ELT1]] to i8*
641 // CHECK: store i8* [[REMOTE_ELT1C]], i8** [[REMOTE_ELT_REF]], align
642 //
643 // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 1
644 // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
645 // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i{{32|64}} 0, i{{32|64}} 1
646 // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i16*
647 // CHECK: [[ELT_VAL:%.+]] = load i16, i16* [[ELT]], align
648 //
649 // CHECK: [[ELT_CAST:%.+]] = sext i16 [[ELT_VAL]] to i32
650 // CHECK: [[WS32:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
651 // CHECK: [[WS:%.+]] = trunc i32 [[WS32]] to i16
652 // CHECK: [[REMOTE_ELT2_VAL32:%.+]] = call i32 @__kmpc_shuffle_int32(i32 [[ELT_CAST]], i16 [[LANEOFFSET]], i16 [[WS]])
653 // CHECK: [[REMOTE_ELT2_VAL:%.+]] = trunc i32 [[REMOTE_ELT2_VAL32]] to i16
654 //
655 // CHECK: store i16 [[REMOTE_ELT2_VAL]], i16* [[REMOTE_ELT2]], align
656 // CHECK: [[REMOTE_ELT2C:%.+]] = bitcast i16* [[REMOTE_ELT2]] to i8*
657 // CHECK: store i8* [[REMOTE_ELT2C]], i8** [[REMOTE_ELT_REF]], align
658 //
659 // Condition to reduce
660 // CHECK: [[CONDALG0:%.+]] = icmp eq i16 [[ALGVER]], 0
661 //
662 // CHECK: [[COND1:%.+]] = icmp eq i16 [[ALGVER]], 1
663 // CHECK: [[COND2:%.+]] = icmp ult i16 [[LANEID]], [[LANEOFFSET]]
664 // CHECK: [[CONDALG1:%.+]] = and i1 [[COND1]], [[COND2]]
665 //
666 // CHECK: [[COND3:%.+]] = icmp eq i16 [[ALGVER]], 2
667 // CHECK: [[COND4:%.+]] = and i16 [[LANEID]], 1
668 // CHECK: [[COND5:%.+]] = icmp eq i16 [[COND4]], 0
669 // CHECK: [[COND6:%.+]] = and i1 [[COND3]], [[COND5]]
670 // CHECK: [[COND7:%.+]] = icmp sgt i16 [[LANEOFFSET]], 0
671 // CHECK: [[CONDALG2:%.+]] = and i1 [[COND6]], [[COND7]]
672 //
673 // CHECK: [[COND8:%.+]] = or i1 [[CONDALG0]], [[CONDALG1]]
674 // CHECK: [[SHOULD_REDUCE:%.+]] = or i1 [[COND8]], [[CONDALG2]]
675 // CHECK: br i1 [[SHOULD_REDUCE]], label {{%?}}[[DO_REDUCE:.+]], label {{%?}}[[REDUCE_ELSE:.+]]
676 //
677 // CHECK: [[DO_REDUCE]]
678 // CHECK: [[RED_LIST1_VOID:%.+]] = bitcast [[RLT]]* [[RED_LIST]] to i8*
679 // CHECK: [[RED_LIST2_VOID:%.+]] = bitcast [[RLT]]* [[REMOTE_RED_LIST]] to i8*
680 // CHECK: call void [[REDUCTION_FUNC]](i8* [[RED_LIST1_VOID]], i8* [[RED_LIST2_VOID]])
681 // CHECK: br label {{%?}}[[REDUCE_CONT:.+]]
682 //
683 // CHECK: [[REDUCE_ELSE]]
684 // CHECK: br label {{%?}}[[REDUCE_CONT]]
685 //
686 // CHECK: [[REDUCE_CONT]]
687 // Now check if we should just copy over the remote reduction list
688 // CHECK: [[COND1:%.+]] = icmp eq i16 [[ALGVER]], 1
689 // CHECK: [[COND2:%.+]] = icmp uge i16 [[LANEID]], [[LANEOFFSET]]
690 // CHECK: [[SHOULD_COPY:%.+]] = and i1 [[COND1]], [[COND2]]
691 // CHECK: br i1 [[SHOULD_COPY]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]]
692 //
693 // CHECK: [[DO_COPY]]
694 // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i{{32|64}} 0, i{{32|64}} 0
695 // CHECK: [[REMOTE_ELT_VOID:%.+]] = load i8*, i8** [[REMOTE_ELT_REF]],
696 // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 0
697 // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
698 // CHECK: [[REMOTE_ELT:%.+]] = bitcast i8* [[REMOTE_ELT_VOID]] to i32*
699 // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32*
700 // CHECK: [[REMOTE_ELT_VAL:%.+]] = load i32, i32* [[REMOTE_ELT]], align
701 // CHECK: store i32 [[REMOTE_ELT_VAL]], i32* [[ELT]], align
702 //
703 // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i{{32|64}} 0, i{{32|64}} 1
704 // CHECK: [[REMOTE_ELT_VOID:%.+]] = load i8*, i8** [[REMOTE_ELT_REF]],
705 // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 1
706 // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
707 // CHECK: [[REMOTE_ELT:%.+]] = bitcast i8* [[REMOTE_ELT_VOID]] to i16*
708 // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i16*
709 // CHECK: [[REMOTE_ELT_VAL:%.+]] = load i16, i16* [[REMOTE_ELT]], align
710 // CHECK: store i16 [[REMOTE_ELT_VAL]], i16* [[ELT]], align
711 // CHECK: br label {{%?}}[[COPY_CONT:.+]]
712 //
713 // CHECK: [[COPY_ELSE]]
714 // CHECK: br label {{%?}}[[COPY_CONT]]
715 //
716 // CHECK: [[COPY_CONT]]
717 // CHECK: void
718 
719 //
720 // Inter warp copy function
721 // CHECK: define internal void [[WARP_COPY_FN]](i8* %0, i32 %1)
722 // CHECK-DAG: [[LANEID:%.+]] = and i32 {{.+}}, 31
723 // CHECK-DAG: [[WARPID:%.+]] = ashr i32 {{.+}}, 5
724 // CHECK-DAG: [[RED_LIST:%.+]] = bitcast i8* {{.+}} to [[RLT]]*
725 // CHECK: call void @__kmpc_barrier(%struct.ident_t* @
726 // CHECK: [[IS_WARP_MASTER:%.+]] = icmp eq i32 [[LANEID]], 0
727 // CHECK: br i1 [[IS_WARP_MASTER]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]]
728 //
729 // [[DO_COPY]]
730 // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 0
731 // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
732 // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32*
733 //
734 // CHECK: [[MEDIUM_ELT:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]]
735 // CHECK: [[ELT_VAL:%.+]] = load i32, i32* [[ELT]], align
736 // CHECK: store volatile i32 [[ELT_VAL]], i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
737 // CHECK: br label {{%?}}[[COPY_CONT:.+]]
738 //
739 // CHECK: [[COPY_ELSE]]
740 // CHECK: br label {{%?}}[[COPY_CONT]]
741 //
742 // Barrier after copy to shared memory storage medium.
743 // CHECK: [[COPY_CONT]]
744 // CHECK: call void @__kmpc_barrier(%struct.ident_t* @
745 // CHECK: [[ACTIVE_WARPS:%.+]] = load i32, i32*
746 //
747 // Read into warp 0.
748 // CHECK: [[IS_W0_ACTIVE_THREAD:%.+]] = icmp ult i32 [[TID:%.+]], [[ACTIVE_WARPS]]
749 // CHECK: br i1 [[IS_W0_ACTIVE_THREAD]], label {{%?}}[[DO_READ:.+]], label {{%?}}[[READ_ELSE:.+]]
750 //
751 // CHECK: [[DO_READ]]
752 // CHECK: [[MEDIUM_ELT:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]]
753 // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0
754 // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
755 // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32*
756 // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load volatile i32, i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
757 // CHECK: store i32 [[MEDIUM_ELT_VAL]], i32* [[ELT]], align
758 // CHECK: br label {{%?}}[[READ_CONT:.+]]
759 //
760 // CHECK: [[READ_ELSE]]
761 // CHECK: br label {{%?}}[[READ_CONT]]
762 //
763 // CHECK: [[READ_CONT]]
764 // CHECK: call void @__kmpc_barrier(%struct.ident_t* @
765 // CHECK: [[IS_WARP_MASTER:%.+]] = icmp eq i32 [[LANEID]], 0
766 // CHECK: br i1 [[IS_WARP_MASTER]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]]
767 //
768 // [[DO_COPY]]
769 // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 1
770 // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
771 // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i16*
772 //
773 // CHECK: [[MEDIUM_ELT32:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]]
774 // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT32]] to i16 addrspace([[SHARED_ADDRSPACE]])*
775 // CHECK: [[ELT_VAL:%.+]] = load i16, i16* [[ELT]], align
776 // CHECK: store volatile i16 [[ELT_VAL]], i16 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
777 // CHECK: br label {{%?}}[[COPY_CONT:.+]]
778 //
779 // CHECK: [[COPY_ELSE]]
780 // CHECK: br label {{%?}}[[COPY_CONT]]
781 //
782 // Barrier after copy to shared memory storage medium.
783 // CHECK: [[COPY_CONT]]
784 // CHECK: call void @__kmpc_barrier(%struct.ident_t* @
785 // CHECK: [[ACTIVE_WARPS:%.+]] = load i32, i32*
786 //
787 // Read into warp 0.
788 // CHECK: [[IS_W0_ACTIVE_THREAD:%.+]] = icmp ult i32 [[TID:%.+]], [[ACTIVE_WARPS]]
789 // CHECK: br i1 [[IS_W0_ACTIVE_THREAD]], label {{%?}}[[DO_READ:.+]], label {{%?}}[[READ_ELSE:.+]]
790 //
791 // CHECK: [[DO_READ]]
792 // CHECK: [[MEDIUM_ELT32:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]]
793 // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT32]] to i16 addrspace([[SHARED_ADDRSPACE]])*
794 // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 1
795 // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
796 // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i16*
797 // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load volatile i16, i16 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
798 // CHECK: store i16 [[MEDIUM_ELT_VAL]], i16* [[ELT]], align
799 // CHECK: br label {{%?}}[[READ_CONT:.+]]
800 //
801 // CHECK: [[READ_ELSE]]
802 // CHECK: br label {{%?}}[[READ_CONT]]
803 //
804 // CHECK: [[READ_CONT]]
805 // CHECK: ret
806 
807 #endif
808